Spaces:

pwc-india
/

data_discovery_source_target_mapping

Sleeping

App Files Files Community

data_discovery_source_target_mapping / backup /1DATA CATALOG.py

pwc-india

Upload 108 files

41c4cf9 verified about 1 month ago

raw

history blame contribute delete

11.5 kB

	import pandas as pd
	import numpy as np
	import streamlit as st
	import sdv
	from sdv.datasets.local import load_csvs
	from sdv.metadata import MultiTableMetadata
	from sdv.multi_table import HMASynthesizer
	import time
	import os
	import gc
	import warnings
	from PIL import Image
	from sdv.metadata import SingleTableMetadata
	import pyodbc
	import google.generativeai as genai
	import textwrap
	from streamlit_extras.stylable_container import stylable_container
	genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0')
	genai_mod = genai.GenerativeModel(
	model_name='models/gemini-1.5-pro-latest'
	)

	st.set_page_config(page_title='DATA DISCOVERY')
	st.title('AUTOMATED DATA CATALOGUE')
	st.subheader('SELECT SOURCE')
	select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
	if select1 =='DB_10001':
	datasets = load_csvs(
	folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
	read_csv_parameters={
	'skipinitialspace': True,
	'encoding': 'utf_8'
	})
	st.markdown(f"System has found :orange[{str(len(datasets))} tables] in the source. Please proceed with selection of mode of discovery.")
	select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery')
	if select_main == 'Multi Table Discovery':
	with st.spinner('Performing Data Discovery'):
	time.sleep(2)
	st.success('Data cataloguing complete!')
	datasets = load_csvs(
	folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
	read_csv_parameters={
	'skipinitialspace': True,
	'encoding': 'utf_8'
	})
	metadata = MultiTableMetadata()
	metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE')
	python_dict = metadata.to_dict()
	st.markdown('---')
	st.subheader('DATA CATALOG')
	# st.json(python_dict)
	brands=datasets['brands']
	categories=datasets['categories']
	customers=datasets['CUSTOMER_MASTER_TBL_1']
	orderitems=datasets['order_items']
	orders=datasets['orders']
	products=datasets['products']
	staffs=datasets['staffs']
	stocks=datasets['stocks']
	stores=datasets['stores']
	tables=python_dict['tables']
	table_names=[*tables]
	col1, col2, col3 = st.columns([2,2,2])

	with col1:
	def view_callback():
	st.session_state.tdet = False
	view= st.button("LIST TABLES",key='view',on_click=view_callback)
	with col2:
	if 'tdet' not in st.session_state:
	st.session_state.tdet = False
	tdet1 = st.button("SHOW TABLE DETAILS")
	with col3:
	rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)

	if tdet1:
	st.session_state.tdet = tdet1
	if view:
	#st.write(python_dict)

	st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))

	if rel:
	rlist1=python_dict['relationships']
	rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
	for i in range(len(rlist1)):
	rlist=rlist1[i]
	nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
	rdf=pd.concat([rdf,nrow],ignore_index=True)
	st.write(rdf)
	if st.session_state.tdet is True:
	def tdet_callback():
	st.session_state.tdet=True
	st.subheader('Select table name to view')
	sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
	col4, col5 = st.columns([1, 3])
	with col4:
	preview= st.button("PREVIEW TABLE",key='preview')
	with col5:
	cdet = st.button("GET COLUMN DETAILS",key='prof')
	if preview:
	st.write(datasets[sbox1])
	if cdet:
	cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
	t_dict=tables[sbox1]
	c_dict=t_dict['columns']
	i=0
	for key in c_dict:
	e_dict=c_dict[key]
	if 'pii' in e_dict:
	p='YES'
	else:
	p='NO'
	if e_dict['sdtype']=='datetime':
	v=e_dict['sdtype']+': '+e_dict['datetime_format']
	else:
	v=e_dict['sdtype']
	new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
	cdetails=pd.concat([cdetails, new_row],ignore_index=True)
	i=i+1
	if 'primary_key' in t_dict:
	st.write('Primary Key:',t_dict['primary_key'])
	else:
	st.write('Primary Key: No key can be detected')
	st.write(cdetails)

	if select_main == 'Single Table Discovery':
	metadata = SingleTableMetadata()
	conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
	"Server=ipzilnpxsssp001.database.windows.net;"
	"Database=Marcopolo_DB;"
	"UID=ssikder004;"
	"PWD=Marcopolo@123" )
	query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC"
	table1=pd.read_sql_query(query1,con=conn)
	table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
	table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping')
	if table_selector is not None:
	st.markdown('---')
	query2="select * from [Client].["+table_selector+"]"
	df = pd.read_sql_query(query2,con=conn)
	main_list=df.columns.to_list()
	sub_list=['ID','LOADID','FILE_NAME']
	if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
	df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
	conn.close()
	metadata = SingleTableMetadata()
	metadata.detect_from_dataframe(df)
	python_dict = metadata.to_dict()
	if f'cont_{table_selector}' not in st.session_state:
	with st.spinner("Processing Table"):
	# Create a GenerativeModel instance
	genai_mod = genai.GenerativeModel(
	model_name='models/gemini-1.5-pro-latest'
	)
	if 'primary_key' in python_dict:
	primary_key = python_dict['primary_key']
	else:
	primary_key = "Could Not be Identified"


	story = f""" Details of the table:
	table columns: {str(list(df.columns))}
	column datatypes: {str(df.dtypes.to_string())}
	table sample data: {df.head(10).to_string()}
	"""
	response = genai_mod.generate_content(textwrap.dedent("""
	You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
	Important: Please consider All fields are mandetorily during your analysis.

	Here is the table details:

	""") + story + f"The Primary Key is:{primary_key}" )
	st.write(response.usage_metadata)
	st.session_state[f'cont_{table_selector}'] = response.text
	with stylable_container(
	key=f"container_with_border",
	css_styles="""
	{
	border: 1px solid white;
	border-radius: 0.5rem;
	padding: calc(1em - 1px);
	width: 110%; /* Set container width to 100% */
	}
	"""
	):
	st.write(st.session_state[f'cont_{table_selector}'])
	col9, col10, col11 = st.columns([2, 3, 9])
	with col9:
	preview= st.button("PREVIEW TABLE",key='preview')
	# with col10:
	# cdet = st.button("GET COLUMN DETAILS",key='prof')
	if preview:
	st.dataframe(df)
	# if cdet:
	# cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
	# c_dict=python_dict['columns']
	# i=0
	# for key in c_dict:
	# e_dict=c_dict[key]
	# if 'pii' in e_dict:
	# p='YES'
	# else:
	# p='NO'
	# if e_dict['sdtype']=='datetime':
	# v=e_dict['sdtype']+': '+e_dict['datetime_format']
	# else:
	# v=e_dict['sdtype']
	# new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
	# cdetails=pd.concat([cdetails, new_row],ignore_index=True)
	# i=i+1
	# if 'primary_key' in python_dict:
	# st.write('Primary Key:',python_dict['primary_key'])
	# else:
	# st.write('Primary Key: No key can be detected')
	# st.write(cdetails)