pwc-india's picture
Upload 108 files
41c4cf9 verified
raw
history blame
6.79 kB
from logging import PlaceHolder
import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np
import streamlit as st
import pyodbc
from streamlit_extras.stateful_button import button
st.set_page_config(page_title='DUPLICATE RECORDS DETECTION', layout= 'wide')
st.title('Detect Duplicate Records')
st.subheader('SELECT TABLE')
conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
"Server=sql-ext-dev-uks-001.database.windows.net;"
"Database=sqldb-ext-dev-uks-001;"
"UID=dbadmin;"
"PWD=mYpa$$w0rD" )
query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' ORDER BY TABLE_NAME ASC"
table1=pd.read_sql_query(query1,con=conn)
table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
table_selector=st.selectbox('SOURCE TABLE NAME',['TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM'],index=None,placeholder='Select table for automated column mapping')
btn11=button('RUN',key='run11')
if table_selector is not None and btn11:
st.markdown('---')
query2="select * from [dbo].["+table_selector+"]"
df = pd.read_sql_query(query2,con=conn)
conn.close()
st.subheader('Data Preview')
data1=df.copy()
if set(['ID','LOADID','FILE_NAME']).issubset(df.columns):
df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
df = df.replace(r'^\s*$', np.nan, regex=True)
if 'SORTL' in df.columns.values.tolist():
df.drop('SORTL',axis=1,inplace=True)
main_col=st.multiselect('PLEASE PROVIDE CONTEXT FOR DEDUPLICATION',df.columns.values.tolist(),placeholder='Select entity for deduplication')
if main_col:
mp = df.isnull().mean()*100 ## Missing Percentage
col = mp[mp<20].index.tolist()
print(col)
up = df[col].apply(lambda x: len(x.unique())/len(x)*100) ## Unique Percentage
up.sort_values(ascending=False,inplace=True)
col = up[(up>=25)&(up<=75)].index.tolist()
df=df.replace(np.nan,'')
if len(main_col)>1:
if bool(set(col)&set(main_col)):
col=list(set(col)-set(main_col))
df['main_column']=''
df['main_column']=df['main_column'].astype(str)
st.markdown('---')
st.write('Note: Main_column comprises of concatenated data of above selected context columns')
for i,val in enumerate(main_col):
df[main_col[i]]=df[main_col[i]].astype(str)
df['main_column']=df['main_column']+'_'+df[main_col[i]]
col.insert(0,'main_column')
rem_col=list(set(df.columns.values.tolist())-set(col))
else:
if main_col[0] in col:
col.remove(main_col[0])
col.insert(0,main_col[0])
rem_col=list(set(df.columns.values.tolist())-set(col))
st.write('COLUMNS SUGGESTED BY AI FOR DETERMINING DUPLICATES:\n',pd.DataFrame(col,columns=['Column Name']))
more_col=st.multiselect('DO YOU WANT TO INCLUDE ANY MORE COLUMN(s)',rem_col,placeholder='Select optional columns to check for potential duplicates')
button1=button('CHECK DUPLICATES', key='btn12')
if button1:
if more_col:
col=col+more_col
grp_col = [x+'_based_group' for x in col]
sort_col=[]
last_row_index = len(df)-1
print(col)
threshold=80 ## Threshold is set to 80
for j in range(len(col)):
df[col[j]]=df[col[j]].astype(str)
df[col[j]]=df[col[j]].str.upper()
df[col[j]] = df[col[j]].replace(np.nan, '', regex=True)
sort_col = sort_col+[col[j]]
df.sort_values(sort_col, inplace=True)
df = df.reset_index(drop=True)
fuzz_col = col[j]+'_fuzzy_ratio'
df.at[0,fuzz_col]=100
df.at[last_row_index,fuzz_col]=100
for i in range(1,last_row_index):
current = df[col[j]].iloc[i]
previous = df[col[j]].iloc[i-1]
fuzzy_ratio = fuzz.ratio(previous,current)
df.at[i,fuzz_col] = fuzzy_ratio
df[fuzz_col] = pd.to_numeric(df[fuzz_col], errors='coerce')
group_counter = 1
fuzz_group = col[j]+'_based_group'
df.at[0,fuzz_group] = group_counter
group = df.at[0,fuzz_group]
for i in range (1, len(df)):
if df.at[i,fuzz_col] > threshold:
df.at[i,fuzz_group] = df.at[i-1,fuzz_group]
else:
if j>=1:
if df.at[i,col[j-1]+'_fuzzy_ratio'] != group:
group_counter = 1
group = df.at[i,col[j-1]+'_based_group']
else:
group_counter +=1
else:
group_counter += 1
df.at[i,fuzz_group] = group_counter
#threshold=threshold*0.9
df['Potential_Duplicate_Cluster'] = df[grp_col].astype(int).astype(str).agg('_'.join, axis=1)
df['DUPLICATES?']=df.duplicated(subset='Potential_Duplicate_Cluster',keep=False).map({True:'Y',False:'N'})
df = df[df['DUPLICATES?']=='Y']
# Define a function to apply the style
def highlight_y(val):
color = 'lightcoral' if val=='Y' else 'white'
return f'background-color: {color}'
# Apply styles
styled_df = df.style.applymap(highlight_y,subset=['DUPLICATES?'])
st.markdown('---')
st.subheader('Results')
#st.write(styled_df)
edited_df=st.data_editor(styled_df,num_rows='dynamic')
#out=df.to_csv()
#st.download_button(label='DOWNLOAD DUPLICATE CLUSTER DATA',data=out, file_name='Duplicate_Clusters.csv',mime='text/csv')