pwc-india's picture
Update pages/4DEDUPLICATION.py
4ab64e0 verified
raw
history blame
6.74 kB
from logging import PlaceHolder
import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np
import streamlit as st
import pyodbc
from streamlit_extras.stateful_button import button
import pymssql
############
from streamlit_app import sidebar
st.set_page_config(page_title='DUPLICATE RECORDS DETECTION', layout= 'wide')
######
def main():
# st.title('PAGE TITLE') # Change this for each page
sidebar()
########
st.title('Detect Duplicate Records')
st.subheader('SELECT TABLE')
conn = pymssql.connect( "Server=sql-ext-dev-uks-001.database.windows.net;"
"Database=sqldb-ext-dev-uks-001;"
"UID=dbadmin;"
"PWD=mYpa$$w0rD" )
query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' ORDER BY TABLE_NAME ASC"
table1=pd.read_sql_query(query1,con=conn)
table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
table_selector=st.selectbox('SOURCE TABLE NAME',['TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM'],index=None,placeholder='Select table for automated column mapping')
btn11=button('RUN',key='run11')
if table_selector is not None and btn11:
st.markdown('---')
query2="select * from [dbo].["+table_selector+"]"
df = pd.read_sql_query(query2,con=conn)
conn.close()
st.subheader('Data Preview')
data1=df.copy()
if set(['ID','LOADID','FILE_NAME']).issubset(df.columns):
df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
df = df.replace(r'^\s*$', np.nan, regex=True)
if 'SORTL' in df.columns.values.tolist():
df.drop('SORTL',axis=1,inplace=True)
main_col=st.multiselect('PLEASE PROVIDE CONTEXT FOR DEDUPLICATION',df.columns.values.tolist(),placeholder='Select entity for deduplication')
if main_col:
mp = df.isnull().mean()*100 ## Missing Percentage
col = mp[mp<20].index.tolist()
print(col)
up = df[col].apply(lambda x: len(x.unique())/len(x)*100) ## Unique Percentage
up.sort_values(ascending=False,inplace=True)
col = up[(up>=25)&(up<=75)].index.tolist()
df=df.replace(np.nan,'')
if len(main_col)>1:
if bool(set(col)&set(main_col)):
col=list(set(col)-set(main_col))
df['main_column']=''
df['main_column']=df['main_column'].astype(str)
st.markdown('---')
st.write('Note: Main_column comprises of concatenated data of above selected context columns')
for i,val in enumerate(main_col):
df[main_col[i]]=df[main_col[i]].astype(str)
df['main_column']=df['main_column']+'_'+df[main_col[i]]
col.insert(0,'main_column')
rem_col=list(set(df.columns.values.tolist())-set(col))
else:
if main_col[0] in col:
col.remove(main_col[0])
col.insert(0,main_col[0])
rem_col=list(set(df.columns.values.tolist())-set(col))
st.write('COLUMNS SUGGESTED BY AI FOR DETERMINING DUPLICATES:\n',pd.DataFrame(col,columns=['Column Name']))
more_col=st.multiselect('DO YOU WANT TO INCLUDE ANY MORE COLUMN(s)',rem_col,placeholder='Select optional columns to check for potential duplicates')
button1=button('CHECK DUPLICATES', key='btn12')
if button1:
if more_col:
col=col+more_col
grp_col = [x+'_based_group' for x in col]
sort_col=[]
last_row_index = len(df)-1
print(col)
threshold=80 ## Threshold is set to 80
for j in range(len(col)):
df[col[j]]=df[col[j]].astype(str)
df[col[j]]=df[col[j]].str.upper()
df[col[j]] = df[col[j]].replace(np.nan, '', regex=True)
sort_col = sort_col+[col[j]]
df.sort_values(sort_col, inplace=True)
df = df.reset_index(drop=True)
fuzz_col = col[j]+'_fuzzy_ratio'
df.at[0,fuzz_col]=100
df.at[last_row_index,fuzz_col]=100
for i in range(1,last_row_index):
current = df[col[j]].iloc[i]
previous = df[col[j]].iloc[i-1]
fuzzy_ratio = fuzz.ratio(previous,current)
df.at[i,fuzz_col] = fuzzy_ratio
df[fuzz_col] = pd.to_numeric(df[fuzz_col], errors='coerce')
group_counter = 1
fuzz_group = col[j]+'_based_group'
df.at[0,fuzz_group] = group_counter
group = df.at[0,fuzz_group]
for i in range (1, len(df)):
if df.at[i,fuzz_col] > threshold:
df.at[i,fuzz_group] = df.at[i-1,fuzz_group]
else:
if j>=1:
if df.at[i,col[j-1]+'_fuzzy_ratio'] != group:
group_counter = 1
group = df.at[i,col[j-1]+'_based_group']
else:
group_counter +=1
else:
group_counter += 1
df.at[i,fuzz_group] = group_counter
#threshold=threshold*0.9
df['Potential_Duplicate_Cluster'] = df[grp_col].astype(int).astype(str).agg('_'.join, axis=1)
df['DUPLICATES?']=df.duplicated(subset='Potential_Duplicate_Cluster',keep=False).map({True:'Y',False:'N'})
df = df[df['DUPLICATES?']=='Y']
# Define a function to apply the style
def highlight_y(val):
color = 'lightcoral' if val=='Y' else 'white'
return f'background-color: {color}'
# Apply styles
styled_df = df.style.applymap(highlight_y,subset=['DUPLICATES?'])
st.markdown('---')
st.subheader('Results')
#st.write(styled_df)
edited_df=st.data_editor(styled_df,num_rows='dynamic')
#out=df.to_csv()
#st.download_button(label='DOWNLOAD DUPLICATE CLUSTER DATA',data=out, file_name='Duplicate_Clusters.csv',mime='text/csv')
######
if __name__ == '__main__':
main()