File size: 6,767 Bytes
8ead88a 4ab64e0 8ead88a 4ab64e0 173bd24 8ead88a 4ab64e0 8ead88a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from logging import PlaceHolder
import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np
import streamlit as st
import pyodbc
from streamlit_extras.stateful_button import button
import pymssql
############
from streamlit_app import sidebar
st.set_page_config(page_title='DUPLICATE RECORDS DETECTION', layout= 'wide')
######
def main():
# st.title('PAGE TITLE') # Change this for each page
sidebar()
########
st.title('Detect Duplicate Records')
st.subheader('SELECT TABLE')
conn = pymssql.connect( "Server=sql-ext-dev-uks-001.database.windows.net;"
"Database=sqldb-ext-dev-uks-001;"
"UID=dbadmin;"
"PWD=mYpa$$w0rD" )
query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' ORDER BY TABLE_NAME ASC"
table1=pd.read_sql_query(query1,con=conn)
table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
table_selector=st.selectbox('SOURCE TABLE NAME',['TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM'],index=None,placeholder='Select table for automated column mapping')
btn11=button('RUN',key='run11')
if table_selector is not None and btn11:
st.markdown('---')
query2="select * from [dbo].["+table_selector+"]"
df = pd.read_sql_query(query2,con=conn)
conn.close()
st.subheader('Data Preview')
data1=df.copy()
if set(['ID','LOADID','FILE_NAME']).issubset(df.columns):
df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
df = df.replace(r'^\s*$', np.nan, regex=True)
if 'SORTL' in df.columns.values.tolist():
df.drop('SORTL',axis=1,inplace=True)
main_col=st.multiselect('PLEASE PROVIDE CONTEXT FOR DEDUPLICATION',df.columns.values.tolist(),placeholder='Select entity for deduplication')
if main_col:
mp = df.isnull().mean()*100 ## Missing Percentage
col = mp[mp<20].index.tolist()
print(col)
up = df[col].apply(lambda x: len(x.unique())/len(x)*100) ## Unique Percentage
up.sort_values(ascending=False,inplace=True)
col = up[(up>=25)&(up<=75)].index.tolist()
df=df.replace(np.nan,'')
if len(main_col)>1:
if bool(set(col)&set(main_col)):
col=list(set(col)-set(main_col))
df['main_column']=''
df['main_column']=df['main_column'].astype(str)
st.markdown('---')
st.write('Note: Main_column comprises of concatenated data of above selected context columns')
for i,val in enumerate(main_col):
df[main_col[i]]=df[main_col[i]].astype(str)
df['main_column']=df['main_column']+'_'+df[main_col[i]]
col.insert(0,'main_column')
rem_col=list(set(df.columns.values.tolist())-set(col))
else:
if main_col[0] in col:
col.remove(main_col[0])
col.insert(0,main_col[0])
rem_col=list(set(df.columns.values.tolist())-set(col))
st.write('COLUMNS SUGGESTED BY AI FOR DETERMINING DUPLICATES:\n',pd.DataFrame(col,columns=['Column Name']))
more_col=st.multiselect('DO YOU WANT TO INCLUDE ANY MORE COLUMN(s)',rem_col,placeholder='Select optional columns to check for potential duplicates')
button1=button('CHECK DUPLICATES', key='btn12')
if button1:
if more_col:
col=col+more_col
grp_col = [x+'_based_group' for x in col]
sort_col=[]
last_row_index = len(df)-1
print(col)
threshold=80 ## Threshold is set to 80
for j in range(len(col)):
df[col[j]]=df[col[j]].astype(str)
df[col[j]]=df[col[j]].str.upper()
df[col[j]] = df[col[j]].replace(np.nan, '', regex=True)
sort_col = sort_col+[col[j]]
df.sort_values(sort_col, inplace=True)
df = df.reset_index(drop=True)
fuzz_col = col[j]+'_fuzzy_ratio'
df.at[0,fuzz_col]=100
df.at[last_row_index,fuzz_col]=100
for i in range(1,last_row_index):
current = df[col[j]].iloc[i]
previous = df[col[j]].iloc[i-1]
fuzzy_ratio = fuzz.ratio(previous,current)
df.at[i,fuzz_col] = fuzzy_ratio
df[fuzz_col] = pd.to_numeric(df[fuzz_col], errors='coerce')
group_counter = 1
fuzz_group = col[j]+'_based_group'
df.at[0,fuzz_group] = group_counter
group = df.at[0,fuzz_group]
for i in range (1, len(df)):
if df.at[i,fuzz_col] > threshold:
df.at[i,fuzz_group] = df.at[i-1,fuzz_group]
else:
if j>=1:
if df.at[i,col[j-1]+'_fuzzy_ratio'] != group:
group_counter = 1
group = df.at[i,col[j-1]+'_based_group']
else:
group_counter +=1
else:
group_counter += 1
df.at[i,fuzz_group] = group_counter
#threshold=threshold*0.9
df['Potential_Duplicate_Cluster'] = df[grp_col].astype(int).astype(str).agg('_'.join, axis=1)
df['DUPLICATES?']=df.duplicated(subset='Potential_Duplicate_Cluster',keep=False).map({True:'Y',False:'N'})
df = df[df['DUPLICATES?']=='Y']
# Define a function to apply the style
def highlight_y(val):
color = 'lightcoral' if val=='Y' else 'white'
return f'background-color: {color}'
# Apply styles
styled_df = df.style.applymap(highlight_y,subset=['DUPLICATES?'])
st.markdown('---')
st.subheader('Results')
#st.write(styled_df)
edited_df=st.data_editor(styled_df,num_rows='dynamic')
#out=df.to_csv()
#st.download_button(label='DOWNLOAD DUPLICATE CLUSTER DATA',data=out, file_name='Duplicate_Clusters.csv',mime='text/csv')
######
if __name__ == '__main__':
main()
|