|
from logging import PlaceHolder |
|
import pandas as pd |
|
from fuzzywuzzy import fuzz |
|
import numpy as np |
|
import streamlit as st |
|
import pyodbc |
|
from streamlit_extras.stateful_button import button |
|
import pymssql |
|
|
|
from streamlit_app import sidebar |
|
|
|
st.set_page_config(page_title='DUPLICATE RECORDS DETECTION', layout= 'wide') |
|
|
|
|
|
def main(): |
|
|
|
sidebar() |
|
|
|
|
|
st.title('Detect Duplicate Records') |
|
st.subheader('SELECT TABLE') |
|
conn = pymssql.connect( "Server=sql-ext-dev-uks-001.database.windows.net;" |
|
"Database=sqldb-ext-dev-uks-001;" |
|
"UID=dbadmin;" |
|
"PWD=mYpa$$w0rD" ) |
|
query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' ORDER BY TABLE_NAME ASC" |
|
table1=pd.read_sql_query(query1,con=conn) |
|
table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str') |
|
table_selector=st.selectbox('SOURCE TABLE NAME',['TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM'],index=None,placeholder='Select table for automated column mapping') |
|
btn11=button('RUN',key='run11') |
|
if table_selector is not None and btn11: |
|
st.markdown('---') |
|
query2="select * from [dbo].["+table_selector+"]" |
|
df = pd.read_sql_query(query2,con=conn) |
|
conn.close() |
|
st.subheader('Data Preview') |
|
data1=df.copy() |
|
if set(['ID','LOADID','FILE_NAME']).issubset(df.columns): |
|
df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) |
|
|
|
df = df.replace(r'^\s*$', np.nan, regex=True) |
|
if 'SORTL' in df.columns.values.tolist(): |
|
df.drop('SORTL',axis=1,inplace=True) |
|
main_col=st.multiselect('PLEASE PROVIDE CONTEXT FOR DEDUPLICATION',df.columns.values.tolist(),placeholder='Select entity for deduplication') |
|
if main_col: |
|
mp = df.isnull().mean()*100 |
|
col = mp[mp<20].index.tolist() |
|
print(col) |
|
up = df[col].apply(lambda x: len(x.unique())/len(x)*100) |
|
up.sort_values(ascending=False,inplace=True) |
|
col = up[(up>=25)&(up<=75)].index.tolist() |
|
df=df.replace(np.nan,'') |
|
if len(main_col)>1: |
|
if bool(set(col)&set(main_col)): |
|
col=list(set(col)-set(main_col)) |
|
df['main_column']='' |
|
df['main_column']=df['main_column'].astype(str) |
|
st.markdown('---') |
|
st.write('Note: Main_column comprises of concatenated data of above selected context columns') |
|
for i,val in enumerate(main_col): |
|
df[main_col[i]]=df[main_col[i]].astype(str) |
|
df['main_column']=df['main_column']+'_'+df[main_col[i]] |
|
col.insert(0,'main_column') |
|
rem_col=list(set(df.columns.values.tolist())-set(col)) |
|
else: |
|
if main_col[0] in col: |
|
col.remove(main_col[0]) |
|
col.insert(0,main_col[0]) |
|
rem_col=list(set(df.columns.values.tolist())-set(col)) |
|
|
|
st.write('COLUMNS SUGGESTED BY AI FOR DETERMINING DUPLICATES:\n',pd.DataFrame(col,columns=['Column Name'])) |
|
more_col=st.multiselect('DO YOU WANT TO INCLUDE ANY MORE COLUMN(s)',rem_col,placeholder='Select optional columns to check for potential duplicates') |
|
button1=button('CHECK DUPLICATES', key='btn12') |
|
if button1: |
|
if more_col: |
|
col=col+more_col |
|
grp_col = [x+'_based_group' for x in col] |
|
sort_col=[] |
|
last_row_index = len(df)-1 |
|
print(col) |
|
threshold=80 |
|
for j in range(len(col)): |
|
df[col[j]]=df[col[j]].astype(str) |
|
df[col[j]]=df[col[j]].str.upper() |
|
df[col[j]] = df[col[j]].replace(np.nan, '', regex=True) |
|
sort_col = sort_col+[col[j]] |
|
df.sort_values(sort_col, inplace=True) |
|
df = df.reset_index(drop=True) |
|
fuzz_col = col[j]+'_fuzzy_ratio' |
|
df.at[0,fuzz_col]=100 |
|
df.at[last_row_index,fuzz_col]=100 |
|
for i in range(1,last_row_index): |
|
current = df[col[j]].iloc[i] |
|
previous = df[col[j]].iloc[i-1] |
|
fuzzy_ratio = fuzz.ratio(previous,current) |
|
df.at[i,fuzz_col] = fuzzy_ratio |
|
df[fuzz_col] = pd.to_numeric(df[fuzz_col], errors='coerce') |
|
group_counter = 1 |
|
fuzz_group = col[j]+'_based_group' |
|
df.at[0,fuzz_group] = group_counter |
|
group = df.at[0,fuzz_group] |
|
for i in range (1, len(df)): |
|
if df.at[i,fuzz_col] > threshold: |
|
df.at[i,fuzz_group] = df.at[i-1,fuzz_group] |
|
else: |
|
if j>=1: |
|
if df.at[i,col[j-1]+'_fuzzy_ratio'] != group: |
|
group_counter = 1 |
|
group = df.at[i,col[j-1]+'_based_group'] |
|
else: |
|
group_counter +=1 |
|
else: |
|
group_counter += 1 |
|
df.at[i,fuzz_group] = group_counter |
|
|
|
df['Potential_Duplicate_Cluster'] = df[grp_col].astype(int).astype(str).agg('_'.join, axis=1) |
|
df['DUPLICATES?']=df.duplicated(subset='Potential_Duplicate_Cluster',keep=False).map({True:'Y',False:'N'}) |
|
df = df[df['DUPLICATES?']=='Y'] |
|
|
|
def highlight_y(val): |
|
color = 'lightcoral' if val=='Y' else 'white' |
|
return f'background-color: {color}' |
|
|
|
|
|
styled_df = df.style.applymap(highlight_y,subset=['DUPLICATES?']) |
|
|
|
st.markdown('---') |
|
st.subheader('Results') |
|
|
|
edited_df=st.data_editor(styled_df,num_rows='dynamic') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|