File size: 6,767 Bytes
8ead88a
 
 
 
 
 
 
 
4ab64e0
 
8ead88a
 
4ab64e0
 
 
 
 
 
 
173bd24
 
 
 
 
 
 
 
 
 
 
 
8ead88a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ab64e0
 
 
8ead88a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from logging import PlaceHolder
import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np
import streamlit as st
import pyodbc
from streamlit_extras.stateful_button import button
import pymssql
############
from streamlit_app import sidebar

st.set_page_config(page_title='DUPLICATE RECORDS DETECTION', layout= 'wide')

######
def main():
    # st.title('PAGE TITLE')  # Change this for each page
    sidebar()
########

    st.title('Detect Duplicate Records')
    st.subheader('SELECT TABLE')
    conn = pymssql.connect( "Server=sql-ext-dev-uks-001.database.windows.net;"
                            "Database=sqldb-ext-dev-uks-001;"
                            "UID=dbadmin;"
                            "PWD=mYpa$$w0rD" )
    query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' ORDER BY TABLE_NAME ASC"
    table1=pd.read_sql_query(query1,con=conn)
    table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
    table_selector=st.selectbox('SOURCE TABLE NAME',['TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM'],index=None,placeholder='Select table for automated column mapping')
    btn11=button('RUN',key='run11')
    if table_selector is not None and btn11:
        st.markdown('---')
        query2="select * from [dbo].["+table_selector+"]"
        df = pd.read_sql_query(query2,con=conn)
        conn.close()
        st.subheader('Data Preview')
        data1=df.copy()
        if set(['ID','LOADID','FILE_NAME']).issubset(df.columns):
            df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)

        df = df.replace(r'^\s*$', np.nan, regex=True)
        if 'SORTL' in df.columns.values.tolist():
            df.drop('SORTL',axis=1,inplace=True)
        main_col=st.multiselect('PLEASE PROVIDE CONTEXT FOR DEDUPLICATION',df.columns.values.tolist(),placeholder='Select entity for deduplication')
        if main_col:
            mp = df.isnull().mean()*100                               ## Missing Percentage
            col = mp[mp<20].index.tolist()
            print(col)
            up = df[col].apply(lambda x: len(x.unique())/len(x)*100)  ## Unique Percentage
            up.sort_values(ascending=False,inplace=True)
            col = up[(up>=25)&(up<=75)].index.tolist()
            df=df.replace(np.nan,'')
            if len(main_col)>1:
                if bool(set(col)&set(main_col)):
                    col=list(set(col)-set(main_col))
                df['main_column']=''
                df['main_column']=df['main_column'].astype(str)
                st.markdown('---')
                st.write('Note: Main_column comprises of concatenated data of above selected context columns')
                for i,val in enumerate(main_col):
                    df[main_col[i]]=df[main_col[i]].astype(str)
                    df['main_column']=df['main_column']+'_'+df[main_col[i]]
                col.insert(0,'main_column')
                rem_col=list(set(df.columns.values.tolist())-set(col))
            else:
                if main_col[0] in col:
                    col.remove(main_col[0])
                col.insert(0,main_col[0])
                rem_col=list(set(df.columns.values.tolist())-set(col))
            
            st.write('COLUMNS SUGGESTED BY AI FOR DETERMINING DUPLICATES:\n',pd.DataFrame(col,columns=['Column Name']))
            more_col=st.multiselect('DO YOU WANT TO INCLUDE ANY MORE COLUMN(s)',rem_col,placeholder='Select optional columns to check for potential duplicates')
            button1=button('CHECK DUPLICATES', key='btn12')
            if button1:
                if more_col:
                    col=col+more_col
                grp_col = [x+'_based_group' for x in col]
                sort_col=[]
                last_row_index = len(df)-1
                print(col)
                threshold=80                                              ## Threshold is set to 80
                for j in range(len(col)):
                    df[col[j]]=df[col[j]].astype(str)
                    df[col[j]]=df[col[j]].str.upper()
                    df[col[j]] = df[col[j]].replace(np.nan, '', regex=True)
                    sort_col = sort_col+[col[j]]
                    df.sort_values(sort_col, inplace=True)
                    df = df.reset_index(drop=True)
                    fuzz_col = col[j]+'_fuzzy_ratio'
                    df.at[0,fuzz_col]=100
                    df.at[last_row_index,fuzz_col]=100
                    for i in range(1,last_row_index):
                        current = df[col[j]].iloc[i]
                        previous = df[col[j]].iloc[i-1]
                        fuzzy_ratio = fuzz.ratio(previous,current)
                        df.at[i,fuzz_col] = fuzzy_ratio
                    df[fuzz_col] = pd.to_numeric(df[fuzz_col], errors='coerce')
                    group_counter = 1
                    fuzz_group = col[j]+'_based_group'
                    df.at[0,fuzz_group] = group_counter
                    group = df.at[0,fuzz_group]
                    for i in range (1, len(df)):
                        if df.at[i,fuzz_col] > threshold:
                            df.at[i,fuzz_group] = df.at[i-1,fuzz_group]
                        else:
                            if j>=1:
                                if df.at[i,col[j-1]+'_fuzzy_ratio'] != group:
                                    group_counter = 1
                                    group = df.at[i,col[j-1]+'_based_group']
                                else:
                                    group_counter +=1
                            else:
                                group_counter += 1
                        df.at[i,fuzz_group] = group_counter
                    #threshold=threshold*0.9
                df['Potential_Duplicate_Cluster'] = df[grp_col].astype(int).astype(str).agg('_'.join, axis=1)
                df['DUPLICATES?']=df.duplicated(subset='Potential_Duplicate_Cluster',keep=False).map({True:'Y',False:'N'})
                df = df[df['DUPLICATES?']=='Y']
                # Define a function to apply the style
                def highlight_y(val):
                    color = 'lightcoral' if val=='Y' else 'white'
                    return f'background-color: {color}'

                # Apply styles
                styled_df = df.style.applymap(highlight_y,subset=['DUPLICATES?'])
                
                st.markdown('---')
                st.subheader('Results')
                #st.write(styled_df)
                edited_df=st.data_editor(styled_df,num_rows='dynamic')
                
                #out=df.to_csv()
                #st.download_button(label='DOWNLOAD DUPLICATE CLUSTER DATA',data=out, file_name='Duplicate_Clusters.csv',mime='text/csv')

            

######
if __name__ == '__main__':
    main()