File size: 6,230 Bytes
526fd5a
 
 
 
 
 
d85cd72
62af0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbec7e3
526fd5a
 
62af0a0
 
526fd5a
62af0a0
7bbe80f
62af0a0
 
 
 
 
 
 
 
 
526fd5a
62af0a0
 
 
 
526fd5a
62af0a0
 
 
 
 
526fd5a
62af0a0
 
 
526fd5a
62af0a0
 
526fd5a
62af0a0
 
 
 
 
 
 
 
 
 
526fd5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1faefb1
 
 
 
526fd5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1faefb1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import gradio as gr
import re 
from datetime import timedelta


def process_data(files_mindbody, files_medserv, tolerance, progress=gr.Progress()):

    try:
        mindbody = load_data(files_mindbody)
        medserv = load_data(files_medserv)
    except Exception as e:
        print(f"An error occurred while loading data: {e}")
        return None
    
    try:
        # Remove multiple commas from the 'Client' column
        medserv['Client'] = medserv['Client'].str.replace(r',+', ',', regex=True)
        mindbody['Client'] = mindbody['Client'].str.replace(r',+', ',', regex=True)
        
        # Split 'Client' names into first name and last name components for both DataFrames
        medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
        mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
    except Exception as e:
        print(f"An error occurred while processing client names: {e}")
    
    try:
        # Split dates if they contain commas in the 'DOS' column of medserv
        medserv['DOS'] = medserv['DOS'].astype(str)
        medserv['DOS'] = medserv['DOS'].str.split(',')
        medserv = medserv.explode('DOS')
        
        # Attempt to convert dates using multiple formats
        formats_to_try = ['%d/%m/%Y', '%Y-%m-%d']  # Add more formats as needed
        for format_to_try in formats_to_try:
            try:
                medserv['DOS'] = pd.to_datetime(medserv['DOS'].str.strip(), format=format_to_try)
                break  # Break out of loop if conversion succeeds
            except ValueError:
                continue  # Continue to next format if conversion fails
    except Exception as e:
        print(f"An error occurred while processing dates in medserv: {e}")
    
    unmatched_rows = []

    try:
        rows = len(mindbody)

        # Iterate through each row in the mindbody DataFrame
        for idx in progress.tdqm(range(rows), desc='Analyzing files...'):
            # Extract relevant information from the current row
            date = mindbody.iloc[idx]['DOS']
            first_name = mindbody.iloc[idx]['First Name']
            last_name = mindbody.iloc[idx]['Last Name']

            # Define the range of dates to search for a match in medserv
            date_range = [date - timedelta(days=i) for i in range(tolerance, -tolerance-1, -1)]
            # Remove the time component from the dates in date_range
            date_range = [d.date() for d in date_range]

            # Filter medserv based on the date range and name criteria
            matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) & 
                               ((medserv['First Name'].str.lower() == first_name.lower()) | 
                                (medserv['Last Name'].str.lower() == last_name.lower())))]

            # If no match is found, append the row to the unmatched_rows list
            if matches.empty:
                unmatched_rows.append(mindbody.iloc[idx])
    except Exception as e:
        print(f"An error occurred while analyzing files: {e}")

    try:
        # Create a DataFrame from the unmatched_rows list
        unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)

        # Specify the columns to include in the output Excel file
        columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location', 'Item Total']

        # Format the 'DOS' column to remove time part
        unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')

        output_file_path = 'Comparison Results.xlsx'
        unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
        
        return output_file_path
    except Exception as e:
        print(f"An error occurred while creating the output file: {e}")
        return None



def load_data(files):
    # Check if a single file or multiple files are provided
    filepaths = [file.name for file in files]
    
    # Load and concatenate multiple files if provided
    dfs = []
    for filepath in filepaths:
        if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
            dfs.append(pd.read_excel(filepath))
        else:
            raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
    
    # Concatenate dataframes if more than one file is provided
    if len(dfs) > 1:
        df = pd.concat(dfs, ignore_index=True)
    else:
        df = dfs[0]
    
    # Find and rename the date column to 'DOS'
    date_column = find_date_column(df)
    if date_column:
        df.rename(columns={date_column: 'DOS'}, inplace=True)

    # Find and rename the name column to 'Client'
    name_column = find_name_column(df)
    if name_column:
        df.rename(columns={name_column: 'Client'}, inplace=True)

    return df


def find_name_column(df):
    name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$"  # Regex pattern for last name, first name(s)

    max_count = 0
    name_column = None

    for column in df.columns:
        # Count matches of the name pattern in each column
        matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
        valid_count = matches.sum()  # Sum of True values indicating valid names

        # Select the column with the maximum count of valid names
        if valid_count > max_count:
            max_count = valid_count
            name_column = column

    return name_column


def find_date_column(df):
    # Check if 'Treatment dates' column exists
    if 'Treatment dates' in df.columns:
        return 'Treatment dates'

    date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b"  # Regex pattern for common date formats

    max_count = 0
    date_column = None

    for column in df.columns:
        # Count matches of the date pattern in each column
        matches = df[column].astype(str).str.contains(date_pattern, na=False)
        valid_count = matches.sum()  # Sum of True values indicating valid dates

        # Select the column with the maximum count of valid dates
        if valid_count > max_count:
            max_count = valid_count
            date_column = column

    return date_column