File size: 6,230 Bytes
526fd5a d85cd72 62af0a0 fbec7e3 526fd5a 62af0a0 526fd5a 62af0a0 7bbe80f 62af0a0 526fd5a 62af0a0 526fd5a 62af0a0 526fd5a 62af0a0 526fd5a 62af0a0 526fd5a 62af0a0 526fd5a 1faefb1 526fd5a 1faefb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import pandas as pd
import gradio as gr
import re
from datetime import timedelta
def process_data(files_mindbody, files_medserv, tolerance, progress=gr.Progress()):
try:
mindbody = load_data(files_mindbody)
medserv = load_data(files_medserv)
except Exception as e:
print(f"An error occurred while loading data: {e}")
return None
try:
# Remove multiple commas from the 'Client' column
medserv['Client'] = medserv['Client'].str.replace(r',+', ',', regex=True)
mindbody['Client'] = mindbody['Client'].str.replace(r',+', ',', regex=True)
# Split 'Client' names into first name and last name components for both DataFrames
medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
except Exception as e:
print(f"An error occurred while processing client names: {e}")
try:
# Split dates if they contain commas in the 'DOS' column of medserv
medserv['DOS'] = medserv['DOS'].astype(str)
medserv['DOS'] = medserv['DOS'].str.split(',')
medserv = medserv.explode('DOS')
# Attempt to convert dates using multiple formats
formats_to_try = ['%d/%m/%Y', '%Y-%m-%d'] # Add more formats as needed
for format_to_try in formats_to_try:
try:
medserv['DOS'] = pd.to_datetime(medserv['DOS'].str.strip(), format=format_to_try)
break # Break out of loop if conversion succeeds
except ValueError:
continue # Continue to next format if conversion fails
except Exception as e:
print(f"An error occurred while processing dates in medserv: {e}")
unmatched_rows = []
try:
rows = len(mindbody)
# Iterate through each row in the mindbody DataFrame
for idx in progress.tdqm(range(rows), desc='Analyzing files...'):
# Extract relevant information from the current row
date = mindbody.iloc[idx]['DOS']
first_name = mindbody.iloc[idx]['First Name']
last_name = mindbody.iloc[idx]['Last Name']
# Define the range of dates to search for a match in medserv
date_range = [date - timedelta(days=i) for i in range(tolerance, -tolerance-1, -1)]
# Remove the time component from the dates in date_range
date_range = [d.date() for d in date_range]
# Filter medserv based on the date range and name criteria
matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) &
((medserv['First Name'].str.lower() == first_name.lower()) |
(medserv['Last Name'].str.lower() == last_name.lower())))]
# If no match is found, append the row to the unmatched_rows list
if matches.empty:
unmatched_rows.append(mindbody.iloc[idx])
except Exception as e:
print(f"An error occurred while analyzing files: {e}")
try:
# Create a DataFrame from the unmatched_rows list
unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
# Specify the columns to include in the output Excel file
columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location', 'Item Total']
# Format the 'DOS' column to remove time part
unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
output_file_path = 'Comparison Results.xlsx'
unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
return output_file_path
except Exception as e:
print(f"An error occurred while creating the output file: {e}")
return None
def load_data(files):
# Check if a single file or multiple files are provided
filepaths = [file.name for file in files]
# Load and concatenate multiple files if provided
dfs = []
for filepath in filepaths:
if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
dfs.append(pd.read_excel(filepath))
else:
raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
# Concatenate dataframes if more than one file is provided
if len(dfs) > 1:
df = pd.concat(dfs, ignore_index=True)
else:
df = dfs[0]
# Find and rename the date column to 'DOS'
date_column = find_date_column(df)
if date_column:
df.rename(columns={date_column: 'DOS'}, inplace=True)
# Find and rename the name column to 'Client'
name_column = find_name_column(df)
if name_column:
df.rename(columns={name_column: 'Client'}, inplace=True)
return df
def find_name_column(df):
name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" # Regex pattern for last name, first name(s)
max_count = 0
name_column = None
for column in df.columns:
# Count matches of the name pattern in each column
matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
valid_count = matches.sum() # Sum of True values indicating valid names
# Select the column with the maximum count of valid names
if valid_count > max_count:
max_count = valid_count
name_column = column
return name_column
def find_date_column(df):
# Check if 'Treatment dates' column exists
if 'Treatment dates' in df.columns:
return 'Treatment dates'
date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" # Regex pattern for common date formats
max_count = 0
date_column = None
for column in df.columns:
# Count matches of the date pattern in each column
matches = df[column].astype(str).str.contains(date_pattern, na=False)
valid_count = matches.sum() # Sum of True values indicating valid dates
# Select the column with the maximum count of valid dates
if valid_count > max_count:
max_count = valid_count
date_column = column
return date_column
|