Zack commited on
Commit
55913f5
·
1 Parent(s): 782735f

Revert "fix: Drop all null columns"

Browse files

This reverts commit 782735f0419f445257d3ed831349d34b3a0b8c25.

Files changed (1) hide show
  1. app.py +43 -3
app.py CHANGED
@@ -10,12 +10,14 @@ scaler = json.load(f)
10
 
11
  TIME_STEPS = 288
12
 
 
13
  def create_sequences(values, time_steps=TIME_STEPS):
14
  output = []
15
  for i in range(len(values) - time_steps + 1):
16
  output.append(values[i : (i + time_steps)])
17
  return np.stack(output)
18
 
 
19
  def normalize_data(data):
20
  df_test_value = (data - scaler["mean"]) / scaler["std"]
21
  return df_test_value
@@ -29,17 +31,21 @@ def plot_test_data(df_test_value):
29
  return fig
30
 
31
  def get_anomalies(df_test_value):
 
32
  x_test = create_sequences(df_test_value.values)
33
  model = from_pretrained_keras("keras-io/timeseries-anomaly-detection")
34
 
 
35
  x_test_pred = model.predict(x_test)
36
  test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
37
  test_mae_loss = test_mae_loss.reshape((-1))
38
 
 
39
  anomalies = test_mae_loss > scaler["threshold"]
40
  return anomalies
41
 
42
  def plot_anomalies(df_test_value, data, anomalies):
 
43
  anomalous_data_indices = []
44
  for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
45
  if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
@@ -54,38 +60,73 @@ def plot_anomalies(df_test_value, data, anomalies):
54
  return fig
55
 
56
  def clean_data(df):
 
 
 
 
57
  if "timestamp" in df.columns and "value" in df.columns:
58
  df["timestamp"] = pd.to_datetime(df["timestamp"])
59
  return df
 
 
60
  elif "Date" in df.columns and "Hour" in df.columns and "Hourly_Labor_Hours_Total" in df.columns:
 
61
  df["timestamp"] = pd.to_datetime(df["Date"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
 
 
62
  df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] + pd.DateOffset(days=1)
63
  df["timestamp"] = df["timestamp"].dt.floor('h')
 
 
64
  df = df[["timestamp", "Hourly_Labor_Hours_Total"]]
 
 
65
  df.rename(columns={"Hourly_Labor_Hours_Total": "value"}, inplace=True)
 
66
  elif "Date_CY" in df.columns and "Hour" in df.columns and "Net_Sales_CY" in df.columns:
 
67
  df["timestamp"] = pd.to_datetime(df["Date_CY"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
 
 
68
  df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] - pd.DateOffset(days=1)
69
  df["timestamp"] = df["timestamp"].dt.floor('h')
 
 
70
  df = df[["timestamp", "Net_Sales_CY"]]
 
 
71
  df.rename(columns={"Net_Sales_CY": "value"}, inplace=True)
 
 
72
  df = df.dropna(subset=['value'])
 
73
  return df
 
74
  else:
75
  raise ValueError("Dataframe does not contain necessary columns.")
76
 
77
  def master(file):
 
78
  data = pd.read_csv(file.name)
79
- print(f"Original data shape: {data.shape}") # Debug statement
 
80
  data = clean_data(data)
81
- print(f"Cleaned data shape: {data.shape}") # Debug statement
 
82
  data['timestamp'] = pd.to_datetime(data['timestamp'])
 
83
  data.set_index("timestamp", inplace=True)
 
 
84
  if len(data) < TIME_STEPS:
85
  return "Not enough data to create sequences. Need at least {} records.".format(TIME_STEPS)
 
86
  df_test_value = normalize_data(data)
 
87
  plot1 = plot_test_data(df_test_value)
 
88
  anomalies = get_anomalies(df_test_value)
 
89
  plot2 = plot_anomalies(df_test_value, data, anomalies)
90
  return plot2
91
 
@@ -101,4 +142,3 @@ iface = gr.Interface(
101
  )
102
 
103
  iface.launch()
104
-
 
10
 
11
  TIME_STEPS = 288
12
 
13
+ # Generated training sequences for use in the model.
14
  def create_sequences(values, time_steps=TIME_STEPS):
15
  output = []
16
  for i in range(len(values) - time_steps + 1):
17
  output.append(values[i : (i + time_steps)])
18
  return np.stack(output)
19
 
20
+
21
  def normalize_data(data):
22
  df_test_value = (data - scaler["mean"]) / scaler["std"]
23
  return df_test_value
 
31
  return fig
32
 
33
  def get_anomalies(df_test_value):
34
+ # Create sequences from test values.
35
  x_test = create_sequences(df_test_value.values)
36
  model = from_pretrained_keras("keras-io/timeseries-anomaly-detection")
37
 
38
+ # Get test MAE loss.
39
  x_test_pred = model.predict(x_test)
40
  test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
41
  test_mae_loss = test_mae_loss.reshape((-1))
42
 
43
+ # Detect all the samples which are anomalies.
44
  anomalies = test_mae_loss > scaler["threshold"]
45
  return anomalies
46
 
47
  def plot_anomalies(df_test_value, data, anomalies):
48
+ # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
49
  anomalous_data_indices = []
50
  for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
51
  if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
 
60
  return fig
61
 
62
  def clean_data(df):
63
+ # Drop rows with any null data
64
+ # df = df.dropna()
65
+
66
+ # Check if the DataFrame already contains the correct columns
67
  if "timestamp" in df.columns and "value" in df.columns:
68
  df["timestamp"] = pd.to_datetime(df["timestamp"])
69
  return df
70
+
71
+ # Check if DataFrame contains the columns to be converted
72
  elif "Date" in df.columns and "Hour" in df.columns and "Hourly_Labor_Hours_Total" in df.columns:
73
+ # Convert "Date" and "Hour" columns into datetime format
74
  df["timestamp"] = pd.to_datetime(df["Date"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
75
+
76
+ # Handle the case where hour is 24
77
  df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] + pd.DateOffset(days=1)
78
  df["timestamp"] = df["timestamp"].dt.floor('h')
79
+
80
+ # Keep only necessary columns
81
  df = df[["timestamp", "Hourly_Labor_Hours_Total"]]
82
+
83
+ # Rename column
84
  df.rename(columns={"Hourly_Labor_Hours_Total": "value"}, inplace=True)
85
+
86
  elif "Date_CY" in df.columns and "Hour" in df.columns and "Net_Sales_CY" in df.columns:
87
+ # Convert "Date_CY" and "Hour" columns into datetime format
88
  df["timestamp"] = pd.to_datetime(df["Date_CY"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
89
+
90
+ # Handle the case where hour is 24
91
  df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] - pd.DateOffset(days=1)
92
  df["timestamp"] = df["timestamp"].dt.floor('h')
93
+
94
+ # Keep only necessary columns
95
  df = df[["timestamp", "Net_Sales_CY"]]
96
+
97
+ # Rename column
98
  df.rename(columns={"Net_Sales_CY": "value"}, inplace=True)
99
+
100
+ # Drop rows where 'value' is NaN
101
  df = df.dropna(subset=['value'])
102
+
103
  return df
104
+
105
  else:
106
  raise ValueError("Dataframe does not contain necessary columns.")
107
 
108
  def master(file):
109
+ # read file
110
  data = pd.read_csv(file.name)
111
+
112
+ # clean data
113
  data = clean_data(data)
114
+
115
+ # Convert timestamp to datetime after cleaning
116
  data['timestamp'] = pd.to_datetime(data['timestamp'])
117
+
118
  data.set_index("timestamp", inplace=True)
119
+
120
+ # Check if data has enough records to create sequences
121
  if len(data) < TIME_STEPS:
122
  return "Not enough data to create sequences. Need at least {} records.".format(TIME_STEPS)
123
+
124
  df_test_value = normalize_data(data)
125
+ # plot input test data
126
  plot1 = plot_test_data(df_test_value)
127
+ # predict
128
  anomalies = get_anomalies(df_test_value)
129
+ #plot anomalous data points
130
  plot2 = plot_anomalies(df_test_value, data, anomalies)
131
  return plot2
132
 
 
142
  )
143
 
144
  iface.launch()