I tried to implement an autoencoder-based anomaly detector finding anomalies in the dataset KDDTrain+. This is actually a pretty straight forward implementation. Unfortunately I failed in implementing the training procedure in such a way that it is reproducible. I train the network on a single CPU only and I seeded all the involved packages - especially tensorflow - but I do not get the desired result.
The code I implemented is the following one:
import pandas as pd
import numpy as np
import tensorflow as tf
SEED = 42
tf.config.experimental.enable_op_determinism()
tf.random.set_seed(SEED)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
def load_and_prepare_data():
# load data from file
col_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment",
"urgent", "hot", "num_failed_logins", "logged_in",
"num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
"num_access_files", "num_outbound_cmds",
"is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
"rerror_rate", "srv_rerror_rate", "same_srv_rate",
"diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
"dst_host_same_srv_rate", "dst_host_diff_srv_rate",
"dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
"dst_host_srv_serror_rate", "dst_host_rerror_rate",
"dst_host_srv_rerror_rate", "label"]
df = pd.read_csv("../data/KDDTrain+_20Percent.txt", header=None, names=col_names, index_col=False)
# produce numerical labels for categorical data
categorical_variables = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
categorical_data = pd.get_dummies(df[categorical_variables])
numerical_variables = list(set(df.columns.values.tolist()) - set(categorical_variables))
numerical_variables.remove('label')
numerical_data = df[numerical_variables].copy()
df_preprocessed = pd.concat([numerical_data, categorical_data], axis=1)
# create data split
labels = df['label'].copy()
label_encoder = LabelEncoder()
integer_labels = label_encoder.fit_transform(labels)
n = int(len(df_preprocessed) * 0.75)
x_train = df_preprocessed.loc[0:n, :]
y_train = integer_labels[0:n]
x_test = df_preprocessed.loc[n:len(df_preprocessed), :]
y_test = integer_labels[n:len(df_preprocessed)]
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_train = x_train.astype(np.float32)
x_test = scaler.transform(x_test)
x_test = x_test.astype(np.float32)
return x_train, y_train, x_test, y_test
def build_model(input_dim, latent_space_dim, num_neurons_per_layer_list, activation_func):
# input layer
input_data = tf.keras.layers.Input(shape=(input_dim,), name='encoder_input')
# hidden layers of encoder
num_hidden_layers = len(num_neurons_per_layer_list)
encoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[0], activation=activation_func, name='encoder_0',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(input_data)
for i in range(1, num_hidden_layers):
encoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[i], activation=activation_func,
name='encoder_{:d}'.format(i),
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(encoder)
# bottleneck layer
latent_encoding = tf.keras.layers.Dense(latent_space_dim, activation='linear', name='latent_encoding',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(encoder)
# hidden layers of decoder
decoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[num_hidden_layers - 1], activation=activation_func,
name='decoder_0',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(latent_encoding)
for i in range(1, num_hidden_layers):
decoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[num_hidden_layers - 1 - i],
activation=activation_func,
name='decoder_{:d}'.format(i),
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(decoder)
# output layer
reconstructed_data = tf.keras.layers.Dense(units=input_dim, activation='linear', name='reconstructed_data',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(decoder)
autoencoder_model = tf.keras.models.Model(input_data, reconstructed_data)
return autoencoder_model
def main():
x_train, y_train, x_test, y_test = load_and_prepare_data()
input_dim = x_train.shape[1]
latent_space_dim = 4
num_neurons_per_layer_list = [16, 48, 64, 96]
activation_func = 'relu'
autoencoder_model = build_model(input_dim, latent_space_dim, num_neurons_per_layer_list, activation_func)
learning_rate = 1e-4
loss_function = 'mse'
# opt = optimizers.Adam(learning_rate=learning_rate)
autoencoder_model.compile(optimizer='adam', loss=loss_function)
history = autoencoder_model.fit(x_train, x_train, shuffle=False, epochs=10, batch_size=512,
validation_data=(x_test, x_test))
main()
I expect to get on every single run the same losses, the same weights and biases in the network and the same evaluation result.
Running the same code twice, I get the following results:
First run:
Epoch 1/5 79/79 [=======================] - 1s 7ms/step - loss: 0.0779 - val_loss: 0.0628
Epoch 2/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0514 - val_loss: 0.0397
Epoch 3/5 79/79 [=======================] - 0s 4ms/step - loss: 0.0311 - val_loss: 0.0236
Epoch 4/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0193 - val_loss: 0.0157
Epoch 5/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0146 - val_loss: 0.0130
Second run:
Epoch 1/5 79/79 [=======================] - 1s 7ms/step - loss: 0.0726 - val_loss: 0.0589
Epoch 2/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0475 - val_loss: 0.0363
Epoch 3/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0297 - val_loss: 0.0233
Epoch 4/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0185 - val_loss: 0.0144
Epoch 5/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0131 - val_loss: 0.0115