Reproducible Training of an autoencoder in Tensorflow

Question

I tried to implement an autoencoder-based anomaly detector finding anomalies in the dataset KDDTrain+. This is actually a pretty straight forward implementation. Unfortunately I failed in implementing the training procedure in such a way that it is reproducible. I train the network on a single CPU only and I seeded all the involved packages - especially tensorflow - but I do not get the desired result.

The code I implemented is the following one:

import pandas as pd
import numpy as np

import tensorflow as tf
SEED = 42
tf.config.experimental.enable_op_determinism()
tf.random.set_seed(SEED)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


def load_and_prepare_data():

    # load data from file
    col_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment",
                 "urgent", "hot", "num_failed_logins", "logged_in",
                 "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
                 "num_access_files", "num_outbound_cmds",
                 "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
                 "rerror_rate", "srv_rerror_rate", "same_srv_rate",
                 "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
                 "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
                 "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
                 "dst_host_srv_serror_rate", "dst_host_rerror_rate",
                 "dst_host_srv_rerror_rate", "label"]

    df = pd.read_csv("../data/KDDTrain+_20Percent.txt", header=None, names=col_names, index_col=False)

    # produce numerical labels for categorical data
    categorical_variables = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
    categorical_data = pd.get_dummies(df[categorical_variables])

    numerical_variables = list(set(df.columns.values.tolist()) - set(categorical_variables))
    numerical_variables.remove('label')
    numerical_data = df[numerical_variables].copy()

    df_preprocessed = pd.concat([numerical_data, categorical_data], axis=1)

    # create data split
    labels = df['label'].copy()
    label_encoder = LabelEncoder()
    integer_labels = label_encoder.fit_transform(labels)

    n = int(len(df_preprocessed) * 0.75)
    x_train = df_preprocessed.loc[0:n, :]
    y_train = integer_labels[0:n]

    x_test = df_preprocessed.loc[n:len(df_preprocessed), :]
    y_test = integer_labels[n:len(df_preprocessed)]

    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_train = x_train.astype(np.float32)

    x_test = scaler.transform(x_test)
    x_test = x_test.astype(np.float32)

    return x_train, y_train, x_test, y_test


def build_model(input_dim, latent_space_dim, num_neurons_per_layer_list, activation_func):
    # input layer
    input_data = tf.keras.layers.Input(shape=(input_dim,), name='encoder_input')

    # hidden layers of encoder
    num_hidden_layers = len(num_neurons_per_layer_list)
    encoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[0], activation=activation_func, name='encoder_0',
                                    kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
                                    bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(input_data)
    for i in range(1, num_hidden_layers):
        encoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[i], activation=activation_func,
                                        name='encoder_{:d}'.format(i),
                                        kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
                                        bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(encoder)

    # bottleneck layer
    latent_encoding = tf.keras.layers.Dense(latent_space_dim, activation='linear', name='latent_encoding',
                                            kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
                                            bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(encoder)

    # hidden layers of decoder
    decoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[num_hidden_layers - 1], activation=activation_func,
                                    name='decoder_0',
                                    kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
                                    bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(latent_encoding)
    for i in range(1, num_hidden_layers):
        decoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[num_hidden_layers - 1 - i],
                                        activation=activation_func,
                                        name='decoder_{:d}'.format(i),
                                        kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
                                        bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(decoder)

    # output layer
    reconstructed_data = tf.keras.layers.Dense(units=input_dim, activation='linear', name='reconstructed_data',
                                               kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
                                               bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(decoder)

    autoencoder_model = tf.keras.models.Model(input_data, reconstructed_data)
    return autoencoder_model


def main():
    x_train, y_train, x_test, y_test = load_and_prepare_data()
    input_dim = x_train.shape[1]

    latent_space_dim = 4
    num_neurons_per_layer_list = [16, 48, 64, 96]
    activation_func = 'relu'

    autoencoder_model = build_model(input_dim, latent_space_dim, num_neurons_per_layer_list, activation_func)
    learning_rate = 1e-4
    loss_function = 'mse'
    # opt = optimizers.Adam(learning_rate=learning_rate)
    autoencoder_model.compile(optimizer='adam', loss=loss_function)

    history = autoencoder_model.fit(x_train, x_train, shuffle=False, epochs=10, batch_size=512,
                                    validation_data=(x_test, x_test))

main()

I expect to get on every single run the same losses, the same weights and biases in the network and the same evaluation result.

Running the same code twice, I get the following results:

First run:

Epoch 1/5 79/79 [=======================] - 1s 7ms/step - loss: 0.0779 - val_loss: 0.0628

Epoch 2/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0514 - val_loss: 0.0397

Epoch 3/5 79/79 [=======================] - 0s 4ms/step - loss: 0.0311 - val_loss: 0.0236

Epoch 4/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0193 - val_loss: 0.0157

Epoch 5/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0146 - val_loss: 0.0130
Second run:

Epoch 1/5 79/79 [=======================] - 1s 7ms/step - loss: 0.0726 - val_loss: 0.0589

Epoch 2/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0475 - val_loss: 0.0363

Epoch 3/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0297 - val_loss: 0.0233

Epoch 4/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0185 - val_loss: 0.0144

Epoch 5/5 79/79 [=======================] - 0s 5ms/step - loss: 0.0131 - val_loss: 0.0115

Please trim your code to make it easier to find your problem. Follow these guidelines to create a minimal reproducible example. — Community
– Community Bot, Commented Apr 2, 2024 at 13:27
I split up the code basically into three functions: * load_and_prepare_data(): data is loaded from file, some preprocessing is applied - train-test-split is returned to the caller * build_model(): model is generated and returned to caller * main(): calles the abovementioned methods and triggers training — flo
– flo, Commented Apr 2, 2024 at 14:10
Gradient decent is a stocastic optimization method - why do you expect to get "same losses, the same weights and biases" ? — Jon Nordby
– Jon Nordby, Commented Apr 3, 2024 at 12:34
Can you provide a plot of the evaluation metrics across epochs, with the different runs (preferably in the same plot)? — Jon Nordby
– Jon Nordby, Commented Apr 3, 2024 at 12:35
By seeding all the involved packages I should get reproducible results - by making the training procedure deterministic, I am able to remove stochasticity (i.e. stochastic noise) while doing hyperparameter optimisation. ... Actually I found out that loading the data is creating the problems in the above example. — flo
– flo, Commented Apr 4, 2024 at 14:42

flo · Accepted Answer · 2024-04-14 06:44:31Z

Finally I was able to find a solution to my problem. First of all it is necessary to seed all the random number generators involved. It is convenient to use the method tensorflow.keras.utils.set_random_seed(seed_val) - calling this method is equivalent to calling

random.seed(seed_val)
numpy.random.set_seed(seed_val)
tensorflow.random.set_seed(seed_val)
torch.manual_seed(seed_val)

An important requirement for deterministically reproducing training results is to renounce on the usage of GPUs. To do this use the calls

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
tensorflow.config.threading.set_intra_op_parallelism_threads(1)
tensorflow.config.threading.set_inter_op_parallelism_threads(1)

The first of these commands is used to target the CPU only while training. The latter commands are used to renounce on parallel execution on CPU-side. Note that execution of these commands may increase the runtime of the code.

Finally, in the example I posted, the problem was data-related. At one point I extracted all the numerical variables/features by means of

numerical_variables = list(set(df.columns.values.tolist()) - set(categorical_variables))

This line of code does not give reproducible results due to the conversion of a list into a set. Sorting the resulting list (after converting the set back to a list) alphabetically by means of numerical_variables.sort() solves the problem.

Notes

I used tensorflow==2.12.0. As far as I know all the above statements and explanations are valid for tensorflow>=2.0.0. For seeding application using previous tensorflow versions, see here - which in any case gives in my opinion a very good explanation of the topic.
As far as I know one can use the commands
- os.environ['TF_DETERMINISTIC_OPS'] = '1'
- os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
to achieve determinism as well when running the training on a GPU. But I didn't test it.

Collectives™ on Stack Overflow

Reproducible Training of an autoencoder in Tensorflow

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related