ICAMS · yury-lysogorskiy · Feb 2, 2024 · Feb 2, 2024
diff --git a/docs/pacemaker/inputfile.md b/docs/pacemaker/inputfile.md
@@ -332,8 +332,12 @@ fit:
     ##                will not be added until the list of functions of the previous body-order is exhausted
     ## power_order -  the order of adding new basis functions is defined by the "power rank" p of a function.
     ##                p = len(ns) + sum(ns) + sum(ls). Functions with the smallest p are added first      
-  #ladder_type: body_order    
-
+  #ladder_type: body_order
+
+  # early stoppping
+    ##  min_relative_train_loss_per_iter: 5e-5
+    ##  min_relative_test_loss_per_iter: 1e-5
+    ##  early_stopping_patience: 200
 
   ## callbacks during the fitting. Module quick_validation.py should be available for import
   ## see example/pacemaker_with_callback for more details and examples

diff --git a/lib/pybind11/CMakeLists.txt b/lib/pybind11/CMakeLists.txt
@@ -5,7 +5,7 @@
 # All rights reserved. Use of this source code is governed by a
 # BSD-style license that can be found in the LICENSE file.
 
-cmake_minimum_required(VERSION 3.4)
+cmake_minimum_required(VERSION 3.7)
 
 # The `cmake_minimum_required(VERSION 3.4...3.22)` syntax does not work with
 # some versions of VS that have a patched CMake 3.11. This forces us to emulate

diff --git a/src/pyace/data/input_template.yaml b/src/pyace/data/input_template.yaml
@@ -70,6 +70,11 @@ fit:
   #  ladder_step: 100
   #  ladder_type: power_order
 
+  # Early stopping
+  # min_relative_train_loss_per_iter: 5e-5
+  # min_relative_test_loss_per_iter: 1e-5
+  # early_stopping_patience: 200
+
 #################################################################
 ## Backend specification section
 #################################################################

diff --git a/src/pyace/generalfit.py b/src/pyace/generalfit.py
@@ -126,6 +126,10 @@ def save_dataset(dataframe, fname):
     log.info("Dataset saved into {}".format(fname))
 
 
+class TestLossChangeTooSmallException(StopIteration):
+    pass
+
+
 class GeneralACEFit:
     """
     Main fitting wrapper class
@@ -150,6 +154,7 @@ def __init__(self,
                  seed=None,
                  callbacks=None
                  ):
+        self.early_stopping_occured = None
         self.seed = seed
         if self.seed is not None:
             log.info("Set numpy random seed to {}".format(self.seed))
@@ -293,11 +298,11 @@ def __init__(self,
 
         self.data_config = data_config
         self.weighting_policy_spec = self.fit_config.get(FIT_WEIGHTING_KW)
-        display_step = backend_config.get('display_step', 20)
+        self.display_step = backend_config.get('display_step', 20)
         if self.ladder_scheme:
-            self.metrics_aggregator = MetricsAggregator(extended_display_step=display_step)
+            self.metrics_aggregator = MetricsAggregator(extended_display_step=self.display_step)
         else:
-            self.metrics_aggregator = MetricsAggregator(extended_display_step=display_step,
+            self.metrics_aggregator = MetricsAggregator(extended_display_step=self.display_step,
                                                         ladder_metrics_filename=None)
         self.fit_backend = FitBackendAdapter(backend_config,
                                              fit_metrics_callback=self.fit_metric_callback,
@@ -354,6 +359,24 @@ def __init__(self,
 
         self.loss_spec = LossFunctionSpecification(**loss_spec_dict)
 
+        # attributes for early stopping
+        self.train_loss_list = []
+        self.test_loss_list = []
+        self.early_stopping_occured = False
+        self.early_stopping_patience = fit_config.get("early_stopping_patience", 200)
+        self.min_relative_train_loss_per_iter = fit_config.get('min_relative_train_loss_per_iter')
+        self.min_relative_test_loss_per_iter = fit_config.get('min_relative_test_loss_per_iter')
+        if self.min_relative_train_loss_per_iter:
+            self.min_relative_train_loss_per_iter=-abs(self.min_relative_train_loss_per_iter)
+            log.info(
+                f"Slowest relative change of TRAIN loss is set to {self.min_relative_train_loss_per_iter :+1.2e}/iter, " +
+                f"patience = {self.early_stopping_patience} iters")
+        if self.min_relative_test_loss_per_iter:
+            self.min_relative_test_loss_per_iter = -abs(self.min_relative_test_loss_per_iter)
+            log.info(
+                f"Slowest relative change of TEST loss is set to {self.min_relative_test_loss_per_iter :+1.2e}/iter, " +
+                f"patience = {self.early_stopping_patience} iters")
+
     def set_core_rep(self, basis_conf):
         # automatic repulsion selection
         if "repulsion" in self.fit_config and self.fit_config["repulsion"] == "auto":
@@ -372,11 +395,71 @@ def fit_metric_callback(self, metrics_dict, extended_display_step=None):
         metrics_dict["cycle_step"] = self.current_fit_cycle
         metrics_dict["ladder_step"] = self.current_ladder_step
         self.metrics_aggregator.fit_metric_callback(metrics_dict, extended_display_step=extended_display_step)
+        self.train_loss_list.append(metrics_dict['loss'])
+        self.log_d_rel_loss(metrics_dict["iter_num"], mode='train')
+        if self.min_relative_train_loss_per_iter is not None:
+            self.detect_early_stopping(mode='train')
 
     def test_metric_callback(self, metrics_dict, extended_display_step=None):
         metrics_dict["cycle_step"] = self.current_fit_cycle
         metrics_dict["ladder_step"] = self.current_ladder_step
         self.metrics_aggregator.test_metric_callback(metrics_dict, extended_display_step=extended_display_step)
+        self.test_loss_list.append(metrics_dict['loss'])
+        self.log_d_rel_loss(metrics_dict["iter_num"], mode='test')
+        if self.min_relative_test_loss_per_iter is not None:
+            self.detect_early_stopping(mode='test')
+
+    def compute_d_rel_loss_d_step(self, loss_list, mode):
+        iter_step = self.display_step if mode == 'test' else 1
+        min_loss_depth = int(np.ceil(self.early_stopping_patience / iter_step))
+        # take last min_loss_depth
+        loss_list = np.array(loss_list[-min_loss_depth:])
+        d_rel_loss_d_step = (loss_list[1:] - loss_list[:-1]) / loss_list[:-1] / iter_step  # normally - big negative
+        return d_rel_loss_d_step
+
+    def log_d_rel_loss(self, iter_num, mode):
+        if iter_num > 0 and iter_num % self.display_step == 0 and not self.early_stopping_occured:
+            loss_list = self.get_loss_list(mode)
+            d_rel_loss_d_step = self.compute_d_rel_loss_d_step(loss_list, mode)
+            if len(d_rel_loss_d_step) > 0:
+                last_d_rel_loss_d_step = d_rel_loss_d_step[-1]
+                log.info(f"Last relative {mode.upper()} loss change {last_d_rel_loss_d_step :+1.2e}/iter")
+
+    def get_loss_list(self, mode):
+        assert mode in ['train', 'test'], f"Unsupported {mode=}"
+
+        if mode == 'train':
+            return self.train_loss_list
+        elif mode == 'test':
+            return self.test_loss_list
+
+    def detect_early_stopping(self, mode):
+        loss_list = self.get_loss_list(mode)
+        if self.early_stopping_occured:
+            # early stopping already occured
+            return
+
+        iter_step = self.display_step if mode == 'test' else 1
+        min_loss_depth = int(np.ceil(self.early_stopping_patience / iter_step))
+
+        if len(loss_list) - 1 < min_loss_depth:  # -1 because test loss is written at it=0
+            # trajectory is not long enough
+            return
+
+        d_rel_loss_d_step = self.compute_d_rel_loss_d_step(loss_list, mode)
+
+        min_relative_loss_per_iter = self.min_relative_test_loss_per_iter if mode == 'test' else self.min_relative_train_loss_per_iter
+        if min(d_rel_loss_d_step) > min_relative_loss_per_iter:
+            # early stopping
+            min_d_rel_loss_d_step = min(d_rel_loss_d_step)
+            last_d_rel_loss_d_step = d_rel_loss_d_step[-1]
+            msg = f"EARLY STOPPING: Too small or even positive {mode.upper()} loss change (best={min_d_rel_loss_d_step:+1.2e}  / iter, " + \
+                  f"last={last_d_rel_loss_d_step:+1.2e}/iter, " + \
+                  f"threshold = {min_relative_loss_per_iter :+1.2e}/iter) " + \
+                  f"within last {self.early_stopping_patience} iterations. Stopping"
+            log.info(msg)
+            self.early_stopping_occured = True
+            raise TestLossChangeTooSmallException(msg)
 
     def fit(self) -> BBasisConfiguration:
         gc.collect()
@@ -474,6 +557,7 @@ def cycle_fitting(self, bbasisconfig: BBasisConfiguration) -> BBasisConfiguratio
                                                                                        num_of_parameters))
             log.info("Running fit backend")
             self.current_fit_iteration = 0
+            self.reset_early_stopping()
             current_bbasisconfig = self.fit_backend.fit(
                 current_bbasisconfig,
                 dataframe=self.fitting_data, loss_spec=self.loss_spec, fit_config=self.fit_config,
@@ -535,6 +619,11 @@ def cycle_fitting(self, bbasisconfig: BBasisConfiguration) -> BBasisConfiguratio
         save_interim_potential(current_best_bbasisconfig, potential_filename="interim_potential_best_cycle.yaml")
         return current_best_bbasisconfig
 
+    def reset_early_stopping(self):
+        self.early_stopping_occured = False
+        self.test_loss_list = []
+        self.train_loss_list = []
+
     @staticmethod
     def apply_gaussian_noise(current_bbasisconfig, trainable_parameters_dict, noise_abs_sigma, noise_rel_sigma):
         cur_bbasis = ACEBBasisSet(current_bbasisconfig)

diff --git a/src/pyace/metrics_aggregator.py b/src/pyace/metrics_aggregator.py
@@ -447,7 +447,7 @@ def print_detailed_metrics(fit_metrics_dict, title='Iteration:'):
         log.info('{:<12}'.format(title) +
                  "#{iter_num:<5}".format(iter_num=iter_num) +
                  '{:<14}'.format('({numeval} evals):'.format(numeval=fit_metrics_dict["eval_count"])) +
-                 '{:>10}'.format('Loss: ') + "{loss: >3.6f}".format(loss=total_loss) +
+                 '{:>10}'.format('Loss: ') + "{loss: >1.4e}".format(loss=total_loss) +
                  '{str1:>21}{rmse_epa:>.2f} ({low_rmse_e:>.2f}) meV/at' \
                  .format(str1=" | RMSE Energy(low): ",
                          rmse_epa=1e3 * fit_metrics_dict["rmse_epa"],

diff --git a/src/pyace/preparedata.py b/src/pyace/preparedata.py
@@ -669,9 +669,18 @@ def prepare_datasets(self):
             self.fitting_data, self.test_data = train_test_split(self.fitting_data, test_size=test_size)
         self.test_data = self.process_dataset(self.test_data)
 
-        # apply weights (TODO: for joint train+test?)
-        self.fitting_data = apply_weights(self.fitting_data, self.weighting_policy_spec, self.ignore_weights)
-        self.test_data = apply_weights(self.test_data, self.weighting_policy_spec, self.ignore_weights)
+        # apply weights
+        if self.test_data is not None:
+            # for joint train+test
+            self.fitting_data["train"] = True
+            self.test_data["train"] = False
+            joint_df = pd.concat([self.fitting_data, self.test_data], axis=0)
+            joint_df = apply_weights(joint_df, self.weighting_policy_spec, self.ignore_weights)
+            self.fitting_data = joint_df.query("train").reset_index(drop=True)
+            self.test_data = joint_df.query("~train").reset_index(drop=True)
+            # self.test_data = apply_weights(self.test_data, self.weighting_policy_spec, self.ignore_weights)
+        else:
+            self.fitting_data = apply_weights(self.fitting_data, self.weighting_policy_spec, self.ignore_weights)
 
         # decrease augmented weights
         aug_factor = self.data_config.get("aug_factor", 1e-4)

diff --git a/tests/test-CLI/Cu-I/input.yaml b/tests/test-CLI/Cu-I/input.yaml
@@ -72,15 +72,19 @@ fit:
 
   ## scipy.minimze algorithm: BFGS /  L-BFGS-B / Nelder-Mead / etc...
   optimizer: BFGS
-  repulsion: auto
 
   ## maximum number of scipy.minimize iterations
   maxiter: 20
 
+  #  early stopping
+  min_relative_train_loss_per_iter: 5e-5
+  min_relative_test_loss_per_iter: 1e-5
+  early_stopping_patience: 10
+
 #################################################################
 ## Backend specification section
 #################################################################
 backend:
   evaluator: tensorpot
   batch_size: 100
-  display_step: 50
+  display_step: 10