DeepLabCut · MMathisLab · Jul 24, 2023 · Jul 18, 2023 · Jul 19, 2023 · Jul 19, 2023
diff --git a/deeplabcut/pose_estimation_tensorflow/core/evaluate.py b/deeplabcut/pose_estimation_tensorflow/core/evaluate.py
@@ -13,6 +13,7 @@
 import argparse
 import os
 from pathlib import Path
+from typing import List
 import numpy as np
 import pandas as pd
 from tqdm import tqdm
@@ -259,7 +260,7 @@ def return_evaluate_network_data(
     >>> deeplabcut._evaluate_network_data('/analysis/project/reaching-task/config.yaml', shuffle=[1])
     --------
     If you want to plot
-    >>> deeplabcut.evaluate_network('/analysis/project/reaching-task/config.yaml',shuffle=[1],True)
+    >>> deeplabcut.evaluate_network('/analysis/project/reaching-task/config.yaml',shuffle=[1],plotting=True)
     """
 
     import os
@@ -494,6 +495,55 @@ def return_evaluate_network_data(
             return results
 
 
+def keypoint_error(
+    df_error: pd.DataFrame,
+    df_error_p_cutoff: pd.DataFrame,
+    train_indices: List[int],
+    test_indices: List[int],
+) -> pd.DataFrame:
+    """Computes the RMSE error for each bodypart
+
+    The error dataframes can be in single animal format (non-hierarchical columns, one
+    column for each bodypart) or multi-animal format (hierarchical columns with 3
+    levels: "scorer", "individuals", "bodyparts").
+
+    Args:
+        df_error: dataframe containing the RMSE error for each image, individual and
+            bodypart
+        df_error_p_cutoff: dataframe containing the RMSE error with p-cutoff for each
+            image, individual and bodypart
+        train_indices: the indices of rows in the dataframe that are in the train set
+        test_indices: the indices of rows in the dataframe that are in the test set
+
+    Returns:
+        A dataframe containing 4 rows (train and test error, with and without p-cutoff)
+        and one column for each bodypart.
+    """
+    df_error = df_error.copy()
+    df_error_p_cutoff = df_error_p_cutoff.copy()
+
+    error_rows = []
+    for row_name, df in [
+        ("Train error (px)", df_error.iloc[train_indices, :]),
+        ("Test error (px)", df_error.iloc[test_indices, :]),
+        ("Train error (px) with p-cutoff", df_error_p_cutoff.iloc[train_indices, :]),
+        ("Test error (px) with p-cutoff", df_error_p_cutoff.iloc[test_indices, :]),
+    ]:
+        df_flat = df.copy()
+        if isinstance(df.columns, pd.MultiIndex):
+            # MA projects have column indices "scorer", "individuals" and "bodyparts"
+            # Drop the scorer level, and put individuals in rows
+            df_flat = df.droplevel("scorer", axis=1).stack(level="individuals").copy()
+
+        bodypart_error = df_flat.mean()
+        bodypart_error["Error Type"] = row_name
+        error_rows.append(bodypart_error)
+
+    # The error rows are series; stack in axis 1 and pivot to get DF
+    keypoint_error_df = pd.concat(error_rows, axis=1)
+    return keypoint_error_df.T.set_index("Error Type")
+
+
 def evaluate_network(
     config,
     Shuffles=[1],
@@ -504,6 +554,7 @@ def evaluate_network(
     gputouse=None,
     rescale=False,
     modelprefix="",
+    per_keypoint_evaluation: bool = False,
 ):
     """Evaluates the network.
 
@@ -557,6 +608,10 @@ def evaluate_network(
         Directory containing the deeplabcut models to use when evaluating the network.
         By default, the models are assumed to exist in the project folder.
 
+    per_keypoint_evaluation: bool, default=False
+        Compute the train and test RMSE for each keypoint, and save the results to
+        a {model_name}-keypoint-results.csv in the evalution-results folder
+
     Returns
     -------
     None
@@ -609,6 +664,7 @@ def evaluate_network(
             comparisonbodyparts=comparisonbodyparts,
             gputouse=gputouse,
             modelprefix=modelprefix,
+            per_keypoint_evaluation=per_keypoint_evaluation,
         )
     else:
         from deeplabcut.utils.auxfun_videos import imread, imresize
@@ -720,7 +776,9 @@ def evaluate_network(
                         )
                     ),
                 )
-                auxiliaryfunctions.attempt_to_make_folder(evaluationfolder, recursive=True)
+                auxiliaryfunctions.attempt_to_make_folder(
+                    evaluationfolder, recursive=True
+                )
                 # path_train_config = modelfolder / 'train' / 'pose_cfg.yaml'
 
                 # Check which snapshots are available and sort them by # iterations
@@ -900,6 +958,15 @@ def evaluate_network(
                         ]
                         final_result.append(results)
 
+                        if per_keypoint_evaluation:
+                            df_keypoint_error = keypoint_error(
+                                RMSE, RMSEpcutoff, trainIndices, testIndices
+                            )
+                            kpt_filename = DLCscorer + "-keypoint-results.csv"
+                            df_keypoint_error.to_csv(
+                                Path(evaluationfolder) / kpt_filename
+                            )
+
                         if show_errors:
                             print(
                                 "Results for",

diff --git a/deeplabcut/pose_estimation_tensorflow/core/evaluate_multianimal.py b/deeplabcut/pose_estimation_tensorflow/core/evaluate_multianimal.py
@@ -18,7 +18,10 @@
 from scipy.spatial import cKDTree
 from tqdm import tqdm
 
-from deeplabcut.pose_estimation_tensorflow.core.evaluate import make_results_file
+from deeplabcut.pose_estimation_tensorflow.core.evaluate import (
+    make_results_file,
+    keypoint_error,
+)
 from deeplabcut.pose_estimation_tensorflow.config import load_config
 from deeplabcut.pose_estimation_tensorflow.lib import crossvalutils
 from deeplabcut.utils import visualization
@@ -106,6 +109,7 @@ def evaluate_multianimal_full(
     comparisonbodyparts="all",
     gputouse=None,
     modelprefix="",
+    per_keypoint_evaluation: bool = False,
 ):
     from deeplabcut.pose_estimation_tensorflow.core import (
         predict,
@@ -495,6 +499,18 @@ def evaluate_multianimal_full(
                         ]
                         final_result.append(results)
 
+                        if per_keypoint_evaluation:
+                            df_keypoint_error = keypoint_error(
+                                error,
+                                error[mask],
+                                trainIndices,
+                                testIndices,
+                            )
+                            kpt_filename = DLCscorer + "-keypoint-results.csv"
+                            df_keypoint_error.to_csv(
+                                Path(evaluationfolder) / kpt_filename
+                            )
+
                         if show_errors:
                             string = (
                                 "Results for {} training iterations, training fraction of {}, and shuffle {}:\n"

diff --git a/examples/testscript.py b/examples/testscript.py
@@ -177,7 +177,9 @@
     deeplabcut.train_network(path_config_file)
 
     print("EVALUATE")
-    deeplabcut.evaluate_network(path_config_file, plotting=True)
+    deeplabcut.evaluate_network(
+        path_config_file, plotting=True, per_keypoint_evaluation=True
+    )
     # deeplabcut.evaluate_network(path_config_file,plotting=True,trainingsetindex=33)
     print("CUT SHORT VIDEO AND ANALYZE (with dynamic cropping!)")
 

diff --git a/examples/testscript_multianimal.py b/examples/testscript_multianimal.py
@@ -153,7 +153,9 @@
     print("Network trained.")
 
     print("Evaluating network...")
-    deeplabcut.evaluate_network(config_path, plotting=True)
+    deeplabcut.evaluate_network(
+        config_path, plotting=True, per_keypoint_evaluation=True
+    )
 
     print("Network evaluated....")
 
@@ -296,7 +298,9 @@
     print("Network trained.")
 
     print("Evaluating network...")
-    deeplabcut.evaluate_network(config_path, plotting=True)
+    deeplabcut.evaluate_network(
+        config_path, plotting=True, per_keypoint_evaluation=True
+    )
 
     print("Network evaluated....")
 

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
@@ -0,0 +1,154 @@
+#
+# DeepLabCut Toolbox (deeplabcut.org)
+# © A. & M.W. Mathis Labs
+# https://github.com/DeepLabCut/DeepLabCut
+#
+# Please see AUTHORS for contributors.
+# https://github.com/DeepLabCut/DeepLabCut/blob/master/AUTHORS
+#
+# Licensed under GNU Lesser General Public License v3.0
+#
+import numpy as np
+import pandas as pd
+import pytest
+
+import deeplabcut.pose_estimation_tensorflow as pet
+
+
+def make_single_animal_rmse_df(
+    bodyparts,
+    train_indices,
+    test_indices,
+    error_data=None,
+) -> pd.DataFrame:
+    if error_data is None:
+        error_data = np.ones((len(train_indices) + len(test_indices), len(bodyparts)))
+    return pd.DataFrame(error_data, columns=bodyparts)
+
+
+def make_multi_animal_rmse_df(
+    scorer,
+    individuals,
+    bodyparts,
+    train_indices,
+    test_indices,
+    error_data=None,
+) -> pd.DataFrame:
+    columns = pd.MultiIndex.from_product(
+        [[scorer], individuals, bodyparts],
+        names=["scorer", "individuals", "bodyparts"],
+    )
+    if error_data is None:
+        error_data = np.ones(
+            (len(train_indices) + len(test_indices), len(individuals) * len(bodyparts))
+        )
+    return pd.DataFrame(error_data, columns=columns)
+
+
+KEYPOINT_ERROR_NAMES = [
+    "Train error (px)",
+    "Test error (px)",
+    "Train error (px) with p-cutoff",
+    "Test error (px) with p-cutoff",
+]
+
+KEYPOINT_ERROR_TEST_DATA = [
+    (
+        {
+            "df_error": make_single_animal_rmse_df(
+                bodyparts=["leg", "arm", "head"],
+                train_indices=[0, 1, 3],
+                test_indices=[2, 4],
+            ),
+            "train_indices": [0, 1, 3],
+            "test_indices": [2, 4],
+        },
+        {
+            "leg": [1.0, 1.0],  # train, test
+            "arm": [1.0, 1.0],  # train, test
+            "head": [1.0, 1.0],  # train, test
+        },
+    ),
+    (
+        {
+            "df_error": make_single_animal_rmse_df(
+                bodyparts=["leftHand", "rightHand"],
+                train_indices=[0, 2],
+                test_indices=[1, 3],
+                error_data=[
+                    [1.0, np.nan],
+                    [1.0, 0.0],
+                    [0.0, 10.0],
+                    [5.0, 5.0],
+                ],
+            ),
+            "train_indices": [0, 2],
+            "test_indices": [1, 3],
+        },
+        {
+            "leftHand": [0.5, 3.0],  # train, test
+            "rightHand": [10.0, 2.5],  # train, test
+        },
+    ),
+    (
+        {
+            "df_error": make_single_animal_rmse_df(
+                bodyparts=["leg", "arm", "head"],
+                train_indices=[0, 1, 3],
+                test_indices=[2, 4],
+            ),
+            "train_indices": [0, 1, 3],
+            "test_indices": [2, 4],
+        },
+        {
+            "leg": [1.0, 1.0],  # train, test
+            "arm": [1.0, 1.0],  # train, test
+            "head": [1.0, 1.0],  # train, test
+        },
+    ),
+    (
+        {
+            "df_error": make_multi_animal_rmse_df(
+                scorer="john",
+                individuals=["individual_1", "individual_2"],
+                bodyparts=["leftArm", "rightArm"],
+                train_indices=[0, 1, 3],
+                test_indices=[2],
+                error_data=[
+                    # individual_1, individual2
+                    # leftArm, rightArm, leftArm, rightArm
+                    [1.0, np.nan, 1.0, 2.0],
+                    [2.0, 0.0, 1.0, np.nan],
+                    [3.0, 10.0, 1.0, np.nan],
+                    [10.0, 4.0, np.nan, np.nan],
+                ],
+            ),
+            "train_indices": [0, 1, 3],
+            "test_indices": [2],
+        },
+        {
+            "leftArm": [3.0, 2.0],  # train, test
+            "rightArm": [2.0, 10.0],  # train, test
+        },
+    ),
+]
+
+
+@pytest.mark.parametrize("inputs, expected_values", KEYPOINT_ERROR_TEST_DATA)
+def test_evaluate_keypoint_error(inputs, expected_values):
+    keypoint_error = pet.keypoint_error(
+        inputs["df_error"],
+        inputs["df_error"],
+        inputs["train_indices"],
+        inputs["test_indices"],
+    )
+    print(inputs["df_error"])
+    print(keypoint_error)
+    for bodypart, mean_errors in expected_values.items():
+        for error_name in KEYPOINT_ERROR_NAMES:
+            if "train" in error_name.lower():
+                mean_error = mean_errors[0]
+            else:
+                mean_error = mean_errors[1]
+
+            assert keypoint_error.loc[error_name, bodypart] == mean_error