DeepLabCut · n-poulsen · Dec 20, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -21,6 +21,9 @@ snapshot-*
 # Modelzoo checkpoints
 deeplabcut/modelzoo/checkpoints/
 
+# PyTorch backbone weights
+deeplabcut/pose_estimation_pytorch/models/backbones/pretrained_weights/
+
 # Wandb files
 wandb/
 

diff --git a/deeplabcut/__init__.py b/deeplabcut/__init__.py
@@ -30,6 +30,7 @@
         "DLC loaded in light mode; you cannot use any GUI (labeling, relabeling and standalone GUI)"
     )
 
+from deeplabcut.core.engine import Engine
 from deeplabcut.create_project import (
     create_new_project,
     create_new_project_3d,

diff --git a/deeplabcut/pose_estimation_pytorch/README.md b/deeplabcut/pose_estimation_pytorch/README.md
@@ -420,16 +420,20 @@ train(
 
 ### Running Video Analysis outside a DeepLabCut Project
 
-DeepLabCut provides high-level APIs (via the GUI or the python package) to analyze your data. The usage of this API assumes the existance of a DLC project (with `config.yaml` file, etc.).
+DeepLabCut provides high-level APIs (via the GUI or the python package) to analyze your
+data. The usage of this API assumes the existance of a DLC project (with `config.yaml`
+file, etc.).
 
-Sometimes it might be more convenient to just run a model on your data via a low-level API. We also use this API under the hood, in particular for the Model Zoo. Check out the example below:
+Sometimes it might be more convenient to just run a model on your data via a low-level
+API. We also use this API under the hood, in particular for the Model Zoo. Check out the
+example below:
 
 ```python
 from pathlib import Path
 
+from deeplabcut.pose_estimation_pytorch import Task
 from deeplabcut.pose_estimation_pytorch.apis.analyze_videos import video_inference
 from deeplabcut.pose_estimation_pytorch.config import read_config_as_dict
-from deeplabcut.pose_estimation_pytorch.task import Task
 from deeplabcut.pose_estimation_pytorch.apis.utils import get_inference_runners
 
 train_dir = Path("/Users/Jaylen/my-dlc-models/train")
@@ -447,30 +451,76 @@ detector_batch_size = 8
 
 # read model configuration
 model_cfg = read_config_as_dict(pytorch_config_path)
-bodyparts = model_cfg["metadata"]["bodyparts"]
-unique_bodyparts = model_cfg["metadata"]["unique_bodyparts"]
-with_identity = model_cfg["metadata"].get("with_identity", False)
 
 pose_task = Task(model_cfg["method"])
 pose_runner, detector_runner = get_inference_runners(
     model_config=model_cfg,
     snapshot_path=snapshot_path,
     max_individuals=max_num_animals,
-    num_bodyparts=len(bodyparts),
-    num_unique_bodyparts=len(unique_bodyparts),
     batch_size=batch_size,
-    with_identity=with_identity,
-    transform=None,
     detector_batch_size=detector_batch_size,
     detector_path=detector_snapshot_path,
-    detector_transform=None,
 )
 
 predictions = video_inference(
     video=video_path,
     task=pose_task,
     pose_runner=pose_runner,
     detector_runner=detector_runner,
-    with_identity=False,
 )
 ```
+
+
+### Running Top-Down Video Analysis with Existing Bounding Boxes
+
+When `deeplabcut.pose_estimation_pytorch.apis.analyze_videos.video_inference` is called
+with a top-down model, it is assumed that a detector snapshot is given as well to obtain
+bounding boxes with which to run pose estimation. It's possible that you've already 
+obtained bounding boxes for your video (with another object detector or through some 
+other means), and you want to re-use those bounding boxes instead of running an object
+detector again.
+
+You can easily do so by writing a bit of custom code, as shown in the example below:
+
+```python
+from pathlib import Path
+
+import numpy as np
+from deeplabcut.pose_estimation_pytorch import get_inference_runners
+from deeplabcut.pose_estimation_pytorch.apis import VideoIterator
+from deeplabcut.pose_estimation_pytorch.config import read_config_as_dict
+from tqdm import tqdm
+
+# create an iterator for your video
+video = VideoIterator("/Users/Jayson/my-cool-video.mp4")
+
+# dummy bboxes - you can load yours from a file or in another way
+#  the bboxes should be in `xywh` format, i.e. (x_top_left, y_top_left, width, height)
+bounding_boxes = [
+    dict(  # frame 0 bounding boxes
+        bboxes=np.array([[12, 37, 120, 78]]),
+    ),
+    dict(  # frame 1 bounding boxes
+        bboxes=np.array([[17, 45, 128, 73], [532, 34, 117, 87]]),
+    ),
+    # ...
+    dict(  # frame N bboxes -> must be equal to the number of frames in the video!
+        bboxes=np.array([[17, 45, 128, 73], [532, 34, 117, 87]]),
+    ),
+]
+video.set_context(bounding_boxes)
+max_individuals = np.max([len(context["bboxes"]) for context in bounding_boxes])
+
+# run inference!
+model_cfg = read_config_as_dict("/Users/Jayson/pytorch_config.yaml")
+pose_runner, _ = get_inference_runners(
+    model_config=model_cfg,
+    snapshot_path=Path("/Users/Jayson/model-snapshot.pt"),
+    max_individuals=max_individuals,
+    batch_size=32,
+)
+
+# your predictions will be a list, containing the predictions made for each frame
+#  as a dict (with keys for "bodyparts" but also "bboxes")!
+predictions = pose_runner.inference(images=tqdm(video))
+```
diff --git a/deeplabcut/pose_estimation_pytorch/__init__.py b/deeplabcut/pose_estimation_pytorch/__init__.py
@@ -11,6 +11,7 @@
 from deeplabcut.pose_estimation_pytorch.apis import (
     analyze_videos,
     convert_detections2tracklets,
+    get_inference_runners,
     evaluate_network,
     extract_maps,
     extract_save_all_maps,

diff --git a/deeplabcut/pose_estimation_pytorch/apis/__init__.py b/deeplabcut/pose_estimation_pytorch/apis/__init__.py
@@ -10,16 +10,32 @@
 #
 
 from deeplabcut.pose_estimation_pytorch.apis.analyze_images import (
+    analyze_image_folder,
     analyze_images,
     superanimal_analyze_images,
 )
-from deeplabcut.pose_estimation_pytorch.apis.analyze_videos import analyze_videos
+from deeplabcut.pose_estimation_pytorch.apis.analyze_videos import (
+    analyze_videos,
+    video_inference,
+    VideoIterator,
+)
 from deeplabcut.pose_estimation_pytorch.apis.convert_detections_to_tracklets import (
     convert_detections2tracklets,
 )
-from deeplabcut.pose_estimation_pytorch.apis.evaluate import evaluate_network
+from deeplabcut.pose_estimation_pytorch.apis.evaluate import (
+    evaluate,
+    evaluate_network,
+)
 from deeplabcut.pose_estimation_pytorch.apis.export import export_model
-from deeplabcut.pose_estimation_pytorch.apis.train import train_network
+from deeplabcut.pose_estimation_pytorch.apis.train import (
+    train,
+    train_network,
+)
+from deeplabcut.pose_estimation_pytorch.apis.utils import (
+    get_detector_inference_runner,
+    get_inference_runners,
+    get_pose_inference_runner,
+)
 from deeplabcut.pose_estimation_pytorch.apis.visualization import (
     extract_maps,
     extract_save_all_maps,

diff --git a/deeplabcut/pose_estimation_pytorch/apis/utils.py b/deeplabcut/pose_estimation_pytorch/apis/utils.py
@@ -423,9 +423,9 @@ def build_bboxes_dict_for_dataframe(
 def get_inference_runners(
     model_config: dict,
     snapshot_path: str | Path,
-    max_individuals: int,
-    num_bodyparts: int,
-    num_unique_bodyparts: int,
+    max_individuals: int | None = None,
+    num_bodyparts: int | None = None,
+    num_unique_bodyparts: int | None = None,
     batch_size: int = 1,
     device: str | None = None,
     with_identity: bool = False,
@@ -439,9 +439,12 @@ def get_inference_runners(
     Args:
         model_config: the pytorch configuration file
         snapshot_path: the path of the snapshot from which to load the weights
-        max_individuals: the maximum number of individuals per image
-        num_bodyparts: the number of bodyparts predicted by the model
-        num_unique_bodyparts: the number of unique_bodyparts predicted by the model
+        max_individuals: the maximum number of individuals per image (if None, uses the
+            individuals defined in the model_config metadata)
+        num_bodyparts: the number of bodyparts predicted by the model (if None, uses the
+            bodyparts defined in the model_config metadata)
+        num_unique_bodyparts: the number of unique_bodyparts predicted by the model (if
+            None, uses the unique bodyparts defined in the model_config metadata)
         batch_size: the batch size to use for the pose model.
         with_identity: whether the pose model has an identity head
         device: if defined, overwrites the device selection from the model config
@@ -457,6 +460,13 @@ def get_inference_runners(
         a runner for pose estimation
         a runner for detection, if detector_path is not None
     """
+    if max_individuals is None:
+        max_individuals = len(model_config["metadata"]["individuals"])
+    if num_bodyparts is None:
+        num_bodyparts = len(model_config["metadata"]["bodyparts"])
+    if num_unique_bodyparts is None:
+        num_unique_bodyparts = len(model_config["metadata"]["unique_bodyparts"])
+
     pose_task = Task(model_config["method"])
     if device is None:
         device = resolve_device(model_config)
@@ -482,10 +492,15 @@ def get_inference_runners(
         if device == "mps":
             detector_device = "cpu"
 
+        crop_cfg = model_config["data"]["inference"].get("top_down_crop", {})
+        width, height = crop_cfg.get("width", 256), crop_cfg.get("height", 256)
+        margin = crop_cfg.get("margin", 0)
+
         pose_preprocessor = build_top_down_preprocessor(
             color_mode=model_config["data"]["colormode"],
             transform=transform,
-            cropped_image_size=(256, 256),
+            top_down_crop_size=(width, height),
+            top_down_crop_margin=margin,
         )
         pose_postprocessor = build_top_down_postprocessor(
             max_individuals=max_individuals,
@@ -636,10 +651,15 @@ def get_pose_inference_runner(
             with_identity=with_identity,
         )
     else:
+        crop_cfg = model_config["data"]["inference"].get("top_down_crop", {})
+        width, height = crop_cfg.get("width", 256), crop_cfg.get("height", 256)
+        margin = crop_cfg.get("margin", 0)
+
         pose_preprocessor = build_top_down_preprocessor(
             color_mode=model_config["data"]["colormode"],
             transform=transform,
-            cropped_image_size=(256, 256),
+            top_down_crop_size=(width, height),
+            top_down_crop_margin=margin,
         )
         pose_postprocessor = build_top_down_postprocessor(
             max_individuals=max_individuals,

diff --git a/deeplabcut/pose_estimation_pytorch/config/backbones/cspnext_m.yaml b/deeplabcut/pose_estimation_pytorch/config/backbones/cspnext_m.yaml
@@ -0,0 +1,19 @@
+model:
+  backbone:
+    type: CSPNeXt
+    model_name: cspnext_m
+    freeze_bn_stats: false
+    freeze_bn_weights: false
+    deepen_factor: 0.67
+    widen_factor: 0.75
+  backbone_output_channels: 768
+runner:
+  optimizer:
+    type: AdamW
+    params:
+      lr: 0.0005
+  scheduler:
+    type: LRListScheduler
+    params:
+      lr_list: [ [ 1e-4 ], [ 1e-5 ] ]
+      milestones: [ 90, 190 ]
diff --git a/deeplabcut/pose_estimation_pytorch/config/backbones/cspnext_s.yaml b/deeplabcut/pose_estimation_pytorch/config/backbones/cspnext_s.yaml
@@ -0,0 +1,19 @@
+model:
+  backbone:
+    type: CSPNeXt
+    model_name: cspnext_s
+    freeze_bn_stats: false
+    freeze_bn_weights: false
+    deepen_factor: 0.33
+    widen_factor: 0.5
+  backbone_output_channels: 512
+runner:
+  optimizer:
+    type: AdamW
+    params:
+      lr: 0.0005
+  scheduler:
+    type: LRListScheduler
+    params:
+      lr_list: [ [ 1e-4 ], [ 1e-5 ] ]
+      milestones: [ 90, 190 ]
diff --git a/deeplabcut/pose_estimation_pytorch/config/backbones/cspnext_x.yaml b/deeplabcut/pose_estimation_pytorch/config/backbones/cspnext_x.yaml
@@ -0,0 +1,19 @@
+model:
+  backbone:
+    type: CSPNeXt
+    model_name: cspnext_x
+    freeze_bn_stats: false
+    freeze_bn_weights: false
+    deepen_factor: 1.33
+    widen_factor: 1.25
+  backbone_output_channels: 1280
+runner:
+  optimizer:
+    type: AdamW
+    params:
+      lr: 0.0005
+  scheduler:
+    type: LRListScheduler
+    params:
+      lr_list: [ [ 1e-4 ], [ 1e-5 ] ]
+      milestones: [ 90, 190 ]
diff --git a/deeplabcut/pose_estimation_pytorch/config/base/aug_top_down.yaml b/deeplabcut/pose_estimation_pytorch/config/base/aug_top_down.yaml
@@ -1,6 +1,9 @@
 colormode: RGB
 inference:
   normalize_images: true
+  top_down_crop:
+    width: 256
+    height: 256
 train:
   affine:
     p: 0.5
@@ -13,3 +16,6 @@ train:
   hist_eq: false
   motion_blur: false
   normalize_images: true
+  top_down_crop:
+    width: 256
+    height: 256
diff --git a/deeplabcut/pose_estimation_pytorch/config/base/base_detector.yaml b/deeplabcut/pose_estimation_pytorch/config/base/base_detector.yaml
@@ -40,6 +40,6 @@ runner:
 train_settings:
   batch_size: 1
   dataloader_workers: 0
-  dataloader_pin_memory: true
+  dataloader_pin_memory: false
   display_iters: 500
   epochs: 250
diff --git a/deeplabcut/pose_estimation_pytorch/config/dekr/dekr_w18.yaml b/deeplabcut/pose_estimation_pytorch/config/dekr/dekr_w18.yaml
@@ -39,7 +39,7 @@ model:
       heatmap_config:
         channels:
         - 270
-        - 64
+        - 18
         - "num_bodyparts + 1"  # num_bodyparts + center keypoint
         num_blocks: 1
         dilation_rate: 1

diff --git a/deeplabcut/pose_estimation_pytorch/config/dekr/dekr_w32.yaml b/deeplabcut/pose_estimation_pytorch/config/dekr/dekr_w32.yaml
@@ -39,7 +39,7 @@ model:
       heatmap_config:
         channels:
         - 480
-        - 64
+        - 32
         - "num_bodyparts + 1"  # num_bodyparts + center keypoint
         num_blocks: 1
         dilation_rate: 1

diff --git a/deeplabcut/pose_estimation_pytorch/config/dekr/dekr_w48.yaml b/deeplabcut/pose_estimation_pytorch/config/dekr/dekr_w48.yaml
@@ -39,7 +39,7 @@ model:
       heatmap_config:
         channels:
         - 720
-        - 64  # TODO: Check channels
+        - 48
         - "num_bodyparts + 1"  # num_bodyparts + center keypoint
         num_blocks: 1
         dilation_rate: 1