Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ jobs:
go-version-file: go.mod
- name: golangci-lint
uses: golangci/golangci-lint-action@1e7e51e771db61008b38414a730f564565cf7c20 # v9.2.0
with:
only-new-issues: ${{ github.event_name == 'pull_request' }}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*~
/deployment-tracker
.idea/
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ deployment records to GitHub's artifact metadata API.
- **Real-time tracking**: Sends deployment records when pods are
created or deleted
- **Graceful shutdown**: Properly drains work queue before terminating
- **Runtime risks**: Track runtime risks through annotations

## How It Works

Expand Down Expand Up @@ -82,14 +83,21 @@ The `DN_TEMPLATE` supports the following placeholders:
- `{{deploymentName}}` - Name of the owning Deployment
- `{{containerName}}` - Container name

## Runtime Risks

You can track runtime risks through annotations. Add the annotation `github.com/runtime-risks`, with a comma-separated list of supported runtime risk values. Annotations are aggregated from the pod and its owner reference objects.

Currently supported runtime risks can be found in the [Create Deployment Record API docs](https://docs.github.com/en/rest/orgs/artifact-metadata?apiVersion=2022-11-28#create-an-artifact-deployment-record). Invalid runtime risk values will be ignored.


## Kubernetes Deployment

A complete deployment manifest is provided in `deploy/manifest.yaml`
which includes:

- **Namespace**: `deployment-tracker`
- **ServiceAccount**: Identity for the controller pod
- **ClusterRole**: Minimal permissions (`get`, `list`, `watch` on pods)
- **ClusterRole**: Minimal permissions (`get`, `list`, `watch` on pods; `get` on other supported objects)
- **ClusterRoleBinding**: Binds the ServiceAccount to the ClusterRole
- **Deployment**: Runs the controller with security hardening

Expand Down
11 changes: 10 additions & 1 deletion cmd/deployment-tracker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"time"

"github.com/github/deployment-tracker/internal/controller"
"k8s.io/client-go/metadata"

"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/client-go/kubernetes"
Expand Down Expand Up @@ -112,6 +113,14 @@ func main() {
os.Exit(1)
}

// Create metadata client
metadataClient, err := metadata.NewForConfig(k8sCfg)
if err != nil {
slog.Error("Error creating Kubernetes metadata client",
"error", err)
os.Exit(1)
}

// Start the metrics server
var promSrv = &http.Server{
Addr: ":" + metricsPort,
Expand Down Expand Up @@ -151,7 +160,7 @@ func main() {
cancel()
}()

cntrl, err := controller.New(clientset, namespace, excludeNamespaces, &cntrlCfg)
cntrl, err := controller.New(clientset, metadataClient, namespace, excludeNamespaces, &cntrlCfg)
if err != nil {
slog.Error("Failed to create controller",
"error", err)
Expand Down
5 changes: 4 additions & 1 deletion deploy/manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get"]
- apiGroups: ["apps"]
resources: ["replicasets"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
Expand Down Expand Up @@ -66,7 +69,7 @@ spec:
- name: PHYSICAL_ENVIRONMENT
value: "iad-moda1"
- name: CLUSTER
value: "kommendorkapten"
value: "test-cluster"
- name: BASE_URL
value: "http://artifact-registry.artifact-registry.svc.cluster.local:9090"
resources:
Expand Down
164 changes: 150 additions & 14 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ import (
"errors"
"fmt"
"log/slog"
"slices"
"strings"
"sync"
"time"

"github.com/github/deployment-tracker/pkg/deploymentrecord"
"github.com/github/deployment-tracker/pkg/image"
"github.com/github/deployment-tracker/pkg/metrics"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/metadata"

corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -29,6 +33,8 @@ const (
EventCreated = "CREATED"
// EventDeleted indicates that a pod has been deleted.
EventDeleted = "DELETED"
// RuntimeRiskAnnotationKey represents the annotation key for runtime risks.
RuntimeRiskAnnotationKey = "github.com/runtime-risks"
)

// PodEvent represents a pod event to be processed.
Expand All @@ -38,21 +44,27 @@ type PodEvent struct {
DeletedPod *corev1.Pod // Only populated for delete events
}

// AggregatePodMetadata represents combined metadata for a pod and its ownership hierarchy.
type AggregatePodMetadata struct {
RuntimeRisks map[deploymentrecord.RuntimeRisk]bool
}

// Controller is the Kubernetes controller for tracking deployments.
type Controller struct {
clientset kubernetes.Interface
podInformer cache.SharedIndexInformer
workqueue workqueue.TypedRateLimitingInterface[PodEvent]
apiClient *deploymentrecord.Client
cfg *Config
clientset kubernetes.Interface
metadataClient metadata.Interface
podInformer cache.SharedIndexInformer
workqueue workqueue.TypedRateLimitingInterface[PodEvent]
apiClient *deploymentrecord.Client
cfg *Config
// best effort cache to avoid redundant posts
// post requests are idempotent, so if this cache fails due to
// restarts or other events, nothing will break.
observedDeployments sync.Map
}

// New creates a new deployment tracker controller.
func New(clientset kubernetes.Interface, namespace string, excludeNamespaces string, cfg *Config) (*Controller, error) {
func New(clientset kubernetes.Interface, metadataClient metadata.Interface, namespace string, excludeNamespaces string, cfg *Config) (*Controller, error) {
// Create informer factory
factory := createInformerFactory(clientset, namespace, excludeNamespaces)

Expand Down Expand Up @@ -84,11 +96,12 @@ func New(clientset kubernetes.Interface, namespace string, excludeNamespaces str
}

cntrl := &Controller{
clientset: clientset,
podInformer: podInformer,
workqueue: queue,
apiClient: apiClient,
cfg: cfg,
clientset: clientset,
metadataClient: metadataClient,
podInformer: podInformer,
workqueue: queue,
apiClient: apiClient,
cfg: cfg,
}

// Add event handlers to the informer
Expand Down Expand Up @@ -334,16 +347,26 @@ func (c *Controller) processEvent(ctx context.Context, event PodEvent) error {

var lastErr error

// Gather aggregate metadata for adds/updates
var runtimeRisks []deploymentrecord.RuntimeRisk
if status != deploymentrecord.StatusDecommissioned {
aggMetadata := c.aggregateMetadata(ctx, podToPartialMetadata(pod))
for risk := range aggMetadata.RuntimeRisks {
runtimeRisks = append(runtimeRisks, risk)
}
slices.Sort(runtimeRisks)
}

// Record info for each container in the pod
for _, container := range pod.Spec.Containers {
if err := c.recordContainer(ctx, pod, container, status, event.EventType); err != nil {
if err := c.recordContainer(ctx, pod, container, status, event.EventType, runtimeRisks); err != nil {
lastErr = err
}
}

// Also record init containers
for _, container := range pod.Spec.InitContainers {
if err := c.recordContainer(ctx, pod, container, status, event.EventType); err != nil {
if err := c.recordContainer(ctx, pod, container, status, event.EventType, runtimeRisks); err != nil {
lastErr = err
}
}
Expand Down Expand Up @@ -371,7 +394,7 @@ func (c *Controller) deploymentExists(ctx context.Context, namespace, name strin
}

// recordContainer records a single container's deployment info.
func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, container corev1.Container, status, eventType string) error {
func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, container corev1.Container, status, eventType string, runtimeRisks []deploymentrecord.RuntimeRisk) error {
dn := getARDeploymentName(pod, container, c.cfg.Template)
digest := getContainerDigest(pod, container.Name)

Expand Down Expand Up @@ -424,6 +447,7 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
c.cfg.Cluster,
status,
dn,
runtimeRisks,
)

if err := c.apiClient.PostOne(ctx, record); err != nil {
Expand Down Expand Up @@ -457,6 +481,7 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
"name", record.Name,
"deployment_name", record.DeploymentName,
"status", record.Status,
"runtime_risks", record.RuntimeRisks,
"digest", record.Digest,
)

Expand All @@ -473,6 +498,94 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
return nil
}

// aggregateMetadata returns aggregated metadata for a pod and its owners.
func (c *Controller) aggregateMetadata(ctx context.Context, obj *metav1.PartialObjectMetadata) AggregatePodMetadata {
aggMetadata := AggregatePodMetadata{
RuntimeRisks: make(map[deploymentrecord.RuntimeRisk]bool),
}
queue := []*metav1.PartialObjectMetadata{obj}
visited := make(map[types.UID]bool)

for len(queue) > 0 {
current := queue[0]
queue = queue[1:]

if visited[current.GetUID()] {
slog.Warn("Already visited object, skipping to avoid cycles",
"UID", current.GetUID(),
"name", current.GetName(),
)
continue
}
visited[current.GetUID()] = true

extractMetadataFromObject(current, &aggMetadata)
c.addOwnersToQueue(ctx, current, &queue)
}

return aggMetadata
}

// addOwnersToQueue takes a current object and looks up its owners, adding them to the queue for processing
// to collect their metadata.
func (c *Controller) addOwnersToQueue(ctx context.Context, current *metav1.PartialObjectMetadata, queue *[]*metav1.PartialObjectMetadata) {
ownerRefs := current.GetOwnerReferences()

for _, owner := range ownerRefs {
ownerObj, err := c.getOwnerMetadata(ctx, current.GetNamespace(), owner)
if err != nil {
slog.Warn("Failed to get owner object for metadata collection",
"namespace", current.GetNamespace(),
"owner_kind", owner.Kind,
"owner_name", owner.Name,
"error", err,
)
continue
}

if ownerObj == nil {
continue
}

*queue = append(*queue, ownerObj)
}
}

// getOwnerMetadata retrieves partial object metadata for an owner ref.
func (c *Controller) getOwnerMetadata(ctx context.Context, namespace string, owner metav1.OwnerReference) (*metav1.PartialObjectMetadata, error) {
gvr := schema.GroupVersionResource{
Group: "apps",
Version: "v1",
}

switch owner.Kind {
case "ReplicaSet":
gvr.Resource = "replicasets"
case "Deployment":
gvr.Resource = "deployments"
default:
slog.Debug("Unsupported owner kind for runtime risk collection",
"kind", owner.Kind,
"name", owner.Name,
)
return nil, nil
}

obj, err := c.metadataClient.Resource(gvr).Namespace(namespace).Get(ctx, owner.Name, metav1.GetOptions{})
if err != nil {
if k8serrors.IsNotFound(err) {
slog.Debug("Owner object not found for metadata collection",
"namespace", namespace,
"owner_kind", owner.Kind,
"owner_name", owner.Name,
)
return nil, nil
}
return nil, err
}
return obj, nil
}

func getCacheKey(dn, digest string) string {
return dn + "||" + digest
}
Expand Down Expand Up @@ -580,3 +693,26 @@ func getDeploymentName(pod *corev1.Pod) string {
}
return ""
}

// extractMetadataFromObject extracts metadata from an object.
func extractMetadataFromObject(obj *metav1.PartialObjectMetadata, aggMetadata *AggregatePodMetadata) {
annotations := obj.GetAnnotations()
if risks, exists := annotations[RuntimeRiskAnnotationKey]; exists {
for _, risk := range strings.Split(risks, ",") {
r := deploymentrecord.ValidateRuntimeRisk(risk)
if r != "" {
aggMetadata.RuntimeRisks[r] = true
}
}
}
}

func podToPartialMetadata(pod *corev1.Pod) *metav1.PartialObjectMetadata {
return &metav1.PartialObjectMetadata{
TypeMeta: metav1.TypeMeta{
APIVersion: "v1",
Kind: "Pod",
},
ObjectMeta: pod.ObjectMeta,
}
}
Loading