Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions pkg/env/sensor.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@ var (
// 100 (per flow) * 1000 (flows) * 100 (buffer size) = 10 MB
NetworkFlowBufferSize = RegisterIntegerSetting("ROX_SENSOR_NETFLOW_OFFLINE_BUFFER_SIZE", 100)

// NetworkFlowClosedConnRememberDuration controls how long the categorized update computer will track
// timestamps for closed connections to handle late-arriving updates.
NetworkFlowClosedConnRememberDuration = registerDurationSetting("ROX_NETFLOW_CLOSED_CONN_REMEMBER_DURATION", 6*time.Minute)

// NetworkFlowDeduperHashingAlgorithm selects the hashing algorithm used for the deduper in the process of
// computing the updates for Central.
// Available choices and their effects (case-insensitive):
// - "FNV64" (default): Uses 64-bit FNV-1a algorithm that optimizes the memory consumption of Sensor.
// It is one of the fastest available 64-bit hashes with decent collision probability.
// - "String": Uses CPU-optimized string concatenation to produce a hash. This implementation makes the deduper
// use more memory than FNV64 (roughly 3x more) but optimizes the CPU performance. It may be preferred
// on less active clusters with little network traffic and processes or when CPU resource is limited.
NetworkFlowDeduperHashingAlgorithm = RegisterSetting("ROX_NETFLOW_DEDUPER_HASHING_ALGORITHM", WithDefault("FNV64"))

// ProcessIndicatorBufferSize indicates how many process indicators will be kept in Sensor while offline.
// 1 Item in the buffer = ~300 bytes
// 50000 * 300 = 15 MB
Expand Down
4 changes: 2 additions & 2 deletions sensor/common/networkflow/manager/enrichment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ func TestEnrichConnection_BusinessLogicPaths(t *testing.T) {
enrichTickerC := make(chan time.Time)
defer close(enrichTickerC)

m, mockEntityStore, mockExternalSrc, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, mockExternalSrc, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)

// Setup mocks
mocks := newMockExpectations(mockEntityStore, mockExternalSrc)
Expand Down Expand Up @@ -364,7 +364,7 @@ func TestEnrichContainerEndpoint_EdgeCases(t *testing.T) {
enrichTickerC := make(chan time.Time)
defer close(enrichTickerC)

m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)

// Setup mocks
mocks := newMockExpectations(mockEntityStore, nil)
Expand Down
4 changes: 2 additions & 2 deletions sensor/common/networkflow/manager/manager_enrich_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func (s *TestNetworkFlowManagerEnrichmentTestSuite) TestEnrichConnection() {
enrichTickerC := make(chan time.Time)
defer close(enrichTickerC)
defer mockCtrl.Finish()
m, mockEntityStore, mockExternalSrc, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, mockExternalSrc, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
srcID := "src-id"
dstID := "dst-id"

Expand Down Expand Up @@ -494,7 +494,7 @@ func (s *TestNetworkFlowManagerEnrichmentTestSuite) TestEnrichContainerEndpoint(

for name, tc := range cases {
s.Run(name, func() {
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)

// Setup environment variables
s.T().Setenv(env.ProcessesListeningOnPort.EnvVar(), strconv.FormatBool(tc.plopFeatEnabled))
Expand Down
22 changes: 14 additions & 8 deletions sensor/common/networkflow/manager/manager_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,8 @@ func (m *networkFlowManager) updateEnrichmentCollectionsSize() {
// Number of entities (connections, endpoints) stored in memory for the purposes of not losing data while offline.
concurrency.WithRLock(&m.activeConnectionsMutex, func() {
flowMetrics.EnrichmentCollectionsSize.WithLabelValues("activeConnections", "connections").Set(float64(len(m.activeConnections)))
})
concurrency.WithRLock(&m.activeEndpointsMutex, func() {
flowMetrics.EnrichmentCollectionsSize.WithLabelValues("activeEndpoints", "endpoints").Set(float64(len(m.activeEndpoints)))
})

Expand All @@ -399,8 +401,10 @@ func (m *networkFlowManager) updateEnrichmentCollectionsSize() {

func (m *networkFlowManager) enrichAndSend() {
m.updateEnrichmentCollectionsSize()
// Takes host connections & endpoints and updates them by enriching with additional data.
// Updates m.activeEndpoints and m.activeConnections if lastSeen was reported as null by the Collector.
// currentEnrichedConnsAndEndpoints takes connections, endpoints, and processes (i.e., enriched-entities, short EE)
// and updates them by adding data from different sources (enriching).
// It updates m.activeEndpoints and m.activeConnections if EE is open (i.e., lastSeen is set to null by Collector).
// Enriched-entities for which the enrichment should be retried are not returned from currentEnrichedConnsAndEndpoints!
currentConns, currentEndpoints, currentProcesses := m.currentEnrichedConnsAndEndpoints()

// The new changes are sent to Central using the update computer implementation.
Expand All @@ -416,18 +420,20 @@ func (m *networkFlowManager) enrichAndSend() {
flowMetrics.NumUpdatesSentToCentralGauge.WithLabelValues("endpoints").Set(float64(len(updatedEndpoints)))
flowMetrics.NumUpdatesSentToCentralGauge.WithLabelValues("processes").Set(float64(len(updatedProcesses)))

// Run periodic cleanup after all tasks here are done.
defer m.updateComputer.PeriodicCleanup(time.Now(), time.Minute)

if len(updatedConns)+len(updatedEndpoints) > 0 {
if sent := m.sendConnsEps(updatedConns, updatedEndpoints); sent {
// Update the UpdateComputer's internal state after sending updates to Central.
// This is important for update computers that rely on the state from the previous tick.
m.updateComputer.UpdateState(currentConns, currentEndpoints, nil)
// Inform the updateComputer that sending has succeeded
m.updateComputer.OnSuccessfulSend(currentConns, currentEndpoints, nil)
}
}

if env.ProcessesListeningOnPort.BooleanSetting() && len(updatedProcesses) > 0 {
if sent := m.sendProcesses(updatedProcesses); sent {
// Update the UpdateComputer's internal state after sending updates to Central.
// This is important for update computers that rely on the state from the previous tick.
m.updateComputer.UpdateState(nil, nil, currentProcesses)
// Inform the updateComputer that sending has succeeded
m.updateComputer.OnSuccessfulSend(nil, nil, currentProcesses)
}
}
metrics.SetNetworkFlowBufferSizeGauge(len(m.sensorUpdates))
Expand Down
4 changes: 2 additions & 2 deletions sensor/common/networkflow/manager/manager_impl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ func (s *NetworkFlowManagerTestSuite) TestManagerOfflineMode() {
enrichTickerC := make(chan time.Time)
defer close(enrichTickerC)
defer mockCtrl.Finish()
m, mockEntity, _, mockDetector := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntity, _, mockDetector := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
states := []struct {
testName string
notify common.SensorComponentEvent
Expand Down Expand Up @@ -375,7 +375,7 @@ func (s *NetworkFlowManagerTestSuite) TestExpireMessage() {
enrichTickerC := make(chan time.Time)
defer close(enrichTickerC)
defer mockCtrl.Finish()
m, mockEntity, _, mockDetector := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntity, _, mockDetector := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
go m.enrichConnections(enrichTickerC)
mockEntity.EXPECT().LookupByContainerID(gomock.Any()).Times(1).DoAndReturn(func(_ any) (clusterentities.ContainerMetadata, bool, bool) {
return clusterentities.ContainerMetadata{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (b *sendNetflowsSuite) SetupTest() {
b.mockCtrl = gomock.NewController(b.T())
enrichTickerC := make(chan time.Time)
defer close(enrichTickerC)
b.m, b.mockEntity, _, b.mockDetector = createManager(b.mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
b.m, b.mockEntity, _, b.mockDetector = createManager(b.mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)

b.fakeTicker = make(chan time.Time)
go b.m.enrichConnections(b.fakeTicker)
Expand Down
16 changes: 8 additions & 8 deletions sensor/common/networkflow/manager/purger_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func (s *NetworkFlowPurgerTestSuite) TestPurgerStartWithTicker() {
s.T().Setenv(env.EnrichmentPurgerTickerCycle.EnvVar(), "1s")
s.Equal(time.Second, nonZeroPurgerCycle())

m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
purger := NewNetworkFlowPurger(mockEntityStore, time.Hour, WithManager(m))
s.NoError(purger.Start())
// Enable the ticker after going online - send the same signal that activates the manager
Expand All @@ -50,7 +50,7 @@ func (s *NetworkFlowPurgerTestSuite) TestDisabledPurger() {
// Disabling the purger
s.T().Setenv(env.EnrichmentPurgerTickerCycle.EnvVar(), "0s")

m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
purger := NewNetworkFlowPurger(mockEntityStore, time.Hour, WithManager(m), WithPurgerTicker(s.T(), purgerTickerC))

s.ErrorContains(purger.Start(), "purger is disabled")
Expand All @@ -72,7 +72,7 @@ func (s *NetworkFlowPurgerTestSuite) TestPurgerWithoutManager() {
enrichTickerC := make(chan time.Time)
defer close(enrichTickerC)
defer mockCtrl.Finish()
_, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
_, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
// Don't set manager to explicitly simulate disconnected purger
purger := NewNetworkFlowPurger(mockEntityStore, time.Hour, WithPurgerTicker(s.T(), purgerTickerC))

Expand Down Expand Up @@ -124,7 +124,7 @@ func (s *NetworkFlowPurgerTestSuite) TestPurgerWithManager() {
now := time.Now()
lastUpdateTS := timestamp.FromGoTime(now.Add(-tc.lastUpdateTime))

m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
purger := NewNetworkFlowPurger(mockEntityStore, tc.purgerMaxAge, WithManager(m), WithPurgerTicker(s.T(), purgerTickerC))

expectationsEndpointPurger(mockEntityStore, tc.isKnownEndpoint, true, false)
Expand Down Expand Up @@ -214,7 +214,7 @@ func (s *NetworkFlowPurgerTestSuite) TestPurgerHostConnsEndpoints() {
now := time.Now()
lastUpdateTS := timestamp.FromGoTime(now.Add(-tc.lastUpdateTime))

m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
expectationsEndpointPurger(mockEntityStore, tc.isKnownEndpoint, tc.foundContainerID, false)
ep := createEndpointPair(timestamp.FromGoTime(now.Add(-tc.firstSeen)), lastUpdateTS)
concurrency.WithLock(&m.connectionsByHostMutex, func() {
Expand Down Expand Up @@ -283,7 +283,7 @@ func (s *NetworkFlowPurgerTestSuite) TestPurgerHostConnsConnections() {
now := time.Now()
lastUpdateTS := timestamp.FromGoTime(now.Add(-tc.lastUpdateTime))

m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
expectationsEndpointPurger(mockEntityStore, true, tc.foundContainerID, tc.containerIDHistorical)

pair := createConnectionPair().
Expand Down Expand Up @@ -348,7 +348,7 @@ func (s *NetworkFlowPurgerTestSuite) TestPurgerActiveConnections() {
now := time.Now()
lastUpdateTS := timestamp.FromGoTime(now.Add(-tc.lastUpdateTime))

m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
expectationsEndpointPurger(mockEntityStore, true, tc.foundContainerID, tc.containerIDHistorical)

pair := createConnectionPair().
Expand Down Expand Up @@ -438,7 +438,7 @@ func (s *NetworkFlowPurgerTestSuite) TestPurgerActiveEndpoints() {
now := time.Now()
lastUpdateTS := timestamp.FromGoTime(now.Add(-tc.lastUpdateTime))

m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewLegacy(), enrichTickerC)
m, mockEntityStore, _, _ := createManager(mockCtrl, updatecomputer.NewTransitionBased(), enrichTickerC)
expectationsEndpointPurger(mockEntityStore, tc.isKnownEndpoint, tc.foundContainerID, tc.containerIDHistorical)

ep := createEndpointPair(timestamp.FromGoTime(now.Add(-tc.firstSeen)), lastUpdateTS)
Expand Down
10 changes: 7 additions & 3 deletions sensor/common/networkflow/updatecomputer/interface.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package updatecomputer

import (
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/stackrox/rox/generated/storage"
"github.com/stackrox/rox/pkg/timestamp"
Expand All @@ -14,15 +16,17 @@ type UpdateComputer interface {
ComputeUpdatedEndpoints(current map[indicator.ContainerEndpoint]timestamp.MicroTS) []*storage.NetworkEndpoint
ComputeUpdatedProcesses(current map[indicator.ProcessListening]timestamp.MicroTS) []*storage.ProcessListeningOnPortFromSensor

// UpdateState brings the update computer to the desired state.
// Required for tests and implementations that store the last sent state (e.g., Legacy).
UpdateState(currentConns map[indicator.NetworkConn]timestamp.MicroTS,
// OnSuccessfulSend contains actions that should be executed after successful sending of updates to Central.
OnSuccessfulSend(currentConns map[indicator.NetworkConn]timestamp.MicroTS,
currentEndpoints map[indicator.ContainerEndpoint]timestamp.MicroTS,
currentProcesses map[indicator.ProcessListening]timestamp.MicroTS)

// ResetState resets all internal state (used when clearing historical data).
ResetState()

// PeriodicCleanup should be run periodically to clean up the temporal data.
PeriodicCleanup(now time.Time, cleanupInterval time.Duration)

// RecordSizeMetrics records metrics for length and byte-size of the collections used in updateComputer.
RecordSizeMetrics(gv1, gv2 *prometheus.GaugeVec)
}
7 changes: 5 additions & 2 deletions sensor/common/networkflow/updatecomputer/legacy.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package updatecomputer

import (
"maps"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/stackrox/rox/generated/storage"
Expand Down Expand Up @@ -67,10 +68,10 @@ func (l *Legacy) ComputeUpdatedProcesses(current map[indicator.ProcessListening]
})
}

// UpdateState updates the internal LastSentState maps with the currentState state.
// OnSuccessfulSend updates the internal LastSentState maps with the currentState state.
// Providing nil will skip updates for respective map.
// Providing empty map will reset the state for given state.
func (l *Legacy) UpdateState(currentConns map[indicator.NetworkConn]timestamp.MicroTS,
func (l *Legacy) OnSuccessfulSend(currentConns map[indicator.NetworkConn]timestamp.MicroTS,
currentEndpoints map[indicator.ContainerEndpoint]timestamp.MicroTS,
currentProcesses map[indicator.ProcessListening]timestamp.MicroTS,
) {
Expand All @@ -93,6 +94,8 @@ func (l *Legacy) UpdateState(currentConns map[indicator.NetworkConn]timestamp.Mi
}
}

func (l *Legacy) PeriodicCleanup(_ time.Time, _ time.Duration) {}

// ResetState clears all internal LastSentState maps
func (l *Legacy) ResetState() {
l.lastSentStateMutex.Lock()
Expand Down
40 changes: 40 additions & 0 deletions sensor/common/networkflow/updatecomputer/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package updatecomputer

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/stackrox/rox/pkg/metrics"
)

func init() {
prometheus.MustRegister(
UpdateEvents,
UpdateEventsGauge,
periodicCleanupDurationSeconds,
)
}

var (
UpdateEvents = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metrics.PrometheusNamespace,
Subsystem: metrics.SensorSubsystem.String(),
Name: "update_computer_update_events_total",
Help: "Counts the internal update events for the categorizeUpdate method in TransitionBased updateComputer. " +
"The 'transition' allows counting the transitions of connections between states 'open' and 'closed'." +
"Action stores the decision whether a given update was sent to Central.",
}, []string{"transition", "entity", "action", "reason"})
UpdateEventsGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metrics.PrometheusNamespace,
Subsystem: metrics.SensorSubsystem.String(),
Name: "update_computer_update_events_current",
Help: "Counts the internal update events for the categorizeUpdate method in TransitionBased updateComputer in a single tick. " +
"The 'transition' allows counting the transitions of connections between states 'open' and 'closed'. in a given tick." +
"Action stores the decision whether a given update was sent to Central.",
}, []string{"transition", "entity", "action", "reason"})
periodicCleanupDurationSeconds = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: metrics.PrometheusNamespace,
Subsystem: metrics.SensorSubsystem.String(),
Name: "update_computer_periodic_cleanup_duration_seconds",
Help: "Time in seconds taken to perform a single periodic cleanup on the transition-based update computer.",
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
})
)
Loading
Loading