Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ require (
github.com/aws/smithy-go v1.23.0
github.com/cenkalti/backoff/v3 v3.2.2
github.com/cenkalti/backoff/v4 v4.3.0
github.com/cespare/xxhash v1.1.0
github.com/cloudflare/cfssl v1.6.5
github.com/cockroachdb/pebble/v2 v2.1.0
github.com/containers/image/v5 v5.36.2
Expand Down
3 changes: 3 additions & 0 deletions go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 0 additions & 10 deletions pkg/env/sensor.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,6 @@ var (
// updates sent to Central. Setting this to `true` enables the behavior as in 4.8 and earlier.
NetworkFlowUseLegacyUpdateComputer = RegisterBooleanSetting("ROX_NETFLOW_USE_LEGACY_UPDATE_COMPUTER", false)

// NetworkFlowDeduperHashingAlgorithm selects the hashing algorithm used for the deduper in the process of
// computing the updates for Central.
// Available choices and their effects (case-insensitive):
// - "FNV64" (default): Uses 64-bit FNV-1a algorithm that optimizes the memory consumption of Sensor.
// It is one of the fastest available 64-bit hashes with decent collision probability.
// - "String": Uses CPU-optimized string concatenation to produce a hash. This implementation makes the deduper
// use more memory than FNV64 (roughly 3x more) but optimizes the CPU performance. It may be preferred
// on less active clusters with little network traffic and processes or when CPU resource is limited.
NetworkFlowDeduperHashingAlgorithm = RegisterSetting("ROX_NETFLOW_DEDUPER_HASHING_ALGORITHM", WithDefault("FNV64"))

// ProcessIndicatorBufferSize indicates how many process indicators will be kept in Sensor while offline.
// 1 Item in the buffer = ~300 bytes
// 50000 * 300 = 15 MB
Expand Down
41 changes: 13 additions & 28 deletions sensor/common/networkflow/manager/indicator/indicator.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@ import (
"github.com/stackrox/rox/pkg/timestamp"
)

// HashingAlgo selects the algorithm for hashing the connection/endpoint/process fingerprinting.
type HashingAlgo int

const (
HashingAlgoString HashingAlgo = iota
HashingAlgoHash
)
// BinaryHash represents a 64-bit hash for memory-efficient key storage.
// Using uint64 directly avoids conversion overhead and provides faster map operations
// compared to [8]byte (single-instruction comparison vs byte-by-byte).
// Switching to a 128-bit hash would require using [16]byte.
type BinaryHash uint64

// ProcessInfo represents process information used in indicators
type ProcessInfo struct {
Expand Down Expand Up @@ -53,13 +51,8 @@ func (i *NetworkConn) ToProto(ts timestamp.MicroTS) *storage.NetworkFlow {
return proto
}

func (i *NetworkConn) Key(h HashingAlgo) string {
switch h {
case HashingAlgoString:
return i.keyString()
default:
return i.keyHash()
}
func (i *NetworkConn) Key() string {
return i.keyHash()
}

// ContainerEndpoint is a key in Sensor's maps that track active endpoints. It's set of fields should be minimal.
Expand All @@ -85,13 +78,9 @@ func (i *ContainerEndpoint) ToProto(ts timestamp.MicroTS) *storage.NetworkEndpoi
return proto
}

func (i *ContainerEndpoint) Key(h HashingAlgo) string {
switch h {
case HashingAlgoString:
return i.keyString()
default:
return i.keyHash()
}
// BinaryKey generates a binary hash for memory-efficient storage in dedupers
func (i *ContainerEndpoint) BinaryKey() BinaryHash {
return i.binaryKeyHash()
}

// ProcessListening represents a listening process.
Expand Down Expand Up @@ -138,11 +127,7 @@ func (i *ProcessListening) ToProto(ts timestamp.MicroTS) *storage.ProcessListeni
return proto
}

func (i *ProcessListening) Key(h HashingAlgo) string {
switch h {
case HashingAlgoString:
return i.keyString()
default:
return i.keyHash()
}
// BinaryKey generates a binary hash for memory-efficient storage in dedupers
func (i *ProcessListening) BinaryKey() BinaryHash {
return i.binaryKeyHash()
}
136 changes: 43 additions & 93 deletions sensor/common/networkflow/manager/indicator/key.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,99 +2,34 @@ package indicator

import (
"hash"
"hash/fnv"
"strconv"
"strings"
"unsafe"

"github.com/cespare/xxhash"
"github.com/stackrox/rox/generated/storage"
)

// Key produces a string that uniquely identifies a given NetworConn indicator.
// Assumption: Two NetworkConn's are identical (for the network-graph purposes) when their keys are identical.
// This is a CPU-optimized implementation that is faster than `keyHash`, but the resulting string takes more memory.
func (i *NetworkConn) keyString() string {
var buf strings.Builder
// 82 chars is an estimate based on typical string-lengths of the NetworkConn's fields to avoid re-sizing.
// The delimiters play important role on avoiding hash collisions (is "appserver" a "app:server" or "apps:erver"?).
buf.Grow(82)
buildStringKey(&buf, i.SrcEntity.ID, i.DstEntity.ID) // 2 x 36 chars for UUIDv4 + 1 char for delimiter
formatPortAndProtocol(&buf, i.DstPort, i.Protocol) // 9 chars maximally
return buf.String()
}
var hashDelimiter = []byte{0}

// keyHash produces a string that uniquely identifies a given NetworConn indicator.
// Assumption: Two NetworkConn's are identical (for the network-graph purposes) when their keys are identical.
// This is memory-optimized implementation that is slower than `keyString`, but the resulting string takes less memory.
func (i *NetworkConn) keyHash() string {
h := fnv.New64a()
// For a collection of length 10^N, the 64bit FNV-1a hash has approximate collision probability of 2.71x10^(N-4).
// For example: for 100M uniformly distributed items, the collision probability is 2.71x10^4 = 0.027.
h := xxhash.New()
// Collision probability example: for 100M uniformly distributed items, the collision probability is 2.71x10^4 = 0.027.
// For lower collision probabilities, one needs to use a fast 128bit hash, for example: XXH3_128 (LLM recommendation).
hashStrings(h, i.SrcEntity.ID, i.DstEntity.ID)
hashPortAndProtocol(h, i.DstPort, i.Protocol)
return hashToHexString(h.Sum64())
}

// keyString produces a string that uniquely identifies a given ContainerEndpoint indicator.
// Assumption: Two ContainerEndpoint's are identical (for the network-graph purposes) when their keys are identical.
// This is a CPU-optimized implementation that is faster than `keyHash`, but the resulting string takes more memory.
func (i *ContainerEndpoint) keyString() string {
var buf strings.Builder
buf.Grow(45) // Estimate based on typical ID lengths.
_, _ = buf.WriteString(i.Entity.ID) // 36 chars (UUIDv4)
formatPortAndProtocol(&buf, i.Port, i.Protocol) // 9 chars maximally

return buf.String()
}

// keyHash produces a string that uniquely identifies a given ContainerEndpoint indicator.
// Assumption: Two ContainerEndpoint's are identical (for the network-graph purposes) when their keys are identical.
// This is memory-optimized implementation that is slower than `keyString`, but the resulting string takes less memory.
func (i *ContainerEndpoint) keyHash() string {
h := fnv.New64a()
hashStrings(h, i.Entity.ID)
hashPortAndProtocol(h, i.Port, i.Protocol)
return hashToHexString(h.Sum64())
}

// keyString produces a string that uniquely identifies a given ProcessListening indicator.
// Assumption: Two ProcessListening's are identical (for the network-graph & PLoP purposes) when their keys are identical.
// This is a CPU-optimized implementation that is faster than `keyHash`, but the resulting string takes more memory.
func (i *ProcessListening) keyString() string {
var buf strings.Builder
// It is hard to compute any reasonable size for pre-allocation as many items have variable length.
// Estimating partially based on gut feeling.
buf.Grow(165)

// Skipping some fields to save memory - they should not be required to ensure uniqueness.
// 5 x strings with variable length (assuming 30 chars each) + 5 chars for delimiter = 155 chars
buildStringKey(&buf, i.PodID, i.ContainerName, i.Process.ProcessName, i.Process.ProcessExec, i.Process.ProcessArgs)
formatPortAndProtocol(&buf, i.Port, i.Protocol) // 9 chars maximally
return buf.String()
}

// keyHash produces a string that uniquely identifies a given ProcessListening indicator.
// Assumption: Two ProcessListening's are identical (for the network-graph & PLoP purposes) when their keys are identical.
// This is memory-optimized implementation that is slower than `keyString`, but the resulting string takes less memory.
func (i *ProcessListening) keyHash() string {
h := fnv.New64a()
// From `ProcessIndicatorUniqueKey` - identifies the process and the container
hashStrings(h, i.PodID, i.ContainerName, i.Process.ProcessName, i.Process.ProcessExec, i.Process.ProcessArgs)
// From: containerEndpoint - identifies the endpoint
hashPortAndProtocol(h, i.Port, i.Protocol)
return hashToHexString(h.Sum64())
}

// Common hash computation utilities
func hashPortAndProtocol(h hash.Hash64, port uint16, protocol storage.L4Protocol) {
portBytes := [2]byte{byte(port >> 8), byte(port)}
_, _ = h.Write(portBytes[:]) // FNV never returns errors, but being explicit

protocolBytes := [4]byte{
buf := [6]byte{
byte(port >> 8), byte(port),
byte(protocol >> 24), byte(protocol >> 16),
byte(protocol >> 8), byte(protocol),
}
_, _ = h.Write(protocolBytes[:])
_, _ = h.Write(buf[:]) // xxhash never returns errors, but being explicit
}

// hashToHexString is performance-optimized implementation of fmt.Sprintf("%016x", hash).
Expand All @@ -114,29 +49,44 @@ func hashToHexString(hash uint64) string {
return string(buf)
}

func formatPortAndProtocol(buf *strings.Builder, port uint16, protocol storage.L4Protocol) {
buf.WriteByte(':')
buf.WriteString(strconv.FormatUint(uint64(port), 10))
buf.WriteByte(':')
buf.WriteString(strconv.FormatUint(uint64(protocol), 10))
}

// buildStringKey is a memory allocation optimized version of `buf.WriteString(strings.Join(parts..., ":"))`.
// Benchmarks show 50% runtime and 50% memory allocation for the current impl when compared against `strings.Join`.
func buildStringKey(buf *strings.Builder, parts ...string) {
for i, part := range parts {
if i > 0 {
buf.WriteByte(':')
}
buf.WriteString(part)
}
}

func hashStrings(h hash.Hash64, strs ...string) {
for i, s := range strs {
if i > 0 {
_, _ = h.Write([]byte{0}) // Use null byte as delimiter to avoid hash collisions
_, _ = h.Write(hashDelimiter) // Use null byte as delimiter to avoid hash collisions
}
// Use zero-copy conversion from string to []byte using unsafe to avoid allocation.
// This is safe because:
// 1. h.Write() doesn't modify data (io.Writer contract)
// 2. xxhash doesn't retain references
// 3. string s remains alive during the call
if len(s) > 0 {
//#nosec G103 -- Audited: zero-copy string-to-bytes conversion for performance
b := unsafe.Slice(unsafe.StringData(s), len(s))
_, _ = h.Write(b)
}
_, _ = h.Write([]byte(s))
}
}

// Binary key generation methods for ContainerEndpoint

// binaryKeyHash produces a binary hash that uniquely identifies a given ContainerEndpoint indicator.
// This is a memory-optimized implementation using direct hash generation without string conversion.
func (i *ContainerEndpoint) binaryKeyHash() BinaryHash {
h := xxhash.New()
hashStrings(h, i.Entity.ID)
hashPortAndProtocol(h, i.Port, i.Protocol)
return BinaryHash(h.Sum64())
}

// Binary key generation methods for ProcessListening

// binaryKeyHash produces a binary hash that uniquely identifies a given ProcessListening indicator.
// This is a memory-optimized implementation using direct hash generation without string conversion.
func (i *ProcessListening) binaryKeyHash() BinaryHash {
h := xxhash.New()
// From `ProcessIndicatorUniqueKey` - identifies the process and the container
hashStrings(h, i.PodID, i.ContainerName, i.Process.ProcessName, i.Process.ProcessExec, i.Process.ProcessArgs)
// From: containerEndpoint - identifies the endpoint
hashPortAndProtocol(h, i.Port, i.Protocol)
return BinaryHash(h.Sum64())
}
Loading
Loading