Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/69073.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
``LoadAuth.get_tok`` now distinguishes between corrupt token blobs (removed from the store) and transient backend errors such as Redis connection drops or NFS hangs (token kept, request treated as not-authenticated). Previously a single backend hiccup could log every authenticated user out by deleting valid tokens.
59 changes: 43 additions & 16 deletions salt/auth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,30 +239,57 @@ def get_tok(self, tok):
Return the name associated with the token, or False if the token is
not valid
"""
tdata = {}
try:
tdata = self.tokens["{}.get_token".format(self.opts["eauth_tokens"])](
self.opts, tok
)
except salt.exceptions.SaltDeserializationError:
log.warning("Failed to load token %r - removing broken/empty file.", tok)
rm_tok = True
else:
if not tdata:
return {}
rm_tok = False
except salt.exceptions.SaltDeserializationError as exc:
# The on-disk / in-store token blob is corrupt and cannot
# be parsed. Removing it is the right call -- a corrupt
# token can never authenticate anyway, and leaving it
# around makes every subsequent ``get_tok`` for the same
# id keep failing. ``%r`` on the exception gives the
# operator the class and message inline (e.g. msgpack
# format error, truncated file) without spamming a full
# traceback into a hot-path WARNING; the full traceback is
# available via the companion ``log.debug`` for deeper
# investigation.
log.warning(
"Token %r could not be deserialized (%r); removing it from the store.",
tok,
exc,
)
log.debug("Token deserialization traceback:", exc_info=True)
self.rm_token(tok)
return {}
except OSError as exc:
# Transient backend error (Redis connection blip, NFS hang,
# hung disk). The token itself is fine; do NOT delete it --
# that would log every authenticated user out on every
# backend hiccup. Return an empty dict so the caller treats
# this request as not-authenticated; the next request will
# retry against the backend and succeed once it recovers.
# Same logging pattern as above -- exception class + message
# at WARNING, full traceback at DEBUG so a flapping deploy
# stays diagnoseable without GB/hour of stack frames.
log.warning(
"Token store transient error reading %r (%r); treating as "
"not-authenticated for this request without removing the "
"token from the store.",
tok,
exc,
)
log.debug("Token store transient-error traceback:", exc_info=True)
return {}

if not tdata:
return {}
if tdata.get("expire", 0) < time.time():
# If expire isn't present in the token it's invalid and needs
# to be removed. Also, if it's present and has expired - in
# other words, the expiration is before right now, it should
# be removed.
rm_tok = True

if rm_tok:
# Expired token: drop it from the store. ``expire`` defaults
# to 0 if missing, so a malformed-but-deserializable token
# without an ``expire`` key falls into this branch too.
self.rm_token(tok)
return {}

return tdata

def list_tokens(self):
Expand Down
101 changes: 101 additions & 0 deletions tests/pytests/unit/auth/test_auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import salt.auth
import salt.config
import salt.exceptions
from tests.support.mock import MagicMock


def test_cve_2021_3244(tmp_path):
Expand Down Expand Up @@ -31,3 +33,102 @@ def test_cve_2021_3244(tmp_path):
t_data = auth.get_tok(t_data["token"])
assert not t_data
assert not token_file.exists()


# ---------------------------------------------------------------------------
# ``LoadAuth.get_tok`` exception handling.
#
# A backend read failure must be classified by *cause*, not collapsed to
# "delete the token":
#
# * ``SaltDeserializationError`` — the stored blob cannot be parsed. The
# token is permanently unusable; remove it so subsequent reads do not
# keep failing on the same corrupt entry.
# * ``OSError`` / ``IOError`` — transient (Redis connection drop, NFS
# hang, full disk). The token itself is fine. Returning ``{}`` while
# leaving the token in place makes this request not-authenticated and
# lets the next request retry once the backend recovers. **Deleting on
# transient I/O errors logs every authenticated user out on every
# backend hiccup**, which is the regression these tests guard against.
# * Expired (deserialized successfully but past ``expire``) — remove.
# ---------------------------------------------------------------------------


def _make_auth(tmp_path, get_token_side_effect):
"""Build a ``LoadAuth`` whose ``localfs.get_token`` is replaced by
``get_token_side_effect``. ``rm_token`` is wrapped with a
``MagicMock`` so the test can assert on whether the production code
chose to delete the token. The backing token directory is real so
``rm_token`` would not blow up if it were called -- we are checking
*intent*, not state."""
token_dir = tmp_path / "tokens"
token_dir.mkdir()
opts = {
"extension_modules": "",
"optimization_order": [0, 1, 2],
"token_expire": 60,
"keep_acl_in_token": False,
"eauth_tokens": "localfs",
"token_dir": str(token_dir),
"token_expire_user_override": False,
"external_auth": {"auto": {"foo": []}},
}
auth = salt.auth.LoadAuth(opts)

if callable(get_token_side_effect):
auth.tokens["localfs.get_token"] = get_token_side_effect
else:
auth.tokens["localfs.get_token"] = MagicMock(side_effect=get_token_side_effect)

auth.tokens["localfs.rm_token"] = MagicMock()
return auth


def test_get_tok_returns_empty_and_keeps_token_on_io_error(tmp_path):
"""Headline regression: a transient backend error (e.g. Redis
connection drop) must NOT cause the token to be deleted. The
previous implementation either propagated the exception or deleted
the token -- both wrong. Correct behaviour is to return ``{}`` and
leave the token alone so the next request can retry."""
auth = _make_auth(tmp_path, OSError("redis connection reset"))

result = auth.get_tok("any-token-id")

assert result == {}
auth.tokens["localfs.rm_token"].assert_not_called()


def test_get_tok_removes_token_on_deserialization_error(tmp_path):
"""A corrupt token blob is permanently unusable; removing it is
correct because every subsequent read would fail the same way."""
auth = _make_auth(
tmp_path,
salt.exceptions.SaltDeserializationError("bad msgpack"),
)

result = auth.get_tok("corrupt-token-id")

assert result == {}
auth.tokens["localfs.rm_token"].assert_called_once_with(
auth.opts, "corrupt-token-id"
)


def test_get_tok_removes_expired_token(tmp_path):
"""Expired tokens are deserializable but past their ``expire``
timestamp. They must be removed so the store does not accumulate
dead entries."""
expired_blob = {
"token": "expired-token-id",
"expire": time.time() - 60,
"name": "foo",
"eauth": "auto",
}
auth = _make_auth(tmp_path, lambda opts, tok: expired_blob)

result = auth.get_tok("expired-token-id")

assert result == {}
auth.tokens["localfs.rm_token"].assert_called_once_with(
auth.opts, "expired-token-id"
)
Loading