Skip to content

Conversation

@codeflash-ai
Copy link

@codeflash-ai codeflash-ai bot commented Jun 15, 2024

📄 _strip_pii() in sentry_sdk/integrations/pymongo.py

📈 Performance improved by 12% (0.12x faster)

⏱️ Runtime went down from 6.47 milliseconds to 5.77 milliseconds

Explanation and details

Here is a more optimized version of your code.

Performance Improvements.

  1. Using a Set for SAFE_COMMAND_ATTRIBUTES: Membership checks are faster with sets compared to lists because they use a hash table.
  2. Using list(command) in the for-loop: This prevents issues that can arise from modifying the dictionary size during iteration.
  3. Eliminating redundant checks: Grouped the conditions logically to avoid unnecessary if checks where possible.

Correctness verification

The new optimized code was tested for correctness. The results are listed below.

✅ 23 Passed − ⚙️ Existing Unit Tests

(click to show existing tests)
- integrations/pymongo/test_pymongo.py

✅ 29 Passed − 🌀 Generated Regression Tests

(click to show generated tests)
# imports
import pytest  # used for our unit tests

# function to test
SAFE_COMMAND_ATTRIBUTES = [
    "insert",
    "ordered",
    "find",
    "limit",
    "singleBatch",
    "aggregate",
    "createIndexes",
    "indexes",
    "delete",
    "findAndModify",
    "renameCollection",
    "to",
    "drop",
]
from sentry_sdk.integrations.pymongo import _strip_pii

# unit tests

# Basic Functionality
def test_empty_command():
    assert _strip_pii({}) == {}

def test_safe_attributes_only():
    assert _strip_pii({"insert": "collectionName"}) == {"insert": "collectionName"}
    assert _strip_pii({"find": "collectionName", "limit": 5}) == {"find": "collectionName", "limit": 5}

def test_unsafe_attributes_only():
    assert _strip_pii({"username": "john_doe"}) == {"username": "%s"}
    assert _strip_pii({"password": "secret"}) == {"password": "%s"}

# Mixed Safe and Unsafe Attributes
def test_mixed_safe_and_unsafe_attributes():
    assert _strip_pii({"insert": "collectionName", "username": "john_doe"}) == {"insert": "collectionName", "username": "%s"}
    assert _strip_pii({"find": "collectionName", "limit": 5, "password": "secret"}) == {"find": "collectionName", "limit": 5, "password": "%s"}

# Special Handling for Documents
def test_single_document():
    assert _strip_pii({"documents": [{"username": "john_doe"}]}) == {"documents": [{"username": "%s"}]}

def test_multiple_documents():
    assert _strip_pii({"documents": [{"username": "john_doe"}, {"password": "secret"}]}) == {"documents": [{"username": "%s"}, {"password": "%s"}]}

def test_nested_documents():
    assert _strip_pii({"documents": [{"user": {"name": "john_doe", "email": "john@example.com"}}]}) == {"documents": [{"user": {"name": "%s", "email": "%s"}}]}

# Special Handling for Dict Style Fields
def test_filter_field():
    assert _strip_pii({"filter": {"username": "john_doe"}}) == {"filter": {"username": "%s"}}

def test_query_field():
    assert _strip_pii({"query": {"password": "secret"}}) == {"query": {"password": "%s"}}

def test_update_field():
    assert _strip_pii({"update": {"$set": {"email": "john@example.com"}}}) == {"update": {"$set": {"email": "%s"}}}

# Special Handling for Pipeline Fields
def test_single_pipeline_with_match():
    assert _strip_pii({"pipeline": [{"$match": {"username": "john_doe"}}]}) == {"pipeline": [{"$match": {"username": "%s"}}]}

def test_multiple_pipelines_with_match():
    assert _strip_pii({"pipeline": [{"$match": {"username": "john_doe"}}, {"$match": {"password": "secret"}}]}) == {"pipeline": [{"$match": {"username": "%s"}}, {"$match": {"password": "%s"}}]}

def test_pipeline_without_match():
    assert _strip_pii({"pipeline": [{"$group": {"_id": "$username"}}]}) == {"pipeline": [{"$group": {"_id": "$username"}}]}

# Edge Cases
def test_non_string_keys():
    assert _strip_pii({123: "some_value"}) == {123: "%s"}

def test_empty_strings_as_values():
    assert _strip_pii({"username": ""}) == {"username": "%s"}

def test_none_as_values():
    assert _strip_pii({"username": None}) == {"username": "%s"}

def test_nested_structures_with_mixed_types():
    assert _strip_pii({"documents": [{"user": {"name": "john_doe", "age": 30}}], "filter": {"active": True}}) == {"documents": [{"user": {"name": "%s", "age": "%s"}}], "filter": {"active": "%s"}}

# Large Scale Test Cases
def test_large_number_of_safe_attributes():
    command = {key: "collectionName" for key in SAFE_COMMAND_ATTRIBUTES}
    assert _strip_pii(command) == command

def test_large_number_of_unsafe_attributes():
    command = {f"unsafe_key_{i}": f"value_{i}" for i in range(1000)}
    expected = {f"unsafe_key_{i}": "%s" for i in range(1000)}
    assert _strip_pii(command) == expected

def test_large_nested_structures():
    command = {"documents": [{"user": {"name": f"user_{i}", "email": f"user_{i}@example.com"}} for i in range(1000)]}
    expected = {"documents": [{"user": {"name": "%s", "email": "%s"}} for i in range(1000)]}
    assert _strip_pii(command) == expected

# Performance and Scalability
def test_very_large_command_dictionary():
    command = {f"key_{i}": f"value_{i}" for i in range(10000)}
    expected = {f"key_{i}": "%s" for i in range(10000)}
    assert _strip_pii(command) == expected

def test_complex_nested_structures():
    command = {"pipeline": [{"$match": {f"field_{i}": f"value_{i}" for i in range(1000)}}]}
    expected = {"pipeline": [{"$match": {f"field_{i}": "%s" for i in range(1000)}}]}
    assert _strip_pii(command) == expected

# Invalid Inputs
def test_non_dictionary_input():
    with pytest.raises(TypeError):
        _strip_pii(["not_a_dict"])
    with pytest.raises(TypeError):
        _strip_pii(12345)

def test_dictionary_with_non_string_keys():
    assert _strip_pii({123: "value"}) == {123: "%s"}

def test_dictionary_with_non_iterable_values_for_special_keys():
    with pytest.raises(TypeError):
        _strip_pii({"documents": "not_a_list"})
    with pytest.raises(TypeError):
        _strip_pii({"pipeline": "not_a_list"})

🔘 (none found) − ⏪ Replay Tests

Here is a more optimized version of your code.



### Performance Improvements.
1. **Using a Set for `SAFE_COMMAND_ATTRIBUTES`:** Membership checks are faster with sets compared to lists because they use a hash table.
2. **Using `list(command)` in the for-loop:** This prevents issues that can arise from modifying the dictionary size during iteration.
3. **Eliminating redundant checks:** Grouped the conditions logically to avoid unnecessary if checks where possible.
@codeflash-ai codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label Jun 15, 2024
@codeflash-ai codeflash-ai bot requested a review from misrasaurabh1 June 15, 2024 00:17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

⚡️ codeflash Optimization PR opened by Codeflash AI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

0 participants