Skip to content

Commit 468710b

Browse files
authored
Sanity Checks for DB (#44)
1 parent e358319 commit 468710b

8 files changed

Lines changed: 487 additions & 46 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ message(STATUS "Binary name: ${NDD_BINARY_NAME}")
253253
# Add new src/*.cpp files here when they should be compiled into ndd.
254254
set(NDD_CORE_SOURCES
255255
src/sparse/inverted_index.cpp
256+
src/utils/system_sanity/system_sanity.cpp
256257
)
257258

258259
# Build non-main project sources separately so they can be compiled in parallel

docs/logs.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ The same overload shapes apply to `LOG_WARN` and `LOG_ERROR`.
6868
## Rules
6969

7070
- Explicit numeric codes are preferred for stable operational logs.
71+
- Each numeric code must map to exactly one production log site. Do not reuse a code for a
72+
different message or path.
7173
- Code-less logs are valid and must never receive synthesized IDs.
7274
- Prefer logging at request boundaries, lifecycle transitions, and rare failure paths.
7375
- Do not add logs in hot loops or per-vector/per-result paths.
@@ -85,7 +87,7 @@ The same overload shapes apply to `LOG_WARN` and `LOG_ERROR`.
8587
- `1400s` WAL logs
8688
- `1500s` metadata logs
8789
- `1600s` vector storage logs
88-
- `1700s` CPU compatibility logs
90+
- `1700s` system sanity checks (CPU compatibility, disk, memory, ulimits)
8991
- `2000s` index manager logs
9092
- `2100s` HNSW load/cache logs
9193

src/core/ndd.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ class IndexManager {
626626
LOG_INFO(2017, index_id, "Saving dirty index during shutdown");
627627
saveIndex(index_id);
628628
} catch(const std::exception& e) {
629-
LOG_ERROR(2017,
629+
LOG_ERROR(2015,
630630
index_id,
631631
"Failed to save index during shutdown: " << e.what());
632632
}
@@ -909,7 +909,7 @@ class IndexManager {
909909
std::shared_lock<std::shared_mutex> lock(indices_mutex_);
910910
auto it = indices_.find(index_id);
911911
if(it != indices_.end() && it->second && it->second->is_dirty) {
912-
LOG_INFO(2023, index_id, "Saving dirty index before reload");
912+
LOG_INFO(2055, index_id, "Saving dirty index before reload");
913913
saveIndex(index_id);
914914
}
915915
}
@@ -1167,7 +1167,7 @@ class IndexManager {
11671167
throw;
11681168
} catch(const std::exception& e) {
11691169
LOG_ERROR(2027, index_id, "Batch insertion failed: " << e.what());
1170-
return false;
1170+
throw std::runtime_error(std::string("Batch insertion failed: ") + e.what());
11711171
}
11721172
}
11731173

@@ -1983,7 +1983,7 @@ inline void IndexManager::executeBackupJob(const std::string& index_id, const st
19831983

19841984
// Check stop_token before expensive operations
19851985
if (st.stop_requested()) {
1986-
LOG_INFO(2046, index_id, "Backup cancelled");
1986+
LOG_INFO(2056, index_id, "Backup cancelled before backup work started");
19871987
backup_store_.clearActiveBackup(username);
19881988
return;
19891989
}
@@ -2004,7 +2004,7 @@ inline void IndexManager::executeBackupJob(const std::string& index_id, const st
20042004

20052005
// Check again after acquiring lock (shutdown may have been requested while waiting)
20062006
if (st.stop_requested()) {
2007-
LOG_INFO(2047, index_id, "Backup cancelled");
2007+
LOG_INFO(2057, index_id, "Backup cancelled");
20082008
backup_store_.clearActiveBackup(username);
20092009
return;
20102010
}

src/main.cpp

Lines changed: 38 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@
3737
#include "core/ndd.hpp"
3838
#include "auth.hpp"
3939
#include "quant/common.hpp"
40-
#include "cpu_compat_check/check_avx_compat.hpp"
41-
#include "cpu_compat_check/check_arm_compat.hpp"
40+
#include "system_sanity/system_sanity.hpp"
4241

4342
using ndd::quant::quantLevelToString;
4443
using ndd::quant::stringToQuantLevel;
@@ -142,32 +141,6 @@ inline nlohmann::ordered_json make_index_info_payload(const IndexInfo& info) {
142141
return payload;
143142
}
144143

145-
/**
146-
* Checks if the CPU is compatible with all
147-
* the instruction sets being used for x86, ARM and MAC Mxx
148-
*/
149-
bool is_cpu_compatible() {
150-
bool ret = true;
151-
152-
#if defined(USE_AVX2) && (defined(__x86_64__) || defined(_M_X64))
153-
ret &= is_avx2_compatible();
154-
#endif //AVX2 checks
155-
156-
#if defined(USE_AVX512) && (defined(__x86_64__) || defined(_M_X64))
157-
ret &= is_avx512_compatible();
158-
#endif //AVX512 checks
159-
160-
#if defined(USE_NEON)
161-
ret &= is_neon_compatible();
162-
#endif
163-
164-
#if defined(USE_SVE2)
165-
ret &= is_sve2_compatible();
166-
#endif
167-
168-
return ret;
169-
}
170-
171144
// Read file contents
172145
std::string read_file(const std::string& path) {
173146
std::ifstream file(path, std::ios::binary);
@@ -233,9 +206,9 @@ int main(int argc, char** argv) {
233206
return 1;
234207
}
235208

236-
if(!is_cpu_compatible()) {
237-
LOG_ERROR(1004, "CPU is not compatible; server startup aborted");
238-
return 0;
209+
if(!run_startup_sanity_checks()) {
210+
LOG_ERROR(1799, "Server startup aborted due to failed sanity checks");
211+
return 1;
239212
}
240213

241214
LOG_INFO("SERVER_ID: " << settings::SERVER_ID);
@@ -253,6 +226,7 @@ int main(int argc, char** argv) {
253226
LOG_INFO("DEFAULT_MAX_ELEMENTS_INCREMENT: " << settings::DEFAULT_MAX_ELEMENTS_INCREMENT);
254227
LOG_INFO("DEFAULT_MAX_ELEMENTS_INCREMENT_TRIGGER: "
255228
<< settings::DEFAULT_MAX_ELEMENTS_INCREMENT_TRIGGER);
229+
LOG_INFO("MINIMUM_REQUIRED_DRAM_MB: " << settings::MINIMUM_REQUIRED_DRAM_MB);
256230

257231
// Path to React build directory
258232
// Get the executable's directory and resolve frontend/dist relative to it
@@ -263,7 +237,6 @@ int main(int argc, char** argv) {
263237

264238
// Initialize index manager with persistence config
265239
std::string data_dir = settings::DATA_DIR;
266-
std::filesystem::create_directories(data_dir);
267240

268241
PersistenceConfig persistence_config{
269242
settings::SAVE_EVERY_N_UPDATES, // Save every n updates
@@ -282,10 +255,11 @@ int main(int argc, char** argv) {
282255

283256
// ========== GENERAL ==========
284257
// Health check endpoint (no auth required)
285-
CROW_ROUTE(app, "/api/v1/health").methods("GET"_method)([](const crow::request& req) {
258+
// CROW_ROUTE(app, "/api/v1/health").methods("GET"_method)([](const crow::request& req) {
259+
CROW_ROUTE(app, "/api/v1/health").methods("GET"_method)([]() {
286260
crow::json::wvalue response(
287261
{{"status", "ok"},
288-
{"timestamp", std::chrono::system_clock::now().time_since_epoch().count()}});
262+
{"timestamp", (std::int64_t)std::chrono::system_clock::now().time_since_epoch().count()}});
289263
PRINT_LOG_TIME();
290264
ndd::printSparseSearchDebugStats();
291265
ndd::printSparseUpdateDebugStats();
@@ -450,7 +424,7 @@ int main(int argc, char** argv) {
450424
body.has("sparse_model") ? std::string(body["sparse_model"].s()) : "None";
451425
const auto sparse_model = ndd::sparseScoringModelFromString(sparse_model_str);
452426
if(!sparse_model.has_value()) {
453-
LOG_WARN(1019, index_id, "Invalid sparse_model: " << sparse_model_str);
427+
LOG_WARN(1025, index_id, "Invalid sparse_model: " << sparse_model_str);
454428
return json_error(
455429
400,
456430
"Invalid sparse_model. Must be one of: None, default, endee_bm25");
@@ -470,7 +444,7 @@ int main(int argc, char** argv) {
470444
index_manager.createIndex(index_id, config, UserType::Admin, size_in_millions);
471445
return crow::response(200, "Index created successfully");
472446
} catch(const std::runtime_error& e) {
473-
LOG_WARN(1019, index_id, "Create-index request failed: " << e.what());
447+
LOG_WARN(1026, index_id, "Create-index request failed: " << e.what());
474448
return json_error(409, e.what());
475449
} catch(const std::exception& e) {
476450
return json_error_500(
@@ -936,6 +910,10 @@ int main(int argc, char** argv) {
936910
// Verify content type is application/msgpack or application/json
937911
auto content_type = req.get_header_value("Content-Type");
938912

913+
if(is_disk_full()){
914+
return json_error(400, "Batch insertion aborted: Not enough storage space");
915+
}
916+
939917
if(content_type == "application/json") {
940918
auto body = crow::json::load(req.body);
941919
if(!body) {
@@ -999,7 +977,14 @@ int main(int argc, char** argv) {
999977

1000978
try {
1001979
bool success = index_manager.addVectors(index_id, vectors);
1002-
return crow::response(success ? 200 : 400);
980+
if(!success) {
981+
LOG_WARN(1066,
982+
ctx.username,
983+
index_name,
984+
"Insert request failed without detailed error from addVectors");
985+
return json_error(400, "Batch insertion failed");
986+
}
987+
return crow::response(200);
1003988
} catch(const std::runtime_error& e) {
1004989
LOG_WARN(1041, ctx.username, index_name, "Insert request rejected: " << e.what());
1005990
return json_error(400, e.what());
@@ -1017,13 +1002,27 @@ int main(int argc, char** argv) {
10171002
auto vectors = obj.as<std::vector<ndd::HybridVectorObject>>();
10181003
LOG_DEBUG("Batch size (Hybrid): " << vectors.size());
10191004
bool success = index_manager.addVectors(index_id, vectors);
1020-
return crow::response(success ? 200 : 400);
1005+
if(!success) {
1006+
LOG_WARN(1067,
1007+
ctx.username,
1008+
index_name,
1009+
"Insert request failed without detailed error from addVectors");
1010+
return json_error(400, "Batch insertion failed");
1011+
}
1012+
return crow::response(200);
10211013
} catch(...) {
10221014
// Fallback to VectorObject
10231015
auto vectors = obj.as<std::vector<ndd::VectorObject>>();
10241016
LOG_DEBUG("Batch size (Dense): " << vectors.size());
10251017
bool success = index_manager.addVectors(index_id, vectors);
1026-
return crow::response(success ? 200 : 400);
1018+
if(!success) {
1019+
LOG_WARN(1068,
1020+
ctx.username,
1021+
index_name,
1022+
"Insert request failed without detailed error from addVectors");
1023+
return json_error(400, "Batch insertion failed");
1024+
}
1025+
return crow::response(200);
10271026
}
10281027
} catch(const std::runtime_error& e) {
10291028
LOG_WARN(1042, ctx.username, index_name, "Insert request rejected: " << e.what());

src/storage/vector_storage.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,9 @@ class VectorStore {
280280
MDBX_val data{const_cast<uint8_t*>(vector_bytes.data()), vector_bytes.size()};
281281

282282
int rc = mdbx_put(txn, dbi_, &key, &data, MDBX_UPSERT);
283+
if(rc != MDBX_SUCCESS) {
284+
return rc;
285+
}
283286
}
284287
return MDBX_SUCCESS;
285288
};
@@ -449,7 +452,7 @@ class MetaStore {
449452
}
450453
};
451454

452-
auto write_batch = [&](MDBX_txn* txn) {
455+
auto write_batch = [&](MDBX_txn* txn) -> int {
453456
for(const auto& [numeric_id, meta] : batch) {
454457
msgpack::sbuffer sbuf;
455458
msgpack::pack(sbuf, meta);
@@ -459,6 +462,7 @@ class MetaStore {
459462

460463
int rc = mdbx_put(txn, dbi_, &key, &data, MDBX_UPSERT);
461464
if(rc != MDBX_SUCCESS) {
465+
return rc;
462466
}
463467
}
464468
return MDBX_SUCCESS;

src/utils/settings.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,15 @@ namespace settings {
7070
// Maximum number of elements in the index
7171
constexpr size_t MAX_VECTORS_ADMIN = 1'000'000'000;
7272

73+
74+
//minimum bytes in filesystem before triggering out of storage sequence
75+
constexpr size_t MINIMUM_REQUIRED_FS_BYTES = (10 * GB);
76+
77+
// System sanity check thresholds
78+
constexpr size_t DEFAULT_MINIMUM_REQUIRED_DRAM_MB = (4 * 1024); //GB in MB
79+
constexpr size_t MINIMUM_OPEN_FILES = 5000;
80+
constexpr size_t DEFAULT_MINIMUM_CPU_CORES = 2;
81+
7382
// Buffer for early exit in search base layer
7483
constexpr int EARLY_EXIT_BUFFER_INSERT = 16;
7584
constexpr int EARLY_EXIT_BUFFER_QUERY = 8;
@@ -192,6 +201,12 @@ namespace settings {
192201
return env ? std::stoull(env) : DEFAULT_NUM_RECOVERY_THREADS;
193202
}();
194203

204+
// Minimum available DRAM in MB (configurable via NDD_MIN_DRAM_MB)
205+
inline static size_t MINIMUM_REQUIRED_DRAM_MB = [] {
206+
const char* env = std::getenv("NDD_MIN_DRAM_MB");
207+
return env ? std::stoull(env) : DEFAULT_MINIMUM_REQUIRED_DRAM_MB;
208+
}();
209+
195210
inline static bool ENABLE_DEBUG_LOG = [] {
196211
const char* env = std::getenv("NDD_DEBUG_LOG");
197212
return env ? (std::string(env) == "1" || std::string(env) == "true")
@@ -384,6 +399,8 @@ namespace settings {
384399
oss << "ENABLE_DEBUG_LOG: " << (ENABLE_DEBUG_LOG ? "true" : "false") << "\n";
385400
oss << "AUTH_ENABLED: " << (AUTH_ENABLED ? "true" : "false") << "\n";
386401
oss << "DEFAULT_USERNAME: " << DEFAULT_USERNAME << "\n";
402+
oss << "MINIMUM_REQUIRED_DRAM_MB: " << MINIMUM_REQUIRED_DRAM_MB << "\n";
403+
oss << "MINIMUM_OPEN_FILES: " << MINIMUM_OPEN_FILES << "\n";
387404
oss << "\n=== MDBX Map Sizes (bit shifts) ===\n";
388405
oss << "INDEX_META_MAP_SIZE_BITS: " << INDEX_META_MAP_SIZE_BITS << "\n";
389406
oss << "INDEX_META_MAP_SIZE_MAX_BITS: " << INDEX_META_MAP_SIZE_MAX_BITS << "\n";

0 commit comments

Comments
 (0)