-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Expand file tree
/
Copy pathcode_verify_rules.py
More file actions
3120 lines (2861 loc) · 127 KB
/
code_verify_rules.py
File metadata and controls
3120 lines (2861 loc) · 127 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Static-analysis rules for scripts/code-verify.py.
Adds CLAUDE.md-derived semantic checks on top of the formatter's existing
style rules. C++ rules use tree-sitter's C++ grammar to walk a real AST;
QML rules stay line-based on top of the tokenizer that already lives in
code-verify.py.
Each rule returns a list of (line, kind, message) tuples. The driver in
code-verify.py wraps them as Violations and routes them through the
existing flag-only / auto-fixable pipeline. Every rule here is flag-only.
Tree-sitter is the only new dependency. The module degrades gracefully:
when tree-sitter or tree-sitter-cpp aren't importable, C++ semantic
checks are silently skipped and the formatter still runs its line-based
rules. The CI install pins both in tests/requirements.txt.
`code-verify off / on` fences mask every rule here too, same as the
existing rules — the driver passes the fence mask in.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from pathlib import Path
try:
import tree_sitter
import tree_sitter_cpp
_CPP_LANG = tree_sitter.Language(tree_sitter_cpp.language())
_CPP_PARSER = tree_sitter.Parser(_CPP_LANG)
HAS_TREE_SITTER = True
except Exception:
HAS_TREE_SITTER = False
_CPP_LANG = None
_CPP_PARSER = None
# ---------------------------------------------------------------------------
# Public types
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class Finding:
line: int
kind: str
message: str
# ---------------------------------------------------------------------------
# Hotpath method names (CLAUDE.md: never allocate on the dashboard path)
# ---------------------------------------------------------------------------
# Methods named here are walked for new/make_shared/append calls. The names
# come straight from CLAUDE.md's "Threading Rules" / "Hotpath" sections.
_HOTPATH_METHODS = frozenset(
{
"hotpathRxFrame",
"hotpathRxSourceFrame",
"processData",
"onReadyRead",
"onFrameReady",
"onRawDataReceived",
"appendChunk",
"frameTimestamp",
"applyTransform",
"parseProjectFrame",
"updateData",
"updateLineSeries",
"pushSample",
}
)
# Calls / patterns banned on the hotpath. Each entry is (regex, message).
_HOTPATH_BANNED_CALLS = [
(re.compile(r"\bnew\s+[A-Za-z_]"), "`new` allocation on hotpath"),
(re.compile(r"\bstd::make_shared\b"), "`std::make_shared` allocation on hotpath"),
(re.compile(r"\bstd::make_unique\b"), "`std::make_unique` allocation on hotpath"),
(re.compile(r"\.append\("), "`.append(` (likely Qt container resize) on hotpath"),
(re.compile(r"\.push_back\("), "`.push_back(` on hotpath (pre-reserve at init)"),
(re.compile(r"\bemit\b"), "bare `emit` on hotpath -- use `Q_EMIT`"),
]
# ---------------------------------------------------------------------------
# CPU-microarchitecture / performance rules
# ---------------------------------------------------------------------------
#
# These rules apply knowledge of how compiled C++ behaves at the assembly /
# register / branch-predictor / cache level. The cycle counts in the rule
# messages are representative for current Intel (Skylake-derived) and ARM
# (Cortex-A7x/A78) microarchitectures; exact numbers vary with the target.
# All rules ship as advisory -- the goal is a checklist for a follow-up
# human / LLM pass, not a CI gate.
# Heavy types -- known to be expensive to copy by value. Even implicitly
# shared Qt containers (QString/QByteArray/QList/...) pay an atomic refcount
# bump on the COW pointer, which is a `lock`-prefix instruction on x86 or an
# `ldxr/stxr` loop on ARM without LSE. std:: containers do a full deep copy.
_HEAVY_TYPES = frozenset(
{
"QString",
"QByteArray",
"QStringList",
"QVariant",
"QVariantMap",
"QVariantList",
"QVariantHash",
"QList",
"QVector",
"QMap",
"QHash",
"QSet",
"QQueue",
"QStack",
"QJsonObject",
"QJsonArray",
"QJsonDocument",
"QJsonValue",
"QImage",
"QPixmap",
"QPolygon",
"QPolygonF",
"QPainterPath",
"QBitArray",
"QDateTime",
"std::string",
"std::wstring",
"std::vector",
"std::map",
"std::unordered_map",
"std::list",
"std::deque",
"std::set",
"std::unordered_set",
"std::multimap",
"std::unordered_multimap",
}
)
_REFCOUNTED_TYPES = frozenset(
{
"std::shared_ptr",
"QSharedPointer",
"QSharedDataPointer",
"QExplicitlySharedDataPointer",
"boost::shared_ptr",
}
)
# File-wide perf patterns: scanned inside every function body, not just
# hotpath methods. Cost matters everywhere these appear; the user can
# wrap a region in `// code-verify off` when the slow path is intentional
# (init code that builds a regex once, error path that throws, etc.).
_PERF_BODY_PATTERNS = [
# `/ <floating-literal>` -- compilers do NOT fold `a / 2.5` to
# `a * 0.4` without `-ffast-math` (would lose 1 ULP for non-exact
# reciprocals). Multiplying by a precomputed reciprocal is ~3 cyc
# vs ~12-22 cyc for divsd.
(
re.compile(
r"(?<![*/=<>!&|^])/\s*(?:\d+\.\d*|\.\d+|\d+\.\d*[eE][+-]?\d+)" r"[fFlL]?"
),
"perf-divide-by-float-literal",
"`/` with a floating-point literal -- compilers don't fold to "
"reciprocal multiply (would lose IEEE accuracy without -ffast-math). "
"Precompute `constexpr double kInvX = 1.0 / X;` and multiply (~3 cyc "
"mulsd vs ~12-22 cyc divsd).",
),
# `pow(x, N)` -- libm transcendental, goes through `exp(log(x) * y)`.
# 40+ cyc on Intel, similar on ARM. Caller-saved FPU/SIMD state gets
# clobbered too.
(
re.compile(r"\b(?:std::)?pow\s*\("),
"perf-pow-call",
"`pow(...)` -- libm transcendental via `exp(log(x) * y)` (40+ cyc on "
"Intel/ARM) and clobbers caller-saved FPU/SIMD state. For small "
"integer exponents write the multiply (`x*x`, `x*x*x`); for "
"`pow(x, 0.5)` use `std::sqrt(x)`; for `pow(2.0, n)` use "
"`std::ldexp(1.0, n)` (single mantissa-shift insn).",
),
# `dynamic_cast<T>` -- walks the inheritance graph via RTTI typeinfo
# string comparisons; 50-200+ cyc worst case and a function call.
(
re.compile(r"\bdynamic_cast\s*<"),
"perf-dynamic-cast",
"`dynamic_cast<...>` -- walks the inheritance graph via RTTI typeinfo "
"string compares (50-200+ cyc worst case, runtime call). Use a "
"discriminating enum + `static_cast`, or pre-resolve the cast once "
"(store the typed pointer at object init).",
),
# malloc / free family -- same arena-mutex cost as `new`/`delete`,
# just less visible. Both Linux glibc and Windows HeapAlloc serialize
# on a per-arena mutex; on contended workloads this is a real cost.
(
re.compile(
r"\b(?:malloc|calloc|realloc|free|aligned_alloc|posix_memalign)" r"\s*\("
),
"perf-malloc-family",
"C heap call -- malloc/free contend on a per-arena mutex (glibc, "
"RtlHeap) and aren't pipelineable. In a hot loop, reuse a "
"pre-reserved buffer or a small-object pool.",
),
# `QRegularExpression(...)` constructor -- compiles the regex to a
# state machine, heap-allocates capture tables. If invoked in a loop,
# the regex gets recompiled every iteration.
(
re.compile(r"\bQRegularExpression\s*\([^)]"),
"perf-regex-construct",
"`QRegularExpression(...)` constructor -- compiles a DFA/NFA state "
"machine and heap-allocates capture state. Build the regex once "
"(file-scope `static const`, or a class member init) and reuse the "
"`.match(...)` path each iteration.",
),
# `.arg(...).arg(...)` chains -- each call returns a new QString
# (heap alloc + copy). Two .arg()s = two allocs. Pass all args in
# one call (`s.arg(a, b, c)`) or use QStringBuilder (`%` operator
# with `<QStringBuilder>` included).
(
re.compile(r"\.arg\s*\([^()]*\)\s*\.arg\s*\("),
"perf-arg-chain",
"`.arg(...).arg(...)` chain -- each call allocates a fresh QString "
"(heap + memcpy). Combine into one call (`.arg(a, b, c)`) or include "
"`<QStringBuilder>` and use the `%` operator (single allocation, "
"sized exactly).",
),
]
# Hotpath-only perf patterns: too noisy to flag file-wide in this codebase
# (qDebug and QString allocation are pervasive in setup/teardown/error
# paths and aren't wrong there). The hotpath methods listed in
# `_HOTPATH_METHODS` run at kHz+ rates -- THAT'S where the cost bites.
_HOTPATH_PERF_PATTERNS = [
# QString / QByteArray construction with a literal -- each call hits
# the heap allocator (malloc on Linux, RtlAllocateHeap on Windows),
# contended on the arena mutex, not pipelineable. Cache the result
# at init or hoist into a file-scope `static const`.
#
# `QStringLiteral("...")` is deliberately NOT flagged: by design it
# constant-folds into a static read-only QString with zero heap touch
# (that's why Qt has it). The other entries are the genuine heap-
# allocating constructors/conversions.
(
re.compile(
r"\bQString\s*\(\s*[\"R]"
r"|\bQByteArray\s*\(\s*[\"R]"
r"|\.toUtf8\s*\(\s*\)"
r"|\.toStdString\s*\(\s*\)"
r"|\.toLatin1\s*\(\s*\)"
r"|\.toLocal8Bit\s*\(\s*\)"
r"|\bQString::fromUtf8\s*\("
r"|\bQString::fromLatin1\s*\("
),
"perf-string-alloc-hotpath",
"string construction/conversion on the hotpath -- heap allocation + "
"memcpy. malloc contends on a per-arena mutex; the new buffer "
"pollutes L1 (32-48 KB). Cache the QString at init, or use a "
"fixed stack buffer for transient formatting.",
),
# qDebug / qWarning -- builds a QDebug stream object, takes the global
# message-handler mutex, formats and writes. Even filtered-out
# categories pay the format cost because `<<` is eager. Hundreds of
# cycles minimum per call; thousands when the handler dispatches to
# a Console widget that re-enters the event loop.
(
re.compile(r"\bq(?:Debug|Info|Warning|Critical|Fatal)\s*\("),
"perf-log-on-hotpath",
"Qt logging call on the hotpath -- builds a QDebug stream, takes "
"the global message-handler mutex, formats and writes. `<<` is "
"eager: even filtered-out categories pay the format cost. Gate "
"behind `#ifdef SERIAL_STUDIO_DEBUG` or move to a sampled counter.",
),
# `throw` on the hotpath -- exception throw runs the personality "
# routine, walks DWARF / SEH unwind tables (1000s of cycles per
# frame), mispredicts every catch on the way out, trashes the
# return-address stack. `noexcept` callers crash hard.
(
re.compile(r"\bthrow\s+\w"),
"perf-throw-on-hotpath",
"`throw` on the hotpath -- stack unwinding via DWARF/SEH personality "
"routines (1000s of cycles), mispredicts every catch frame, trashes "
"the return-address stack predictor. Return an error code, an "
"`std::expected`-style variant, or a sentinel value instead.",
),
# Mutex / lock-guard acquisition on the hotpath -- ~20 cyc lock-prefix
# RMW on x86, ldaxr+stxr+DMB on ARM, serializes the store buffer, and
# contended bouncing thrashes the L1 line. Outside the kHz frame path
# the cost is irrelevant; locks are the right answer for once-per-event
# state mutation. Only flag inside known-hot methods.
(
re.compile(
r"\b(?:QMutexLocker|QReadLocker|QWriteLocker|QRecursiveMutex"
r"|std::lock_guard|std::unique_lock|std::scoped_lock"
r"|std::shared_lock)\b"
),
"perf-lock-acquire",
"lock acquisition on the hotpath -- atomic RMW with full memory "
"barrier (~20 cyc x86 `lock`-prefix, ldaxr+stxr+DMB on ARM), "
"serializes the store buffer; contended bouncing thrashes the L1 "
"line. Prefer thread-local / SPSC / per-core state, or a relaxed "
"`std::atomic` when the invariant fits a single word.",
),
# Bare mutex.lock() / lockForRead() calls -- same physical cost.
(
re.compile(r"\b\w+\.(?:lock|try_lock|lockForRead|lockForWrite|tryLock)\s*\("),
"perf-lock-acquire",
"explicit `.lock()`/`.try_lock()`/`.lockForRead()` call on the "
"hotpath -- same `lock`-prefix RMW cost as the locker types.",
),
# Integer / float division by a non-literal divisor on the hotpath.
# `idiv`/`udiv` is the slowest ALU op (20-40 cyc Skylake/Zen, not
# pipelined; 12-40 cyc Cortex-A78). When the divisor is constexpr the
# compiler emits a magic-number multiply; the hotpath cost only bites
# when the divisor is a true runtime variable. `sizeof(...)` is
# compile-time and skipped via lookahead. Reciprocal-cache lines
# (`auto inv = 1.0 / x`) are skipped via _is_reciprocal_cache_line.
(
re.compile(r"(?<![*/=<>!&|^])/\s*(?!/)(?!sizeof\b)[A-Za-z_]\w*"),
"perf-divide-runtime-divisor",
"`/` with a non-literal divisor on the hotpath -- division is the "
"slowest ALU op (divsd ~11-22 cyc Skylake, fdiv ~10-40 cyc Cortex-A78; "
"idiv 20-40 cyc, not pipelined). Cache the reciprocal once "
"(`r = 1.0 / d`) and multiply in the loop, or use a bit-shift for "
"power-of-two integer cases.",
),
# Modulo by a non-literal divisor on the hotpath. Same idiv cost as
# integer divide; power-of-two N can be replaced with `& (N - 1)`.
(
re.compile(r"(?<![%=*/+\-<>!&|^])%\s*[A-Za-z_]\w*"),
"perf-modulo-runtime-divisor",
"`%` with a non-literal divisor on the hotpath -- emits `idiv`/`udiv` "
"(20-40 cyc x86, 12-40 cyc ARM). For power-of-two N use `& (N - 1)` "
"(single-cycle `and`); for runtime divisors hoist out of the loop or "
"use a libdivide-style precomputed magic-number multiply.",
),
]
# Header line-pattern: a hotpath method declared `virtual`. Every call
# site emits an indirect branch through the vtable; the predictor learns
# monomorphic sites but can't inline, and polymorphic sites mispredict
# (15-20 cycle bubble on x86, similar on ARM). `final` partially helps
# when the dynamic type is known.
_VIRTUAL_HOTPATH_RE = re.compile(
r"\bvirtual\b[^;{]*\b("
+ "|".join(sorted(re.escape(n) for n in _HOTPATH_METHODS))
+ r")\s*\("
)
# Generic atomic-type detector used by the false-sharing rule. Catches
# `std::atomic<T>`, `std::atomic_int`, `std::atomic_flag`, and the Qt
# `QAtomicInt`/`QAtomicPointer<T>`/`QAtomicInteger<T>` family.
_ATOMIC_DECL_RE = re.compile(
r"\b(?:std::atomic(?:_[a-z0-9_]+)?\s*(?:<|\s+m?_?\w)"
r"|std::atomic_flag\b"
r"|QAtomic(?:Int|Pointer|Integer)\b)"
)
# Local fixed-size array declaration with a numeric size, e.g.
# `char buf[8192];`, `double samples[2048] = {};`.
_STACK_ARRAY_RE = re.compile(
r"\b(?:char|signed\s+char|unsigned\s+char|int8_t|uint8_t|int|short|long"
r"|size_t|ptrdiff_t|int16_t|int32_t|int64_t|uint16_t|uint32_t|uint64_t"
r"|float|double|wchar_t|qint8|qint16|qint32|qint64|quint8|quint16"
r"|quint32|quint64|qreal)\s+"
r"(?:const\s+)?"
r"\w+\s*\[\s*(\d+)\s*\]\s*[;={,]"
)
def _walk_to_function_declarator(decl):
"""Drill through pointer/reference declarators to the innermost
function_declarator. Returns None when the chain doesn't lead to one."""
seen = 0
while decl is not None and seen < 16:
if decl.type == "function_declarator":
return decl
nested = decl.child_by_field_name("declarator")
if nested is None:
return None
decl = nested
seen += 1
return None
def _sink_param_names(func_node, src: bytes) -> set:
"""Return parameter names that this function treats as sinks.
A sink parameter is one where pass-by-value + move is the right call
instead of pass-by-const-ref:
- The body (or a constructor's field initializer list) calls
`std::move(<param>)`.
- The body ends with `return <param>;` AFTER mutating it -- the
param is the function's return value, so the implicit move on
`return` makes the by-value form at least as cheap as
`const T&` + explicit copy."""
names: set = set()
body = func_node.child_by_field_name("body")
if body is not None:
body_text = _node_text(body, src)
for m in re.finditer(r"\bstd::move\s*\(\s*([A-Za-z_]\w*)\s*\)", body_text):
names.add(m.group(1))
# `return <name>;` at any point in the body: param flows out as
# the return value, which the compiler implicitly moves from.
for m in re.finditer(r"\breturn\s+([A-Za-z_]\w*)\s*;", body_text):
names.add(m.group(1))
for c in func_node.children:
if c.type != "field_initializer_list":
continue
for m in re.finditer(
r"\bstd::move\s*\(\s*([A-Za-z_]\w*)\s*\)", _node_text(c, src)
):
names.add(m.group(1))
return names
def _parameter_perf_findings(func_node, src: bytes, fenced) -> list:
"""Flag heavy types or refcounted smart pointers passed by value.
Applies universally -- by-value copies are wasteful regardless of
whether the function is in the hotpath list.
Sink parameters (those `std::move`'d in the body) are skipped: the
by-value + move idiom is correct C++ for a function that conditionally
keeps a local copy of the argument."""
findings: list = []
decl = func_node.child_by_field_name("declarator")
fdecl = _walk_to_function_declarator(decl)
if fdecl is None:
return findings
params = fdecl.child_by_field_name("parameters")
if params is None:
return findings
sink_params = _sink_param_names(func_node, src)
for param in params.children:
if param.type != "parameter_declaration":
continue
param_type = param.child_by_field_name("type")
param_decl = param.child_by_field_name("declarator")
if param_type is None:
continue
if param_decl is not None and param_decl.type in (
"pointer_declarator",
"reference_declarator",
"abstract_pointer_declarator",
"abstract_reference_declarator",
"rvalue_reference_declarator",
"abstract_rvalue_reference_declarator",
):
continue
# Resolve the parameter's identifier name (if it has one) so we
# can suppress sink-param idioms.
pname = None
if param_decl is not None:
cur = param_decl
while cur is not None and cur.type != "identifier":
next_cur = None
for c in cur.children:
if c.type == "identifier":
next_cur = c
break
if next_cur is None:
for c in cur.children:
if hasattr(c, "children") and c.children:
next_cur = c
break
cur = next_cur
if cur is not None and cur.type == "identifier":
pname = _node_text(cur, src)
if pname is not None and pname in sink_params:
continue
type_text = _node_text(param_type, src).strip()
base = type_text
for q in ("const ", "constexpr ", "volatile ", "mutable ", "register "):
while base.startswith(q):
base = base[len(q) :].lstrip()
cuts = [i for i in (base.find("<"), base.find(" ")) if i >= 0]
if cuts:
base = base[: min(cuts)]
line = _line_of(param)
if fenced(line):
continue
if base in _HEAVY_TYPES:
findings.append(
Finding(
line,
"perf-large-by-value-param",
f"`{type_text}` passed by value -- forces a copy in the "
f"prologue (atomic ref-bump for Qt COW types: `lock`-prefix "
f"on x86, ldxr+stxr on ARM without LSE; full deep memcpy "
f"for std:: containers). Pass `const {base}&` and copy "
f"only when you genuinely keep a local copy.",
)
)
elif base in _REFCOUNTED_TYPES:
findings.append(
Finding(
line,
"perf-shared-ptr-by-value",
f"`{type_text}` by value -- two atomic refcount ops per "
f"call (`lock add`/`lock sub` on x86, ~20 cyc each; "
f"ldxr/stxr loop on ARM without LSE/v8.1 atomics). "
f"Pass `const {base}<...>&` and copy only when you "
f"actually store the pointer.",
)
)
return findings
def _init_only_decl_line_span(body, src: bytes) -> set:
"""Return the set of 1-based line numbers that belong to a declaration
whose initializer runs at most once: `constexpr` (compile-time folded)
or `static const`/`static constexpr` (function-local one-shot init).
Runtime-cost rules (divide, modulo, regex-construct, ...) reason about
the per-call cost of code inside a function body. These declarations
are not on the per-call path -- the optimizer folds `constexpr` and the
runtime evaluates `static const` exactly once -- so the rules must not
fire on their initializer lines, no matter how many physical lines the
initializer spans."""
if body is None:
return set()
exempt: set = set()
for node in _walk(body):
if node.type != "declaration":
continue
specifiers = []
for c in node.children:
if c.type == "storage_class_specifier":
specifiers.append(_node_text(c, src))
elif c.type == "type_qualifier":
specifiers.append(_node_text(c, src))
spec_set = set(s.strip() for s in specifiers)
is_constexpr = "constexpr" in spec_set
is_static_const = "static" in spec_set and "const" in spec_set
if not (is_constexpr or is_static_const):
continue
first = node.start_point[0] + 1
last = node.end_point[0] + 1
for ln in range(first, last + 1):
exempt.add(ln)
return exempt
def _cold_branch_line_span(body, src: bytes) -> set:
"""Return the set of 1-based line numbers inside cold-path branches:
`[[unlikely]]`-attributed statements and `catch_clause` bodies.
Both are reached only on error / overflow / exception, not on the
steady-state hotpath that the perf rules are designed to flag.
`qWarning(...)` inside an overflow branch or a catch block is correct
code, not a hotpath log call."""
if body is None:
return set()
exempt: set = set()
for node in _walk(body):
if node.type == "catch_clause":
cs = node.child_by_field_name("body")
if cs is None:
for c in node.children:
if c.type == "compound_statement":
cs = c
break
if cs is not None:
for ln in range(cs.start_point[0] + 1, cs.end_point[0] + 2):
exempt.add(ln)
continue
if node.type == "attributed_statement":
attr_text = _node_text(node, src)[:64]
if "[[unlikely]]" not in attr_text and "[[gnu::unlikely]]" not in attr_text:
continue
for c in node.children:
if c.type == "attribute_declaration":
continue
for ln in range(c.start_point[0] + 1, c.end_point[0] + 2):
exempt.add(ln)
return exempt
_RECIPROCAL_CACHE_RE = re.compile(r"\b1(?:\.0+f?|\.0+L?|\.0+|\b)\s*/\s*[A-Za-z_(]")
_DIV_OR_MOD_DIVISOR_RE = re.compile(
r"(?<![*/=<>!&|^])[/%]\s*(?!/)(?!sizeof\b)([A-Za-z_]\w*)"
)
def _is_reciprocal_cache_line(scrubbed: str) -> bool:
"""True when the line is a reciprocal-cache declaration like
`const float inv = 1.0f / x;` or `auto r = 1.0 / qMax(...);`.
These are the rule's RECOMMENDED fix for runtime-divisor cost --
cache the reciprocal once, multiply in the loop -- so flagging them
is exactly backwards. We detect by the literal-1 numerator pattern."""
return bool(_RECIPROCAL_CACHE_RE.search(scrubbed))
# Well-known math/system identifiers that resolve to compile-time constants
# even though they're macros (M_PI family) or constants the compiler
# substitutes via the standard library headers. Treating these as
# compile-time means the divisor / modulo rules don't fire on them.
_KNOWN_COMPILE_TIME_NAMES = frozenset(
{
"M_PI",
"M_PI_2",
"M_PI_4",
"M_1_PI",
"M_2_PI",
"M_2_SQRTPI",
"M_E",
"M_LOG2E",
"M_LOG10E",
"M_LN2",
"M_LN10",
"M_SQRT2",
"M_SQRT1_2",
"INT8_MAX",
"INT16_MAX",
"INT32_MAX",
"INT64_MAX",
"UINT8_MAX",
"UINT16_MAX",
"UINT32_MAX",
"UINT64_MAX",
"CHAR_BIT",
"CHAR_MAX",
"CHAR_MIN",
}
)
def _compile_time_constants_in_scope(body, src: bytes) -> set:
"""Walk the function body for `constexpr` declarations and return the
set of their identifier names, plus a fixed set of well-known math/
system macros that resolve to compile-time constants. The divisor /
modulo rules can then skip lines whose divisor resolves to one of
these -- the compiler folds them into a multiply-by-magic-number, no
idiv at runtime."""
if body is None:
return set(_KNOWN_COMPILE_TIME_NAMES)
names: set = set(_KNOWN_COMPILE_TIME_NAMES)
for node in _walk(body):
if node.type != "declaration":
continue
is_constexpr = any(
c.type == "type_qualifier" and _node_text(c, src).strip() == "constexpr"
for c in node.children
)
if not is_constexpr:
continue
for c in node.children:
if c.type != "init_declarator":
continue
decl = c.child_by_field_name("declarator")
if decl is None:
continue
cur = decl
while cur is not None and cur.type != "identifier":
next_cur = None
for cc in cur.children:
if cc.type == "identifier":
next_cur = cc
break
cur = next_cur
if cur is not None and cur.type == "identifier":
names.add(_node_text(cur, src))
return names
def _scan_body_lines(body, src: bytes, fname: str, fenced, patterns) -> list:
"""Run a list of `(regex, kind, message)` triples over each line of a
function body. First-match wins per line so a single problematic
expression doesn't fire every pattern.
Lines skipped (all driven by AST walks for multi-line statements):
- `constexpr` / `static const` declarations -- init code, not per-call
- `[[unlikely]]`-attributed substatement bodies -- cold path
- `catch_clause` bodies -- error / exception path, not steady-state
Lines skipped per-pattern: reciprocal-cache declarations
(`const T inv = 1.0 / x;`) bypass the divide rule -- they ARE the
recommended fix for runtime-divisor cost."""
if body is None:
return []
findings: list = []
body_text = _node_text(body, src)
body_start = body.start_point[0] + 1
exempt_lines = _init_only_decl_line_span(body, src) | _cold_branch_line_span(
body, src
)
constexpr_names = _compile_time_constants_in_scope(body, src)
for j, line in enumerate(body_text.split("\n")):
abs_line = body_start + j
if fenced(abs_line) or abs_line in exempt_lines:
continue
scrubbed = _strip_strings_and_line_comments(line)
is_recip_cache = _is_reciprocal_cache_line(scrubbed)
# If every divisor / modulo on this line resolves to a constexpr
# name we know is in scope, the compiler folds them. Skip the
# divide/modulo runtime rules for this line.
divisors = _DIV_OR_MOD_DIVISOR_RE.findall(scrubbed)
all_compile_time = bool(divisors) and all(
d in constexpr_names for d in divisors
)
for pat, kind, msg in patterns:
if is_recip_cache and kind == "perf-divide-runtime-divisor":
continue
if all_compile_time and kind in (
"perf-divide-runtime-divisor",
"perf-modulo-runtime-divisor",
):
continue
if pat.search(scrubbed):
findings.append(Finding(abs_line, kind, msg))
break
return findings
def _recursion_findings(func_node, fname: str, body, src: bytes, fenced) -> list:
"""Flag direct self-recursion in a hotpath method. Recursion at kHz
rates blows the i-cache (200+ cyc for an L2 miss), trashes the RAS
predictor (mispredict on every return), and prevents inlining.
Only flags **stack** recursion. The following are NOT recursion and
are skipped:
- Calls inside a `lambda_expression` body (deferred to whichever
executor consumes the lambda; doesn't grow the stack here).
- Qualified calls like `Base::fname(...)` or `Foo::fname(...)`
(statically dispatched to a different function).
- Method calls on a different object (`other.fname(...)`,
`other->fname(...)`); only bare `fname(...)` and `this->fname(...)`
are real self-calls."""
if body is None or not fname or fname not in _HOTPATH_METHODS:
return []
findings: list = []
seen_lines: set = set()
for node in _walk(body):
if node.type != "call_expression":
continue
# Skip calls that live inside a lambda body -- those execute when
# the lambda runs, not on this call's stack frame.
cur = node.parent
in_lambda = False
while cur is not None and cur is not body:
if cur.type == "lambda_expression":
in_lambda = True
break
cur = cur.parent
if in_lambda:
continue
callee = node.child_by_field_name("function")
if callee is None:
continue
# Recognise: `fname(...)` (identifier) or `this->fname(...)` (field_expression
# whose object is `this`). Reject `Foo::fname`, `obj.fname`, `obj->fname`.
is_self = False
if callee.type == "identifier":
is_self = _node_text(callee, src) == fname
elif callee.type == "field_expression":
obj = callee.child_by_field_name("argument")
field = callee.child_by_field_name("field")
if (
obj is not None
and field is not None
and obj.type == "this"
and _node_text(field, src) == fname
):
is_self = True
if not is_self:
continue
line = node.start_point[0] + 1
if fenced(line) or line in seen_lines:
continue
seen_lines.add(line)
findings.append(
Finding(
line,
"perf-recursive-hotpath",
f"hotpath `{fname}` calls itself -- recursion on a kHz "
f"frame loop blows the i-cache (200+ cyc per L2 miss), "
f"mispredicts the RAS on every return, and prevents the "
f"compiler from inlining. Rewrite iteratively (explicit "
f"work-list / std::stack).",
)
)
return findings
_RUN_LOOP_COND_RE = re.compile(r"\bm_\w+\s*(?:\.load\s*\(|\)\s*\.\s*load\s*\()")
# Qt event-handler suffixes. These methods fire only on user input, geometry
# changes, focus changes, etc. -- cold paths by construction. Any per-call
# divide/modulo/pow inside them is irrelevant to the steady-state hotpath.
_QT_EVENT_HANDLER_SUFFIXES = (
"Event", # mousePressEvent, wheelEvent, keyPressEvent, paintEvent...
"EventFilter", # eventFilter override
"ChangeEvent", # geometryChange, focusChange, etc.
)
def _is_qt_event_handler(fname: str) -> bool:
"""True when @a fname matches Qt's event-handler naming convention.
Qt's QObject / QWidget / QQuickItem event handlers all end in `Event`
(`mousePressEvent`, `wheelEvent`, `paintEvent`, `geometryChange`,
`eventFilter`, ...). They are dispatched once per user gesture or
window event -- nowhere near the kHz frame rate the perf rules
target."""
if not fname:
return False
for suffix in _QT_EVENT_HANDLER_SUFFIXES:
if fname.endswith(suffix) and len(fname) > len(suffix):
return True
return False
# Method-name patterns for the QQuickPaintedItem / QPainter render path. These
# functions are called at most at the screen refresh rate (~60 Hz) -- two
# orders of magnitude below the kHz frame loop the perf rules target.
_PAINT_METHOD_NAMES = frozenset(
{
"paint",
"render",
}
)
_PAINT_METHOD_PREFIXES = ("draw", "render", "paint")
def _is_paint_method(fname: str) -> bool:
"""True when @a fname is a paint / render method (`paint`, `paintEvent`,
`drawXAxis`, `drawGrid`, `renderTile`, `paintBackground`, ...).
Paint callbacks fire at the screen refresh rate at most. Compared to
the kHz frame hotpath, the per-call cost of one or two divides is
invisible. Locks, divides, etc. on these paths are not the rule's
target."""
if not fname:
return False
if fname in _PAINT_METHOD_NAMES:
return True
for prefix in _PAINT_METHOD_PREFIXES:
if fname.startswith(prefix) and len(fname) > len(prefix):
# Next char must be uppercase to avoid false positives like
# `drained` or `paintbrush`.
tail = fname[len(prefix)]
if tail.isupper():
return True
return False
def _is_constexpr_or_consteval(func_node, src: bytes) -> bool:
"""True when the function carries `constexpr`, `consteval`, or `constinit`
among its specifiers. Such functions are compile-time-evaluable (or are
only meaningful at compile time), so runtime-cost rules don't apply to
their bodies: any non-literal divisor / modulo / pow / etc. that survives
constant folding does so because the function is being called with
runtime arguments at a single specific site that the user is choosing
to keep generic."""
for c in func_node.children:
if c.type in ("type_qualifier", "storage_class_specifier"):
if _node_text(c, src).strip() in ("constexpr", "consteval", "constinit"):
return True
return False
def _is_long_running_loop_function(body, src: bytes) -> bool:
"""Heuristic: the function's body contains a `while`/`for`/`do` loop
whose condition reads a member atomic flag (e.g. `m_running.load()`),
at any nesting level inside the body. That's the canonical
thread-entry / event-loop pattern in this codebase -- the function is
called once per thread start, not per frame, so a 4 KB stack buffer
in front of the loop amortizes.
Descends through `preproc_ifdef` and similar nesting so a function
body that's entirely wrapped in `#ifdef Q_OS_WIN` still matches."""
if body is None:
return False
for node in _walk(body):
if node.type not in ("while_statement", "for_statement", "do_statement"):
continue
cond = node.child_by_field_name("condition")
if cond is None:
for c in node.children:
if c.type == "condition_clause" or c.type == "parenthesized_expression":
cond = c
break
if cond is None:
continue
if _RUN_LOOP_COND_RE.search(_node_text(cond, src)):
return True
return False
def _large_stack_buffer_findings(body, src: bytes, fenced) -> list:
"""Flag local fixed-size arrays > ~4 KB. Stack-frame setup cost,
pollutes L1 (32-48 KB) when the function recurses or is called in a
hot loop with other state already on the stack, and on deep call
paths risks overflow. The 1024-element threshold catches `double[512]`
(4 KB), `int[1024]` (4 KB), `char[4096]` (4 KB) and similar.
Functions that ARE the long-running loop (e.g. `pipeReadLoopWin`,
detected by a top-level `while (m_*.load())`) are exempted: the stack
frame is set up once per thread start, the buffer is reused every
iteration, and the cost amortizes."""
if body is None:
return []
if _is_long_running_loop_function(body, src):
return []
findings: list = []
body_text = _node_text(body, src)
body_start = body.start_point[0] + 1
for j, line in enumerate(body_text.split("\n")):
abs_line = body_start + j
if fenced(abs_line):
continue
scrubbed = _strip_strings_and_line_comments(line)
m = _STACK_ARRAY_RE.search(scrubbed)
if m is None:
continue
try:
n = int(m.group(1))
except ValueError:
continue
if n < 1024:
continue
findings.append(
Finding(
abs_line,
"perf-large-stack-buffer",
f"local array of {n} elements on the stack -- frame-setup "
f"cost, pollutes L1 (32-48 KB) when called in a hot loop, "
f"risks overflow on deep call paths (Windows default 1 MB, "
f"Linux 8 MB). Promote to a member, thread_local, or a "
f"pre-reserved buffer.",
)
)
return findings
def _adjacent_atomic_findings(class_node, src: bytes, fenced) -> list:
"""Two `std::atomic<>` (or QAtomic*) members within a few lines of
each other almost certainly share a 64-byte cache line. When two
cores write to atomics that share a line, MESI/MOESI invalidations
bounce the line across cores -- a 50-200x slowdown vs the uncontended
case (false sharing). The fix is `alignas(64)` (or
`std::hardware_destructive_interference_size`) on each, or explicit
`char _pad[64];` padding.
Pointer-to-atomic fields are skipped: the pointer itself is set once
at construction, and the actual atomics live behind the indirection
in some other object whose layout we can't reason about from here."""
findings: list = []
body = class_node.child_by_field_name("body")
if body is None:
return findings
prev_line = -100
for child in body.children:
if child.type != "field_declaration":
continue
text = _node_text(child, src)
if not _ATOMIC_DECL_RE.search(text):
prev_line = -100
continue
# Skip pointer-to-atomic and reference-to-atomic fields: the
# atomic that would suffer false sharing lives elsewhere.
if re.search(r"atomic\w*\s*(?:<[^>]*>)?\s*[*&]", text):
prev_line = -100
continue
if "alignas" in text:
prev_line = _line_of(child)
continue
line = _line_of(child)
if not fenced(line) and 0 < line - prev_line <= 4:
findings.append(
Finding(
line,
"perf-false-sharing-risk",
"adjacent atomic members will share a cache line "
"(64 B Intel/AArch64, up to 128 B on Apple Silicon "
"M-series via the 128 B speculative line). Cross-core "
"writes thrash MESI/MOESI invalidations (50-200x slowdown "
"vs uncontended). Add `alignas(64)` / "
"`alignas(std::hardware_destructive_interference_size)` "
"or insert `char _pad[64 - sizeof(prev)];` between them.",
)
)
prev_line = line
return findings
def _virtual_hotpath_findings(src_text: str, path: Path, fenced) -> list:
"""Header line-scan: a hotpath method declared `virtual`. Every call
site emits a vtable load + indirect branch (5-10 cyc best case, 15-20
cyc misprediction penalty on polymorphic sites) and the compiler
can't inline through it. If there's only one implementation, mark
`final` (devirtualizes when the dynamic type is statically known)
or drop `virtual` entirely.
Skipped when the method is taken as a Qt member-function pointer