Skip to content

Commit fe60913

Browse files
schuayCommit Bot
authored andcommitted
[regexp] Consistent expectations for output registers
... between the interpreter and generated code. Prior to this CL, pre- and post conditions on the output register array differed between the interpreter and generated code. Interpreter Pre: `output` fits captures and temporary registers. Post: None. Generated code Pre: `output` fits capture registers. Post: `output` is modified if and only if the match succeeded. This CL changes the interpreter to match generated code pre- and post conditions by allocating space for temporary registers inside the interpreter. Drive-by: Add MaxRegisterCount, RegistersForCaptureCount helpers. Bug: chromium:1067270 Change-Id: I2900ef2f31207d817ec7ead3e0e2215b23b398f0 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2135642 Commit-Queue: Jakob Gruber <jgruber@chromium.org> Reviewed-by: Leszek Swirski <leszeks@chromium.org> Cr-Commit-Position: refs/heads/master@{#67268}
1 parent 3a83b13 commit fe60913

11 files changed

Lines changed: 166 additions & 146 deletions

File tree

src/builtins/builtins-regexp-gen.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ TNode<HeapObject> RegExpBuiltinsAssembler::RegExpExecInternal(
528528
data, JSRegExp::kIrregexpCaptureCountIndex));
529529
// capture_count is the number of captures without the match itself.
530530
// Required registers = (capture_count + 1) * 2.
531-
STATIC_ASSERT(Internals::IsValidSmi((JSRegExp::kMaxCaptures + 1) << 1));
531+
STATIC_ASSERT(Internals::IsValidSmi((JSRegExp::kMaxCaptures + 1) * 2));
532532
TNode<Smi> register_count =
533533
SmiShl(SmiAdd(capture_count, SmiConstant(1)), 1);
534534

src/objects/js-regexp-inl.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ JSRegExp::Type JSRegExp::TypeTag() const {
3434
return static_cast<JSRegExp::Type>(smi.value());
3535
}
3636

37-
int JSRegExp::CaptureCount() {
37+
int JSRegExp::CaptureCount() const {
3838
switch (TypeTag()) {
3939
case ATOM:
4040
return 0;
@@ -45,6 +45,11 @@ int JSRegExp::CaptureCount() {
4545
}
4646
}
4747

48+
int JSRegExp::MaxRegisterCount() const {
49+
CHECK_EQ(TypeTag(), IRREGEXP);
50+
return Smi::ToInt(DataAt(kIrregexpMaxRegisterCountIndex));
51+
}
52+
4853
JSRegExp::Flags JSRegExp::GetFlags() {
4954
DCHECK(this->data().IsFixedArray());
5055
Object data = this->data();

src/objects/js-regexp.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,11 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
113113
static constexpr int kMaxCaptures = 1 << 16;
114114

115115
// Number of captures (without the match itself).
116-
inline int CaptureCount();
116+
inline int CaptureCount() const;
117+
// Each capture (including the match itself) needs two registers.
118+
static int RegistersForCaptureCount(int count) { return (count + 1) * 2; }
119+
120+
inline int MaxRegisterCount() const;
117121
inline Flags GetFlags();
118122
inline String Pattern();
119123
inline Object CaptureNameMap();

src/objects/objects.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4212,7 +4212,8 @@ Handle<RegExpMatchInfo> RegExpMatchInfo::ReserveCaptures(
42124212
Isolate* isolate, Handle<RegExpMatchInfo> match_info, int capture_count) {
42134213
DCHECK_GE(match_info->length(), kLastMatchOverhead);
42144214

4215-
int capture_register_count = (capture_count + 1) * 2;
4215+
int capture_register_count =
4216+
JSRegExp::RegistersForCaptureCount(capture_count);
42164217
const int required_length = kFirstCaptureIndex + capture_register_count;
42174218
Handle<RegExpMatchInfo> result = Handle<RegExpMatchInfo>::cast(
42184219
EnsureSpaceInFixedArray(isolate, match_info, required_length));

src/regexp/regexp-compiler.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ class RecursionCheck {
223223
// a fixed array or a null handle depending on whether it succeeded.
224224
RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
225225
bool one_byte)
226-
: next_register_(2 * (capture_count + 1)),
226+
: next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
227227
unicode_lookaround_stack_register_(kNoRegister),
228228
unicode_lookaround_position_register_(kNoRegister),
229229
work_list_(nullptr),

src/regexp/regexp-interpreter.cc

Lines changed: 78 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,48 @@ class BacktrackStack {
144144
DISALLOW_COPY_AND_ASSIGN(BacktrackStack);
145145
};
146146

147+
// Registers used during interpreter execution. These consist of output
148+
// registers in indices [0, output_register_count[ which will contain matcher
149+
// results as a {start,end} index tuple for each capture (where the whole match
150+
// counts as implicit capture 0); and internal registers in indices
151+
// [output_register_count, total_register_count[.
152+
class InterpreterRegisters {
153+
public:
154+
using RegisterT = int;
155+
156+
InterpreterRegisters(int total_register_count, RegisterT* output_registers,
157+
int output_register_count)
158+
: registers_(total_register_count),
159+
output_registers_(output_registers),
160+
output_register_count_(output_register_count) {
161+
// TODO(jgruber): Use int32_t consistently for registers. Currently, CSA
162+
// uses int32_t while runtime uses int.
163+
STATIC_ASSERT(sizeof(int) == sizeof(int32_t));
164+
DCHECK_GE(output_register_count, 2); // At least 2 for the match itself.
165+
DCHECK_GE(total_register_count, output_register_count);
166+
DCHECK_LE(total_register_count, RegExpMacroAssembler::kMaxRegisterCount);
167+
DCHECK_NOT_NULL(output_registers);
168+
169+
// Initialize the output register region to -1 signifying 'no match'.
170+
std::memset(registers_.data(), -1,
171+
output_register_count * sizeof(RegisterT));
172+
}
173+
174+
const RegisterT& operator[](size_t index) const { return registers_[index]; }
175+
RegisterT& operator[](size_t index) { return registers_[index]; }
176+
177+
void CopyToOutputRegisters() {
178+
MemCopy(output_registers_, registers_.data(),
179+
output_register_count_ * sizeof(RegisterT));
180+
}
181+
182+
private:
183+
static constexpr int kStaticCapacity = 64; // Arbitrary.
184+
base::SmallVector<RegisterT, kStaticCapacity> registers_;
185+
RegisterT* const output_registers_;
186+
const int output_register_count_;
187+
};
188+
147189
IrregexpInterpreter::Result ThrowStackOverflow(Isolate* isolate,
148190
RegExp::CallOrigin call_origin) {
149191
CHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
@@ -305,12 +347,12 @@ bool CheckBitInTable(const uint32_t current_char, const byte* const table) {
305347
#endif // DEBUG
306348

307349
template <typename Char>
308-
IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
309-
String subject_string,
310-
Vector<const Char> subject, int* registers,
311-
int current, uint32_t current_char,
312-
RegExp::CallOrigin call_origin,
313-
const uint32_t backtrack_limit) {
350+
IrregexpInterpreter::Result RawMatch(
351+
Isolate* isolate, ByteArray code_array, String subject_string,
352+
Vector<const Char> subject, int* output_registers,
353+
int output_register_count, int total_register_count, int current,
354+
uint32_t current_char, RegExp::CallOrigin call_origin,
355+
const uint32_t backtrack_limit) {
314356
DisallowHeapAllocation no_gc;
315357

316358
#if V8_USE_COMPUTED_GOTO
@@ -364,6 +406,8 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
364406
const byte* pc = code_array.GetDataStartAddress();
365407
const byte* code_base = pc;
366408

409+
InterpreterRegisters registers(total_register_count, output_registers,
410+
output_register_count);
367411
BacktrackStack backtrack_stack;
368412

369413
uint32_t backtrack_count = 0;
@@ -471,6 +515,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
471515
BYTECODE(SUCCEED) {
472516
isolate->counters()->regexp_backtracks()->AddSample(
473517
static_cast<int>(backtrack_count));
518+
registers.CopyToOutputRegisters();
474519
return IrregexpInterpreter::SUCCESS;
475520
}
476521
BYTECODE(ADVANCE_CP) {
@@ -952,24 +997,25 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
952997

953998
// static
954999
IrregexpInterpreter::Result IrregexpInterpreter::Match(
955-
Isolate* isolate, JSRegExp regexp, String subject_string, int* registers,
956-
int registers_length, int start_position, RegExp::CallOrigin call_origin) {
957-
if (FLAG_regexp_tier_up) {
958-
regexp.TierUpTick();
959-
}
1000+
Isolate* isolate, JSRegExp regexp, String subject_string,
1001+
int* output_registers, int output_register_count, int start_position,
1002+
RegExp::CallOrigin call_origin) {
1003+
if (FLAG_regexp_tier_up) regexp.TierUpTick();
9601004

9611005
bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string);
9621006
ByteArray code_array = ByteArray::cast(regexp.Bytecode(is_one_byte));
1007+
int total_register_count = regexp.MaxRegisterCount();
9631008

964-
return MatchInternal(isolate, code_array, subject_string, registers,
965-
registers_length, start_position, call_origin,
966-
regexp.BacktrackLimit());
1009+
return MatchInternal(isolate, code_array, subject_string, output_registers,
1010+
output_register_count, total_register_count,
1011+
start_position, call_origin, regexp.BacktrackLimit());
9671012
}
9681013

9691014
IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
9701015
Isolate* isolate, ByteArray code_array, String subject_string,
971-
int* registers, int registers_length, int start_position,
972-
RegExp::CallOrigin call_origin, uint32_t backtrack_limit) {
1016+
int* output_registers, int output_register_count, int total_register_count,
1017+
int start_position, RegExp::CallOrigin call_origin,
1018+
uint32_t backtrack_limit) {
9731019
DCHECK(subject_string.IsFlat());
9741020

9751021
// Note: Heap allocation *is* allowed in two situations if calling from
@@ -980,27 +1026,23 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
9801026
// after interrupts have run.
9811027
DisallowHeapAllocation no_gc;
9821028

983-
// Reset registers to -1 (=undefined).
984-
// This is necessary because registers are only written when a
985-
// capture group matched.
986-
// Resetting them ensures that previous matches are cleared.
987-
memset(registers, -1, sizeof(registers[0]) * registers_length);
988-
9891029
uc16 previous_char = '\n';
9901030
String::FlatContent subject_content = subject_string.GetFlatContent(no_gc);
9911031
if (subject_content.IsOneByte()) {
9921032
Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
9931033
if (start_position != 0) previous_char = subject_vector[start_position - 1];
9941034
return RawMatch(isolate, code_array, subject_string, subject_vector,
995-
registers, start_position, previous_char, call_origin,
996-
backtrack_limit);
1035+
output_registers, output_register_count,
1036+
total_register_count, start_position, previous_char,
1037+
call_origin, backtrack_limit);
9971038
} else {
9981039
DCHECK(subject_content.IsTwoByte());
9991040
Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
10001041
if (start_position != 0) previous_char = subject_vector[start_position - 1];
10011042
return RawMatch(isolate, code_array, subject_string, subject_vector,
1002-
registers, start_position, previous_char, call_origin,
1003-
backtrack_limit);
1043+
output_registers, output_register_count,
1044+
total_register_count, start_position, previous_char,
1045+
call_origin, backtrack_limit);
10041046
}
10051047
}
10061048

@@ -1009,11 +1051,11 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
10091051
// This method is called through an external reference from RegExpExecInternal
10101052
// builtin.
10111053
IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
1012-
Address subject, int32_t start_position, Address, Address, int* registers,
1013-
int32_t registers_length, Address, RegExp::CallOrigin call_origin,
1014-
Isolate* isolate, Address regexp) {
1054+
Address subject, int32_t start_position, Address, Address,
1055+
int* output_registers, int32_t output_register_count, Address,
1056+
RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp) {
10151057
DCHECK_NOT_NULL(isolate);
1016-
DCHECK_NOT_NULL(registers);
1058+
DCHECK_NOT_NULL(output_registers);
10171059
DCHECK(call_origin == RegExp::CallOrigin::kFromJs);
10181060

10191061
DisallowHeapAllocation no_gc;
@@ -1028,38 +1070,18 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
10281070
return IrregexpInterpreter::RETRY;
10291071
}
10301072

1031-
// In generated code, registers are allocated on the stack. The given
1032-
// `registers` argument is only guaranteed to hold enough space for permanent
1033-
// registers (i.e. for captures), and not for temporary registers used only
1034-
// during matcher execution. We match that behavior in the interpreter by
1035-
// using a SmallVector as internal register storage.
1036-
static constexpr int kBaseRegisterArraySize = 64; // Arbitrary.
1037-
const int internal_register_count =
1038-
Smi::ToInt(regexp_obj.DataAt(JSRegExp::kIrregexpMaxRegisterCountIndex));
1039-
base::SmallVector<int, kBaseRegisterArraySize> internal_registers(
1040-
internal_register_count);
1041-
1042-
Result result =
1043-
Match(isolate, regexp_obj, subject_string, internal_registers.data(),
1044-
internal_register_count, start_position, call_origin);
1045-
1046-
// Copy capture registers to the output array.
1047-
if (result == IrregexpInterpreter::SUCCESS) {
1048-
CHECK_GE(internal_registers.size(), registers_length);
1049-
MemCopy(registers, internal_registers.data(),
1050-
registers_length * sizeof(registers[0]));
1051-
}
1052-
1053-
return result;
1073+
return Match(isolate, regexp_obj, subject_string, output_registers,
1074+
output_register_count, start_position, call_origin);
10541075
}
10551076

10561077
#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
10571078

10581079
IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime(
10591080
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string,
1060-
int* registers, int registers_length, int start_position) {
1061-
return Match(isolate, *regexp, *subject_string, registers, registers_length,
1062-
start_position, RegExp::CallOrigin::kFromRuntime);
1081+
int* output_registers, int output_register_count, int start_position) {
1082+
return Match(isolate, *regexp, *subject_string, output_registers,
1083+
output_register_count, start_position,
1084+
RegExp::CallOrigin::kFromRuntime);
10631085
}
10641086

10651087
} // namespace internal

src/regexp/regexp-interpreter.h

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,36 +23,42 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
2323

2424
// In case a StackOverflow occurs, a StackOverflowException is created and
2525
// EXCEPTION is returned.
26-
static Result MatchForCallFromRuntime(Isolate* isolate,
27-
Handle<JSRegExp> regexp,
28-
Handle<String> subject_string,
29-
int* registers, int registers_length,
30-
int start_position);
26+
static Result MatchForCallFromRuntime(
27+
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string,
28+
int* output_registers, int output_register_count, int start_position);
3129

3230
// In case a StackOverflow occurs, EXCEPTION is returned. The caller is
3331
// responsible for creating the exception.
32+
//
3433
// RETRY is returned if a retry through the runtime is needed (e.g. when
3534
// interrupts have been scheduled or the regexp is marked for tier-up).
35+
//
3636
// Arguments input_start, input_end and backtrack_stack are
3737
// unused. They are only passed to match the signature of the native irregex
3838
// code.
39+
//
40+
// Arguments output_registers and output_register_count describe the results
41+
// array, which will contain register values of all captures if SUCCESS is
42+
// returned. For all other return codes, the results array remains unmodified.
3943
static Result MatchForCallFromJs(Address subject, int32_t start_position,
4044
Address input_start, Address input_end,
41-
int* registers, int32_t registers_length,
45+
int* output_registers,
46+
int32_t output_register_count,
4247
Address backtrack_stack,
4348
RegExp::CallOrigin call_origin,
4449
Isolate* isolate, Address regexp);
4550

4651
static Result MatchInternal(Isolate* isolate, ByteArray code_array,
47-
String subject_string, int* registers,
48-
int registers_length, int start_position,
52+
String subject_string, int* output_registers,
53+
int output_register_count,
54+
int total_register_count, int start_position,
4955
RegExp::CallOrigin call_origin,
5056
uint32_t backtrack_limit);
5157

5258
private:
5359
static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string,
54-
int* registers, int registers_length, int start_position,
55-
RegExp::CallOrigin call_origin);
60+
int* output_registers, int output_register_count,
61+
int start_position, RegExp::CallOrigin call_origin);
5662
};
5763

5864
} // namespace internal

src/regexp/regexp-macro-assembler.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@ struct DisjunctDecisionRow {
2828
class RegExpMacroAssembler {
2929
public:
3030
// The implementation must be able to handle at least:
31-
static const int kMaxRegister = (1 << 16) - 1;
32-
static const int kMaxCPOffset = (1 << 15) - 1;
33-
static const int kMinCPOffset = -(1 << 15);
34-
35-
static const int kTableSizeBits = 7;
36-
static const int kTableSize = 1 << kTableSizeBits;
37-
static const int kTableMask = kTableSize - 1;
31+
static constexpr int kMaxRegisterCount = (1 << 16);
32+
static constexpr int kMaxRegister = kMaxRegisterCount - 1;
33+
static constexpr int kMaxCPOffset = (1 << 15) - 1;
34+
static constexpr int kMinCPOffset = -(1 << 15);
35+
36+
static constexpr int kTableSizeBits = 7;
37+
static constexpr int kTableSize = 1 << kTableSizeBits;
38+
static constexpr int kTableMask = kTableSize - 1;
3839

3940
static constexpr int kUseCharactersValue = -1;
4041

0 commit comments

Comments
 (0)