Skip to content

Commit ff340d8

Browse files
Milad FaV8 LUCI CQ
authored andcommitted
S390 [simd]: Implement vector load and extend
This CL takes advantage of the z15 `load byte reverse element` instruction to optimize Simd LoadExtend opcodes. On the simulator we only run `load element` as reversing is not required. Change-Id: Ia34ac86f93e987656596b3116771a30f64009416 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3119048 Reviewed-by: Junliang Yan <junyan@redhat.com> Commit-Queue: Milad Fa <mfarazma@redhat.com> Cr-Commit-Position: refs/heads/main@{#76517}
1 parent ba25a52 commit ff340d8

8 files changed

Lines changed: 149 additions & 23 deletions

File tree

src/codegen/s390/constants-s390.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1562,7 +1562,13 @@ using SixByteInstr = uint64_t;
15621562
V(vstbr, VSTBR, \
15631563
0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \
15641564
V(vlbrrep, VLBRREP, \
1565-
0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */
1565+
0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */ \
1566+
V(vlebrh, VLEBRH, \
1567+
0xE601) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (16) */ \
1568+
V(vlebrf, VLEBRF, \
1569+
0xE603) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (32) */ \
1570+
V(vlebrg, VLEBRG, \
1571+
0xE602) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (64) */
15661572

15671573
#define S390_RIE_G_OPCODE_LIST(V) \
15681574
V(lochi, LOCHI, \

src/codegen/s390/macro-assembler-s390.cc

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3949,6 +3949,31 @@ LOAD_SPLAT_LIST(LOAD_SPLAT)
39493949
#undef LOAD_SPLAT
39503950
#undef LOAD_SPLAT_LIST
39513951

3952+
#define LOAD_EXTEND_LIST(V) \
3953+
V(32x2U, vuplh, 2) \
3954+
V(32x2S, vuph, 2) \
3955+
V(16x4U, vuplh, 1) \
3956+
V(16x4S, vuph, 1) \
3957+
V(8x8U, vuplh, 0) \
3958+
V(8x8S, vuph, 0)
3959+
3960+
#define LOAD_EXTEND(name, unpack_instr, condition) \
3961+
void TurboAssembler::LoadAndExtend##name##LE(Simd128Register dst, \
3962+
const MemOperand& mem) { \
3963+
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2) && \
3964+
is_uint12(mem.offset())) { \
3965+
vlebrg(kScratchDoubleReg, mem, Condition(0)); \
3966+
} else { \
3967+
LoadU64LE(r1, mem); \
3968+
vlvg(kScratchDoubleReg, r1, MemOperand(r0, 0), Condition(3)); \
3969+
} \
3970+
unpack_instr(dst, kScratchDoubleReg, Condition(0), Condition(0), \
3971+
Condition(condition)); \
3972+
}
3973+
LOAD_EXTEND_LIST(LOAD_EXTEND)
3974+
#undef LOAD_EXTEND
3975+
#undef LOAD_EXTEND
3976+
39523977
#else
39533978
void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem,
39543979
Register scratch) {
@@ -4036,6 +4061,25 @@ LOAD_SPLAT_LIST(LOAD_SPLAT)
40364061
#undef LOAD_SPLAT
40374062
#undef LOAD_SPLAT_LIST
40384063

4064+
#define LOAD_EXTEND_LIST(V) \
4065+
V(32x2U, vuplh, 2) \
4066+
V(32x2S, vuph, 2) \
4067+
V(16x4U, vuplh, 1) \
4068+
V(16x4S, vuph, 1) \
4069+
V(8x8U, vuplh, 0) \
4070+
V(8x8S, vuph, 0)
4071+
4072+
#define LOAD_EXTEND(name, unpack_instr, condition) \
4073+
void TurboAssembler::LoadAndExtend##name##LE(Simd128Register dst, \
4074+
const MemOperand& mem) { \
4075+
vleg(kScratchDoubleReg, mem, Condition(0)); \
4076+
unpack_instr(dst, kScratchDoubleReg, Condition(0), Condition(0), \
4077+
Condition(condition)); \
4078+
}
4079+
LOAD_EXTEND_LIST(LOAD_EXTEND)
4080+
#undef LOAD_EXTEND
4081+
#undef LOAD_EXTEND
4082+
40394083
#endif
40404084

40414085
// Load And Test (Reg <- Reg)

src/codegen/s390/macro-assembler-s390.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
396396
void LoadAndSplat32x4LE(Simd128Register dst, const MemOperand& mem);
397397
void LoadAndSplat16x8LE(Simd128Register dst, const MemOperand& mem);
398398
void LoadAndSplat8x16LE(Simd128Register dst, const MemOperand& mem);
399+
void LoadAndExtend8x8ULE(Simd128Register dst, const MemOperand& mem);
400+
void LoadAndExtend8x8SLE(Simd128Register dst, const MemOperand& mem);
401+
void LoadAndExtend16x4ULE(Simd128Register dst, const MemOperand& mem);
402+
void LoadAndExtend16x4SLE(Simd128Register dst, const MemOperand& mem);
403+
void LoadAndExtend32x2ULE(Simd128Register dst, const MemOperand& mem);
404+
void LoadAndExtend32x2SLE(Simd128Register dst, const MemOperand& mem);
399405

400406
// Load And Test
401407
void LoadAndTest32(Register dst, Register src);

src/compiler/backend/s390/code-generator-s390.cc

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2218,28 +2218,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
22182218
__ vl(i.OutputSimd128Register(), operand, Condition(0));
22192219
break;
22202220
}
2221-
#define LOAD_SPLAT(type) \
2222-
AddressingMode mode = kMode_None; \
2223-
MemOperand operand = i.MemoryOperand(&mode); \
2224-
Simd128Register dst = i.OutputSimd128Register(); \
2225-
__ LoadAndSplat##type##LE(dst, operand);
2226-
case kS390_S128Load8Splat: {
2227-
LOAD_SPLAT(8x16);
2228-
break;
2229-
}
2230-
case kS390_S128Load16Splat: {
2231-
LOAD_SPLAT(16x8);
2232-
break;
2233-
}
2234-
case kS390_S128Load32Splat: {
2235-
LOAD_SPLAT(32x4);
2236-
break;
2237-
}
2238-
case kS390_S128Load64Splat: {
2239-
LOAD_SPLAT(64x2);
2240-
break;
2241-
}
2242-
#undef LOAD_SPLAT
22432221
case kS390_StoreWord8:
22442222
ASSEMBLE_STORE_INTEGER(StoreU8);
22452223
break;
@@ -3409,6 +3387,58 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
34093387
__ vpkls(dst, dst, kScratchDoubleReg, Condition(0), Condition(3));
34103388
break;
34113389
}
3390+
#define LOAD_SPLAT(type) \
3391+
AddressingMode mode = kMode_None; \
3392+
MemOperand operand = i.MemoryOperand(&mode); \
3393+
Simd128Register dst = i.OutputSimd128Register(); \
3394+
__ LoadAndSplat##type##LE(dst, operand);
3395+
case kS390_S128Load64Splat: {
3396+
LOAD_SPLAT(64x2);
3397+
break;
3398+
}
3399+
case kS390_S128Load32Splat: {
3400+
LOAD_SPLAT(32x4);
3401+
break;
3402+
}
3403+
case kS390_S128Load16Splat: {
3404+
LOAD_SPLAT(16x8);
3405+
break;
3406+
}
3407+
case kS390_S128Load8Splat: {
3408+
LOAD_SPLAT(8x16);
3409+
break;
3410+
}
3411+
#undef LOAD_SPLAT
3412+
#define LOAD_EXTEND(type) \
3413+
AddressingMode mode = kMode_None; \
3414+
MemOperand operand = i.MemoryOperand(&mode); \
3415+
Simd128Register dst = i.OutputSimd128Register(); \
3416+
__ LoadAndExtend##type##LE(dst, operand);
3417+
case kS390_S128Load32x2U: {
3418+
LOAD_EXTEND(32x2U);
3419+
break;
3420+
}
3421+
case kS390_S128Load32x2S: {
3422+
LOAD_EXTEND(32x2S);
3423+
break;
3424+
}
3425+
case kS390_S128Load16x4U: {
3426+
LOAD_EXTEND(16x4U);
3427+
break;
3428+
}
3429+
case kS390_S128Load16x4S: {
3430+
LOAD_EXTEND(16x4S);
3431+
break;
3432+
}
3433+
case kS390_S128Load8x8U: {
3434+
LOAD_EXTEND(8x8U);
3435+
break;
3436+
}
3437+
case kS390_S128Load8x8S: {
3438+
LOAD_EXTEND(8x8S);
3439+
break;
3440+
}
3441+
#undef LOAD_EXTEND
34123442
case kS390_StoreCompressTagged: {
34133443
CHECK(!instr->HasOutput());
34143444
size_t index = 0;

src/compiler/backend/s390/instruction-codes-s390.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,12 @@ namespace compiler {
376376
V(S390_S128Load16Splat) \
377377
V(S390_S128Load32Splat) \
378378
V(S390_S128Load64Splat) \
379+
V(S390_S128Load8x8S) \
380+
V(S390_S128Load8x8U) \
381+
V(S390_S128Load16x4S) \
382+
V(S390_S128Load16x4U) \
383+
V(S390_S128Load32x2S) \
384+
V(S390_S128Load32x2U) \
379385
V(S390_StoreSimd128) \
380386
V(S390_LoadSimd128) \
381387
V(S390_StoreCompressTagged) \

src/compiler/backend/s390/instruction-scheduler-s390.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
363363
case kS390_S128Load16Splat:
364364
case kS390_S128Load32Splat:
365365
case kS390_S128Load64Splat:
366+
case kS390_S128Load8x8S:
367+
case kS390_S128Load8x8U:
368+
case kS390_S128Load16x4S:
369+
case kS390_S128Load16x4U:
370+
case kS390_S128Load32x2S:
371+
case kS390_S128Load32x2U:
366372
return kIsLoadOperation;
367373

368374
case kS390_StoreWord8:

src/compiler/backend/s390/instruction-selector-s390.cc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2808,6 +2808,24 @@ void InstructionSelector::VisitLoadTransform(Node* node) {
28082808
case LoadTransformation::kS128Load64Splat:
28092809
opcode = kS390_S128Load64Splat;
28102810
break;
2811+
case LoadTransformation::kS128Load8x8S:
2812+
opcode = kS390_S128Load8x8S;
2813+
break;
2814+
case LoadTransformation::kS128Load8x8U:
2815+
opcode = kS390_S128Load8x8U;
2816+
break;
2817+
case LoadTransformation::kS128Load16x4S:
2818+
opcode = kS390_S128Load16x4S;
2819+
break;
2820+
case LoadTransformation::kS128Load16x4U:
2821+
opcode = kS390_S128Load16x4U;
2822+
break;
2823+
case LoadTransformation::kS128Load32x2S:
2824+
opcode = kS390_S128Load32x2S;
2825+
break;
2826+
case LoadTransformation::kS128Load32x2U:
2827+
opcode = kS390_S128Load32x2U;
2828+
break;
28112829
default:
28122830
UNREACHABLE();
28132831
}

src/execution/s390/simulator-s390.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,7 @@ void Simulator::EvalTableInit() {
760760
V(vlr, VLR, 0xE756) /* type = VRR_A VECTOR LOAD */ \
761761
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
762762
V(vlef, VLEF, 0xE703) /* type = VRX VECTOR LOAD ELEMENT (32) */ \
763+
V(vleg, VLEG, 0xE702) /* type = VRX VECTOR LOAD ELEMENT (64) */ \
763764
V(vavgl, VAVGL, 0xE7F0) /* type = VRR_C VECTOR AVERAGE LOGICAL */ \
764765
V(va, VA, 0xE7F3) /* type = VRR_C VECTOR ADD */ \
765766
V(vs, VS, 0xE7F7) /* type = VRR_C VECTOR SUBTRACT */ \
@@ -3205,6 +3206,15 @@ EVALUATE(VLEF) {
32053206
return length;
32063207
}
32073208

3209+
EVALUATE(VLEG) {
3210+
DCHECK_OPCODE(VLEG);
3211+
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
3212+
intptr_t addr = GET_ADDRESS(x2, b2, d2);
3213+
uint64_t value = ReadDW(addr);
3214+
set_simd_register_by_lane<uint64_t>(r1, m3, value);
3215+
return length;
3216+
}
3217+
32083218
// TODO(john): unify most fp binary operations
32093219
template <class T, class Operation>
32103220
inline static void VectorBinaryOp(Simulator* sim, int dst, int src1, int src2,

0 commit comments

Comments
 (0)