Skip to content

Commit 48fc70c

Browse files
committed
Add support for C <= 4 in AVX version
1 parent 86b13cb commit 48fc70c

File tree

2 files changed

+42
-26
lines changed

2 files changed

+42
-26
lines changed

aten/src/ATen/native/cpu/UpSampleKernel.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1808,7 +1808,8 @@ void upsample_bilinear2d_aa_kernel_impl(
18081808
c10::optional<double> scales_w) {
18091809
#ifdef CPU_CAPABILITY_AVX2
18101810
// Don't worry about the weird "== 1" check, it's just a hack to check the AVX implem against the fallback. It will be removed
1811-
if ((input[0][0][0][0].item<uint8_t>() == 1) && (input.dtype() == at::kByte)) { // TODO: add more assumptions as needed
1811+
// TODO: add more assumptions as needed
1812+
if ((input[0][0][0][0].item<uint8_t>() == 1) && (input.dtype() == at::kByte) && (input.size(1) <= 4)) {
18121813
input[0][0][0][0] = 0; // TODO: remove this atrocity !!!
18131814
beepidiboop(input, output);
18141815
// std::cout << "fast path" << std::endl;

aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
#pragma once
22
#ifdef CPU_CAPABILITY_AVX2
33

4-
5-
#include <ATen/core/Tensor.h>
64
#include <ATen/Context.h>
75
#include <ATen/Dispatch.h>
86
#include <ATen/Parallel.h>
97
#include <ATen/TensorIterator.h>
8+
#include <ATen/core/Tensor.h>
109
#include <ATen/cpu/vec/vec.h>
1110
#include <ATen/native/UpSample.h>
1211
#include <ATen/native/cpu/utils.h>
@@ -72,33 +71,40 @@ static inline double bilinear_filter(double x) {
7271
return 0.0;
7372
}
7473

75-
void unpack_rgb(uint8_t* unpacked, const uint8_t* packed, int num_pixels)
74+
void unpack_rgb(
75+
uint8_t* unpacked,
76+
const uint8_t* packed,
77+
int num_pixels,
78+
int num_channels)
7679
// TODO: maybe use faster version from
7780
// https://github.com/python-pillow/Pillow/pull/2693/files
7881
{
79-
int i;
80-
for (i = 0; i < num_pixels; i++) {
81-
unpacked[0] = packed[0];
82-
unpacked[1] = packed[1];
83-
unpacked[2] = packed[2];
84-
unpacked[3] = 255;
82+
for (const auto i : c10::irange(num_pixels)) {
83+
for (const auto j : c10::irange(num_channels)) {
84+
unpacked[j] = packed[j];
85+
}
86+
for (const auto j : c10::irange(num_channels, 4)) {
87+
unpacked[j] = 255;
88+
}
8589
unpacked += 4;
86-
packed += 3;
90+
packed += num_channels;
8791
}
8892
}
8993

90-
void pack_rgb(uint8_t* packed, const uint8_t* unpacked, int num_pixels)
94+
void pack_rgb(
95+
uint8_t* packed,
96+
const uint8_t* unpacked,
97+
int num_pixels,
98+
int num_channels)
9199
// TODO: maybe use faster version from
92100
// https://github.com/python-pillow/Pillow/pull/2693/files
93101
{
94-
int i;
95-
/* RGB triplets */
96-
for (i = 0; i < num_pixels; i++) {
97-
packed[0] = unpacked[0];
98-
packed[1] = unpacked[1];
99-
packed[2] = unpacked[2];
100-
packed += 3;
102+
for (const auto i : c10::irange(num_pixels)) {
103+
for (const auto j : c10::irange(num_channels)) {
104+
packed[j] = unpacked[j];
105+
}
101106
unpacked += 4;
107+
packed += num_channels;
102108
}
103109
}
104110

@@ -324,8 +330,7 @@ void ImagingResampleVertical_8bpc(
324330
// use the same buffer for normalized coefficients
325331
kk = (INT16*)prekk;
326332
coefs_precision = normalize_coeffs_8bpc(yout, ksize, prekk);
327-
// std::cout << "BB " << coefs_precision << std::endl;
328-
333+
// std::cout << "BB " << coefs_precision << std::endl;
329334

330335
for (yy = 0; yy < yout; yy++) {
331336
k = &kk[yy * ksize];
@@ -455,6 +460,8 @@ UINT32* ImagingResampleInner(
455460
void beepidiboop(const at::Tensor& input, const at::Tensor& output) {
456461
// Assume shape is 1, 3, H, W and layout is channels_last
457462

463+
auto batch_size = input.size(0);
464+
auto num_channels = input.size(1);
458465
auto xin = input.size(3);
459466
auto yin = input.size(2);
460467
auto xout = output.size(3);
@@ -464,19 +471,27 @@ void beepidiboop(const at::Tensor& input, const at::Tensor& output) {
464471

465472
UINT32* unpacked_input_p = (UINT32*)malloc(sizeof(UINT32) * num_pixels_input);
466473

467-
for (const auto i : c10::irange(input.size(0))) {
468-
const uint8_t* packed_input_p = (const uint8_t*)input[i].data_ptr<uint8_t>();
469-
unpack_rgb((uint8_t*)unpacked_input_p, packed_input_p, num_pixels_input);
474+
for (const auto i : c10::irange(batch_size)) {
475+
const uint8_t* packed_input_p =
476+
(const uint8_t*)input[i].data_ptr<uint8_t>();
477+
unpack_rgb(
478+
(uint8_t*)unpacked_input_p,
479+
packed_input_p,
480+
num_pixels_input,
481+
num_channels);
470482

471483
UINT32* unpacked_output_p =
472484
ImagingResampleInner(unpacked_input_p, xin, yin, xout, yout);
473485

474486
uint8_t* packed_output_p = (uint8_t*)output[i].data_ptr<uint8_t>();
475487
pack_rgb(
476-
packed_output_p, (const uint8_t*)unpacked_output_p, num_pixels_output);
488+
packed_output_p,
489+
(const uint8_t*)unpacked_output_p,
490+
num_pixels_output,
491+
num_channels);
477492
}
478493

479-
free(unpacked_input_p);
494+
free(unpacked_input_p);
480495
}
481496

482497
void ImagingResampleHorizontalConvolution8u4x(

0 commit comments

Comments
 (0)