11#pragma once
22#ifdef CPU_CAPABILITY_AVX2
33
4-
5- #include < ATen/core/Tensor.h>
64#include < ATen/Context.h>
75#include < ATen/Dispatch.h>
86#include < ATen/Parallel.h>
97#include < ATen/TensorIterator.h>
8+ #include < ATen/core/Tensor.h>
109#include < ATen/cpu/vec/vec.h>
1110#include < ATen/native/UpSample.h>
1211#include < ATen/native/cpu/utils.h>
@@ -72,33 +71,40 @@ static inline double bilinear_filter(double x) {
7271 return 0.0 ;
7372}
7473
75- void unpack_rgb (uint8_t * unpacked, const uint8_t * packed, int num_pixels)
74+ void unpack_rgb (
75+ uint8_t * unpacked,
76+ const uint8_t * packed,
77+ int num_pixels,
78+ int num_channels)
7679// TODO: maybe use faster version from
7780// https://github.com/python-pillow/Pillow/pull/2693/files
7881{
79- int i;
80- for (i = 0 ; i < num_pixels; i++) {
81- unpacked[0 ] = packed[0 ];
82- unpacked[1 ] = packed[1 ];
83- unpacked[2 ] = packed[2 ];
84- unpacked[3 ] = 255 ;
82+ for (const auto i : c10::irange (num_pixels)) {
83+ for (const auto j : c10::irange (num_channels)) {
84+ unpacked[j] = packed[j];
85+ }
86+ for (const auto j : c10::irange (num_channels, 4 )) {
87+ unpacked[j] = 255 ;
88+ }
8589 unpacked += 4 ;
86- packed += 3 ;
90+ packed += num_channels ;
8791 }
8892}
8993
90- void pack_rgb (uint8_t * packed, const uint8_t * unpacked, int num_pixels)
94+ void pack_rgb (
95+ uint8_t * packed,
96+ const uint8_t * unpacked,
97+ int num_pixels,
98+ int num_channels)
9199// TODO: maybe use faster version from
92100// https://github.com/python-pillow/Pillow/pull/2693/files
93101{
94- int i;
95- /* RGB triplets */
96- for (i = 0 ; i < num_pixels; i++) {
97- packed[0 ] = unpacked[0 ];
98- packed[1 ] = unpacked[1 ];
99- packed[2 ] = unpacked[2 ];
100- packed += 3 ;
102+ for (const auto i : c10::irange (num_pixels)) {
103+ for (const auto j : c10::irange (num_channels)) {
104+ packed[j] = unpacked[j];
105+ }
101106 unpacked += 4 ;
107+ packed += num_channels;
102108 }
103109}
104110
@@ -324,8 +330,7 @@ void ImagingResampleVertical_8bpc(
324330 // use the same buffer for normalized coefficients
325331 kk = (INT16*)prekk;
326332 coefs_precision = normalize_coeffs_8bpc (yout, ksize, prekk);
327- // std::cout << "BB " << coefs_precision << std::endl;
328-
333+ // std::cout << "BB " << coefs_precision << std::endl;
329334
330335 for (yy = 0 ; yy < yout; yy++) {
331336 k = &kk[yy * ksize];
@@ -455,6 +460,8 @@ UINT32* ImagingResampleInner(
455460void beepidiboop (const at::Tensor& input, const at::Tensor& output) {
456461 // Assume shape is 1, 3, H, W and layout is channels_last
457462
463+ auto batch_size = input.size (0 );
464+ auto num_channels = input.size (1 );
458465 auto xin = input.size (3 );
459466 auto yin = input.size (2 );
460467 auto xout = output.size (3 );
@@ -464,19 +471,27 @@ void beepidiboop(const at::Tensor& input, const at::Tensor& output) {
464471
465472 UINT32* unpacked_input_p = (UINT32*)malloc (sizeof (UINT32) * num_pixels_input);
466473
467- for (const auto i : c10::irange (input.size (0 ))) {
468- const uint8_t * packed_input_p = (const uint8_t *)input[i].data_ptr <uint8_t >();
469- unpack_rgb ((uint8_t *)unpacked_input_p, packed_input_p, num_pixels_input);
474+ for (const auto i : c10::irange (batch_size)) {
475+ const uint8_t * packed_input_p =
476+ (const uint8_t *)input[i].data_ptr <uint8_t >();
477+ unpack_rgb (
478+ (uint8_t *)unpacked_input_p,
479+ packed_input_p,
480+ num_pixels_input,
481+ num_channels);
470482
471483 UINT32* unpacked_output_p =
472484 ImagingResampleInner (unpacked_input_p, xin, yin, xout, yout);
473485
474486 uint8_t * packed_output_p = (uint8_t *)output[i].data_ptr <uint8_t >();
475487 pack_rgb (
476- packed_output_p, (const uint8_t *)unpacked_output_p, num_pixels_output);
488+ packed_output_p,
489+ (const uint8_t *)unpacked_output_p,
490+ num_pixels_output,
491+ num_channels);
477492 }
478493
479- free (unpacked_input_p);
494+ free (unpacked_input_p);
480495}
481496
482497void ImagingResampleHorizontalConvolution8u4x (
0 commit comments