Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions aten/src/ATen/native/LegacyBridge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,6 @@ namespace {

// TODO: Maybe the foo_ variants should call th_foo_

Tensor norm(const Tensor & self, Scalar p) {
if (_has_native(self)) {
return native_norm(self, p);
} else {
return th_norm(self, p);
}
}

Tensor clone(const Tensor& self) {
if (_has_native(self)) {
return native_clone(self);
Expand Down
56 changes: 51 additions & 5 deletions aten/src/ATen/native/ReduceOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ namespace native {

DEFINE_DISPATCH(sum_kernel);
DEFINE_DISPATCH(prod_kernel);
DEFINE_DISPATCH(norm_kernel);

static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
ScalarType scalarType = self.type().scalarType();
Expand Down Expand Up @@ -584,23 +585,68 @@ Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim)
return reduce_multi_associative_out<_sum, _sum_out>(result, self, dims, keepdim);
}

Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
Tensor result = self.type().tensor();
return at::native::norm_out(result, self, p, dim, keepdim);
Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_, bool keepdim) {
int64_t dim = maybe_wrap_dim(dim_, self.dim());
if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
return result;
if (self.is_contiguous() && result.is_contiguous()) {
_dimreduce_setup(result, self, dim);
norm_kernel(kCPU, result, self, p, dim);
if (!keepdim) {
result.squeeze_(dim);
}
return result;
} else {
return at::_th_norm_out(result, self, p, dim, keepdim);
}
}

Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
"norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
dim = maybe_wrap_dim(dim, self.dim());
if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
return result;
} else {
return at::_th_norm_out(result, self, p, dim, keepdim);
if (self.is_cuda()) {
return at::_th_norm_out(result, self, p, dim, keepdim);
} else {
return _norm_out_cpu(result, self, p, dim, keepdim);
}
}
}

Tensor _norm(const Tensor &self, Scalar p) {
if (self.type().is_sparse()) {
return at::native_norm(self, p);
} else {
AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,

This comment was marked as off-topic.

This comment was marked as off-topic.

"norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
if (self.is_cuda()) {
return at::th_norm(self, p);
} else {
if (self.is_contiguous()) {
Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
norm_kernel(kCPU, result, self, p, nullopt);
return result;
} else {
return at::th_norm(self, p);
}
}
}
}

Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
Tensor result = self.type().tensor();
return at::native::norm_out(result, self, p, dim, keepdim);
}

Tensor norm(const Tensor& self, Scalar p) {
return at::native::_norm(self, p);
}

Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
Tensor result = self.type().tensor();
return at::native::all_out(result, self, dim, keepdim);
Expand Down
154 changes: 154 additions & 0 deletions aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,163 @@ static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional<in
});
}

template<typename scalar_t>
struct NormReduction {
// reduction width in number of scalar elements
static constexpr int WIDTH = 128 / sizeof(scalar_t);
using Vec = Vec256<scalar_t>;

static void apply(Tensor& res, const Tensor& self, Scalar p, at::optional<int64_t> dim) {
auto out_ = res.data<scalar_t>();
auto data_ = self.data<scalar_t>();
auto numel = self.numel();
float pval = 0.0;
if (p.isIntegral()){
pval = p.to<int64_t>();
} else if (p.isFloatingPoint()) {
pval = p.to<float>();
}
if (!dim.has_value()) {
*out_ = reduce_all(data_, numel, pval);
return;
}
int64_t n = self.size(*dim);
int64_t stride = self.stride(*dim);
// A contiguous tensor does not need to hold a meaningful stride
// if the corresponding size is 1
if (n == 1) {
stride = 1;
for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
stride *= self.size(i);
}
}
int64_t batch = numel / n;
parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
for (int64_t bi = begin; bi < end; bi++) {
int64_t b = bi / stride;
int64_t i = bi % stride;
const scalar_t* data = &data_[b * n * stride + i];
out_[bi] = norm_reduce(data, n, stride, pval);
}
});
}

static scalar_t reduce_all(const scalar_t* data_, int64_t size, float pval) {
scalar_t sum = parallel_reduce(
0,
size,
internal::GRAIN_SIZE,
(scalar_t)0,
[=](int64_t begin, int64_t end, scalar_t init) {
const scalar_t* data = &data_[begin];
int64_t n = end - begin;
scalar_t result = norm_reduce(data, n, 1, pval);
return result;
},
std::plus<scalar_t>());
return sum;
}

static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
scalar_t result = 0.0;
if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
int64_t n_rounded = round_down(n, WIDTH);
scalar_t result1 = norm_reduce128(data, n_rounded, pval);
scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);

This comment was marked as off-topic.

} else {
result = norm_reduce_sequential(data, n, stride, pval);
}
return result;
}

static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
scalar_t result = 0.0;
if (pval == 0) {
for (int64_t k = 0; k < n; k++) {
result += (data[k * stride] != 0.0);
}
} else if (pval == 1) {
for (int64_t k = 0; k < n; k++) {
result += std::abs(data[k * stride]);
}
} else if (pval == 2) {
for (int64_t k = 0; k < n; k++) {
result += data[k * stride] * data[k * stride];
}
result = std::sqrt(result);
} else if (pval == 3) {
for (int64_t k = 0; k < n; k++) {
result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
}
result = std::pow(result, 1.0/3);
} else if (std::isinf(pval)) {
for (int64_t k = 0; k < n; k++) {
result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
}
result = result;
} else {
for (int64_t k = 0; k < n; k++) {
result += std::pow(std::abs(data[k * stride]), pval);
}
result = std::pow(result, 1.0/pval);
}
return result;
}

// Reduce down a column of WIDTH elements (128 bytes) with the given number n
// n is already rounded by 128
static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
scalar_t result = 0.0;
Vec acc[4] = {0.0, 0.0, 0.0, 0.0}; // 128 bytes (two cache lines)
static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
int64_t rows = n / WIDTH;
if (pval == 1){
for (int row = 0; row < rows; row ++) {
for (int j = 0; j != 4; j++) {
auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
acc[j] = acc[j] + val.abs();
}
}
}
else if (pval == 2) {
for (int row = 0; row < rows; row ++) {
for (int j = 0; j != 4; j++) {
auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
acc[j] = acc[j] + val * val;
}
}
}
else if (pval == 3) {
for (int row = 0; row < rows; row ++) {
for (int j = 0; j != 4; j++) {
auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
acc[j] = acc[j] + (val * val * val).abs();
}
}
}
scalar_t buf[WIDTH] = {0};
for (int j = 0; j != 4; j++) {
acc[j].store(&buf[j * Vec::size]);
}
for (int i = 0; i < WIDTH; i++) {
result += buf[i];
}
result = std::pow(result, 1.0/pval);
return result;
}
};

static void norm_kernel_impl(Tensor& result, const Tensor& self, Scalar p, at::optional<int64_t> dim) {
AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
NormReduction<scalar_t>::apply(result, self, p, dim);
});
}

} // anonymous namespace

REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl);
REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl);
REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);

}} // namespace at::native
3 changes: 3 additions & 0 deletions aten/src/ATen/native/cpu/ReduceOpsKernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional<int64_t>);
DECLARE_DISPATCH(reduce_fn, sum_kernel);
DECLARE_DISPATCH(reduce_fn, prod_kernel);

using reduce_norm_fn = void(*)(Tensor &, const Tensor &, Scalar, at::optional<int64_t>);
DECLARE_DISPATCH(reduce_norm_fn, norm_kernel);

}} // namespace at::native