pytorch · mingfeima · Aug 21, 2018 · Sep 3, 2018
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
@@ -22,14 +22,6 @@ namespace {
 
 // TODO: Maybe the foo_ variants should call th_foo_
 
-Tensor norm(const Tensor & self, Scalar p) {
-  if (_has_native(self)) {
-    return native_norm(self, p);
-  } else {
-    return th_norm(self, p);
-  }
-}
-
 Tensor clone(const Tensor& self) {
   if (_has_native(self)) {
     return native_clone(self);

diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
@@ -19,6 +19,7 @@ namespace native {
 
 DEFINE_DISPATCH(sum_kernel);
 DEFINE_DISPATCH(prod_kernel);
+DEFINE_DISPATCH(norm_kernel);
 
 static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
   ScalarType scalarType = self.type().scalarType();
@@ -584,23 +585,68 @@ Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim)
   return reduce_multi_associative_out<_sum, _sum_out>(result, self, dims, keepdim);
 }
 
-Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
-  return at::native::norm_out(result, self, p, dim, keepdim);
+Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_, bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
+    return result;
+  if (self.is_contiguous() && result.is_contiguous()) {
+    _dimreduce_setup(result, self, dim);
+    norm_kernel(kCPU, result, self, p, dim);
+    if (!keepdim) {
+      result.squeeze_(dim);
+    }
+    return result;
+  } else {
+    return at::_th_norm_out(result, self, p, dim, keepdim);
+  }
 }
 
-Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
+Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
   AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
   AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
     return result;
   } else {
-    return at::_th_norm_out(result, self, p, dim, keepdim);
+    if (self.is_cuda()) {
+      return at::_th_norm_out(result, self, p, dim, keepdim);
+    } else {
+      return _norm_out_cpu(result, self, p, dim, keepdim);
+    }
+  }
+}
+
+Tensor _norm(const Tensor &self, Scalar p) {
+  if (self.type().is_sparse()) {
+    return at::native_norm(self, p);
+  } else {
+    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+             "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+    AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
+    if (self.is_cuda()) {
+      return at::th_norm(self, p);
+    } else {
+      if (self.is_contiguous()) {
+        Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
+        norm_kernel(kCPU, result, self, p, nullopt);
+        return result;
+      } else {
+        return at::th_norm(self, p);
+      }
+    }
   }
 }
 
+Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
+  Tensor result = self.type().tensor();
+  return at::native::norm_out(result, self, p, dim, keepdim);
+}
+
+Tensor norm(const Tensor& self, Scalar p) {
+  return at::native::_norm(self, p);
+}
+
 Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
   Tensor result = self.type().tensor();
   return at::native::all_out(result, self, dim, keepdim);

diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -183,9 +183,163 @@ static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional<in
   });
 }
 
+template<typename scalar_t>
+struct NormReduction {
+  // reduction width in number of scalar elements
+  static constexpr int WIDTH = 128 / sizeof(scalar_t);
+  using Vec = Vec256<scalar_t>;
+
+  static void apply(Tensor& res, const Tensor& self, Scalar p, at::optional<int64_t> dim) {
+    auto out_ = res.data<scalar_t>();
+    auto data_ = self.data<scalar_t>();
+    auto numel = self.numel();
+    float pval = 0.0;
+    if (p.isIntegral()){
+      pval = p.to<int64_t>();
+    } else if (p.isFloatingPoint()) {
+      pval = p.to<float>();
+    }
+    if (!dim.has_value()) {
+      *out_ = reduce_all(data_, numel,  pval);
+      return;
+    }
+    int64_t n = self.size(*dim);
+    int64_t stride = self.stride(*dim);
+    // A contiguous tensor does not need to hold a meaningful stride
+    // if the corresponding size is 1
+    if (n == 1) {
+      stride = 1;
+      for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
+        stride *= self.size(i);
+      }
+    }
+    int64_t batch = numel / n;
+    parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
+      for (int64_t bi = begin; bi < end; bi++) {
+        int64_t b = bi / stride;
+        int64_t i = bi % stride;
+        const scalar_t* data = &data_[b * n * stride + i];
+        out_[bi] = norm_reduce(data, n, stride, pval);
+      }
+    });
+  }
+
+  static scalar_t reduce_all(const scalar_t* data_, int64_t size,  float pval) {
+    scalar_t sum = parallel_reduce(
+      0,
+      size,
+      internal::GRAIN_SIZE,
+      (scalar_t)0,
+      [=](int64_t begin, int64_t end, scalar_t init) {
+        const scalar_t* data = &data_[begin];
+        int64_t n = end - begin;
+        scalar_t result = norm_reduce(data, n, 1, pval);
+        return result;
+      },
+      std::plus<scalar_t>());
+    return sum;
+  }
+
+  static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
+    scalar_t result = 0.0;
+    if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
+      int64_t n_rounded = round_down(n, WIDTH);
+      scalar_t result1 = norm_reduce128(data, n_rounded, pval);
+      scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
+      result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);
+    } else {
+      result = norm_reduce_sequential(data, n, stride, pval);
+    }
+    return result;
+  }
+
+  static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
+    scalar_t result = 0.0;
+    if (pval == 0) {
+      for (int64_t k = 0; k < n; k++) {
+        result += (data[k * stride] != 0.0);
+      }
+    } else if (pval == 1) {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::abs(data[k * stride]);
+      }
+    } else if (pval == 2) {
+      for (int64_t k = 0; k < n; k++) {
+        result += data[k * stride] * data[k * stride];
+      }
+      result = std::sqrt(result);
+    } else if (pval == 3) {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
+      }
+      result = std::pow(result, 1.0/3);
+    } else if (std::isinf(pval)) {
+      for (int64_t k = 0; k < n; k++) {
+        result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
+      }
+      result = result;
+    } else {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::pow(std::abs(data[k * stride]), pval);
+      }
+      result = std::pow(result, 1.0/pval);
+    }
+    return result;
+  }
+
+  // Reduce down a column of WIDTH elements (128 bytes) with the given number n
+  // n is already rounded by 128
+  static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
+    scalar_t result = 0.0;
+    Vec acc[4] = {0.0, 0.0, 0.0, 0.0};  // 128 bytes (two cache lines)
+    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
+    int64_t rows = n / WIDTH;
+    if (pval == 1){
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + val.abs();
+        }
+      }
+    }
+    else if (pval == 2) {
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + val * val;
+        }
+      }
+    }
+    else if (pval == 3) {
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + (val * val * val).abs();
+        }
+      }
+    }
+    scalar_t buf[WIDTH] = {0};
+    for (int j = 0; j != 4; j++) {
+      acc[j].store(&buf[j * Vec::size]);
+    }
+    for (int i = 0; i < WIDTH; i++) {
+      result += buf[i];
+    }
+    result = std::pow(result, 1.0/pval);
+    return result;
+  }
+};
+
+static void norm_kernel_impl(Tensor& result, const Tensor& self, Scalar p, at::optional<int64_t> dim) {
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
+    NormReduction<scalar_t>::apply(result, self, p, dim);
+  });
+}
+
 }  // anonymous namespace
 
 REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl);
 REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl);
+REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@@ -11,4 +11,7 @@ using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional<int64_t>);
 DECLARE_DISPATCH(reduce_fn, sum_kernel);
 DECLARE_DISPATCH(reduce_fn, prod_kernel);
 
+using reduce_norm_fn = void(*)(Tensor &, const Tensor &, Scalar, at::optional<int64_t>);
+DECLARE_DISPATCH(reduce_norm_fn, norm_kernel);
+
 }} // namespace at::native