pytorch · goldsborough · Jun 21, 2018 · Jun 6, 2018 · Jun 8, 2018 · Jun 10, 2018
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -397,7 +397,7 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
   DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")
 
 # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake
-FOREACH(HEADER ${base_h})
+FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h})
   string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER})
   GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY)
   INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR})
@@ -406,11 +406,6 @@ FOREACH(HEADER ${generated_h} ${cuda_generated_h})
   # NB: Assumed to be flat
   INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen)
 ENDFOREACH()
-FOREACH(HEADER ${cuda_h})
-  string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER})
-  GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY)
-  INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR})
-ENDFOREACH()
 INSTALL(FILES ${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml
   DESTINATION ${AT_INSTALL_SHARE_DIR}/ATen)
 

diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
@@ -5,6 +5,7 @@
 #include "cudnn-wrapper.h"
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
+#include "ATen/cuda/ATenCUDAGeneral.h"
 #include <cuda.h>
 
 #if CUDNN_VERSION < 7000
@@ -100,7 +101,7 @@ struct DescriptorDeleter {
 // initialized the first time you call set() or any other initializing
 // function.
 template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
-class Descriptor
+class AT_CUDA_API Descriptor
 {
 public:
   // TODO: Figure out why const-correctness doesn't work here
@@ -129,7 +130,7 @@ class Descriptor
   std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
 };
 
-class TensorDescriptor
+class AT_CUDA_API TensorDescriptor
   : public Descriptor<cudnnTensorStruct,
                       &cudnnCreateTensorDescriptor,
                       &cudnnDestroyTensorDescriptor>
@@ -181,7 +182,7 @@ class FilterDescriptor
   }
 };
 
-struct ConvolutionDescriptor
+struct AT_CUDA_API ConvolutionDescriptor
   : public Descriptor<cudnnConvolutionStruct,
                       &cudnnCreateConvolutionDescriptor,
                       &cudnnDestroyConvolutionDescriptor>
@@ -200,7 +201,7 @@ struct ConvolutionDescriptor
   }
 };
 
-struct SpatialTransformerDescriptor
+struct AT_CUDA_API SpatialTransformerDescriptor
   : public Descriptor<cudnnSpatialTransformerStruct,
                       &cudnnCreateSpatialTransformerDescriptor,
                       &cudnnDestroySpatialTransformerDescriptor>
@@ -239,7 +240,7 @@ inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
 
 #endif // CUDNN_VERSION
 
-struct DropoutDescriptor
+struct AT_CUDA_API DropoutDescriptor
   : public Descriptor<cudnnDropoutStruct,
                       &cudnnCreateDropoutDescriptor,
                       &cudnnDestroyDropoutDescriptor>
@@ -281,7 +282,7 @@ struct DropoutDescriptor
   }
 };
 
-struct RNNDescriptor
+struct AT_CUDA_API RNNDescriptor
   : public Descriptor<cudnnRNNStruct,
                       &cudnnCreateRNNDescriptor,
                       &cudnnDestroyRNNDescriptor>

diff --git a/test/cpp_extensions/cudnn_extension.cpp b/test/cpp_extensions/cudnn_extension.cpp
@@ -0,0 +1,80 @@
+/*
+ * CuDNN ReLU extension. Simple function but contains the general structure of
+ * most CuDNN extensions:
+ * 1) Check arguments. at::check* functions provide a standard way to validate
+ * input and provide pretty errors.
+ * 2) Create descriptors. Most CuDNN functions require creating and setting a
+ * variety of descriptors.
+ * 3) Apply the CuDNN function.
+ * 4) Destroy your descriptors.
+ * 5) Return something (optional).
+ */
+
+#include <torch/torch.h>
+
+#include <ATen/cudnn/Descriptors.h> // for TensorDescriptor
+#include <ATen/cudnn/Exceptions.h> // for CUDNN_CHECK
+#include <ATen/cudnn/Handles.h> // for getCudnnHandle
+
+// Name of function in python module and name used for error messages by
+// at::check* functions.
+const char* cudnn_relu_name = "cudnn_relu";
+
+// Check arguments to cudnn_relu
+void cudnn_relu_check(const at::Tensor& inputs, const at::Tensor& outputs) {
+  // Create TensorArgs. These record the names and positions of each tensor as a
+  // parameter.
+  at::TensorArg arg_inputs(inputs, "inputs", 0);
+  at::TensorArg arg_outputs(outputs, "outputs", 1);
+  // Check arguments. No need to return anything. These functions with throw an
+  // error if they fail. Messages are populated using information from
+  // TensorArgs.
+  at::checkContiguous(cudnn_relu_name, arg_inputs);
+  at::checkScalarType(cudnn_relu_name, arg_inputs, at::kFloat);
+  at::checkBackend(cudnn_relu_name, arg_inputs.tensor, at::kCUDA);
+  at::checkContiguous(cudnn_relu_name, arg_outputs);
+  at::checkScalarType(cudnn_relu_name, arg_outputs, at::kFloat);
+  at::checkBackend(cudnn_relu_name, arg_outputs.tensor, at::kCUDA);
+  at::checkSameSize(cudnn_relu_name, arg_inputs, arg_outputs);
+}
+
+void cudnn_relu(const at::Tensor& inputs, const at::Tensor& outputs) {
+  // Most CuDNN extensions will follow a similar pattern.
+  // Step 1: Check inputs. This will throw an error if inputs are invalid, so no
+  // need to check return codes here.
+  cudnn_relu_check(inputs, outputs);
+  // Step 2: Create descriptors
+  cudnnHandle_t cuDnn = at::native::getCudnnHandle();
+  // Note: 4 is minimum dim for a TensorDescriptor. Input and output are same
+  // size and type and contiguous, so one descriptor is sufficient.
+  at::native::TensorDescriptor input_tensor_desc(inputs, 4);
+  cudnnActivationDescriptor_t activationDesc;
+  // Note: Always check return value of cudnn functions using CUDNN_CHECK
+  at::native::CUDNN_CHECK(cudnnCreateActivationDescriptor(&activationDesc));
+  at::native::CUDNN_CHECK(cudnnSetActivationDescriptor(
+      activationDesc,
+      /*mode=*/CUDNN_ACTIVATION_RELU,
+      /*reluNanOpt=*/CUDNN_PROPAGATE_NAN,
+      /*coef=*/1.));
+  // Step 3: Apply CuDNN function
+  float alpha = 1.;
+  float beta = 0.;
+  at::native::CUDNN_CHECK(cudnnActivationForward(
+      cuDnn,
+      activationDesc,
+      &alpha,
+      input_tensor_desc.desc(),
+      inputs.data_ptr(),
+      &beta,
+      input_tensor_desc.desc(), // output descriptor same as input
+      outputs.data_ptr()));
+  // Step 4: Destroy descriptors
+  at::native::CUDNN_CHECK(cudnnDestroyActivationDescriptor(activationDesc));
+  // Step 5: Return something (optional)
+}
+
+// Create the pybind11 module
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // Use the same name as the check functions so error messages make sense
+  m.def(cudnn_relu_name, &cudnn_relu, "CuDNN ReLU");
+}
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
@@ -1,7 +1,9 @@
 import unittest
+import sys
 
 import torch
 import torch.utils.cpp_extension
+import torch.backends.cudnn
 try:
     import torch_test_cpp_extension.cpp as cpp_extension
 except ImportError:
@@ -13,6 +15,7 @@
 
 from torch.utils.cpp_extension import CUDA_HOME
 TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
+TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_available()
 
 
 class TestCppExtension(common.TestCase):
@@ -100,6 +103,30 @@ def test_jit_cuda_extension(self):
         # 2 * sigmoid(0) = 2 * 0.5 = 1
         self.assertEqual(z, torch.ones_like(z))
 
+    @unittest.skipIf(not TEST_CUDNN, "CuDNN not found")
+    def test_jit_cudnn_extension(self):
+        # implementation of CuDNN ReLU
+        if sys.platform == 'win32':
+            extra_ldflags = ['cudnn.lib']
+        else:
+            extra_ldflags = ['-lcudnn']
+        module = torch.utils.cpp_extension.load(
+            name='torch_test_cudnn_extension',
+            sources=[
+                'cpp_extensions/cudnn_extension.cpp'
+            ],
+            extra_ldflags=extra_ldflags,
+            verbose=True,
+            with_cuda=True)
+
+        x = torch.randn(100, device='cuda', dtype=torch.float32)
+        y = torch.zeros(100, device='cuda', dtype=torch.float32)
+        module.cudnn_relu(x, y)  # y=relu(x)
+        self.assertEqual(torch.nn.functional.relu(x), y)
+        with self.assertRaisesRegex(RuntimeError, "same size"):
+            y_incorrect = torch.zeros(20, device='cuda', dtype=torch.float32)
+            module.cudnn_relu(x, y_incorrect)
+
     def test_optional(self):
         has_value = cpp_extension.function_taking_optional(torch.ones(5))
         self.assertTrue(has_value)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
@@ -416,7 +416,8 @@ def load(name,
          extra_ldflags=None,
          extra_include_paths=None,
          build_directory=None,
-         verbose=False):
+         verbose=False,
+         with_cuda=None):
     '''
     Loads a PyTorch C++ extension just-in-time (JIT).
 
@@ -464,6 +465,11 @@ def load(name,
             to the build.
         build_directory: optional path to use as build workspace.
         verbose: If ``True``, turns on verbose logging of load steps.
+        with_cuda: Determines whether CUDA headers and libraries are added to
+            the build. If set to ``None`` (default), this value is
+            automatically determined based on the existence of ``.cu`` or
+            ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers
+            and libraries to be included.
 
     Returns:
         The loaded PyTorch extension as a Python module.
@@ -484,7 +490,8 @@ def load(name,
         extra_ldflags,
         extra_include_paths,
         build_directory or _get_build_directory(name, verbose),
-        verbose)
+        verbose,
+        with_cuda=with_cuda)
 
 
 def load_inline(name,
@@ -496,7 +503,8 @@ def load_inline(name,
                 extra_ldflags=None,
                 extra_include_paths=None,
                 build_directory=None,
-                verbose=False):
+                verbose=False,
+                with_cuda=None):
     '''
     Loads a PyTorch C++ extension just-in-time (JIT) from string sources.
 
@@ -538,6 +546,11 @@ def load_inline(name,
         functions: A list of function names for which to generate function
             bindings. If a dictionary is given, it should map function names to
             docstrings (which are otherwise just the function names).
+        with_cuda: Determines whether CUDA headers and libraries are added to
+            the build. If set to ``None`` (default), this value is
+            automatically determined based on whether ``cuda_sources`` is
+            provided. Set it to `True`` to force CUDA headers
+            and libraries to be included.
 
     Example:
         >>> from torch.utils.cpp_extension import load_inline
@@ -552,8 +565,6 @@ def load_inline(name,
     '''
     build_directory = build_directory or _get_build_directory(name, verbose)
 
-    source_files = []
-
     if isinstance(cpp_sources, str):
         cpp_sources = [cpp_sources]
     cuda_sources = cuda_sources or []
@@ -606,7 +617,8 @@ def load_inline(name,
         extra_ldflags,
         extra_include_paths,
         build_directory,
-        verbose)
+        verbose,
+        with_cuda=with_cuda)
 
 
 def _jit_compile(name,
@@ -616,13 +628,15 @@ def _jit_compile(name,
                  extra_ldflags,
                  extra_include_paths,
                  build_directory,
-                 verbose):
+                 verbose,
+                 with_cuda=None):
     baton = FileBaton(os.path.join(build_directory, 'lock'))
     if baton.try_acquire():
         try:
             verify_ninja_availability()
             check_compiler_abi_compatibility(os.environ.get('CXX', 'c++'))
-            with_cuda = any(map(_is_cuda_file, sources))
+            if with_cuda is None:
+                with_cuda = any(map(_is_cuda_file, sources))
             extra_ldflags = _prepare_ldflags(
                 extra_ldflags or [],
                 with_cuda,
@@ -837,7 +851,7 @@ def _write_ninja_file(path,
     for source_file in sources:
         # '/path/to/file.cpp' -> 'file'
         file_name = os.path.splitext(os.path.basename(source_file))[0]
-        if _is_cuda_file(source_file):
+        if _is_cuda_file(source_file) and with_cuda:
             rule = 'cuda_compile'
             # Use a different object filename in case a C++ and CUDA file have
             # the same filename but different extension (.cpp vs. .cu).