pytorch
diff --git a/‎README.md‎
Lines changed: 3 additions & 9 deletions b/‎README.md‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎aten/src/ATen/Declarations.cwrap‎
Lines changed: 20 additions & 0 deletions b/‎aten/src/ATen/Declarations.cwrap‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎aten/src/ATen/PTThreadPool.h‎
Lines changed: 0 additions & 20 deletions b/‎aten/src/ATen/PTThreadPool.h‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎aten/src/ATen/Parallel.cpp‎
Lines changed: 174 additions & 0 deletions b/‎aten/src/ATen/Parallel.cpp‎
Lines changed: 174 additions & 0 deletions
@@ -156,8 +156,8 @@ You will get a high-quality BLAS library (MKL) and you get controlled dependency
 Once you have [Anaconda](https://www.anaconda.com/distribution/#download-section) installed, here are the instructions.
 
 If you want to compile with CUDA support, install
-- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 7.5 or above
-- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v6.x or above
+- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 9 or above
+- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v7 or above
 
 If you want to disable CUDA support, export environment variable `NO_CUDA=1`.
 Other potentially useful environment variables may be found in `setup.py`.
@@ -175,7 +175,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing
 On Linux
 ```bash
 # Add LAPACK support for the GPU if needed
-conda install -c pytorch magma-cuda90 # or [magma-cuda80 | magma-cuda92 | magma-cuda100 ] depending on your cuda version
+conda install -c pytorch magma-cuda90 # or [magma-cuda92 | magma-cuda100 ] depending on your cuda version
 ```
 
 #### Get the PyTorch Source
@@ -209,9 +209,6 @@ If the version of Visual Studio 2017 is higher than 15.4.5, installing of "VC++
 <br/> There is no guarantee of the correct building with VC++ 2017 toolsets, others than version 15.4 v14.11.
 <br/> "VC++ 2017 version 15.4 v14.11 toolset" might be installed onto already installed Visual Studio 2017 by running its installation once again and checking the corresponding checkbox under "Individual components"/"Compilers, build tools, and runtimes".
 
-For building against CUDA 8.0 Visual Studio 2015 Update 3 (version 14.0), and the [patch](https://download.microsoft.com/download/8/1/d/81dbe6bb-ed92-411a-bef5-3a75ff972c6a/vc14-kb4020481.exe) are needed to be installed too.
-The details of the patch can be found [here](https://support.microsoft.com/en-gb/help/4020481/fix-link-exe-crashes-with-a-fatal-lnk1000-error-when-you-use-wholearch).
-
 NVTX is a part of CUDA distributive, where it is called "Nsight Compute". For installing it onto already installed CUDA run CUDA installation once again and check the corresponding checkbox.
 Be sure that CUDA with Nsight Compute is installed after Visual Studio 2017.
 
@@ -221,9 +218,6 @@ REM [Optional] The following two lines are needed for Python 2.7, but the suppor
 set MSSdk=1
 set FORCE_PY27_BUILD=1
 
-REM [Optional] As for CUDA 8, VS2015 Update 3 is required; use the following line.
-set "CUDAHOSTCXX=%VS140COMNTOOLS%..\..\VC\bin\amd64\cl.exe"
-
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 set DISTUTILS_USE_SDK=1
 
 
@@ -92,11 +92,13 @@
   options:
     - arguments:
       - arg: THTensor* self
+        broadcast: mask inplace fallback types:Bool
       - THBoolTensor* mask
       - real value
     - zero_dim_tensor_only: True
       arguments:
       - arg: THTensor* self
+        broadcast: mask inplace fallback types:Bool
       - THBoolTensor* mask
       - THTensor* value
 ]]
@@ -118,12 +120,15 @@
   return: self
   arguments:
     - arg: THTensor* self
+      broadcast: mask inplace fallback types:Bool
     - THBoolTensor* mask
     - THTensor* source
 ]]
 [[
   name: _th_masked_select
   cname: maskedSelect
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -137,13 +142,16 @@
 [[
   name: _th_masked_select_bool
   cname: maskedSelectBool
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
   arguments:
     - arg: THTensor* result
       output: True
     - arg: THTensor* self
+      broadcast: mask fallback types:Bool
     - THBoolTensor* mask
 ]]
 [[
@@ -366,6 +374,8 @@
 ]]
 [[
   name: _th_and
+  cpu_bool: True
+  cuda_bool: True
   cname: __and__
   variants:
     - function
@@ -388,6 +398,8 @@
 [[
   name: _th_iand_
   cname: __iand__
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -407,6 +419,8 @@
 [[
   name: _th_or
   cname: __or__
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -428,6 +442,8 @@
 [[
   name: _th_ior_
   cname: __ior__
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -447,6 +463,8 @@
 [[
   name: _th_xor
   cname: __xor__
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -1772,6 +1790,8 @@
 [[
   name: _th_sign
   cname: sign
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
 
@@ -0,0 +1,174 @@
+#include <ATen/Parallel.h>
+
+#include <ATen/Config.h>
+#include <ATen/Version.h>
+
+#include <atomic>
+#include <sstream>
+#include <thread>
+
+#ifdef TH_BLAS_MKL
+#include <mkl.h>
+#endif
+
+namespace at {
+
+namespace {
+const int NOT_SET = -1;
+const int CONSUMED = -2;
+
+// Number of threads set by the user
+std::atomic<int> num_threads{NOT_SET};
+
+// Number of inter-op threads set by the user;
+// NOT_SET -> positive value -> CONSUMED
+// (CONSUMED - thread pool is initialized)
+// or
+// NOT_SET -> CONSUMED
+std::atomic<int> num_interop_threads{NOT_SET};
+
+// thread pool global instance is hidden,
+// users should use at::launch and get/set_num_interop_threads interface
+TaskThreadPoolBase& get_pool() {
+  static std::shared_ptr<TaskThreadPoolBase> pool =
+      ThreadPoolRegistry()->Create(
+          "C10",
+          /* device_id */ 0,
+          /* pool_size */ num_interop_threads.exchange(CONSUMED),
+          /* create_new */ true);
+  return *pool;
+}
+
+ // Factory function for ThreadPoolRegistry
+std::shared_ptr<TaskThreadPoolBase> create_c10_threadpool(
+    int device_id,
+    int pool_size,
+    bool create_new) {
+  // For now, the only accepted device id is 0
+  AT_CHECK(device_id == 0);
+  // Create new thread pool
+  AT_CHECK(create_new);
+  return std::make_shared<PTThreadPool>(pool_size);
+}
+
+}
+
+void init_num_threads() {
+  auto nthreads = num_threads.load();
+  if (nthreads > 0) {
+    set_num_threads(nthreads);
+  } else {
+#if defined(_OPENMP) && defined(TH_BLAS_MKL)
+  // If we are using MKL an OpenMP make sure the number of threads match.
+  // Otherwise, MKL and our OpenMP-enabled functions will keep changing the
+  // size of the OpenMP thread pool, resulting in worse performance (and memory
+  // leaks in GCC 5.4)
+  omp_set_num_threads(mkl_get_max_threads());
+#endif
+  }
+}
+
+void set_num_threads(int nthreads) {
+  AT_CHECK(nthreads > 0, "Expected positive number of threads");
+
+  num_threads.store(nthreads);
+#ifdef _OPENMP
+  omp_set_num_threads(nthreads);
+#endif
+#ifdef TH_BLAS_MKL
+  mkl_set_num_threads(nthreads);
+
+  // because PyTorch uses OpenMP outside of MKL invocations
+  // as well, we want this flag to be false, so that
+  // threads aren't destroyed and recreated across every
+  // MKL / non-MKL boundary of OpenMP usage
+  // See https://github.com/pytorch/pytorch/issues/13757
+  mkl_set_dynamic(false);
+#endif
+}
+
+// Explicitly calling omp_get_max_threads() as the size of the parallel
+// region might be different in the new thread;
+// Use init_num_threads() during thread initialization to ensure
+// consistent size of parallel region in different threads
+int get_num_threads() {
+#ifdef _OPENMP
+  return omp_get_max_threads();
+#else
+  return 1;
+#endif
+}
+
+namespace {
+const char* get_env_var(const char* var_name) {
+  const char* value = std::getenv(var_name);
+  return value ? value : "[not set]";
+}
+}
+
+std::string get_parallel_info() {
+  std::ostringstream ss;
+
+  ss << "ATen/Parallel:\n\tat::get_num_threads() : "
+     << at::get_num_threads() << std::endl;
+
+  ss << at::get_openmp_version() << std::endl;
+#ifdef _OPENMP
+  ss << "\tomp_get_max_threads() : " << omp_get_max_threads() << std::endl;
+#endif
+
+  ss << at::get_mkl_version() << std::endl;
+#ifdef TH_BLAS_MKL
+  ss << "\tmkl_get_max_threads() : " << mkl_get_max_threads() << std::endl;
+#endif
+
+  ss << at::get_mkldnn_version() << std::endl;
+
+  ss << "std::thread::hardware_concurrency() : "
+     << std::thread::hardware_concurrency() << std::endl;
+
+  ss << "Environment variables:" << std::endl;
+  ss << "\tOMP_NUM_THREADS : " << get_env_var("OMP_NUM_THREADS") << std::endl;
+  ss << "\tMKL_NUM_THREADS : " << get_env_var("MKL_NUM_THREADS") << std::endl;
+
+  return ss.str();
+}
+
+PTThreadPool::PTThreadPool(
+    int pool_size,
+    int numa_node_id)
+    : c10::ThreadPool(pool_size, numa_node_id) {}
+
+void PTThreadPool::init_thread() {
+  c10::setThreadName("PTThreadPool");
+  at::init_num_threads();
+}
+
+C10_REGISTER_CREATOR(ThreadPoolRegistry, C10, create_c10_threadpool);
+
+void set_num_interop_threads(int nthreads) {
+  AT_CHECK(nthreads > 0, "Expected positive number of threads");
+
+  int no_value = NOT_SET;
+  AT_CHECK(num_interop_threads.compare_exchange_strong(no_value, nthreads),
+      "Error: cannot set number of interop threads after parallel work "
+      "has started or set_num_interop_threads called");
+}
+
+int get_num_interop_threads() {
+  int nthreads = num_interop_threads.load();
+  if (nthreads > 0) {
+    return nthreads;
+  } else if (nthreads == NOT_SET) {
+    // return default value
+    return TaskThreadPoolBase::defaultNumThreads();
+  } else {
+    return get_pool().size();
+  }
+}
+
+void launch(const std::function<void()>& func) {
+  get_pool().run(func);
+}
+
+} // namespace at