Skip to content

Commit 57e42f4

Browse files
author
eellison
committed
Merge branch 'master' of https://github.com/pytorch/pytorch into string_methods
2 parents 2d984f2 + a21cf76 commit 57e42f4

File tree

69 files changed

+2210
-1553
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+2210
-1553
lines changed

README.md

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,8 @@ You will get a high-quality BLAS library (MKL) and you get controlled dependency
156156
Once you have [Anaconda](https://www.anaconda.com/distribution/#download-section) installed, here are the instructions.
157157

158158
If you want to compile with CUDA support, install
159-
- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 7.5 or above
160-
- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v6.x or above
159+
- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 9 or above
160+
- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v7 or above
161161

162162
If you want to disable CUDA support, export environment variable `NO_CUDA=1`.
163163
Other potentially useful environment variables may be found in `setup.py`.
@@ -175,7 +175,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing
175175
On Linux
176176
```bash
177177
# Add LAPACK support for the GPU if needed
178-
conda install -c pytorch magma-cuda90 # or [magma-cuda80 | magma-cuda92 | magma-cuda100 ] depending on your cuda version
178+
conda install -c pytorch magma-cuda90 # or [magma-cuda92 | magma-cuda100 ] depending on your cuda version
179179
```
180180

181181
#### Get the PyTorch Source
@@ -209,9 +209,6 @@ If the version of Visual Studio 2017 is higher than 15.4.5, installing of "VC++
209209
<br/> There is no guarantee of the correct building with VC++ 2017 toolsets, others than version 15.4 v14.11.
210210
<br/> "VC++ 2017 version 15.4 v14.11 toolset" might be installed onto already installed Visual Studio 2017 by running its installation once again and checking the corresponding checkbox under "Individual components"/"Compilers, build tools, and runtimes".
211211

212-
For building against CUDA 8.0 Visual Studio 2015 Update 3 (version 14.0), and the [patch](https://download.microsoft.com/download/8/1/d/81dbe6bb-ed92-411a-bef5-3a75ff972c6a/vc14-kb4020481.exe) are needed to be installed too.
213-
The details of the patch can be found [here](https://support.microsoft.com/en-gb/help/4020481/fix-link-exe-crashes-with-a-fatal-lnk1000-error-when-you-use-wholearch).
214-
215212
NVTX is a part of CUDA distributive, where it is called "Nsight Compute". For installing it onto already installed CUDA run CUDA installation once again and check the corresponding checkbox.
216213
Be sure that CUDA with Nsight Compute is installed after Visual Studio 2017.
217214

@@ -221,9 +218,6 @@ REM [Optional] The following two lines are needed for Python 2.7, but the suppor
221218
set MSSdk=1
222219
set FORCE_PY27_BUILD=1
223220
224-
REM [Optional] As for CUDA 8, VS2015 Update 3 is required; use the following line.
225-
set "CUDAHOSTCXX=%VS140COMNTOOLS%..\..\VC\bin\amd64\cl.exe"
226-
227221
set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
228222
set DISTUTILS_USE_SDK=1
229223

aten/src/ATen/Declarations.cwrap

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,13 @@
9292
options:
9393
- arguments:
9494
- arg: THTensor* self
95+
broadcast: mask inplace fallback types:Bool
9596
- THBoolTensor* mask
9697
- real value
9798
- zero_dim_tensor_only: True
9899
arguments:
99100
- arg: THTensor* self
101+
broadcast: mask inplace fallback types:Bool
100102
- THBoolTensor* mask
101103
- THTensor* value
102104
]]
@@ -118,12 +120,15 @@
118120
return: self
119121
arguments:
120122
- arg: THTensor* self
123+
broadcast: mask inplace fallback types:Bool
121124
- THBoolTensor* mask
122125
- THTensor* source
123126
]]
124127
[[
125128
name: _th_masked_select
126129
cname: maskedSelect
130+
cpu_bool: True
131+
cuda_bool: True
127132
variants:
128133
- function
129134
return: argument 0
@@ -137,13 +142,16 @@
137142
[[
138143
name: _th_masked_select_bool
139144
cname: maskedSelectBool
145+
cpu_bool: True
146+
cuda_bool: True
140147
variants:
141148
- function
142149
return: argument 0
143150
arguments:
144151
- arg: THTensor* result
145152
output: True
146153
- arg: THTensor* self
154+
broadcast: mask fallback types:Bool
147155
- THBoolTensor* mask
148156
]]
149157
[[
@@ -366,6 +374,8 @@
366374
]]
367375
[[
368376
name: _th_and
377+
cpu_bool: True
378+
cuda_bool: True
369379
cname: __and__
370380
variants:
371381
- function
@@ -388,6 +398,8 @@
388398
[[
389399
name: _th_iand_
390400
cname: __iand__
401+
cpu_bool: True
402+
cuda_bool: True
391403
variants:
392404
- function
393405
return: argument 0
@@ -407,6 +419,8 @@
407419
[[
408420
name: _th_or
409421
cname: __or__
422+
cpu_bool: True
423+
cuda_bool: True
410424
variants:
411425
- function
412426
return: argument 0
@@ -428,6 +442,8 @@
428442
[[
429443
name: _th_ior_
430444
cname: __ior__
445+
cpu_bool: True
446+
cuda_bool: True
431447
variants:
432448
- function
433449
return: argument 0
@@ -447,6 +463,8 @@
447463
[[
448464
name: _th_xor
449465
cname: __xor__
466+
cpu_bool: True
467+
cuda_bool: True
450468
variants:
451469
- function
452470
return: argument 0
@@ -1772,6 +1790,8 @@
17721790
[[
17731791
name: _th_sign
17741792
cname: sign
1793+
cpu_bool: True
1794+
cuda_bool: True
17751795
variants:
17761796
- function
17771797
return: argument 0

aten/src/ATen/PTThreadPool.h

Lines changed: 0 additions & 20 deletions
This file was deleted.

aten/src/ATen/Parallel.cpp

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#include <ATen/Parallel.h>
2+
3+
#include <ATen/Config.h>
4+
#include <ATen/Version.h>
5+
6+
#include <atomic>
7+
#include <sstream>
8+
#include <thread>
9+
10+
#ifdef TH_BLAS_MKL
11+
#include <mkl.h>
12+
#endif
13+
14+
namespace at {
15+
16+
namespace {
17+
const int NOT_SET = -1;
18+
const int CONSUMED = -2;
19+
20+
// Number of threads set by the user
21+
std::atomic<int> num_threads{NOT_SET};
22+
23+
// Number of inter-op threads set by the user;
24+
// NOT_SET -> positive value -> CONSUMED
25+
// (CONSUMED - thread pool is initialized)
26+
// or
27+
// NOT_SET -> CONSUMED
28+
std::atomic<int> num_interop_threads{NOT_SET};
29+
30+
// thread pool global instance is hidden,
31+
// users should use at::launch and get/set_num_interop_threads interface
32+
TaskThreadPoolBase& get_pool() {
33+
static std::shared_ptr<TaskThreadPoolBase> pool =
34+
ThreadPoolRegistry()->Create(
35+
"C10",
36+
/* device_id */ 0,
37+
/* pool_size */ num_interop_threads.exchange(CONSUMED),
38+
/* create_new */ true);
39+
return *pool;
40+
}
41+
42+
// Factory function for ThreadPoolRegistry
43+
std::shared_ptr<TaskThreadPoolBase> create_c10_threadpool(
44+
int device_id,
45+
int pool_size,
46+
bool create_new) {
47+
// For now, the only accepted device id is 0
48+
AT_CHECK(device_id == 0);
49+
// Create new thread pool
50+
AT_CHECK(create_new);
51+
return std::make_shared<PTThreadPool>(pool_size);
52+
}
53+
54+
}
55+
56+
void init_num_threads() {
57+
auto nthreads = num_threads.load();
58+
if (nthreads > 0) {
59+
set_num_threads(nthreads);
60+
} else {
61+
#if defined(_OPENMP) && defined(TH_BLAS_MKL)
62+
// If we are using MKL an OpenMP make sure the number of threads match.
63+
// Otherwise, MKL and our OpenMP-enabled functions will keep changing the
64+
// size of the OpenMP thread pool, resulting in worse performance (and memory
65+
// leaks in GCC 5.4)
66+
omp_set_num_threads(mkl_get_max_threads());
67+
#endif
68+
}
69+
}
70+
71+
void set_num_threads(int nthreads) {
72+
AT_CHECK(nthreads > 0, "Expected positive number of threads");
73+
74+
num_threads.store(nthreads);
75+
#ifdef _OPENMP
76+
omp_set_num_threads(nthreads);
77+
#endif
78+
#ifdef TH_BLAS_MKL
79+
mkl_set_num_threads(nthreads);
80+
81+
// because PyTorch uses OpenMP outside of MKL invocations
82+
// as well, we want this flag to be false, so that
83+
// threads aren't destroyed and recreated across every
84+
// MKL / non-MKL boundary of OpenMP usage
85+
// See https://github.com/pytorch/pytorch/issues/13757
86+
mkl_set_dynamic(false);
87+
#endif
88+
}
89+
90+
// Explicitly calling omp_get_max_threads() as the size of the parallel
91+
// region might be different in the new thread;
92+
// Use init_num_threads() during thread initialization to ensure
93+
// consistent size of parallel region in different threads
94+
int get_num_threads() {
95+
#ifdef _OPENMP
96+
return omp_get_max_threads();
97+
#else
98+
return 1;
99+
#endif
100+
}
101+
102+
namespace {
103+
const char* get_env_var(const char* var_name) {
104+
const char* value = std::getenv(var_name);
105+
return value ? value : "[not set]";
106+
}
107+
}
108+
109+
std::string get_parallel_info() {
110+
std::ostringstream ss;
111+
112+
ss << "ATen/Parallel:\n\tat::get_num_threads() : "
113+
<< at::get_num_threads() << std::endl;
114+
115+
ss << at::get_openmp_version() << std::endl;
116+
#ifdef _OPENMP
117+
ss << "\tomp_get_max_threads() : " << omp_get_max_threads() << std::endl;
118+
#endif
119+
120+
ss << at::get_mkl_version() << std::endl;
121+
#ifdef TH_BLAS_MKL
122+
ss << "\tmkl_get_max_threads() : " << mkl_get_max_threads() << std::endl;
123+
#endif
124+
125+
ss << at::get_mkldnn_version() << std::endl;
126+
127+
ss << "std::thread::hardware_concurrency() : "
128+
<< std::thread::hardware_concurrency() << std::endl;
129+
130+
ss << "Environment variables:" << std::endl;
131+
ss << "\tOMP_NUM_THREADS : " << get_env_var("OMP_NUM_THREADS") << std::endl;
132+
ss << "\tMKL_NUM_THREADS : " << get_env_var("MKL_NUM_THREADS") << std::endl;
133+
134+
return ss.str();
135+
}
136+
137+
PTThreadPool::PTThreadPool(
138+
int pool_size,
139+
int numa_node_id)
140+
: c10::ThreadPool(pool_size, numa_node_id) {}
141+
142+
void PTThreadPool::init_thread() {
143+
c10::setThreadName("PTThreadPool");
144+
at::init_num_threads();
145+
}
146+
147+
C10_REGISTER_CREATOR(ThreadPoolRegistry, C10, create_c10_threadpool);
148+
149+
void set_num_interop_threads(int nthreads) {
150+
AT_CHECK(nthreads > 0, "Expected positive number of threads");
151+
152+
int no_value = NOT_SET;
153+
AT_CHECK(num_interop_threads.compare_exchange_strong(no_value, nthreads),
154+
"Error: cannot set number of interop threads after parallel work "
155+
"has started or set_num_interop_threads called");
156+
}
157+
158+
int get_num_interop_threads() {
159+
int nthreads = num_interop_threads.load();
160+
if (nthreads > 0) {
161+
return nthreads;
162+
} else if (nthreads == NOT_SET) {
163+
// return default value
164+
return TaskThreadPoolBase::defaultNumThreads();
165+
} else {
166+
return get_pool().size();
167+
}
168+
}
169+
170+
void launch(const std::function<void()>& func) {
171+
get_pool().run(func);
172+
}
173+
174+
} // namespace at

0 commit comments

Comments
 (0)