petrex
diff --git a/‎.jenkins/caffe2/build.sh‎
Lines changed: 86 additions & 61 deletions b/‎.jenkins/caffe2/build.sh‎
Lines changed: 86 additions & 61 deletions
diff --git a/‎.jenkins/pytorch/test.sh‎
Lines changed: 2 additions & 2 deletions b/‎.jenkins/pytorch/test.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.jenkins/pytorch/win-build.sh‎
Lines changed: 10 additions & 2 deletions b/‎.jenkins/pytorch/win-build.sh‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎CODEOWNERS‎
Lines changed: 5 additions & 5 deletions b/‎CODEOWNERS‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎aten/src/ATen/Backtrace.h‎
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/Backtrace.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/CPUApplyUtils.h‎
Lines changed: 27 additions & 28 deletions b/‎aten/src/ATen/CPUApplyUtils.h‎
Lines changed: 27 additions & 28 deletions
diff --git a/‎aten/src/ATen/Context.h‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/Context.h‎
Lines changed: 3 additions & 0 deletions
@@ -2,9 +2,15 @@
 
 set -ex
 
+# The INSTALL_PREFIX here must match up with test.sh
+INSTALL_PREFIX="/usr/local/caffe2"
 LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
+CMAKE_ARGS=()
+
 
+# Setup SCCACHE
+###############################################################################
 # Setup sccache if SCCACHE_BUCKET is set
 if [ -n "${SCCACHE_BUCKET}" ]; then
   mkdir -p ./sccache
@@ -61,24 +67,29 @@ report_compile_cache_stats() {
   fi
 }
 
-CMAKE_ARGS=("-DBUILD_BINARY=ON")
-CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
-CMAKE_ARGS+=("-DUSE_ZSTD=ON")
 
-if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then
-  if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then
-    CMAKE_ARGS+=("-DBUILD_ATEN=ON")
-  fi
+###############################################################################
+# Explicitly set Python executable.
+###############################################################################
+# On Ubuntu 16.04 the default Python is still 2.7.
+PYTHON="$(which python)"
+if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
+  PYTHON=$(which "python${BASH_REMATCH[1]}")
+  CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}")
 fi
 
-# Run build script from scripts if applicable
+
+###############################################################################
+# Use special scripts for Android, conda, and setup builds
+###############################################################################
 if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
   export ANDROID_NDK=/opt/ndk
+  CMAKE_ARGS+=("-DBUILD_BINARY=ON")
+  CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+  CMAKE_ARGS+=("-DUSE_ZSTD=ON")
   "${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@"
   exit 0
-fi
-if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
-
+elif [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
   # click (required by onnx) wants these set
   # TODO don't think this fixes the problem for conda3 yet
   export LANG=C.UTF-8
@@ -96,51 +107,50 @@ if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
   PROTOBUF_INCDIR=/opt/conda/include pip install -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
   report_compile_cache_stats
   exit 0
+elif [[ $BUILD_ENVIRONMENT == *setup* ]]; then
+  rm -rf $INSTALL_PREFIX && mkdir $INSTALL_PREFIX
+  PYTHONPATH=$INSTALL_PREFIX $PYTHON setup_caffe2.py develop --install-dir $INSTALL_PREFIX
+  exit 0
 fi
 
-# Run cmake from ./build_caffe2 directory so it doesn't conflict with
-# standard PyTorch build directory. Eventually these won't need to
-# be separate.
-rm -rf build_caffe2
-mkdir build_caffe2
-cd ./build_caffe2
 
-INSTALL_PREFIX="/usr/local/caffe2"
+###############################################################################
+# Set cmake args
+###############################################################################
+CMAKE_ARGS+=("-DBUILD_BINARY=ON")
+CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+CMAKE_ARGS+=("-DUSE_ZSTD=ON")
 CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
 
-# Explicitly set Python executable.
-# On Ubuntu 16.04 the default Python is still 2.7.
-PYTHON="$(which python)"
-if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
-  PYTHON=$(which "python${BASH_REMATCH[1]}")
-  CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}")
+if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then
+  if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then
+    CMAKE_ARGS+=("-DBUILD_ATEN=ON")
+  fi
 fi
+if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
+  CMAKE_ARGS+=("-DBLAS=MKL")
+fi
+if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
+  CMAKE_ARGS+=("-DUSE_CUDA=ON")
+  CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
+  CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
+
+  # Explicitly set path to NVCC such that the symlink to ccache or sccache is used
+  CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc")
 
-case "${BUILD_ENVIRONMENT}" in
-  *-mkl*)
-    CMAKE_ARGS+=("-DBLAS=MKL")
-    ;;
-  *-cuda*)
-    CMAKE_ARGS+=("-DUSE_CUDA=ON")
-    CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
-    CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
-
-    # Explicitly set path to NVCC such that the symlink to ccache or sccache is used
-    CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc")
-
-    # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
-    # Setting PATH to resolve to the right nvcc alone isn't enough.
-    # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
-    export CUDA_PATH="/usr/local/cuda"
-
-    # Ensure the ccache symlink can still find the real nvcc binary.
-    export PATH="/usr/local/cuda/bin:$PATH"
-    ;;
-  *-rocm*)
-    export LANG=C.UTF-8
-    export LC_ALL=C.UTF-8
-    export HCC_AMDGPU_TARGET=gfx900
-esac
+  # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
+  # Setting PATH to resolve to the right nvcc alone isn't enough.
+  # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
+  export CUDA_PATH="/usr/local/cuda"
+
+  # Ensure the ccache symlink can still find the real nvcc binary.
+  export PATH="/usr/local/cuda/bin:$PATH"
+fi
+if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
+  export LANG=C.UTF-8
+  export LC_ALL=C.UTF-8
+  export HCC_AMDGPU_TARGET=gfx900
+fi
 
 # Try to include Redis support for Linux builds
 if [ "$(uname)" == "Linux" ]; then
@@ -154,14 +164,6 @@ if [ "$(uname)" == "Darwin" ]; then
   CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON")
 fi
 
-# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
-# and use that if so.
-if [[ -x "$(command -v cmake3)" ]]; then
-    CMAKE_BINARY=cmake3
-else
-    CMAKE_BINARY=cmake
-fi
-
 # Use a speciallized onnx namespace in CI to catch hardcoded onnx namespace
 CMAKE_ARGS+=("-DONNX_NAMESPACE=ONNX_NAMESPACE_FOR_C2_CI")
 
@@ -173,17 +175,35 @@ if [[ -n "$INTEGRATED" ]]; then
     CMAKE_ARGS+=("-DCAFFE2_LINK_LOCAL_PROTOBUF=OFF")
 fi
 
-# Configure
-${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
-
-# Build
+# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
+# and use that if so.
+if [[ -x "$(command -v cmake3)" ]]; then
+    CMAKE_BINARY=cmake3
+else
+    CMAKE_BINARY=cmake
+fi
 # sccache will fail for CUDA builds if all cores are used for compiling
 if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]] && [ -n "${SCCACHE}" ]; then
   MAX_JOBS=`expr $(nproc) - 1`
 else
   MAX_JOBS=$(nproc)
 fi
 
+
+###############################################################################
+# Configure and make
+###############################################################################
+# Run cmake from ./build_caffe2 directory so it doesn't conflict with
+# standard PyTorch build directory. Eventually these won't need to
+# be separate.
+rm -rf build_caffe2
+mkdir build_caffe2
+cd ./build_caffe2
+
+# Configure
+${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
+
+# Build
 if [ "$(uname)" == "Linux" ]; then
   make "-j${MAX_JOBS}" install
 else
@@ -193,6 +213,11 @@ fi
 
 report_compile_cache_stats
 
+
+###############################################################################
+# Install ONNX
+###############################################################################
+
 # Install ONNX into a local directory
 pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
 
 
@@ -36,10 +36,10 @@ fi
 
 export ATEN_DISABLE_AVX=
 export ATEN_DISABLE_AVX2=
-if [[ "${JOB_BASE_NAME}" == *NO_AVX* ]]; then
+if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then
   export ATEN_DISABLE_AVX=1
 fi
-if [[ "${JOB_BASE_NAME}" == *NO_AVX2* ]]; then
+if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
   export ATEN_DISABLE_AVX2=1
 fi
 
 
@@ -44,7 +44,15 @@ set MAGMA_HOME=%cd%\\magma
 
 :: Install sccache
 mkdir %CD%\\tmp_bin
-if "%REBUILD%"=="" ( aws s3 cp s3://ossci-windows/sccache.exe %CD%\\tmp_bin\\sccache.exe --quiet )
+if "%REBUILD%"=="" (
+  :check_sccache
+  %CD%\\tmp_bin\\sccache.exe --show-stats || (
+    taskkill /im sccache.exe /f /t || set ERRORLEVEL=0
+    del %CD%\\tmp_bin\\sccache.exe
+    aws s3 cp s3://ossci-windows/sccache.exe %CD%\\tmp_bin\\sccache.exe
+    goto :check_sccache
+  )
+)
 
 :: Install Miniconda3
 if "%REBUILD%"=="" (
@@ -73,7 +81,7 @@ set CUDNN_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
 :: Target only our CI GPU machine's CUDA arch to speed up the build
 set TORCH_CUDA_ARCH_LIST=5.2
 
-sccache --stop-server || set ERRORLEVEL=0
+sccache --stop-server
 sccache --start-server
 sccache --zero-stats
 set CC=sccache cl
 
@@ -30,6 +30,15 @@ if(NOT DEFINED BLAS_SET_BY_USER)
   set(BLAS_SET_BY_USER ${BLAS_SET_BY_USER} CACHE STRING "Marks whether BLAS was manually set by user or auto-detected")
 endif()
 
+# These lines are an attempt to make find_package(cuda) pick up
+# libcuda.dylib, and not cuda.framework.  It doesn't work all
+# the time, but it seems to help for some users.
+# TODO: replace this with a more robust fix
+if(APPLE)
+  set(CMAKE_FIND_FRAMEWORK LAST)
+  set(CMAKE_FIND_APPBUNDLE LAST)
+endif()
+
 # ---[ Options.
 # Note to developers: if you add an option below, make sure you also add it to
 # cmake/Summary.cmake so that the summary prints out the option values.
 
@@ -11,9 +11,9 @@
 /requirements.txt @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
 /torch/csrc/api/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ebetica @goldsborough
 /test/cpp/api/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ebetica @goldsborough
-/torch/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
-/torch/csrc/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
-/torch/csrc/jit/passes/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
-/test/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
-/scripts/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
+/torch/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/torch/csrc/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/torch/csrc/jit/passes/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/test/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/scripts/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
 /torch/lib/c10d/ @apaszke @pietern @teng-li
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <cstddef>
 #include <string>
 #include <typeinfo>
 
@@ -354,28 +354,26 @@ inline void CPU_tensor_parallel_apply1(
     int64_t grain_size = internal::TBB_GRAIN_SIZE) {
   if (!_apply_preamble({tensor1}))
     return;
-  if (tensor1.numel() < grain_size) {
-    CPU_tensor_apply1<scalar1>(tensor1, op);
-    return;
-  }
-  auto range = tbb::blocked_range<size_t>(0, tensor1.numel());
   if (tensor1.ndimension() < 8) {
-    tbb::parallel_for(
-        range, [&tensor1, &op](const tbb::blocked_range<size_t> r) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &op](int64_t begin, int64_t end) {
           apply_op(
-              r.end() - r.begin(),
-              r.begin(),
+              end - begin,
+              begin,
               op,
               strided_tensor_iter_fixed<scalar1, 8>(tensor1, true));
         });
   } else {
-    tbb::parallel_for(
-        range, [&tensor1, &op](const tbb::blocked_range<size_t> r) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &op](int64_t begin, int64_t end) {
           apply_op(
-              r.end() - r.begin(),
-              r.begin(),
-              op,
-              strided_tensor_iter<scalar1>(tensor1));
+              end - begin, begin, op, strided_tensor_iter<scalar1>(tensor1));
         });
   }
 }
@@ -388,27 +386,28 @@ inline void CPU_tensor_parallel_apply2(
     int64_t grain_size = internal::TBB_GRAIN_SIZE) {
   if (!_apply_preamble({tensor1, tensor2}))
     return;
-  if ((tensor1.numel() + tensor2.numel()) < grain_size) {
-    CPU_tensor_apply2<scalar1, scalar2>(tensor1, tensor2, op);
-    return;
-  }
-  auto range = tbb::blocked_range<size_t>(0, tensor1.numel());
   if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {
-    tbb::parallel_for(
-        range, [&tensor1, &tensor2, &op](const tbb::blocked_range<size_t> r) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
           apply_op(
-              r.end() - r.begin(),
-              r.begin(),
+              end - begin,
+              begin,
               op,
               strided_tensor_iter_fixed<scalar1, 8>(tensor1),
               strided_tensor_iter_fixed<scalar2, 8>(tensor2));
         });
   } else {
-    tbb::parallel_for(
-        range, [&tensor1, &tensor2, &op](const tbb::blocked_range<size_t> r) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
           apply_op(
-              r.end() - r.begin(),
-              r.begin(),
+              end - begin,
+              begin,
               op,
               strided_tensor_iter<scalar1>(tensor1),
               strided_tensor_iter<scalar2>(tensor2));
 
@@ -86,6 +86,9 @@ class AT_API Context {
   cudaStream_t getCurrentCUDAStream() const {
     return detail::getCUDAHooks().getCurrentCUDAStream(thc_state.get());
   }
+  cudaStream_t getCurrentCUDAStreamOnDevice(int64_t device) const {
+    return detail::getCUDAHooks().getCurrentCUDAStreamOnDevice(thc_state.get(), device);
+  }
   cudaDeviceProp* getCurrentDeviceProperties() const {
     return detail::getCUDAHooks().getCurrentDeviceProperties(thc_state.get());
   }
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+#pragma once`
	`2`	`+`
`1`	`3`	`#include <cstddef>`
`2`	`4`	`#include <string>`
`3`	`5`	`#include <typeinfo>`
Original file line number	Diff line number	Diff line change
`@@ -86,6 +86,9 @@ class AT_API Context {`
`86`	`86`	`cudaStream_t getCurrentCUDAStream() const {`
`87`	`87`	`return detail::getCUDAHooks().getCurrentCUDAStream(thc_state.get());`
`88`	`88`	`}`
	`89`	`+ cudaStream_t getCurrentCUDAStreamOnDevice(int64_t device) const {`
	`90`	`+ return detail::getCUDAHooks().getCurrentCUDAStreamOnDevice(thc_state.get(), device);`
	`91`	`+ }`
`89`	`92`	`cudaDeviceProp* getCurrentDeviceProperties() const {`
`90`	`93`	`return detail::getCUDAHooks().getCurrentDeviceProperties(thc_state.get());`
`91`	`94`	`}`