Skip to content

Commit e2f1eff

Browse files
committed
Update on "Additional error checking for torch.cuda.nccl APIs."
`torch.cuda.nccl` APIs didn't throw appropriate errors when called with inputs/outputs that were of the wrong type and it resulted in some cryptic errors instead. Adding some error checks with explicit error messages for these APIs. Differential Revision: [D23206069](https://our.internmc.facebook.com/intern/diff/D23206069/) [ghstack-poisoned]
2 parents 43fd86b + cbdaa20 commit e2f1eff

File tree

142 files changed

+5230
-2172
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

142 files changed

+5230
-2172
lines changed

.circleci/config.yml

Lines changed: 52 additions & 53 deletions
Large diffs are not rendered by default.

.circleci/scripts/setup_ci_environment.sh

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -47,31 +47,44 @@ else
4747
sudo rm -rf /etc/apt/sources.list.d/nvidia-docker.list
4848
fi
4949

50+
add_to_env_file() {
51+
local content
52+
content=$1
53+
# BASH_ENV should be set by CircleCI
54+
echo "${content}" >> "${BASH_ENV:-/tmp/env}"
55+
}
56+
57+
add_to_env_file "IN_CIRCLECI=1"
58+
add_to_env_file "COMMIT_SOURCE=${CIRCLE_BRANCH:-}"
59+
add_to_env_file "BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}"
60+
add_to_env_file "CIRCLE_PULL_REQUEST=${CIRCLE_PULL_REQUEST}"
61+
62+
5063
if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then
51-
echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
52-
echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH:-}" >> /home/circleci/project/env
53-
echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
64+
add_to_env_file "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2"
65+
66+
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
67+
MEMORY_LIMIT_MAX_JOBS=8 # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
68+
MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
69+
add_to_env_file "MAX_JOBS=${MAX_JOBS}"
70+
5471
if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
55-
echo "declare -x TORCH_CUDA_ARCH_LIST=5.2" >> /home/circleci/project/env
72+
add_to_env_file "TORCH_CUDA_ARCH_LIST=5.2"
5673
fi
57-
export SCCACHE_MAX_JOBS=`expr $(nproc) - 1`
58-
export MEMORY_LIMIT_MAX_JOBS=8 # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
59-
export MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
60-
echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env
6174

6275
if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
6376
# This IAM user allows write access to S3 bucket for sccache & bazels3cache
6477
set +x
65-
echo "declare -x XLA_CLANG_CACHE_S3_BUCKET_NAME=${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}" >> /home/circleci/project/env
66-
echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}" >> /home/circleci/project/env
67-
echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}" >> /home/circleci/project/env
78+
add_to_env_file "XLA_CLANG_CACHE_S3_BUCKET_NAME=${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
79+
add_to_env_file "AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
80+
add_to_env_file "AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
6881
set -x
6982
else
7083
# This IAM user allows write access to S3 bucket for sccache
7184
set +x
72-
echo "declare -x XLA_CLANG_CACHE_S3_BUCKET_NAME=${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}" >> /home/circleci/project/env
73-
echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}" >> /home/circleci/project/env
74-
echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}" >> /home/circleci/project/env
85+
add_to_env_file "XLA_CLANG_CACHE_S3_BUCKET_NAME=${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
86+
add_to_env_file "AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
87+
add_to_env_file "AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
7588
set -x
7689
fi
7790
fi
@@ -80,5 +93,5 @@ fi
8093
set +x
8194
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4:-}
8295
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4:-}
83-
eval $(aws ecr get-login --region us-east-1 --no-include-email)
96+
eval "$(aws ecr get-login --region us-east-1 --no-include-email)"
8497
set -x

.circleci/verbatim-sources/commands.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ commands:
77
name: "Calculate docker image hash"
88
command: |
99
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
10-
echo "export DOCKER_TAG=${DOCKER_TAG}" >> ${BASH_ENV}
10+
echo "DOCKER_TAG=${DOCKER_TAG}" >> "${BASH_ENV}"
1111
1212
designate_upload_channel:
1313
description: "inserts the correct upload channel into ${BASH_ENV}"

.circleci/verbatim-sources/job-specs/binary-job-specs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
command: |
5353
source /env
5454
cd /pytorch && export COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
55-
pip3 install requests && \
55+
python3 -mpip install requests && \
5656
SCRIBE_GRAPHQL_ACCESS_TOKEN=${SCRIBE_GRAPHQL_ACCESS_TOKEN} \
5757
python3 /pytorch/.circleci/scripts/upload_binary_size_to_scuba.py || exit 0
5858

.circleci/verbatim-sources/job-specs/caffe2-job-specs.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,10 @@
4141
4242
echo "DOCKER_IMAGE: "${DOCKER_IMAGE}
4343
time docker pull ${DOCKER_IMAGE} >/dev/null
44-
export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
44+
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
4545
docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
4646
47-
export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
47+
export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
4848
echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
4949
5050
# Push intermediate Docker image for next phase to use
@@ -113,13 +113,13 @@
113113
echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
114114
time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
115115
if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then
116-
export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
116+
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
117117
else
118-
export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
118+
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
119119
fi
120120
docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
121121
122-
export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_test_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
122+
export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && ./ci_test_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
123123
echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
124124
125125
caffe2_macos_build:

0 commit comments

Comments
 (0)