Skip to content

Commit ddd6f12

Browse files
Update on "Added add/mul for nested dense [B, *, D], [B, 1, D] case (CUDA-only)"
[ghstack-poisoned]
2 parents a45096e + 842ba3d commit ddd6f12

File tree

150 files changed

+2600
-1197
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

150 files changed

+2600
-1197
lines changed

.github/actions/upload-test-artifacts/action.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,4 @@ runs:
139139
retention-days: 14
140140
if-no-files-found: ignore
141141
path: usage_log.txt
142+
continue-on-error: true

.github/auto_request_review.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ reviewers:
77
- wconstab
88
- anjali411
99
- albanD
10-
- Krovatkin
1110
- miladm
1211
- bdhirsh
1312

.github/ci_commit_pins/triton.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
5ca1ed01016530056c4507661c24d6c21efc983d
1+
f16138d447bccc54641a9c48ffedbd449a1a40a7

.github/ci_commit_pins/vision.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
cba1c011a87dd14af10f97bcb113fa09a8e2b396
1+
d95fbaf1efd5346a4afcf5b9953df75696432265

.github/requirements/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,6 @@ The list of support files are as follows:
1717
test jobs to setup the conda environment
1818
* conda-env-macOS-X64. This is use by MacOS (x86-64) build and test
1919
jobs to setup the conda environment
20+
* Pip:
21+
* pip-requirements-macOS.txt. This is used by MacOS build and test jobs to
22+
setup the pip environment
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
boto3==1.19.12
2+
hypothesis==6.56.4
3+
expecttest==0.1.3
4+
librosa>=0.6.2
5+
mpmath==1.2.1
6+
networkx==2.8.7
7+
# Use numba-0.49.1 or older on Intel Macs, but 0.56.0 on M1 machines, as older numba is not available
8+
numba==0.56.0; platform_machine == "arm64"
9+
numba<=0.49.1; platform_machine != "arm64"
10+
opt-einsum>=3.3
11+
psutil==5.9.1
12+
pynvml==11.4.1
13+
pygments==2.12.0
14+
pytest==7.2.0
15+
pytest-xdist==3.0.2
16+
pytest-rerunfailures==10.2
17+
pytest-shard==0.1.2
18+
scipy==1.9.0
19+
sympy==1.11.1
20+
unittest-xml-reporting<=3.2.0,>=2.0.0
21+
xdoctest==1.0.2

.github/scripts/install_nvidia_utils_linux.sh

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,29 @@ install_nvidia_driver_amzn2() {
2626
# Purge any nvidia driver installed from RHEL repo
2727
sudo yum remove -y nvidia-driver-latest-dkms
2828

29+
# Try to gather more information about the runner and its existing NVIDIA driver if any
30+
echo "Before installing NVIDIA driver"
31+
lspci
32+
lsmod
33+
modinfo nvidia || true
34+
2935
HAS_NVIDIA_DRIVER=0
3036
# Check if NVIDIA driver has already been installed
3137
if [ -x "$(command -v nvidia-smi)" ]; then
38+
set +e
3239
# The driver exists, check its version next
3340
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
41+
NVIDIA_SMI_STATUS=$?
3442

35-
if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
43+
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
44+
echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing"
45+
elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
3646
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
3747
else
3848
HAS_NVIDIA_DRIVER=1
3949
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
4050
fi
51+
set -e
4152
fi
4253

4354
if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
@@ -51,17 +62,25 @@ install_nvidia_driver_amzn2() {
5162
sudo rm -fv /tmp/nvidia_driver
5263
fi
5364

65+
sudo modprobe nvidia || true
66+
echo "After installing NVIDIA driver"
67+
lspci
68+
lsmod
69+
modinfo nvidia || true
70+
5471
(
5572
set +e
5673
nvidia-smi
57-
status=$?
74+
NVIDIA_SMI_STATUS=$?
75+
5876
# Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
59-
if [ $status -eq 0 ] || [ $status -eq 14 ]; then
60-
echo "INFO: Ignoring allowed status ${status}"
77+
if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then
78+
echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}"
6179
else
62-
echo "ERROR: nvidia-smi exited with unresolved status ${status}"
63-
exit ${status}
80+
echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}"
81+
exit ${NVIDIA_SMI_STATUS}
6482
fi
83+
set -e
6584
)
6685
)
6786
}

.github/workflows/_mac-test-mps.yml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,21 @@ jobs:
3737
name: ${{ inputs.build-environment }}
3838
use-gha: true
3939

40+
# This is copied from the main macos test workflow. It was missed in the earlier fix because macos M1
41+
# runners are shared and not ephemeral, so the issue wasn't manifested if the runners with the fix were
42+
# used
43+
- name: Install macOS homebrew dependencies
44+
run: |
45+
# Install dependencies
46+
brew install libomp
47+
brew link --force libomp
48+
4049
- name: Setup miniconda
4150
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
4251
with:
4352
python-version: 3.9
4453
environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
54+
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
4555

4656
- name: Install PyTorch
4757
env:
@@ -51,10 +61,9 @@ jobs:
5161
run: |
5262
# shellcheck disable=SC1090
5363
set -ex
54-
${CONDA_RUN} python3 -mpip install "unittest-xml-reporting<=3.2.0,>=2.0.0"
5564
# As wheels are cross-compiled they are reported as x86_64 ones
5665
ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv ${ORIG_WHLNAME} ${ARM_WHLNAME}
57-
${CONDA_RUN} python3 -mpip install dist/*.whl
66+
${CONDA_RUN} python3 -mpip install --no-index --no-deps dist/*.whl
5867
5968
- name: Run MPS tests
6069
env:

.github/workflows/_mac-test.yml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,19 +94,19 @@ jobs:
9494
with:
9595
python-version: 3.8
9696
environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
97+
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
9798

9899
- name: Setup miniconda (arm64, py3.9)
99100
if: ${{ runner.arch == 'ARM64' }}
100101
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
101102
with:
102103
python-version: 3.9
103104
environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
105+
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
104106

105107
- name: Start monitoring script
106108
id: monitor-script
107109
run: |
108-
${CONDA_RUN} python3 -m pip install psutil==5.9.1
109-
${CONDA_RUN} python3 -m pip install pynvml==11.4.1
110110
${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
111111
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
112112
@@ -144,7 +144,7 @@ jobs:
144144
export PR_BODY="${PR_BODY//[\'\"]}"
145145
arch
146146
147-
${CONDA_RUN} python3 -mpip install $(echo dist/*.whl)[opt-einsum]
147+
${CONDA_RUN} python3 -mpip install --no-index --no-deps $(echo dist/*.whl)
148148
${CONDA_RUN} .jenkins/pytorch/macos-test.sh
149149
150150
- name: Get workflow job id
@@ -190,6 +190,4 @@ jobs:
190190
GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
191191
run: |
192192
set -x
193-
${CONDA_RUN} python3 -m pip install -r requirements.txt
194-
${CONDA_RUN} python3 -m pip install boto3==1.19.12
195193
${CONDA_RUN} python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test

.github/workflows/inductor.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jobs:
2929
{ config: "inductor", shard: 5, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
3030
{ config: "inductor", shard: 6, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
3131
{ config: "inductor", shard: 7, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
32+
{ config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
3233
]}
3334
3435
linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:

0 commit comments

Comments
 (0)