pytorch
diff --git a/‎.circleci/cimodel/data/pytorch_build_data.py‎
Lines changed: 3 additions & 3 deletions b/‎.circleci/cimodel/data/pytorch_build_data.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.circleci/cimodel/data/pytorch_build_definitions.py‎
Lines changed: 3 additions & 3 deletions b/‎.circleci/cimodel/data/pytorch_build_definitions.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.jenkins/pytorch/macos-test.sh‎
Lines changed: 0 additions & 2 deletions b/‎.jenkins/pytorch/macos-test.sh‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.jenkins/pytorch/test.sh‎
Lines changed: 2 additions & 4 deletions b/‎.jenkins/pytorch/test.sh‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎.jenkins/pytorch/win-test.sh‎
Lines changed: 0 additions & 1 deletion b/‎.jenkins/pytorch/win-test.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp‎
Lines changed: 10 additions & 8 deletions b/‎aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎test/jit/test_tracer.py‎
Lines changed: 3 additions & 1 deletion b/‎test/jit/test_tracer.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎test/mobile/test_lite_script_type.py‎
Lines changed: 64 additions & 1 deletion b/‎test/mobile/test_lite_script_type.py‎
Lines changed: 64 additions & 1 deletion
diff --git a/‎test/quantization/core/test_quantized_op.py‎
Lines changed: 36 additions & 1 deletion b/‎test/quantization/core/test_quantized_op.py‎
Lines changed: 36 additions & 1 deletion
@@ -74,7 +74,7 @@ def child_constructor(self):
             "mlc": MLCConfigNode,
             "vulkan": VulkanConfigNode,
             "parallel_tbb": ParallelTBBConfigNode,
-            "noarch": NoarchConfigNode,
+            "crossref": CrossRefConfigNode,
             "parallel_native": ParallelNativeConfigNode,
             "onnx": ONNXConfigNode,
             "libtorch": LibTorchConfigNode,
@@ -171,9 +171,9 @@ def child_constructor(self):
         return ImportantConfigNode
 
 
-class NoarchConfigNode(TreeConfigNode):
+class CrossRefConfigNode(TreeConfigNode):
     def init2(self, node_name):
-        self.props["is_noarch"] = node_name
+        self.props["is_crossref"] = node_name
 
     def child_constructor(self):
         return ImportantConfigNode
 
@@ -239,7 +239,7 @@ def instantiate_configs(only_slow_gradcheck):
         compiler_version = fc.find_prop("compiler_version")
         is_xla = fc.find_prop("is_xla") or False
         is_asan = fc.find_prop("is_asan") or False
-        is_noarch = fc.find_prop("is_noarch") or False
+        is_crossref = fc.find_prop("is_crossref") or False
         is_onnx = fc.find_prop("is_onnx") or False
         is_pure_torch = fc.find_prop("is_pure_torch") or False
         is_vulkan = fc.find_prop("is_vulkan") or False
@@ -283,8 +283,8 @@ def instantiate_configs(only_slow_gradcheck):
             python_version = fc.find_prop("pyver")
             parms_list[0] = fc.find_prop("abbreviated_pyver")
 
-        if is_noarch:
-            parms_list_ignored_for_docker_image.append("noarch")
+        if is_crossref:
+            parms_list_ignored_for_docker_image.append("crossref")
 
         if is_onnx:
             parms_list.append("onnx")
 
@@ -138,7 +138,7 @@ jobs:
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "noarch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 
   linux-bionic-cuda11_3-py3_7-clang9-build:
 
@@ -4,8 +4,6 @@
 # shellcheck source=./macos-common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh"
 
-export PYTORCH_TEST_SKIP_NOARCH=1
-
 conda install -y six
 pip install -q hypothesis "expecttest==0.1.3" "librosa>=0.6.2" "numba<=0.49.1" psutil "scipy==1.6.3"
 
 
@@ -62,10 +62,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
   export BUILD_SPLIT_CUDA=ON
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *noarch* ]]; then
-  export PYTORCH_TEST_SKIP_NOARCH=0
-else
-  export PYTORCH_TEST_SKIP_NOARCH=1
+if [[ "$BUILD_ENVIRONMENT" == *crossref* ]]; then
+  export PYTORCH_TEST_WITH_CROSSREF=1
 fi
 
 if [[ -n "$PR_NUMBER" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then
 
@@ -26,7 +26,6 @@ export TEST_DIR_WIN
 export PYTORCH_FINAL_PACKAGE_DIR="${PYTORCH_FINAL_PACKAGE_DIR:-/c/users/circleci/workspace/build-results}"
 PYTORCH_FINAL_PACKAGE_DIR_WIN=$(cygpath -w "${PYTORCH_FINAL_PACKAGE_DIR}")
 export PYTORCH_FINAL_PACKAGE_DIR_WIN
-export PYTORCH_TEST_SKIP_NOARCH=1
 
 mkdir -p "$TMP_DIR"/build/torch
 
 
@@ -12,6 +12,7 @@
 #include <ATen/native/quantized/cudnn/utils.h>
 #include <ATen/native/utils/ParamsHash.h>
 #include <ATen/TensorUtils.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/core/QScheme.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/ArrayRef.h>
@@ -99,18 +100,19 @@ Tensor add(Tensor qa, Tensor qb, double output_scale, int64_t output_zero_point)
     }
     qa = qa.view(new_sizes);
     qb = qb.view(new_sizes);
+  } else if (qa.dim() == 4) {
+    qa = qa.contiguous(c10::MemoryFormat::ChannelsLast);
+    qb = qb.contiguous(c10::MemoryFormat::ChannelsLast);
   }
 
-  at::Tensor add_output = at::empty(qa.sizes(), at::device(at::kCUDA).dtype(at::kFloat));
-  at::Tensor quantized_output = at::_empty_affine_quantized(
-      qa.sizes(),
-      at::device(at::kCUDA).dtype(at::ScalarType::QInt8),
-      output_scale,
-      output_zero_point);
+  auto memory_format = qa.dim() == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+  at::Tensor add_output = at::empty(qa.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format);
+  at::Tensor quantized_output = at::_empty_affine_quantized(qa.sizes(), at::device(at::kCUDA).dtype(at::ScalarType::QInt8),
+                                                            output_scale, output_zero_point, memory_format);
   // TODO: When cudnn enables support for broadcasting, we can remove this tensor
-  at::Tensor requantize_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat));
+  at::Tensor requantize_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format);
   requantize_multiplier_tensor.fill_(qa.q_scale() / output_scale);
-  at::Tensor rhs_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat));
+  at::Tensor rhs_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format);
   rhs_multiplier_tensor.fill_(qb.q_scale() / qa.q_scale());
 
   cudnnHandle_t handle = at::native::getCudnnHandle();
 
@@ -17,7 +17,7 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.common_utils import suppress_warnings, \
     skipIfCompiledWithoutNumpy, enable_profiling_mode_for_profiling_tests, \
-    IS_SANDCASTLE, TemporaryFileName
+    IS_SANDCASTLE, TemporaryFileName, skipIfCrossRef
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \
     _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, \
     RUN_CUDA_MULTI_GPU, make_global
@@ -511,6 +511,7 @@ def to_tensor(x, y):
         self.assertEqual(to_tensor_trace(x, y), to_tensor(x, y))
 
     @skipIfCompiledWithoutNumpy
+    @skipIfCrossRef
     def test_trace_warn(self):
         def fn(x):
             int(x)  # Warning 1.
@@ -1779,6 +1780,7 @@ def forward(self, x):
 
         torch.jit.trace(Mod(), (torch.rand(3, 4),))
 
+    @skipIfCrossRef
     def test_trace_records_names(self):
         def foo(bar, baz):
             baz = bar + 3
 
@@ -3,7 +3,7 @@
 import torch
 import torch.utils.bundled_inputs
 import io
-from typing import List, NamedTuple
+from typing import Dict, List, NamedTuple
 
 from torch.jit.mobile import _load_for_lite_interpreter
 from torch.testing._internal.common_utils import TestCase, run_tests
@@ -33,6 +33,69 @@ def forward(self, a: torch.Tensor):
             mobile_module_result
         )
 
+
+    def test_typing_dict_with_namedtuple(self):
+        class Foo(NamedTuple):
+            id: torch.Tensor
+
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super(Bar, self).__init__()
+                self.foo = Foo(torch.tensor(1))
+
+            def forward(self, a: torch.Tensor):
+                self.foo = Foo(a)
+                re: Dict[str, Foo] = dict()
+                re["test"] = Foo(a)
+                return self.foo, re["test"]
+
+        # The corresponding bytecode is
+        # (8,
+        #  ('__torch__.___torch_mangle_2.Bar.forward',
+        #   (('instructions',
+        #     (('STOREN', 1, 2),
+        #      ('DROPR', 1, 0),
+        #      ('DICT_CONSTRUCT', 0, 0),
+        #      ('STORE', 3, 0),
+        #      ('LOAD', 3, 0),
+        #      ('LOADC', 1, 0),
+        #      ('MOVE', 2, 0),
+        #      ('NAMED_TUPLE_CONSTRUCT', 1, 1),
+        #      ('OP', 0, 0),
+        #      ('MOVE', 3, 0),
+        #      ('LOADC', 1, 0),
+        #      ('DICT_INDEX', 0, 0),
+        #      ('LOADC', 0, 0),
+        #      ('TUPLE_INDEX', 0, 0),
+        #      ('RET', 0, 0))),
+        #    ('operators', (('aten::_set_item', 'str', 3),)),
+        #    ('constants', (0, 'test')),
+        #    ('types',
+        #     ('Dict[str,__torch__.Foo[NamedTuple, [[id, Tensor]]]]',
+        #      '__torch__.Foo[NamedTuple, [[id, Tensor]]]')),
+        #    ('register_size', 3)),
+        #   (('arguments',
+        #     ((('name', 'self'),
+        #       ('type', '__torch__.___torch_mangle_2.Bar'),
+        #       ('default_value', None)),
+        #      (('name', 'a'), ('type', 'Tensor'), ('default_value', None)))),
+        #    ('returns',
+        #     ((('name', ''), ('type', 'Tensor'), ('default_value', None)),)))))
+
+        sample_input = torch.tensor(5)
+        script_module = torch.jit.script(Bar())
+
+        script_module_result = script_module(sample_input)
+
+        buffer_mobile = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter())
+        buffer_mobile.seek(0)
+        mobile_module = _load_for_lite_interpreter(buffer_mobile)
+        mobile_module_result = mobile_module(sample_input)
+        torch.testing.assert_allclose(
+            script_module_result,
+            mobile_module_result
+        )
+
     def test_typing_namedtuple_custom_classtype(self):
         class Foo(NamedTuple):
             id: torch.Tensor
 
@@ -832,7 +832,7 @@ def test_qadd_relu_same_qparams(self):
     """Tests the correctness of the cudnn add and add_relu op
     (Similar to test_qadd_relu_different_qparams, will probably merge in the future)"""
     @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
-    @unittest.skip("Local only - currently the qconv2d_cudnn op is bulid "
+    @unittest.skip("Local only - currently the test_qadd_relu_cudnn op is bulid "
                    "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
                    "after it is built by default")
     def test_qadd_relu_cudnn(self):
@@ -865,6 +865,41 @@ def test_qadd_relu_cudnn(self):
         np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(),
                                 "Quantized addition with ReLU failed.")
 
+    """Tests the correctness of the cudnn add and add_relu op for nhwc format"""
+    @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
+    @unittest.skip("Local only - currently the test_qadd_relu_cudnn_nhwc op is bulid "
+                   "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
+                   "after it is built by default")
+    def test_qadd_relu_cudnn_nhwc(self):
+        dtype = torch.qint8
+        add_relu = torch.ops.quantized.add_relu
+        add = torch.ops.quantized.add
+
+        A = torch.rand(16, 8, 4, 12).to(device="cuda")
+        B = torch.rand(16, 8, 4, 12).to(device="cuda")
+        scale_A = 2.5
+        scale_B = 6.3
+        scale_C = 12.9
+        zero_point = 0
+        qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point,
+                                       dtype=dtype)
+        qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point,
+                                       dtype=dtype)
+        # Add ground truth
+        C = (qA.dequantize() + qB.dequantize()).to(device="cpu").numpy()
+        qC = _quantize(C, scale_C, zero_point, dtype=np_dtype[dtype])
+        qC_hat = add(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu")
+        np.testing.assert_equal(qC, qC_hat.int_repr(),
+                                "Quantized addition failed.")
+
+        # Add + ReLU ground truth
+        Crelu = C.copy()
+        Crelu[C < 0] = 0
+        qCrelu = _quantize(Crelu, scale_C, zero_point, dtype=np_dtype[dtype])
+        qCrelu_hat = add_relu(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu")
+        np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(),
+                                "Quantized addition with ReLU failed.")
+
     """Tests the correctness of the add and add_relu op."""
     def test_qadd_relu_different_qparams(self):
         for dtype in [torch.quint8, torch.qint8, torch.qint32]: