Skip to content

Commit 680c177

Browse files
committed
Update on "[Quant][FX] Add backend config for onednn backend and fuse Linear-LeakyReLU"
**Summary** Add backend config for onednn backend so that it can support more post op fusion for int8 inference. First `Linear - LeakyReLU` fusion is implemented based on previous PRs. **Test plan** python test_quantization.py TestFuseFx cc jerryzh168 jianyuh raghuramank100 jamesr66a vkuzo jgong5 leslie-fang-intel mingfeima XiaobingSuper sanchitintel ashokei jingxu10 [ghstack-poisoned]
2 parents 700d4eb + f52839f commit 680c177

File tree

144 files changed

+2604
-2050
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

144 files changed

+2604
-2050
lines changed

.github/ci_commit_pins/multipy.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
7dd29931fa8e9bb7c970f05f8c0dc13b69e17494

.github/ci_commit_pins/xla.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
d43ed3c4d24ea17df0a57fed7f338a9c7b7d1ef0
1+
06cc6d4f596e764502e3ce5ab998c8986fd52e5b

.jenkins/pytorch/common_utils.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,12 @@ function setup_torchdeploy_deps(){
158158

159159
function checkout_install_torchdeploy() {
160160
local commit
161+
commit=$(get_pinned_commit multipy)
161162
setup_torchdeploy_deps
162163
pushd ..
163164
git clone --recurse-submodules https://github.com/pytorch/multipy.git
164165
pushd multipy
166+
git checkout "${commit}"
165167
python multipy/runtime/example/generate_examples.py
166168
pip install -e . --install-option="--cudatests"
167169
popd
@@ -197,7 +199,9 @@ function checkout_install_torchbench() {
197199
git clone https://github.com/pytorch/benchmark torchbench
198200
pushd torchbench
199201
git checkout no_torchaudio
200-
python install.py
202+
# Occasionally the installation may fail on one model but it is ok to continue
203+
# to install and test other models
204+
python install.py --continue_on_fail
201205
popd
202206
}
203207

.lintrunner.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,7 @@ include_patterns = [
783783
'benchmarks/dynamo/**/*.py',
784784
'torch/_inductor/**/*.py',
785785
'test/inductor/**/*.py',
786+
'test/test_weak.py',
786787
'torch/onnx/**/*.py',
787788
'torch/package/**/*.py',
788789
'torch/_decomp/**/*.py',

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ To develop PyTorch on your machine, here are some tips:
9090
* Python >= 3.7 (3.7.6+ recommended)
9191

9292
### Instructions
93+
_**Note**: If you get stuck at any step, check out the [tips and debugging](#tips-and-debugging) section below for common solutions_
9394

9495
1. Uninstall all existing PyTorch installs. You may need to run `pip
9596
uninstall torch` multiple times. You'll know `torch` is fully

aten/src/ATen/FunctionalStorageImpl.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,16 @@ const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Ten
6565

6666

6767
c10::SymInt get_nbytes(const Tensor& value) {
68+
// The functionalization story when wrapping tensors that don't have storage
69+
// is a bit wonky, but fortunately for some models (e.g., dlrm) we never
70+
// actually perform mutations on these tensors, so you never really get
71+
// called out on it. For now, functionalization still creates "storages"
72+
// for these tensors (which is wrong), but we don't give them any space.
73+
// A more proper fix would be to have a SparseFunctionalTensorWrapper that
74+
// models sparse correctly.
75+
if (value.is_sparse()) {
76+
return 0;
77+
}
6878
if (value.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
6979
// Today, the two implementations of SymInt are in Python (proxy tensor),
7080
// and lazy tensor (LTC/XLA).

aten/src/ATen/core/boxing/KernelFunction_impl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ inline typename remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
6464

6565
template <>
6666
inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(c10::SymIntArrayRef x) {
67-
return C10_AS_INTARRAYREF_SLOW(x);
67+
return c10::asIntArrayRefSlow(x);
6868
}
6969

7070
template <>
@@ -74,7 +74,7 @@ inline typename remove_symint<c10::optional<c10::SymInt>>::type unpackSymInt(c10
7474

7575
template <>
7676
inline typename remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(at::OptionalSymIntArrayRef x) {
77-
return x.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*x)) : c10::nullopt;
77+
return x.has_value() ? c10::make_optional(c10::asIntArrayRefSlow(*x)) : c10::nullopt;
7878
}
7979

8080
template<class Return, class... Args>

aten/src/ATen/core/union_type.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
228228
}
229229

230230
UnionTypePtr UnionType::create(std::vector<TypePtr> reference) {
231-
auto union_type = new UnionType(std::move(reference));
231+
UnionTypePtr union_type(new UnionType(std::move(reference)));
232232

233233
// Some very special-cased logic for `Optional`. This will be deleted
234234
// in a later PR
@@ -267,7 +267,7 @@ UnionTypePtr UnionType::create(std::vector<TypePtr> reference) {
267267
}
268268
}
269269

270-
return UnionTypePtr(union_type);
270+
return union_type;
271271
}
272272

273273
c10::optional<TypePtr> UnionType::subtractTypeSet(std::vector<TypePtr>& to_subtract) const {

aten/src/ATen/cuda/CUDAGeneratorImpl.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index)
101101
: c10::GeneratorImpl{Device(DeviceType::CUDA, device_index),
102102
DispatchKeySet(c10::DispatchKey::CUDA)} {
103103
at::cuda::assertNotCapturing("Cannot construct a new CUDAGeneratorImpl");
104+
no_reset_rnn_state_.clear();
104105
}
105106

106107
/**
@@ -113,6 +114,7 @@ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
113114
at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_current_seed");
114115
seed_ = seed;
115116
philox_offset_per_thread_ = 0;
117+
no_reset_rnn_state_.clear();
116118
}
117119

118120
#define CAPTURE_DEFAULT_GENS_MSG \

aten/src/ATen/cuda/CUDAGeneratorImpl.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <ATen/cuda/detail/PhiloxCudaStateRaw.cuh>
55
#include <ATen/Context.h>
66
#include <limits>
7+
#include <atomic>
78

89
namespace at {
910
/**
@@ -104,6 +105,10 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
104105
uint64_t capture_epilogue();
105106
PhiloxCudaState philox_cuda_state(uint64_t increment);
106107

108+
bool reset_rnn_state() {
109+
return !no_reset_rnn_state_.test_and_set();
110+
}
111+
107112
// Temporarily accommodates call sites that use philox_engine_inputs.
108113
// Allows incremental refactor of call sites to use philox_cuda_state.
109114
std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
@@ -118,6 +123,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
118123
int64_t* offset_extragraph_{};
119124
uint32_t offset_intragraph_ = 0;
120125
bool graph_expects_this_gen_ = false;
126+
std::atomic_flag no_reset_rnn_state_;
121127
};
122128

123129
namespace cuda {

0 commit comments

Comments
 (0)