Skip to content

Commit ea7652e

Browse files
committed
Add debug and fix some bugs in CPU fuser
* avoid writing `x + 1.0000*y` which causes a promotion to double from float * refactor tests to make writing graphs easier (while not strictly necessary, I have some benchmarking code that I am using to make the fuser faster that is easier to write in this form) * option to dump the disassembly of the CPU fused code for perf debugging.
1 parent 0b7f1e5 commit ea7652e

File tree

3 files changed

+185
-98
lines changed

3 files changed

+185
-98
lines changed

torch/csrc/jit/fusion_compiler.cpp

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ std::string valueName(Value * n) {
201201
auto s = at::Scalar(t);
202202
return (s.isIntegral()) ?
203203
std::to_string(s.toLong()) :
204-
std::to_string(s.toDouble());
204+
(std::to_string(s.toDouble()) + "f");
205205
}
206206

207207
const char * scalarTypeName(at::ScalarType type) {
@@ -558,10 +558,15 @@ struct CUDAFusionFunction : public CompiledFusionFunction {
558558
struct TempFile {
559559
TH_DISALLOW_COPY_AND_ASSIGN(TempFile);
560560
TempFile(const std::string & t, int suffix) {
561+
// mkstemps edits its first argument in places
562+
// so we make a copy of the string here, including null terminator
561563
std::vector<char> tt(t.c_str(), t.c_str() + t.size() + 1);
562564
int fd = mkstemps(tt.data(), suffix);
563565
JIT_ASSERT(fd != -1);
564566
file_ = fdopen(fd, "r+");
567+
568+
// - 1 becuase tt.size() includes the null terminator,
569+
// but std::string does not expect one
565570
name_ = std::string(tt.begin(), tt.end() - 1);
566571
}
567572
const std::string & name() const {
@@ -623,18 +628,29 @@ static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
623628
static const std::string compile_string =
624629
"\"${cxx}\" -O3 -g -march=native -std=c++11 -fPIC -shared \"${cpp_file}\" -o \"${so_file}\"";
625630

626-
static void runCompiler(const std::string & cxx, const std::string & cpp_file, const std::string & so_file) {
631+
static void runCompiler(FusionCompilerConfig & config, const std::string & cpp_file, const std::string & so_file) {
627632
TemplateEnv env;
628-
env.s("cxx", cxx);
633+
env.s("cxx", config.cxx);
629634
env.s("cpp_file",cpp_file);
630635
env.s("so_file",so_file);
631636
std::string result = format(compile_string,env);
632637
int r = system(result.c_str());
633638
JIT_ASSERT(r == 0);
634639
}
635640

641+
642+
static const std::string disas_string =
643+
"objdump -M intel -d \"${so_file}\"";
644+
static void disas(const std::string & so_file) {
645+
TemplateEnv env;
646+
env.s("so_file", so_file);
647+
std::string cmd = format(disas_string, env);
648+
int r = system(cmd.c_str());
649+
JIT_ASSERT(r == 0);
650+
}
651+
636652
struct CPUFusionFunction : public CompiledFusionFunction {
637-
CPUFusionFunction(const std::string & name, AnnotatedGraph & agraph, const std::string & cxx)
653+
CPUFusionFunction(const std::string & name, AnnotatedGraph & agraph, FusionCompilerConfig & config)
638654
: CompiledFusionFunction(name, agraph) {
639655
TempFile so_file(so_template, 3);
640656
TempFile cpp_file(cpp_template, 4);
@@ -644,7 +660,11 @@ struct CPUFusionFunction : public CompiledFusionFunction {
644660
compilation_unit = cu.str();
645661
cpp_file.write(compilation_unit);
646662
cpp_file.sync();
647-
runCompiler(cxx, cpp_file.name(), so_file.name());
663+
runCompiler(config, cpp_file.name(), so_file.name());
664+
if(config.debug) {
665+
std::cout << compilation_unit << "\n";
666+
disas(so_file.name());
667+
}
648668
so_lib.reset(new DynamicLibrary(so_file.name().c_str()));
649669
kernel = reinterpret_cast<void(*)(uint32_t, void**)>(so_lib->sym(name.c_str()));
650670
}
@@ -690,7 +710,7 @@ std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(AnnotatedGr
690710
#endif
691711
} else {
692712
JIT_ASSERT(canCompileOnCPU());
693-
raw_func = new CPUFusionFunction(name, agraph, cxx);
713+
raw_func = new CPUFusionFunction(name, agraph, config_);
694714
}
695715
it = cache.emplace(key_, std::shared_ptr<CompiledFusionFunction>(raw_func)).first;
696716
}
@@ -711,15 +731,23 @@ std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(Node* fusio
711731
return getOrCompile(agraph);
712732
}
713733

714-
void FusionCompiler::debugLaunchGraph(Graph & graph, bool is_cuda, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
734+
735+
std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(Graph & graph,
736+
bool is_cuda,
737+
at::ArrayRef<at::Tensor> inputs,
738+
at::ArrayRef<at::Tensor> outputs) {
715739
AnnotatedGraph agraph(graph, is_cuda);
716740
for(auto & i : inputs) {
717-
agraph.input_desc.emplace_back(i);
741+
agraph.input_desc.emplace_back(i);
718742
}
719743
for(auto & i : outputs) {
720-
agraph.output_desc.emplace_back(i);
744+
agraph.output_desc.emplace_back(i);
721745
}
722-
auto func = getOrCompile(agraph);
746+
return getOrCompile(agraph);
747+
}
748+
749+
void FusionCompiler::debugLaunchGraph(Graph & graph, bool is_cuda, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
750+
auto func = getOrCompile(graph, is_cuda, inputs, outputs);
723751
func->launch_with_tensors(inputs, outputs);
724752
}
725753

@@ -736,13 +764,13 @@ static bool programExists(const std::string & program) {
736764
FusionCompiler::FusionCompiler() {
737765
const char * cxx_env = getenv("CXX");
738766
if(cxx_env != nullptr) {
739-
cxx = cxx_env;
740-
} else {
741-
cxx = "g++";
767+
config_.cxx = cxx_env;
742768
}
743-
if(!programExists(cxx)) {
744-
cxx = "";
769+
if(!programExists(config_.cxx)) {
770+
config_.cxx = "";
745771
}
772+
const char * debug_env = getenv("PYTORCH_FUSION_DEBUG");
773+
config_.debug = debug_env && atoi(debug_env) != 0;
746774
}
747775

748776
//TODO: thread safety

torch/csrc/jit/fusion_compiler.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ struct CompiledFusionFunction {
119119
std::vector<ConcatDesc> concat_desc;
120120
};
121121

122+
struct FusionCompilerConfig {
123+
std::string cxx = "g++"; // compiler location
124+
bool debug = false; // emit debugging information about fusions
125+
};
126+
122127
// caching compiler
123128
struct FusionCompiler {
124129
TH_DISALLOW_COPY_AND_ASSIGN(FusionCompiler);
@@ -129,16 +134,21 @@ struct FusionCompiler {
129134
// uses type annotations in fusion_group to create Annotated graph
130135
std::shared_ptr<CompiledFusionFunction> getOrCompile(Node * fusion_group);
131136

137+
// uses inputs/outputs as examples to infer continuity, does not run the graph
138+
std::shared_ptr<CompiledFusionFunction> getOrCompile(Graph & graph,
139+
bool is_cuda,
140+
at::ArrayRef<at::Tensor> inputs,
141+
at::ArrayRef<at::Tensor> outputs);
132142
// debugging function that lets you do everything from compilation to execution
133143
// in one step.
134144
// this should not be used in the hot path of execution because it has to serialize
135145
// the graph each time
136146
void debugLaunchGraph(Graph & graph, bool is_cuda, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
137147
bool canCompileOnCPU() const {
138-
return cxx.size() > 0;
148+
return config_.cxx.size() > 0;
139149
}
140150
private:
141-
std::string cxx; // compiler location
151+
FusionCompilerConfig config_;
142152
std::unordered_map<std::string, std::shared_ptr<CompiledFusionFunction>> cache;
143153
};
144154

0 commit comments

Comments
 (0)