Skip to content

Commit e5ea34a

Browse files
caisqtensorflower-gardener
authored andcommitted
tfdb: Debug nodes inserter
EXPERIMENTAL: Insert special debug ops (e.g., DebugIdentity) to graph for debugging. Currently, debug ops need to take exactly one input and has the string attribute "tensor_name" to indicate what tensor it watches. For example, before the node insertion, the graph may look like: A:0 -----------1----------> B | ---------2-----------> C wherein the output slot 0 of node A feeds as the input to nodes B through edge 1 and to node C through edge 2. After the node insertion, assuming both B and C have non-Ref input, the graph becomes: A:0 ---3---> Copy -----------4----------> B | ---------5--------> C | ---------6--------> X If a node (e.g., B) has Ref input, the graph becomes: ----------------4---------------> B | A:0 ---3-----> Copy -----------5----------> C | -----------6--------> X In other words, we do not feed Refs to deep-copies to downstream nodes. The Copy node is the inserted deep-copy node that copies the input tensor on-device (e.g., CPU-to-CPU or GPU-to-GPU deep copy) that reduces the likelihood of racy updates during debug tensor-watching. X is the newly created debug node that transforms the input (copy of the watched tensor) into a debug signal. DebugIdentity is the simplest debugging paradigm, in which the debug signal (i.e., X:0) equals the tensor itself. More sophisticated debug ops can be used to transform the tensor into other useful debug signals. An example is the added DebugNanCounter op. If the nodes (A, B and C) are located on GPU and the edges from A to B or C is HOST_MEMORY, the CopyHost op will be used instead of the Copy op. A reserved string attribute "debug_url" is created for the debug ops to make it possible to send debug signals to files or RPC calls in the future. Other points worth noting: * The debug ops have control-edge connections to the original destination node, in order to ensure that the debug signals are deterministically generated before the destination node executes. * More than one debug ops can be added to watch a tensor. * A new field called "DebugTensorWatch" is added to RunOptions to support debug node insertion. * A new method GPUUtil::CopyGPUTensorToSameGPU has been added to make GPU-to-GPU deep-copy of tensors possible. * The two test files (debug_gateway_test.cc and debug_gateway_gpu_test.cc) have been consolidated to the former, by using the GOOGLE_CUDA macro. Change: 127562075
1 parent 25352e6 commit e5ea34a

17 files changed

Lines changed: 1364 additions & 255 deletions

tensorflow/core/BUILD

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,8 @@ filegroup(
560560
"client/**/*.cc",
561561
"common_runtime/**/*.h",
562562
"common_runtime/**/*.cc",
563+
"debug/**/*.h",
564+
"debug/**/*.cc",
563565
"framework/**/*.h",
564566
"framework/**/*.cc",
565567
"graph/**/*.h",
@@ -1085,6 +1087,7 @@ tf_cuda_library(
10851087
linkstatic = 1,
10861088
deps = [
10871089
":core_cpu_internal",
1090+
":debug_graph_utils",
10881091
":framework",
10891092
":gpu_tracer",
10901093
":lib",
@@ -1114,6 +1117,23 @@ tf_cuda_library(
11141117
alwayslink = 1,
11151118
)
11161119

1120+
tf_cuda_library(
1121+
name = "debug_graph_utils",
1122+
srcs = ["debug/debug_graph_utils.cc"],
1123+
hdrs = ["debug/debug_graph_utils.h"],
1124+
copts = tf_copts(),
1125+
linkstatic = 1,
1126+
deps = [
1127+
":core_cpu_internal",
1128+
":framework",
1129+
":lib",
1130+
":lib_internal",
1131+
":proto_text",
1132+
":protos_all_cc",
1133+
],
1134+
alwayslink = 1,
1135+
)
1136+
11171137
cc_library(
11181138
name = "example_parser_configuration",
11191139
srcs = ["example/example_parser_configuration.cc"],
@@ -1580,42 +1600,8 @@ tf_cc_test(
15801600
],
15811601
)
15821602

1583-
tf_cc_test(
1584-
name = "debug/debug_gateway_test",
1585-
size = "small",
1586-
linkstatic = tf_kernel_tests_linkstatic(),
1587-
deps = [
1588-
":core",
1589-
":core_cpu",
1590-
":core_cpu_internal",
1591-
":debug_gateway_internal",
1592-
":direct_session_internal",
1593-
":framework",
1594-
":framework_internal",
1595-
":lib",
1596-
":lib_internal",
1597-
":ops",
1598-
":protos_all_cc",
1599-
":test",
1600-
":test_main",
1601-
":testlib",
1602-
"//tensorflow/cc:cc_ops",
1603-
"//tensorflow/core/kernels:control_flow_ops",
1604-
"//tensorflow/core/kernels:cwise_op",
1605-
"//tensorflow/core/kernels:dense_update_ops",
1606-
"//tensorflow/core/kernels:fifo_queue_op",
1607-
"//tensorflow/core/kernels:identity_op",
1608-
"//tensorflow/core/kernels:matmul_op",
1609-
"//tensorflow/core/kernels:ops_util",
1610-
"//tensorflow/core/kernels:queue_ops",
1611-
"//tensorflow/core/kernels:session_ops",
1612-
"//tensorflow/core/kernels:variable_ops",
1613-
"//third_party/eigen3",
1614-
],
1615-
)
1616-
16171603
tf_cc_test_gpu(
1618-
name = "debug/debug_gateway_gpu_test",
1604+
name = "debug/debug_gateway_test",
16191605
size = "small",
16201606
args = ["--heap_check=local"],
16211607
linkstatic = tf_kernel_tests_linkstatic(),
@@ -1625,6 +1611,7 @@ tf_cc_test_gpu(
16251611
":core_cpu",
16261612
":core_cpu_internal",
16271613
":debug_gateway_internal",
1614+
":debug_graph_utils",
16281615
":direct_session",
16291616
":direct_session_internal",
16301617
":framework",
@@ -1637,6 +1624,7 @@ tf_cc_test_gpu(
16371624
":test_main",
16381625
":testlib",
16391626
"//tensorflow/cc:cc_ops",
1627+
"//tensorflow/core/kernels:debug_ops",
16401628
"//tensorflow/core/kernels:ops_util",
16411629
],
16421630
)

tensorflow/core/common_runtime/direct_session.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,15 @@ Status DirectSession::Run(const RunOptions& run_options,
284284
}
285285
thread::ThreadPool* pool = thread_pools_[run_options.inter_op_thread_pool()];
286286

287+
// EXPERIMENTAL: Options that allow the client to insert nodes into partition
288+
// graphs for debugging.
289+
if (!run_options.debug_tensor_watch_opts().empty()) {
290+
debug_node_inserter_.reset(
291+
new DebugNodeInserter(run_options.debug_tensor_watch_opts()));
292+
} else {
293+
debug_node_inserter_.reset(nullptr);
294+
}
295+
287296
// Check if we already have an executor for these arguments.
288297
ExecutorsAndKeys* executors_and_keys;
289298
RunStateArgs run_state_args;
@@ -794,6 +803,12 @@ Status DirectSession::GetOrCreateExecutors(
794803

795804
partition_graph = iter->second.release();
796805
optimizer.Optimize(lib, device, &partition_graph);
806+
807+
// EXPERIMENTAL: tfdb inserts debug nodes (i.e., probes) to the graph
808+
if (debug_node_inserter_) {
809+
TF_RETURN_IF_ERROR(
810+
debug_node_inserter_->InsertNodes(partition_graph, params.device));
811+
}
797812
iter->second.reset(partition_graph);
798813

799814
TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device->device_type()),

tensorflow/core/common_runtime/direct_session.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ limitations under the License.
2929
#include "tensorflow/core/common_runtime/executor.h"
3030
#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
3131
#include "tensorflow/core/common_runtime/simple_graph_execution_state.h"
32+
#include "tensorflow/core/debug/debug_graph_utils.h"
3233
#include "tensorflow/core/framework/cancellation.h"
3334
#include "tensorflow/core/framework/graph.pb.h"
3435
#include "tensorflow/core/framework/session_state.h"
@@ -253,7 +254,9 @@ class DirectSession : public Session {
253254

254255
TF_DISALLOW_COPY_AND_ASSIGN(DirectSession);
255256

257+
// EXPERIMENTAL: debugger (tfdb) related
256258
friend class DebugGateway;
259+
std::unique_ptr<DebugNodeInserter> debug_node_inserter_;
257260
};
258261

259262
} // end namespace tensorflow

tensorflow/core/common_runtime/gpu/gpu_util.cc

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,4 +426,32 @@ uint64 GPUUtil::Checksum(const Tensor& tensor) {
426426
tensor.TotalBytes(), 0);
427427
}
428428

429+
// static
430+
void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
431+
const DeviceContext* device_context,
432+
const Tensor* src_gpu_tensor,
433+
Tensor* dst_gpu_tensor,
434+
StatusCallback done) {
435+
VLOG(1) << "CopyGPUTensorToSameGPU";
436+
const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
437+
gpu::Stream* send_stream = nullptr;
438+
Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor,
439+
dst_gpu_tensor, &dev_info, &send_stream);
440+
if (!s.ok()) {
441+
done(s);
442+
return;
443+
}
444+
445+
const int64 total_bytes = src_gpu_tensor->TotalBytes();
446+
if (total_bytes > 0) {
447+
void* src_ptr = GetBase(src_gpu_tensor);
448+
DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
449+
void* dst_ptr = GetBase(dst_gpu_tensor);
450+
DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
451+
send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
452+
}
453+
454+
done(Status::OK());
455+
}
456+
429457
} // namespace tensorflow

tensorflow/core/common_runtime/gpu/gpu_util.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,16 @@ class GPUUtil {
100100
AllocatorAttributes dst_alloc_attr,
101101
const Tensor* input, Tensor* output,
102102
StatusCallback done);
103+
104+
// Deep-copying of GPU tensor on the same device.
105+
// 'src_gpu_tensor''s and 'dst_gpu_tensor''s backing memory must be on
106+
// 'gpu_device' and 'dst_cpu_tensor' must be allocated to be of the same
107+
// size as 'src_gpu_tensor'.
108+
static void CopyGPUTensorToSameGPU(Device* gpu_device,
109+
const DeviceContext* device_context,
110+
const Tensor* src_gpu_tensor,
111+
Tensor* dst_gpu_tensor,
112+
StatusCallback done);
103113
};
104114

105115
} // namespace tensorflow

0 commit comments

Comments
 (0)