tensorflow · nouiz · Sep 27, 2019 · Sep 27, 2019 · Sep 30, 2019 · Oct 1, 2019
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
@@ -48,6 +48,15 @@ bool SetterForXlaAutoJitFlag(const string& value) {
     return true;
   }
 
+  if (value == "fusible") {
+    mark_for_compilation_flags->xla_auto_jit_flag
+        .optimization_level_single_gpu = 1;
+    mark_for_compilation_flags->xla_auto_jit_flag.optimization_level_general =
+        1;
+    mark_for_compilation_flags->tf_xla_ops_to_cluster = "FUSIBLE";
+    return true;
+  }
+
   absl::string_view value_sv(value);
   if (!absl::ConsumePrefix(&value_sv, "single-gpu(") ||
       !absl::ConsumeSuffix(&value_sv, ")") ||
@@ -65,7 +74,9 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
       Flag("tf_xla_auto_jit", SetterForXlaAutoJitFlag, "0",
            "Control compilation of operators into XLA computations on CPU and "
            "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
-           "things very likely to be improved; 2 = on for everything.  "
+           "things very likely to be improved; 2 = on for everything; "
+           "(experimental) fusible = only for Tensorflow operations that XLA "
+	   "knows how to fuse.  "
            "If set to single-gpu(<N>) then this resolves to <N> for single-GPU "
            "graphs (graphs that have at least one node placed on a GPU and no "
            "more than one GPU is in use through the entire graph) and 0 "
@@ -78,6 +89,22 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
       Flag("tf_xla_max_cluster_size",
            &mark_for_compilation_flags->tf_xla_max_cluster_size,
            "Maximum number of operators in an XLA compilation."),
+      Flag("tf_xla_ops_to_cluster",
+           &mark_for_compilation_flags->tf_xla_ops_to_cluster,
+	   "(experimental) "
+           "Limit the operations clustered by XLA to these operations. "
+           "If multiple, separate them with commas. Shortcuts: "
+           " PW: All point-wise operations."
+           " RED: All reduction operations."
+           " MISC: Mixed operations."
+           " PWRED: TF operations that get converted to PW+RED operation in XLA."
+           " REDUCEWINDOW: TF operations like MaxPool/AvgPool that get "
+           "converted to ReduceWindow in XLA."
+           " REDUCEWINDOWPW: Operation that get converted to ReduceWindow + PW "
+           "(LRN, LRNGrad)."
+           " BN: TF FusedBatchNorm* operations."
+           " FUSIBLE: All TF operations that XLA can fuse (All the above). "
+           "You can also put any TF operation name, e.g. 'FUSIBLE,Matmul'."),
       Flag("tf_xla_clustering_debug",
            &mark_for_compilation_flags->tf_xla_clustering_debug,
            "Dump graphs during XLA compilation."),

diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
@@ -55,6 +55,9 @@ struct MarkForCompilationPassFlags {
   // Maximum number of operators in an XLA compilation.
   int32 tf_xla_max_cluster_size;
 
+  // If non-empty, limit XLA clustering to the following TF operations.
+  string tf_xla_ops_to_cluster;
+
   // Dump graphs during XLA compilation.
   bool tf_xla_clustering_debug;