tensorflow
diff --git a/‎tensorflow/compiler/jit/flags.cc‎
Lines changed: 29 additions & 1 deletion b/‎tensorflow/compiler/jit/flags.cc‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎tensorflow/compiler/jit/flags.h‎
Lines changed: 3 additions & 0 deletions b/‎tensorflow/compiler/jit/flags.h‎
Lines changed: 3 additions & 0 deletions
@@ -48,6 +48,15 @@ bool SetterForXlaAutoJitFlag(const string& value) {
     return true;
   }
 
+  if (value == "fusible") {
+    mark_for_compilation_flags->xla_auto_jit_flag
+        .optimization_level_single_gpu = 1;
+    mark_for_compilation_flags->xla_auto_jit_flag.optimization_level_general =
+        1;
+    mark_for_compilation_flags->tf_xla_ops_to_cluster = "FUSIBLE";
+    return true;
+  }
+
   absl::string_view value_sv(value);
   if (!absl::ConsumePrefix(&value_sv, "single-gpu(") ||
       !absl::ConsumeSuffix(&value_sv, ")") ||
@@ -65,7 +74,9 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
       Flag("tf_xla_auto_jit", SetterForXlaAutoJitFlag, "0",
            "Control compilation of operators into XLA computations on CPU and "
            "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
-           "things very likely to be improved; 2 = on for everything.  "
+           "things very likely to be improved; 2 = on for everything; "
+           "(experimental) fusible = only for Tensorflow operations that XLA "
+           "knows how to fuse.  "
            "If set to single-gpu(<N>) then this resolves to <N> for single-GPU "
            "graphs (graphs that have at least one node placed on a GPU and no "
            "more than one GPU is in use through the entire graph) and 0 "
@@ -78,6 +89,23 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
       Flag("tf_xla_max_cluster_size",
            &mark_for_compilation_flags->tf_xla_max_cluster_size,
            "Maximum number of operators in an XLA compilation."),
+      Flag(
+          "tf_xla_ops_to_cluster",
+          &mark_for_compilation_flags->tf_xla_ops_to_cluster,
+          "(experimental) "
+          "Limit the operations clustered by XLA to these operations. "
+          "If multiple, separate them with commas. Shortcuts: "
+          " PW: All point-wise operations."
+          " RED: All reduction operations."
+          " MISC: Mixed operations."
+          " PWRED: TF operations that get converted to PW+RED operation in XLA."
+          " REDUCEWINDOW: TF operations like MaxPool/AvgPool that get "
+          "converted to ReduceWindow in XLA."
+          " REDUCEWINDOWPW: Operation that get converted to ReduceWindow + PW "
+          "(LRN, LRNGrad)."
+          " BN: TF FusedBatchNorm* operations."
+          " FUSIBLE: All TF operations that XLA can fuse (All the above). "
+          "You can also put any TF operation name, e.g. 'FUSIBLE,Matmul'."),
       Flag("tf_xla_clustering_debug",
            &mark_for_compilation_flags->tf_xla_clustering_debug,
            "Dump graphs during XLA compilation."),
 
@@ -55,6 +55,9 @@ struct MarkForCompilationPassFlags {
   // Maximum number of operators in an XLA compilation.
   int32 tf_xla_max_cluster_size;
 
+  // If non-empty, limit XLA clustering to the following TF operations.
+  string tf_xla_ops_to_cluster;
+
   // Dump graphs during XLA compilation.
   bool tf_xla_clustering_debug;