[ROOFLINE] Calculate roofline from existing TIR PrimFunc (#11238)

Tristan Konolige · web-flow · commit d2a7f93bebca · 2022-05-09T14:45:09.000-07:00
Refactor roofline_analysis to use a pass instrument to save TIR code
from compilation for feature extraction. This should support different
compilation pipelines and avoids recompiling the module twice.
diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
@@ -22,6 +22,8 @@
 from ..target import Target
 from ..runtime import profiler_vm, profiling, Device, num_threads
 from ..script import tir as T
+from ..ir.instrument import pass_instrument
+from ..ir.expr import GlobalVar
 
 
 def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
@@ -36,16 +38,6 @@ def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
     return args
 
 
-def _estimated_features(mod: IRModule, params: Dict[str, nd.NDArray], target: Target):
-    comp = relay.vm.VMCompiler()
-    mod, params = comp.optimize(mod, params=params, target=target)
-    return {
-        prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
-        for name, prim in mod.functions.items()
-        if isinstance(prim, tir.PrimFunc)
-    }
-
-
 def _detect_vec_width_registers(
     target: Target, vec_width: Optional[int], num_vector_registers: Optional[int]
 ):
@@ -226,60 +218,98 @@ def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int
     return a.numpy().size * 4 / times.min  # 4 bytes per float32
 
 
-def roofline_analysis(
-    mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
+@pass_instrument
+class SaveLoweredTIR:
+    """Save TIR functions from right before final lowering. Right now this
+    means right before tir.MakePackedAPI."""
+
+    def __init__(self):
+        self.functions = {}
+        self.done = False
+
+    def run_after_pass(self, mod, info):
+        if not self.done:
+            if info.name == "tir.MakePackedAPI":
+                self.done = True
+            else:
+                for v, func in mod.functions.items():
+                    self.functions[v] = func
+
+
+def roofline_from_existing(
+    report: profiling.Report,
+    tir_functions: Dict[GlobalVar, tir.PrimFunc],
+    target: Target,
+    dev: Device,
 ) -> profiling.Report:
-    """
-    Create a profiling report that contains roofline and other estimated
-    statistics from running a module on the VM.
+    """Add roofline and other estimated statistics to an existing profiling report.
 
-    These statistics are calculated by analyzing the lowered TIR of each
-    operator, so they are estimates of the true values. The statistics are:
-      - Bound: Is the operator memory or compute bound. This is computed by
-        assuming that the operator could perfectly cache all loads -- each byte
-        of memory is only loaded once.
-      - Percent of Theoretical Optimal: What percent of theoretical optimal for
-        the bound. i.e. percent of peak memory bandwidth if memory bound,
-        percent of peak FLOP/s if compute bound.
-      - Loaded Bytes: estimation of the number of bytes loaded from main memory.
-      - Estimated Flops: estimated number of floating point operations.
-      - Arithmetic Intensity: ratio of FLOPs per byte of data.
-      - FLOP/s: floating point operations per second.
-      - Bandwidth: Number of bytes loaded per second.
+    :py:func:`roofline_analysis` should always be used instead of this function
+    unless you need a custom compilation pipeline.
 
-    Parameters
-    ----------
-    mod : IRModule
-      Uncompiled input module>
+    Calculating roofline statistics requires features extracted the TIR
+    functions in addition to per-operator runtime information (`report`) of the
+    same TIR features. The features and TIR functions are not included with the
+    compiled library used to generate the per-operator runtime. It is essential
+    that the per-operator information comes from the exact same compilation
+    pipeline as the TIR functions.
 
-    params : Dict[str, nd.NDArray]
 
-    target : Union[str, Target]
-      Target to run on.
+    Example
+    -------
+
+    ..code: : python
+
+        import tvm
+        import tvm.relay
+
+        mod, params = tvm.relay.testing.mlp.get_workload()
+
+        # it is recommended to use SaveLoweredTIR to get out the tir primfuncs
+        save_tir = tvm.utils.roofline.SaveLoweredTIR()
+        with tvm.transform.PassContext(opt_level=3, pass_instrument=[save_tir]):
+            lib = relay.vm.compile(mod, params=params, target=target)
+
+        vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
+        report = vmexec.profile(*inputs)
+
+        roofline_report = roofline_from_existing(report, save_tir.functions, target, dev)
 
+
+    Parameters
+    ----------
+    report : Report
+        Existing profiling report from :py:method:`VirtualMachineProfiler.profile`.
+    tir_functions : Dict[GlobalVar, PrimFunc]
+        TIR primfuncs from the module run to generate `report`. It is nessesary
+        that these functions come before the `tir.MakePackedAPI` pass and are
+        compatible with auto_scheduler featurization.
+        :py:class:`SaveLoweredTIR` is the recommended way to collect these
+        functions.
+    target : Target
+        TVM target that `report` was generated with.
     dev : Device
-      Device to run on.
+        Device that `report` was generated with.
 
     Returns
     -------
-
-    report : profiling.Report
-      Profiling report which includes the estimated statistics.
+    profiling.Report
+        New profiling report that includes all information from `report`
+        along with additional roofline metrics. See
+        :py:func:`roofline_analysis` for more information on which metrics
+        are included.
     """
-    if isinstance(target, str):
-        target = Target(target)
     peak_bandwidth = estimate_peak_bandwidth(target, dev)
     peak_flops = estimate_peak_fma_flops(target, dev)
 
     ridge_point = peak_flops / peak_bandwidth
 
-    all_features = _estimated_features(mod, params, target)
-
-    lib = relay.vm.compile(mod, params=params, target=target)
-    vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
+    all_features = {
+        prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
+        for name, prim in tir_functions.items()
+        if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys()
+    }
 
-    args = _create_args(mod, dev)
-    report = vmexec.profile(*args)
     new_calls = []
     for call in report.calls:
         if "Hash" in call.keys():
@@ -313,3 +343,71 @@ def roofline_analysis(
         else:
             new_calls.append(call)
     return profiling.Report(new_calls, report.device_metrics)
+
+
+def roofline_analysis(
+    mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
+) -> profiling.Report:
+    """
+    Create a profiling report that contains roofline and other estimated
+    statistics from running a module on the VM.
+
+    The roofline model measures how close a operator gets to best possible
+    memory bandwidth or FLOP/s depending on whether it is memory or compute
+    bound. This computation uses the runtime of the operator along with two
+    numbers extracted from the TIR code: bytes of memory touched and number of
+    floating point operations.
+
+    These statistics are calculated by analyzing the lowered TIR of each
+    operator, so they are estimates of the true values. The statistics are:
+      - Bound: Is the operator memory or compute bound. This is computed by
+        assuming that the operator could perfectly cache all loads -- each byte
+        of memory is only loaded once.
+      - Percent of Theoretical Optimal: What percent of theoretical optimal for
+        the bound. i.e. percent of peak memory bandwidth if memory bound,
+        percent of peak FLOP/s if compute bound.
+      - Loaded Bytes: estimation of the number of bytes loaded from main memory.
+      - Estimated Flops: estimated number of floating point operations.
+      - Arithmetic Intensity: ratio of FLOPs per byte of data.
+      - FLOP/s: floating point operations per second.
+      - Bandwidth: Number of bytes loaded per second.
+
+    Parameters
+    ----------
+    mod : IRModule
+      Uncompiled input module>
+
+    params : Dict[str, nd.NDArray]
+
+    target : Union[str, Target]
+      Target to run on.
+
+    dev : Device
+      Device to run on.
+
+    Returns
+    -------
+
+    report : profiling.Report
+      Profiling report which includes the estimated statistics.
+    """
+    if isinstance(target, str):
+        target = Target(target)
+
+    save_tir = SaveLoweredTIR()
+    # copy existing context but add our instrument
+    pass_ctx = transform.PassContext.current()
+    with transform.PassContext(
+        opt_level=pass_ctx.opt_level,
+        required_pass=pass_ctx.required_pass,
+        disabled_pass=pass_ctx.disabled_pass,
+        instruments=list(pass_ctx.instruments) + [save_tir],
+        config=pass_ctx.config,
+    ):
+        lib = relay.vm.compile(mod, params=params, target=target)
+    vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
+
+    args = _create_args(mod, dev)
+    report = vmexec.profile(*args)
+
+    return roofline_from_existing(report, save_tir.functions, target, dev)
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
@@ -740,7 +740,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     // TODO(tkonolige): add arithmetic counts from this statement to counts of inner stores.
     ana_.Bind(node->var, node->value);
     ICHECK(variable_definition_stack_.size() > 0)
-        << "Variable definition out size of a for loop is not handled by feature extraction";
+        << "Variable definition outside of a for loop is not handled by feature extraction";
     variable_definition_stack_.back().push_back(std::make_tuple(node->var, node->value));
     StmtExprVisitor::VisitStmt_(node);
   }

Original file line number	Diff line number	Diff line change
`@@ -740,7 +740,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {`
`740`	`740`	`// TODO(tkonolige): add arithmetic counts from this statement to counts of inner stores.`
`741`	`741`	`ana_.Bind(node->var, node->value);`
`742`	`742`	`ICHECK(variable_definition_stack_.size() > 0)`
`743`		`- << "Variable definition out size of a for loop is not handled by feature extraction";`
	`743`	`+ << "Variable definition outside of a for loop is not handled by feature extraction";`
`744`	`744`	`variable_definition_stack_.back().push_back(std::make_tuple(node->var, node->value));`
`745`	`745`	`StmtExprVisitor::VisitStmt_(node);`
`746`	`746`	`}`