apache · tkonolige · Jun 28, 2022 · Jun 17, 2022 · Jun 27, 2022 · Jun 28, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -394,7 +394,6 @@ endif()
 if(USE_PROFILER)
   message(STATUS "Build with profiler...")
 
-  add_definitions(-DUSE_PROFILER=1)
   tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
   set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
@@ -539,6 +539,25 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
 
 /*!
  * \brief Wrap a timer function to measure the time cost of a given packed function.
+ *
+ * Approximate implementation:
+ * \code{.py}
+ * f() // warmup
+ * for i in range(repeat)
+ *   f_preproc()
+ *   while True:
+ *     start = time()
+ *     for j in range(number):
+ *       f()
+ *     duration_ms = time() - start
+ *     if duration_ms >= min_repeat_ms:
+ *       break
+ *     else:
+ *        number = (min_repeat_ms / (duration_ms / number) + 1
+ *   if cooldown_interval_ms and i % repeats_to_cooldown == 0:
+ *     sleep(cooldown_interval_ms)
+ * \endcode
+ *
  * \param f The function argument.
  * \param dev The device.
  * \param number The number of times to run this function for taking average.
@@ -554,10 +573,16 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
  *        minimum duration requirement of one `repeat`.
  *        i.e., When the run time of one `repeat` falls below this time,
  *        the `number` parameter will be automatically increased.
- * \param f_preproc The function to be executed before we excetute time evaluator.
+ * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
+ *        defined by `repeats_to_cooldown`.
+ * \param repeats_to_cooldown The number of repeats before the
+ *        cooldown is activated.
+ * \param f_preproc The function to be executed before we execute time
+ *        evaluator.
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
+                             int cooldown_interval_ms, int repeats_to_cooldown,
                              PackedFunc f_preproc = nullptr);
 
 }  // namespace profiling

diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -239,7 +239,7 @@ def f_per_layer(rt_mod, dev, input_data):
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
         print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
         for k, v in graph_nodes_time.items():
             print(f"{k} : {v:.3f}")
 

diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
@@ -19,6 +19,7 @@
 import logging
 import os
 import shutil
+import struct
 import tempfile
 
 import tvm._ffi
@@ -222,13 +223,19 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
-    def _run_debug(self):
+    def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
         """Execute the node specified with index will be executed.
         Each debug output will be copied to the buffer
         Time consumed for each execution will be set as debug output.
         """
         # Get timing.
-        self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
+        self.debug_datum._time_list = self.run_individual(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
+        )
 
         # Get outputs.
         self._run_per_layer()
@@ -259,31 +266,127 @@ def debug_get_output(self, node, out=None):
 
         self._debug_get_output(node_index, out)
 
-    def run(self, **input_dict):
+    # pylint: disable=arguments-differ
+    def run(
+        self,
+        number=10,
+        repeat=1,
+        min_repeat_ms=1,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+        **input_dict,
+    ):
         """Run forward execution of the graph with debug
 
         Parameters
         ----------
+        number: int, optional
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
+
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         input_dict : dict of str to NDArray
             List of input values to be feed to
         """
         if input_dict:
             self.set_input(**input_dict)
 
         # Step 1. Execute the graph
-        self._run_debug()
+        self._run_debug(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
+        )
         # Step 2. Dump the output tensors to the dump folder
         self.debug_datum.dump_output_tensor()
         # Step 3. Dump the Chrome trace to the dump folder
         self.debug_datum.dump_chrome_trace()
         # Step 4. Display the collected information
         self.debug_datum.display_debug_result()
 
-    def run_individual(self, number, repeat=1, min_repeat_ms=0):
-        ret = self._run_individual(number, repeat, min_repeat_ms)
-        return ret.strip(",").split(",") if ret else []
+    def run_individual(
+        self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
+    ):
+        """Run each operation in the graph and get the time per op for all ops.
+
+        number: int
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
 
-    def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
+        Returns
+        -------
+        A 2-dimensional array where the dimensions are: the index of the operation and
+        the repeat of the measurement.
+        """
+        res = self._run_individual(
+            number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+        )
+        results = []
+        offset = 0
+        format_size = "@q"
+        (nodes_count,) = struct.unpack_from(format_size, res, offset)
+        offset += struct.calcsize(format_size)
+        format_data = "@" + repeat * "d"
+        for _ in range(0, nodes_count):
+            ret = struct.unpack_from(format_data, res, offset)
+            offset += struct.calcsize(format_data)
+            results.append([*ret])
+        return results
+
+    def run_individual_node(
+        self,
+        index,
+        number=10,
+        repeat=1,
+        min_repeat_ms=0,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+    ):
         """Benchmark a single node in the serialized graph.
 
         This does not do any data transfers and uses arrays already on the device.
@@ -304,27 +407,32 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
             The returned result contains `repeat` costs,
             each of which is an average of `number` costs.
 
-        min_repeat_ms: int, optional
+        min_repeat_ms : int, optional
             The minimum duration of one `repeat` in milliseconds.
             By default, one `repeat` contains `number` runs. If this parameter is set,
             the parameters `number` will be dynamically adjusted to meet the
             minimum duration requirement of one `repeat`.
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         Returns
         -------
         A module BenchmarkResult
         """
         # Results are returned as serialized strings which we deserialize
-        ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
-        answer = []
-        for value in ret.split(","):
-            if value.strip() == "":
-                continue
-            answer.append(float(value))
-
-        return BenchmarkResult(answer)
+        res = self._run_individual_node(
+            index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+        )
+        fmt = "@" + ("d" * repeat)
+        results = struct.unpack(fmt, res)
+        return BenchmarkResult(list(results))
 
     def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op

diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
@@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
     def get_output_tensors(self):
         """Get the output tensors of each operation in numpy format"""
         eid = 0
-        order = 0
         output_tensors = {}
-        for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
+        for i, node in enumerate(self._nodes_list):
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
-                order += time[0]
 
                 # the node name is not unique, so we need a consistent
                 # indexing based on the list ordering in the nodes
@@ -157,7 +155,7 @@ def s_to_us(t):
             return t * 10**6
 
         starting_times = np.zeros(len(self._time_list) + 1)
-        starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
+        starting_times[1:] = np.cumsum([np.mean(times) for times in self._time_list])
 
         def node_to_events(node, times, starting_time):
             return [
@@ -170,7 +168,7 @@ def node_to_events(node, times, starting_time):
                 ),
                 ChromeTraceEvent(
                     # Use start + duration instead of end to ensure precise timings.
-                    ts=s_to_us(times[0] + starting_time),
+                    ts=s_to_us(np.mean(times) + starting_time),
                     tid=1,
                     pid=1,
                     ph="E",
@@ -205,12 +203,31 @@ def _dump_graph_json(self, graph):
 
     def get_debug_result(self, sort_by_time=True):
         """Return the debugger result"""
-        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
-        lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
+        header = [
+            "Node Name",
+            "Ops",
+            "Time(us)",
+            "Time(%)",
+            "Shape",
+            "Inputs",
+            "Outputs",
+            "Measurements(us)",
+        ]
+        lines = [
+            "---------",
+            "---",
+            "--------",
+            "-------",
+            "-----",
+            "------",
+            "-------",
+            "----------------",
+        ]
         eid = 0
         data = []
-        total_time = sum(time[0] for time in self._time_list)
+        total_time = sum([np.mean(time) for time in self._time_list])
         for node, time in zip(self._nodes_list, self._time_list):
+            time_mean = np.mean(time)
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
                 op = node["op"]
@@ -219,11 +236,12 @@ def get_debug_result(self, sort_by_time=True):
                     continue
                 name = node["name"]
                 shape = str(self._output_tensor_list[eid].shape)
-                time_us = round(time[0] * 1e6, 3)
-                time_percent = round(((time[0] / total_time) * 100), 3)
+                time_us = round(time_mean * 1e6, 3)
+                time_percent = round(((time_mean / total_time) * 100), 3)
                 inputs = str(node["attrs"]["num_inputs"])
                 outputs = str(node["attrs"]["num_outputs"])
-                node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
+                measurements = str([round(repeat_data * 1e6, 3) for repeat_data in time])
+                node_data = [name, op, time_us, time_percent, shape, inputs, outputs, measurements]
                 data.append(node_data)
                 eid += 1
 
@@ -232,7 +250,7 @@ def get_debug_result(self, sort_by_time=True):
             data = sorted(data, key=lambda x: x[2], reverse=True)
             # Insert a row for total time at the end.
             rounded_total_time_us = round(total_time * 1e6, 3)
-            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"])
+            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-", "-"])
 
         fmt = ""
         for i, _ in enumerate(header):