apache
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/tvm/runtime/profiling.h‎
Lines changed: 2 additions & 1 deletion b/‎include/tvm/runtime/profiling.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/tvm/contrib/debugger/debug_executor.py‎
Lines changed: 92 additions & 17 deletions b/‎python/tvm/contrib/debugger/debug_executor.py‎
Lines changed: 92 additions & 17 deletions
diff --git a/‎python/tvm/contrib/debugger/debug_result.py‎
Lines changed: 29 additions & 11 deletions b/‎python/tvm/contrib/debugger/debug_result.py‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎python/tvm/contrib/graph_executor.py‎
Lines changed: 10 additions & 1 deletion b/‎python/tvm/contrib/graph_executor.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py‎
Lines changed: 4 additions & 1 deletion b/‎python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py‎
Lines changed: 4 additions & 1 deletion b/‎python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py‎
Lines changed: 4 additions & 1 deletion
@@ -392,7 +392,6 @@ endif()
 if(USE_PROFILER)
   message(STATUS "Build with profiler...")
 
-  add_definitions(-DUSE_PROFILER=1)
   tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
   set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}
 
@@ -543,11 +543,12 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
  *        minimum duration requirement of one `repeat`.
  *        i.e., When the run time of one `repeat` falls below this time,
  *        the `number` parameter will be automatically increased.
+ * \param cooldown_interval_ms The cool down interval between two measurements in milliseconds.
  * \param f_preproc The function to be executed before we excetute time evaluator.
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
-                             PackedFunc f_preproc = nullptr);
+                             int cooldown_interval_ms, PackedFunc f_preproc = nullptr);
 
 }  // namespace profiling
 }  // namespace runtime
 
@@ -222,13 +222,18 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
-    def _run_debug(self):
+    def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms):
         """Execute the node specified with index will be executed.
         Each debug output will be copied to the buffer
         Time consumed for each execution will be set as debug output.
         """
         # Get timing.
-        self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
+        self.debug_datum._time_list = self.run_individual(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+        )
 
         # Get outputs.
         self._run_per_layer()
@@ -259,31 +264,98 @@ def debug_get_output(self, node, out=None):
 
         self._debug_get_output(node_index, out)
 
-    def run(self, **input_dict):
+    # pylint: disable=arguments-differ
+    def run(self, number=10, repeat=1, min_repeat_ms=1, cooldown_interval_ms=0, **input_dict):
         """Run forward execution of the graph with debug
 
         Parameters
         ----------
+        number: int, optional
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
+
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cool down interval between two measurements in milliseconds.
+
         input_dict : dict of str to NDArray
             List of input values to be feed to
         """
         if input_dict:
             self.set_input(**input_dict)
 
         # Step 1. Execute the graph
-        self._run_debug()
+        self._run_debug(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+        )
         # Step 2. Dump the output tensors to the dump folder
         self.debug_datum.dump_output_tensor()
         # Step 3. Dump the Chrome trace to the dump folder
         self.debug_datum.dump_chrome_trace()
         # Step 4. Display the collected information
         self.debug_datum.display_debug_result()
 
-    def run_individual(self, number, repeat=1, min_repeat_ms=0):
-        ret = self._run_individual(number, repeat, min_repeat_ms)
-        return ret.strip(",").split(",") if ret else []
+    def run_individual(self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0):
+        """Run each operation in the graph and get the time per op for all ops.
+
+        number: int
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
+
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
 
-    def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cool down interval between two measurements in milliseconds.
+
+        Returns
+        -------
+        A 3-dimensional array where the dimensions are: the index of the operation,
+        the repeat and the number of the measurement.
+        """
+        ret = self._run_individual(number, repeat, min_repeat_ms, cooldown_interval_ms)
+        measurements = []
+        for node_data in ret.strip(":").split(":"):
+            measurements.append([])
+            for repeat_data in node_data.strip(";").split(";"):
+                measurements[-1].append([])
+                for number_data in repeat_data.strip(",").split(","):
+                    if number_data:
+                        measurements[-1][-1].append(float(number_data))
+        return measurements
+
+    def run_individual_node(
+        self, index, number=10, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0
+    ):
         """Benchmark a single node in the serialized graph.
 
         This does not do any data transfers and uses arrays already on the device.
@@ -304,27 +376,30 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
             The returned result contains `repeat` costs,
             each of which is an average of `number` costs.
 
-        min_repeat_ms: int, optional
+        min_repeat_ms : int, optional
             The minimum duration of one `repeat` in milliseconds.
             By default, one `repeat` contains `number` runs. If this parameter is set,
             the parameters `number` will be dynamically adjusted to meet the
             minimum duration requirement of one `repeat`.
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        cooldown_interval_ms : int, optional
+            The cool down interval between two measurements in milliseconds.
+
         Returns
         -------
         A module BenchmarkResult
         """
         # Results are returned as serialized strings which we deserialize
-        ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
-        answer = []
-        for value in ret.split(","):
-            if value.strip() == "":
-                continue
-            answer.append(float(value))
-
-        return BenchmarkResult(answer)
+        ret = self._run_individual_node(index, number, repeat, min_repeat_ms, cooldown_interval_ms)
+        measurements = []
+        for repeat_data in ret.replace(" ", "").strip(";").split(";"):
+            measurements.append([])
+            for number_data in repeat_data.strip(",").split(","):
+                if number_data:
+                    measurements[-1].append(float(number_data))
+        return BenchmarkResult(measurements)
 
     def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op
 
@@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
     def get_output_tensors(self):
         """Get the output tensors of each operation in numpy format"""
         eid = 0
-        order = 0
         output_tensors = {}
-        for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
+        for i, node in enumerate(self._nodes_list):
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
-                order += time[0]
 
                 # the node name is not unique, so we need a consistent
                 # indexing based on the list ordering in the nodes
@@ -157,7 +155,12 @@ def s_to_us(t):
             return t * 10**6
 
         starting_times = np.zeros(len(self._time_list) + 1)
-        starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
+        starting_times[1:] = np.cumsum(
+            [
+                np.mean([np.mean(repeat_data) for repeat_data in node_data])
+                for node_data in self._time_list
+            ]
+        )
 
         def node_to_events(node, times, starting_time):
             return [
@@ -170,7 +173,9 @@ def node_to_events(node, times, starting_time):
                 ),
                 ChromeTraceEvent(
                     # Use start + duration instead of end to ensure precise timings.
-                    ts=s_to_us(times[0] + starting_time),
+                    ts=s_to_us(
+                        np.mean([np.mean(repeat_data) for repeat_data in times]) + starting_time
+                    ),
                     tid=1,
                     pid=1,
                     ph="E",
@@ -205,12 +210,18 @@ def _dump_graph_json(self, graph):
 
     def get_debug_result(self, sort_by_time=True):
         """Return the debugger result"""
-        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
-        lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
+        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs", "Times"]
+        lines = ["---------", "---", "--------", "-------", "-----", "------", "-------", "-------"]
         eid = 0
         data = []
-        total_time = sum(time[0] for time in self._time_list)
+        total_time = sum(
+            [
+                np.mean([np.mean(repeat_data) for repeat_data in node_data])
+                for node_data in self._time_list
+            ]
+        )
         for node, time in zip(self._nodes_list, self._time_list):
+            time_mean = np.mean([np.mean(repeat_data) for repeat_data in time])
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
                 op = node["op"]
@@ -219,11 +230,18 @@ def get_debug_result(self, sort_by_time=True):
                     continue
                 name = node["name"]
                 shape = str(self._output_tensor_list[eid].shape)
-                time_us = round(time[0] * 1e6, 3)
-                time_percent = round(((time[0] / total_time) * 100), 3)
+                time_us = round(time_mean * 1e6, 3)
+
+                times = str(
+                    [
+                        [round(number_data * 1e3, 3) for number_data in repeat_data]
+                        for repeat_data in time
+                    ]
+                )
+                time_percent = round(((time_mean / total_time) * 100), 3)
                 inputs = str(node["attrs"]["num_inputs"])
                 outputs = str(node["attrs"]["num_outputs"])
-                node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
+                node_data = [name, op, time_us, time_percent, shape, inputs, outputs, times]
                 data.append(node_data)
                 eid += 1
 
 
@@ -356,6 +356,7 @@ def benchmark(
         number=5,
         min_repeat_ms=None,
         end_to_end=False,
+        cooldown_interval_ms=0,
         **kwargs,
     ):
         """Calculate runtime of a function by repeatedly calling it.
@@ -400,6 +401,9 @@ def benchmark(
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
 
+        cooldown_interval_ms : Optional[float]
+            The cool down interval between two measurements in milliseconds.
+
         end_to_end : bool
             If set, include time to transfer input tensors to the device and time to transfer
             returned tensors in the total runtime. This will give accurate timings for end to end
@@ -432,5 +436,10 @@ def benchmark(
         if kwargs:
             self.set_input(**kwargs)
         return self.module.time_evaluator(
-            func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
+            func_name,
+            device,
+            repeat=repeat,
+            number=number,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
         )()
@@ -211,7 +211,10 @@ def f_per_layer(rt_mod, dev, input_data):
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
         print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        graph_nodes_time = {
+            k: float(np.mean([np.mean(repeat_data) for repeat_data in v]))
+            for k, v in zip(graph_nodes, graph_time)
+        }
         for k, v in graph_nodes_time.items():
             print(f"{k} : {v:.3f}")
 
 
@@ -174,7 +174,10 @@ def f_per_layer(rt_mod, dev, input_data):
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
         print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        graph_nodes_time = {
+            k: float(np.mean([np.mean(repeat_data) for repeat_data in v]))
+            for k, v in zip(graph_nodes, graph_time)
+        }
         for k, v in graph_nodes_time.items():
             print(f"{k} : {v:.3f}")