Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ endif()
if(USE_PROFILER)
message(STATUS "Build with profiler...")

add_definitions(-DUSE_PROFILER=1)
tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}
Expand Down
27 changes: 26 additions & 1 deletion include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,25 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i

/*!
* \brief Wrap a timer function to measure the time cost of a given packed function.
*
* Approximate implementation:
* \code{.py}
* f() // warmup
* for i in range(repeat)
* f_preproc()
* while True:
* start = time()
* for j in range(number):
* f()
* duration_ms = time() - start
* if duration_ms >= min_repeat_ms:
* break
* else:
* number = (min_repeat_ms / (duration_ms / number) + 1
* if cooldown_interval_ms and i % repeats_to_cooldown == 0:
* sleep(cooldown_interval_ms)
* \endcode
*
* \param f The function argument.
* \param dev The device.
* \param number The number of times to run this function for taking average.
Expand All @@ -554,10 +573,16 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* minimum duration requirement of one `repeat`.
* i.e., When the run time of one `repeat` falls below this time,
* the `number` parameter will be automatically increased.
* \param f_preproc The function to be executed before we excetute time evaluator.
* \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
* defined by `repeats_to_cooldown`.
* \param repeats_to_cooldown The number of repeats before the
* cooldown is activated.
Comment thread
Icemist marked this conversation as resolved.
* \param f_preproc The function to be executed before we execute time
* evaluator.
* \return f_timer A timer function.
*/
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
int cooldown_interval_ms, int repeats_to_cooldown,
PackedFunc f_preproc = nullptr);

} // namespace profiling
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/auto_scheduler/testing/tune_relay.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def f_per_layer(rt_mod, dev, input_data):
graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
print("|graph_nodes| = ", len(graph_nodes))
print("|graph_time| = ", len(graph_time))
graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
for k, v in graph_nodes_time.items():
print(f"{k} : {v:.3f}")

Expand Down
142 changes: 125 additions & 17 deletions python/tvm/contrib/debugger/debug_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import logging
import os
import shutil
import struct
import tempfile

import tvm._ffi
Expand Down Expand Up @@ -222,13 +223,19 @@ def _run_per_layer(self):
output_tensors.append(self._get_node_output(i, j))
self.debug_datum.update_output_tensors(output_tensors)

def _run_debug(self):
def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
"""Execute the node specified with index will be executed.
Each debug output will be copied to the buffer
Time consumed for each execution will be set as debug output.
"""
# Get timing.
self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
self.debug_datum._time_list = self.run_individual(
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)

# Get outputs.
self._run_per_layer()
Expand Down Expand Up @@ -259,31 +266,127 @@ def debug_get_output(self, node, out=None):

self._debug_get_output(node_index, out)

def run(self, **input_dict):
# pylint: disable=arguments-differ
def run(
self,
number=10,
repeat=1,
min_repeat_ms=1,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
**input_dict,
):
"""Run forward execution of the graph with debug

Parameters
----------
number: int, optional
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.

repeat: int, optional
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.

min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.

cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.

repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.

input_dict : dict of str to NDArray
List of input values to be feed to
"""
if input_dict:
self.set_input(**input_dict)

# Step 1. Execute the graph
self._run_debug()
self._run_debug(
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)
# Step 2. Dump the output tensors to the dump folder
self.debug_datum.dump_output_tensor()
# Step 3. Dump the Chrome trace to the dump folder
self.debug_datum.dump_chrome_trace()
# Step 4. Display the collected information
self.debug_datum.display_debug_result()

def run_individual(self, number, repeat=1, min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []
def run_individual(
self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
):
"""Run each operation in the graph and get the time per op for all ops.

number: int
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.

def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
repeat: int, optional
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.

min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.

cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.

repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.

Returns
-------
A 2-dimensional array where the dimensions are: the index of the operation and
the repeat of the measurement.
"""
res = self._run_individual(
number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
)
results = []
offset = 0
format_size = "@q"
(nodes_count,) = struct.unpack_from(format_size, res, offset)
offset += struct.calcsize(format_size)
format_data = "@" + repeat * "d"
for _ in range(0, nodes_count):
ret = struct.unpack_from(format_data, res, offset)
offset += struct.calcsize(format_data)
results.append([*ret])
return results

def run_individual_node(
self,
index,
number=10,
repeat=1,
min_repeat_ms=0,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
):
"""Benchmark a single node in the serialized graph.

This does not do any data transfers and uses arrays already on the device.
Expand All @@ -304,27 +407,32 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
The returned result contains `repeat` costs,
each of which is an average of `number` costs.

min_repeat_ms: int, optional
min_repeat_ms : int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.

cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.

repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.

Returns
-------
A module BenchmarkResult
"""
# Results are returned as serialized strings which we deserialize
ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
answer = []
for value in ret.split(","):
if value.strip() == "":
continue
answer.append(float(value))

return BenchmarkResult(answer)
res = self._run_individual_node(
index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
)
fmt = "@" + ("d" * repeat)
results = struct.unpack(fmt, res)
return BenchmarkResult(list(results))

def profile(self, collectors=None, **input_dict):
"""Run forward execution of the graph and collect overall and per-op
Expand Down
42 changes: 30 additions & 12 deletions python/tvm/contrib/debugger/debug_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
def get_output_tensors(self):
"""Get the output tensors of each operation in numpy format"""
eid = 0
order = 0
output_tensors = {}
for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
for i, node in enumerate(self._nodes_list):
num_outputs = self.get_graph_node_output_num(node)
for j in range(num_outputs):
order += time[0]

# the node name is not unique, so we need a consistent
# indexing based on the list ordering in the nodes
Expand Down Expand Up @@ -157,7 +155,7 @@ def s_to_us(t):
return t * 10**6

starting_times = np.zeros(len(self._time_list) + 1)
starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
starting_times[1:] = np.cumsum([np.mean(times) for times in self._time_list])

def node_to_events(node, times, starting_time):
return [
Expand All @@ -170,7 +168,7 @@ def node_to_events(node, times, starting_time):
),
ChromeTraceEvent(
# Use start + duration instead of end to ensure precise timings.
ts=s_to_us(times[0] + starting_time),
ts=s_to_us(np.mean(times) + starting_time),
tid=1,
pid=1,
ph="E",
Expand Down Expand Up @@ -205,12 +203,31 @@ def _dump_graph_json(self, graph):

def get_debug_result(self, sort_by_time=True):
"""Return the debugger result"""
header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
header = [
"Node Name",
"Ops",
"Time(us)",
"Time(%)",
"Shape",
"Inputs",
"Outputs",
"Measurements(us)",
]
lines = [
"---------",
"---",
"--------",
"-------",
"-----",
"------",
"-------",
"----------------",
]
eid = 0
data = []
total_time = sum(time[0] for time in self._time_list)
total_time = sum([np.mean(time) for time in self._time_list])
for node, time in zip(self._nodes_list, self._time_list):
time_mean = np.mean(time)
num_outputs = self.get_graph_node_output_num(node)
for j in range(num_outputs):
op = node["op"]
Expand All @@ -219,11 +236,12 @@ def get_debug_result(self, sort_by_time=True):
continue
name = node["name"]
shape = str(self._output_tensor_list[eid].shape)
time_us = round(time[0] * 1e6, 3)
time_percent = round(((time[0] / total_time) * 100), 3)
time_us = round(time_mean * 1e6, 3)
time_percent = round(((time_mean / total_time) * 100), 3)
inputs = str(node["attrs"]["num_inputs"])
outputs = str(node["attrs"]["num_outputs"])
node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
measurements = str([round(repeat_data * 1e6, 3) for repeat_data in time])
node_data = [name, op, time_us, time_percent, shape, inputs, outputs, measurements]
data.append(node_data)
eid += 1

Expand All @@ -232,7 +250,7 @@ def get_debug_result(self, sort_by_time=True):
data = sorted(data, key=lambda x: x[2], reverse=True)
# Insert a row for total time at the end.
rounded_total_time_us = round(total_time * 1e6, 3)
data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"])
data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-", "-"])

fmt = ""
for i, _ in enumerate(header):
Expand Down
Loading