Skip to content

Commit 8e3e7fe

Browse files
committed
Add cooldown_interval_ms and internal measurements for profiling
1 parent 6247bf4 commit 8e3e7fe

File tree

17 files changed

+1030
-734
lines changed

17 files changed

+1030
-734
lines changed

CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,6 @@ endif()
392392
if(USE_PROFILER)
393393
message(STATUS "Build with profiler...")
394394

395-
add_definitions(-DUSE_PROFILER=1)
396395
tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
397396
list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
398397
set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}

include/tvm/runtime/profiling.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,11 +543,12 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
543543
* minimum duration requirement of one `repeat`.
544544
* i.e., When the run time of one `repeat` falls below this time,
545545
* the `number` parameter will be automatically increased.
546+
* \param cooldown_interval_ms The cool down interval between two measurements in milliseconds.
546547
* \param f_preproc The function to be executed before we excetute time evaluator.
547548
* \return f_timer A timer function.
548549
*/
549550
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
550-
PackedFunc f_preproc = nullptr);
551+
int cooldown_interval_ms, PackedFunc f_preproc = nullptr);
551552

552553
} // namespace profiling
553554
} // namespace runtime

python/tvm/contrib/debugger/debug_executor.py

Lines changed: 92 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -222,13 +222,18 @@ def _run_per_layer(self):
222222
output_tensors.append(self._get_node_output(i, j))
223223
self.debug_datum.update_output_tensors(output_tensors)
224224

225-
def _run_debug(self):
225+
def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms):
226226
"""Execute the node specified with index will be executed.
227227
Each debug output will be copied to the buffer
228228
Time consumed for each execution will be set as debug output.
229229
"""
230230
# Get timing.
231-
self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
231+
self.debug_datum._time_list = self.run_individual(
232+
number=number,
233+
repeat=repeat,
234+
min_repeat_ms=min_repeat_ms,
235+
cooldown_interval_ms=cooldown_interval_ms,
236+
)
232237

233238
# Get outputs.
234239
self._run_per_layer()
@@ -259,31 +264,98 @@ def debug_get_output(self, node, out=None):
259264

260265
self._debug_get_output(node_index, out)
261266

262-
def run(self, **input_dict):
267+
# pylint: disable=arguments-differ
268+
def run(self, number=10, repeat=1, min_repeat_ms=1, cooldown_interval_ms=0, **input_dict):
263269
"""Run forward execution of the graph with debug
264270
265271
Parameters
266272
----------
273+
number: int, optional
274+
The number of times to run this function for taking average.
275+
We call these runs as one `repeat` of measurement.
276+
277+
repeat: int, optional
278+
The number of times to repeat the measurement.
279+
In total, the function will be invoked (1 + number x repeat) times,
280+
where the first one is warm up and will be discarded.
281+
The returned result contains `repeat` costs,
282+
each of which is an average of `number` costs.
283+
284+
min_repeat_ms: int, optional
285+
The minimum duration of one `repeat` in milliseconds.
286+
By default, one `repeat` contains `number` runs. If this parameter is set,
287+
the parameters `number` will be dynamically adjusted to meet the
288+
minimum duration requirement of one `repeat`.
289+
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
290+
will be automatically increased.
291+
292+
cooldown_interval_ms: int, optional
293+
The cool down interval between two measurements in milliseconds.
294+
267295
input_dict : dict of str to NDArray
268296
List of input values to be feed to
269297
"""
270298
if input_dict:
271299
self.set_input(**input_dict)
272300

273301
# Step 1. Execute the graph
274-
self._run_debug()
302+
self._run_debug(
303+
number=number,
304+
repeat=repeat,
305+
min_repeat_ms=min_repeat_ms,
306+
cooldown_interval_ms=cooldown_interval_ms,
307+
)
275308
# Step 2. Dump the output tensors to the dump folder
276309
self.debug_datum.dump_output_tensor()
277310
# Step 3. Dump the Chrome trace to the dump folder
278311
self.debug_datum.dump_chrome_trace()
279312
# Step 4. Display the collected information
280313
self.debug_datum.display_debug_result()
281314

282-
def run_individual(self, number, repeat=1, min_repeat_ms=0):
283-
ret = self._run_individual(number, repeat, min_repeat_ms)
284-
return ret.strip(",").split(",") if ret else []
315+
def run_individual(self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0):
316+
"""Run each operation in the graph and get the time per op for all ops.
317+
318+
number: int
319+
The number of times to run this function for taking average.
320+
We call these runs as one `repeat` of measurement.
321+
322+
repeat: int, optional
323+
The number of times to repeat the measurement.
324+
In total, the function will be invoked (1 + number x repeat) times,
325+
where the first one is warm up and will be discarded.
326+
The returned result contains `repeat` costs,
327+
each of which is an average of `number` costs.
285328
286-
def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
329+
min_repeat_ms: int, optional
330+
The minimum duration of one `repeat` in milliseconds.
331+
By default, one `repeat` contains `number` runs. If this parameter is set,
332+
the parameters `number` will be dynamically adjusted to meet the
333+
minimum duration requirement of one `repeat`.
334+
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
335+
will be automatically increased.
336+
337+
cooldown_interval_ms: int, optional
338+
The cool down interval between two measurements in milliseconds.
339+
340+
Returns
341+
-------
342+
A 3-dimensional array where the dimensions are: the index of the operation,
343+
the repeat and the number of the measurement.
344+
"""
345+
ret = self._run_individual(number, repeat, min_repeat_ms, cooldown_interval_ms)
346+
measurements = []
347+
for node_data in ret.strip(":").split(":"):
348+
measurements.append([])
349+
for repeat_data in node_data.strip(";").split(";"):
350+
measurements[-1].append([])
351+
for number_data in repeat_data.strip(",").split(","):
352+
if number_data:
353+
measurements[-1][-1].append(float(number_data))
354+
return measurements
355+
356+
def run_individual_node(
357+
self, index, number=10, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0
358+
):
287359
"""Benchmark a single node in the serialized graph.
288360
289361
This does not do any data transfers and uses arrays already on the device.
@@ -304,27 +376,30 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
304376
The returned result contains `repeat` costs,
305377
each of which is an average of `number` costs.
306378
307-
min_repeat_ms: int, optional
379+
min_repeat_ms : int, optional
308380
The minimum duration of one `repeat` in milliseconds.
309381
By default, one `repeat` contains `number` runs. If this parameter is set,
310382
the parameters `number` will be dynamically adjusted to meet the
311383
minimum duration requirement of one `repeat`.
312384
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
313385
will be automatically increased.
314386
387+
cooldown_interval_ms : int, optional
388+
The cool down interval between two measurements in milliseconds.
389+
315390
Returns
316391
-------
317392
A module BenchmarkResult
318393
"""
319394
# Results are returned as serialized strings which we deserialize
320-
ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
321-
answer = []
322-
for value in ret.split(","):
323-
if value.strip() == "":
324-
continue
325-
answer.append(float(value))
326-
327-
return BenchmarkResult(answer)
395+
ret = self._run_individual_node(index, number, repeat, min_repeat_ms, cooldown_interval_ms)
396+
measurements = []
397+
for repeat_data in ret.replace(" ", "").strip(";").split(";"):
398+
measurements.append([])
399+
for number_data in repeat_data.strip(",").split(","):
400+
if number_data:
401+
measurements[-1].append(float(number_data))
402+
return BenchmarkResult(measurements)
328403

329404
def profile(self, collectors=None, **input_dict):
330405
"""Run forward execution of the graph and collect overall and per-op

python/tvm/contrib/debugger/debug_result.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
114114
def get_output_tensors(self):
115115
"""Get the output tensors of each operation in numpy format"""
116116
eid = 0
117-
order = 0
118117
output_tensors = {}
119-
for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
118+
for i, node in enumerate(self._nodes_list):
120119
num_outputs = self.get_graph_node_output_num(node)
121120
for j in range(num_outputs):
122-
order += time[0]
123121

124122
# the node name is not unique, so we need a consistent
125123
# indexing based on the list ordering in the nodes
@@ -157,7 +155,12 @@ def s_to_us(t):
157155
return t * 10**6
158156

159157
starting_times = np.zeros(len(self._time_list) + 1)
160-
starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
158+
starting_times[1:] = np.cumsum(
159+
[
160+
np.mean([np.mean(repeat_data) for repeat_data in node_data])
161+
for node_data in self._time_list
162+
]
163+
)
161164

162165
def node_to_events(node, times, starting_time):
163166
return [
@@ -170,7 +173,9 @@ def node_to_events(node, times, starting_time):
170173
),
171174
ChromeTraceEvent(
172175
# Use start + duration instead of end to ensure precise timings.
173-
ts=s_to_us(times[0] + starting_time),
176+
ts=s_to_us(
177+
np.mean([np.mean(repeat_data) for repeat_data in times]) + starting_time
178+
),
174179
tid=1,
175180
pid=1,
176181
ph="E",
@@ -205,12 +210,18 @@ def _dump_graph_json(self, graph):
205210

206211
def get_debug_result(self, sort_by_time=True):
207212
"""Return the debugger result"""
208-
header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
209-
lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
213+
header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs", "Times"]
214+
lines = ["---------", "---", "--------", "-------", "-----", "------", "-------", "-------"]
210215
eid = 0
211216
data = []
212-
total_time = sum(time[0] for time in self._time_list)
217+
total_time = sum(
218+
[
219+
np.mean([np.mean(repeat_data) for repeat_data in node_data])
220+
for node_data in self._time_list
221+
]
222+
)
213223
for node, time in zip(self._nodes_list, self._time_list):
224+
time_mean = np.mean([np.mean(repeat_data) for repeat_data in time])
214225
num_outputs = self.get_graph_node_output_num(node)
215226
for j in range(num_outputs):
216227
op = node["op"]
@@ -219,11 +230,18 @@ def get_debug_result(self, sort_by_time=True):
219230
continue
220231
name = node["name"]
221232
shape = str(self._output_tensor_list[eid].shape)
222-
time_us = round(time[0] * 1e6, 3)
223-
time_percent = round(((time[0] / total_time) * 100), 3)
233+
time_us = round(time_mean * 1e6, 3)
234+
235+
times = str(
236+
[
237+
[round(number_data * 1e3, 3) for number_data in repeat_data]
238+
for repeat_data in time
239+
]
240+
)
241+
time_percent = round(((time_mean / total_time) * 100), 3)
224242
inputs = str(node["attrs"]["num_inputs"])
225243
outputs = str(node["attrs"]["num_outputs"])
226-
node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
244+
node_data = [name, op, time_us, time_percent, shape, inputs, outputs, times]
227245
data.append(node_data)
228246
eid += 1
229247

python/tvm/contrib/graph_executor.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ def benchmark(
356356
number=5,
357357
min_repeat_ms=None,
358358
end_to_end=False,
359+
cooldown_interval_ms=0,
359360
**kwargs,
360361
):
361362
"""Calculate runtime of a function by repeatedly calling it.
@@ -400,6 +401,9 @@ def benchmark(
400401
milliseconds. This can be used to ensure that the function is run enough to get an
401402
accurate measurement.
402403
404+
cooldown_interval_ms : Optional[float]
405+
The cool down interval between two measurements in milliseconds.
406+
403407
end_to_end : bool
404408
If set, include time to transfer input tensors to the device and time to transfer
405409
returned tensors in the total runtime. This will give accurate timings for end to end
@@ -432,5 +436,10 @@ def benchmark(
432436
if kwargs:
433437
self.set_input(**kwargs)
434438
return self.module.time_evaluator(
435-
func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
439+
func_name,
440+
device,
441+
repeat=repeat,
442+
number=number,
443+
min_repeat_ms=min_repeat_ms,
444+
cooldown_interval_ms=cooldown_interval_ms,
436445
)()

python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,10 @@ def f_per_layer(rt_mod, dev, input_data):
211211
graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
212212
print("|graph_nodes| = ", len(graph_nodes))
213213
print("|graph_time| = ", len(graph_time))
214-
graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
214+
graph_nodes_time = {
215+
k: float(np.mean([np.mean(repeat_data) for repeat_data in v]))
216+
for k, v in zip(graph_nodes, graph_time)
217+
}
215218
for k, v in graph_nodes_time.items():
216219
print(f"{k} : {v:.3f}")
217220

python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,10 @@ def f_per_layer(rt_mod, dev, input_data):
174174
graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
175175
print("|graph_nodes| = ", len(graph_nodes))
176176
print("|graph_time| = ", len(graph_time))
177-
graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
177+
graph_nodes_time = {
178+
k: float(np.mean([np.mean(repeat_data) for repeat_data in v]))
179+
for k, v in zip(graph_nodes, graph_time)
180+
}
178181
for k, v in graph_nodes_time.items():
179182
print(f"{k} : {v:.3f}")
180183

0 commit comments

Comments
 (0)