Skip to content

Commit ffef540

Browse files
simon-modcrankshaw
authored andcommitted
Metrics and Monitoring V0.0.2: Model Container, Latencies, and Example (#357)
* Allow port specification for Prometheus * Add monitoring example with grafana This commit adds an example in the example folder. It helps the user to visualize clipper metric. The init_grafana.py script launches a grafana/grafana docker container. It adds prometheus as a data source. I attempted to add the dashboard via Grafana API but failed after trying it out for hours. This seems like a persistent issue for at least two years: grafana/grafana#2816 <Simon Mo> * Fix dict unpacking lint error * Format Code * Update yapf * Update yapf and reformat code * Check yapf version in shell script * Change `echo` to `cat` * Add python version check; almost ready * Format code with python version 2.7.12 * Allow port specification for Prometheus * Add monitoring example with grafana This commit adds an example in the example folder. It helps the user to visualize clipper metric. The init_grafana.py script launches a grafana/grafana docker container. It adds prometheus as a data source. I attempted to add the dashboard via Grafana API but failed after trying it out for hours. This seems like a persistent issue for at least two years: grafana/grafana#2816 <Simon Mo> * Update Metric Config; Finish Implement Model Container * Add Integration Test * Fix Integration Tests * Format Code and Rebase from Develop * Format Code Again with python2.7.12 * Update monitoring readme; trigger Jenkins * Address Comments 1. Fix naming and typos 2. Update configuration buckets * Format Code * Address comments - use 1000.0 instead 1000 rpc arithmatic. - change docs * Address RPC Test issue Add a block of code make sure config file is found. * Update tensowflowcifar
1 parent 9ac3bd2 commit ffef540

File tree

13 files changed

+693
-52
lines changed

13 files changed

+693
-52
lines changed

clipper_admin/clipper_admin/docker/docker_container_manager.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def __init__(self,
2525
clipper_rpc_port=7000,
2626
redis_ip=None,
2727
redis_port=6379,
28+
prometheus_port=9090,
2829
docker_network="clipper_network",
2930
extra_container_kwargs={}):
3031
"""
@@ -63,6 +64,7 @@ def __init__(self,
6364
else:
6465
self.external_redis = True
6566
self.redis_port = redis_port
67+
self.prometheus_port = prometheus_port
6668
if docker_network is "host":
6769
raise ClipperException(
6870
"DockerContainerManager does not support running Clipper on the "
@@ -156,7 +158,7 @@ def start_clipper(self, query_frontend_image, mgmt_frontend_image,
156158
setup_metric_config(query_frontend_metric_name,
157159
CLIPPER_INTERNAL_METRIC_PORT)
158160
run_metric_image(self.docker_client, self.common_labels,
159-
self.extra_container_kwargs)
161+
self.prometheus_port, self.extra_container_kwargs)
160162

161163
self.connect()
162164

clipper_admin/clipper_admin/docker/docker_metric_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,15 +80,15 @@ def setup_metric_config(query_frontend_metric_name,
8080
yaml.dump(prom_config, f)
8181

8282

83-
def run_metric_image(docker_client, common_labels, extra_container_kwargs):
83+
def run_metric_image(docker_client, common_labels, prometheus_port,
84+
extra_container_kwargs):
8485
"""
8586
Run the prometheus image.
8687
:param docker_client: The docker client object
8788
:param common_labels: Labels to pass in
8889
:param extra_container_kwargs: Kwargs to pass in.
8990
:return: None
9091
"""
91-
9292
metric_cmd = [
9393
"--config.file=/etc/prometheus/prometheus.yml",
9494
"--storage.tsdb.path=/prometheus",
@@ -101,7 +101,7 @@ def run_metric_image(docker_client, common_labels, extra_container_kwargs):
101101
"prom/prometheus",
102102
metric_cmd,
103103
name="metric_frontend-{}".format(random.randint(0, 100000)),
104-
ports={'9090/tcp': 9090},
104+
ports={'9090/tcp': prometheus_port},
105105
volumes={
106106
'/tmp/clipper/prometheus.yml': {
107107
'bind': '/etc/prometheus/prometheus.yml',

containers/python/rpc.py

Lines changed: 93 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77
from datetime import datetime
88
import socket
99
import sys
10+
import os
11+
import yaml
1012
from collections import deque
1113
from multiprocessing import Pipe, Process
1214
from prometheus_client import start_http_server
13-
from prometheus_client.core import GaugeMetricFamily, REGISTRY
15+
from prometheus_client.core import Counter, Gauge, Histogram, Summary
1416

1517
INPUT_TYPE_BYTES = 0
1618
INPUT_TYPE_INTS = 1
@@ -193,8 +195,6 @@ def run(self, metric_conn):
193195
sys.stdout.flush()
194196
sys.stderr.flush()
195197

196-
pred_metric = dict(model_pred_count=0)
197-
198198
while True:
199199
socket = self.context.socket(zmq.DEALER)
200200
poller.register(socket, zmq.POLLIN)
@@ -312,13 +312,25 @@ def run(self, metric_conn):
312312

313313
response.send(socket, self.event_history)
314314

315-
pred_metric['model_pred_count'] += 1
316-
317-
metric_conn.send(pred_metric)
315+
recv_time = (t2 - t1).microseconds
316+
parse_time = (t3 - t2).microseconds
317+
handle_time = (t4 - t3).microseconds
318+
319+
model_container_metric = {}
320+
model_container_metric['pred_total'] = 1
321+
model_container_metric[
322+
'recv_time_ms'] = recv_time / 1000.0
323+
model_container_metric[
324+
'parse_time_ms'] = parse_time / 1000.0
325+
model_container_metric[
326+
'handle_time_ms'] = handle_time / 1000.0
327+
model_container_metric['end_to_end_latency_ms'] = (
328+
recv_time + parse_time + handle_time) / 1000.0
329+
metric_conn.send(model_container_metric)
318330

319331
print("recv: %f us, parse: %f us, handle: %f us" %
320-
((t2 - t1).microseconds, (t3 - t2).microseconds,
321-
(t4 - t3).microseconds))
332+
(recv_time, parse_time, handle_time))
333+
322334
sys.stdout.flush()
323335
sys.stderr.flush()
324336

@@ -499,39 +511,95 @@ def start(self, model, host, port, model_name, model_version, input_type):
499511
self.server.model_input_type = model_input_type
500512
self.server.model = model
501513

502-
parent_conn, child_conn = Pipe(duplex=True)
514+
child_conn, parent_conn = Pipe(duplex=False)
503515
metrics_proc = Process(target=run_metric, args=(child_conn, ))
504516
metrics_proc.start()
505517
self.server.run(parent_conn)
506518

507519

508520
class MetricCollector:
521+
"""
522+
Note this is no longer a Prometheus Collector.
523+
Instead, this is simply a class to encapsulate metric recording.
524+
"""
525+
509526
def __init__(self, pipe_child_conn):
510527
self.pipe_conn = pipe_child_conn
528+
self.metrics = {}
529+
self.name_to_type = {}
530+
531+
self._load_config()
532+
533+
def _load_config(self):
534+
config_file_path = 'metrics_config.yaml'
535+
536+
# Make sure we are inside /container, where the config file lives.
537+
cwd = os.path.split(os.getcwd())[1]
538+
if cwd != 'container':
539+
config_file_path = os.path.join(os.getcwd(), 'container',
540+
config_file_path)
541+
542+
with open(config_file_path, 'r') as f:
543+
config = yaml.load(f)
544+
config = config['Model Container']
545+
546+
prefix = 'clipper_{}_'.format(config.pop('prefix'))
547+
548+
for name, spec in config.items():
549+
metric_type = spec.get('type')
550+
metric_description = spec.get('description')
551+
552+
if not metric_type and not metric_description:
553+
raise Exception(
554+
"{}: Metric Type and Metric Description are Required in Config File.".
555+
format(name))
556+
557+
if metric_type == 'Counter':
558+
self.metrics[name] = Counter(prefix + name, metric_description)
559+
elif metric_type == 'Gauge':
560+
self.metrics[name] = Gauge(prefix + name, metric_description)
561+
elif metric_type == 'Histogram':
562+
if 'bucket' in spec.keys():
563+
buckets = spec['bucket'] + [float("inf")]
564+
self.metrics[name] = Histogram(
565+
prefix + name, metric_description, buckets=buckets)
566+
else:
567+
self.metrics[name] = Histogram(prefix + name,
568+
metric_description)
569+
elif metric_type == 'Summary':
570+
self.metrics[name] = Summary(prefix + name, metric_description)
571+
else:
572+
raise Exception(
573+
"Unknown Metric Type: {}. See config file.".format(
574+
metric_type))
575+
576+
self.name_to_type[name] = metric_type
511577

512578
def collect(self):
513-
latest_metric_dict = None
514-
while self.pipe_conn.poll():
515-
latest_metric_dict = self.pipe_conn.recv()
516-
if latest_metric_dict:
517-
for name, val in latest_metric_dict.items():
518-
try:
519-
yield GaugeMetricFamily(
520-
name=name,
521-
documentation=name, # Required Argument
522-
value=val)
523-
except ValueError:
524-
pass
579+
while True:
580+
latest_metric_dict = self.pipe_conn.recv(
581+
) # This call is blocking.
582+
for name, value in latest_metric_dict.items():
583+
metric = self.metrics[name]
584+
if self.name_to_type[name] == 'Counter':
585+
metric.inc(value)
586+
elif self.name_to_type[name] == 'Gauge':
587+
metric.set(value)
588+
elif self.name_to_type[name] == 'Histogram' or self.name_to_type[name] == 'Summary':
589+
metric.observe(value)
590+
else:
591+
raise Exception(
592+
"Unknown Metric Type for {}. See config file.".format(
593+
name))
525594

526595

527596
def run_metric(child_conn):
528597
"""
529598
This function takes a child_conn at the end of the pipe and
530599
receive object to update prometheus metric.
531600
532-
It is recommended to be ran in a separate process.
601+
It is recommended to be ran in a separate process.
533602
"""
534-
REGISTRY.register(MetricCollector(child_conn))
603+
collector = MetricCollector(child_conn)
535604
start_http_server(1390)
536-
while True:
537-
time.sleep(1)
605+
collector.collect()

dockerfiles/RPCDockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ RUN mkdir -p /model \
99
&& apt-get update \
1010
&& apt-get install -y libzmq3 libzmq3-dev \
1111
&& conda install -y pyzmq \
12-
&& pip install prometheus_client
12+
&& pip install prometheus_client pyyaml
1313

1414
WORKDIR /container
1515

1616
COPY containers/python/__init__.py containers/python/rpc.py /container/
1717

18+
COPY monitoring/metrics_config.yaml /container/
19+
1820
ENV CLIPPER_MODEL_PATH=/model
1921

2022
# vim: set filetype=dockerfile:

dockerfiles/TensorFlowCifarDockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ FROM clipper/py-rpc:${CODE_VERSION}
33

44
MAINTAINER Dan Crankshaw <dscrankshaw@gmail.com>
55

6-
RUN conda install tensorflow
6+
RUN conda install -c conda-forge tensorflow
77

88
COPY containers/python/tf_cifar_container.py /container/
99

0 commit comments

Comments
 (0)