Skip to content

Commit 7dd59b6

Browse files
committed
add tensorflow profiling
1 parent 49b966a commit 7dd59b6

29 files changed

+654
-107
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ tmp
1313
dist
1414
stackimpact.egg-info
1515
__pycache__
16+
venv*

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ StackImpact is a production-grade performance profiler built for both production
88

99
#### Features
1010

11-
* Continuous hot spot profiling for CPU usage, memory allocation, blocking calls.
11+
* Continuous hot spot profiling of CPU usage, memory allocation and blocking calls.
12+
* TensorFlow profiling.
1213
* Error and exception monitoring.
1314
* Health monitoring including CPU, memory, garbage collection and other runtime metrics.
1415
* Alerts on profile anomalies.
@@ -142,6 +143,15 @@ agent.start_allocation_profiler();
142143
agent.stop_allocation_profiler();
143144
```
144145

146+
```python
147+
# Start TensorFlow profiler.
148+
agent.start_tf_profiler();
149+
```
150+
151+
```python
152+
# Stop TensorFlow profiler and report the recorded profile to the Dashboard.
153+
agent.stop_tf_profiler();
154+
```
145155

146156
#### Analyzing performance data in the Dashboard
147157

README.rst

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ complement profiles for extensive performance analysis. Learn more at
2020
Features
2121
^^^^^^^^
2222

23-
- Continuous hot spot profiling for CPU usage, memory allocation,
23+
- Continuous hot spot profiling of CPU usage, memory allocation and
2424
blocking calls.
25+
- TensorFlow profiling.
2526
- Error and exception monitoring.
2627
- Health monitoring including CPU, memory, garbage collection and other
2728
runtime metrics.
@@ -192,6 +193,16 @@ disabled with ``auto_profiling: False``.
192193
# Stop heap allocation profiler and report the recorded profile to the Dashboard.
193194
agent.stop_allocation_profiler();
194195
196+
.. code:: python
197+
198+
# Start TensorFlow profiler.
199+
agent.start_tf_profiler();
200+
201+
.. code:: python
202+
203+
# Stop TensorFlow profiler and report the recorded profile to the Dashboard.
204+
agent.stop_tf_profiler();
205+
195206
Analyzing performance data in the Dashboard
196207
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
197208

examples/tensorflow/app.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from __future__ import print_function
2+
import random
3+
import time
4+
import sys
5+
import threading
6+
import tensorflow as tf
7+
sys.path.append(".")
8+
import stackimpact
9+
10+
11+
agent = stackimpact.start(
12+
agent_key = 'agent key here',
13+
app_name = 'MyTensorFlowScript')
14+
15+
16+
def handle_some_event():
17+
with agent.profile():
18+
tf.reset_default_graph()
19+
x = tf.random_normal([1000, 1000])
20+
y = tf.random_normal([1000, 1000])
21+
res = tf.matmul(x, y)
22+
23+
with tf.Session() as sess:
24+
sess.run(res)
25+
26+
27+
# Simulate events
28+
while True:
29+
handle_some_event()
30+
time.sleep(2)
31+
32+

examples/tensorflow/manual.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from __future__ import print_function
2+
import sys
3+
import tensorflow as tf
4+
sys.path.append(".")
5+
import stackimpact
6+
7+
8+
agent = stackimpact.start(
9+
agent_key = 'agent key here',
10+
app_name = 'MyTensorFlowScript',
11+
auto_profiling = False)
12+
13+
agent.start_tf_profiler()
14+
15+
x = tf.random_normal([1000, 1000])
16+
y = tf.random_normal([1000, 1000])
17+
res = tf.matmul(x, y)
18+
19+
with tf.Session() as sess:
20+
sess.run(res)
21+
22+
23+
agent.stop_tf_profiler()
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from keras.models import Sequential
2+
from keras.layers.core import Dense, Dropout, Activation
3+
from keras.optimizers import SGD
4+
import numpy as np
5+
import sys
6+
7+
sys.path.append(".")
8+
import stackimpact
9+
10+
11+
agent = stackimpact.start(
12+
agent_key = 'agent key here',
13+
app_name = 'MyKerasScript',
14+
auto_profiling = False)
15+
16+
17+
agent.start_tf_profiler()
18+
19+
X = np.array([[0,0],[0,1],[1,0],[1,1]])
20+
y = np.array([[0],[1],[1],[0]])
21+
22+
model = Sequential()
23+
model.add(Dense(8, input_dim=2))
24+
model.add(Activation('tanh'))
25+
model.add(Dense(1))
26+
model.add(Activation('sigmoid'))
27+
28+
sgd = SGD(lr=0.1)
29+
model.compile(loss='binary_crossentropy', optimizer=sgd)
30+
31+
model.fit(X, y, batch_size=1, nb_epoch=1000)
32+
print(model.predict_proba(X))
33+
34+
agent.stop_tf_profiler()
35+

publish.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
pandoc --from=markdown --to=rst --output=README.rst 'README.md'
6+
7+
python3 -m unittest discover -v -s tests -p *_test.py
8+
9+
rm -f dist/*.tar.gz
10+
python setup.py sdist
11+
12+
for bundle in dist/*.tar.gz; do
13+
echo "Publishing $bundle..."
14+
twine upload $bundle
15+
done
16+

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ def read(fname):
66

77
setup(
88
name = 'stackimpact',
9-
version = '1.2.2',
9+
version = '1.2.3',
1010
description = 'StackImpact Python Profiler',
1111
long_description = read('README.rst'),
1212
author = 'StackImpact',

stackimpact/agent.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@
2626
from .profilers.cpu_profiler import CPUProfiler
2727
from .profilers.allocation_profiler import AllocationProfiler
2828
from .profilers.block_profiler import BlockProfiler
29+
from .profilers.tf_profiler import TFProfiler
2930

3031

31-
class Span:
32+
class Span(object):
3233

3334
def __init__(self, stop_func = None):
3435
if stop_func:
@@ -48,9 +49,9 @@ def __exit__(self, exc_type, exc_value, traceback):
4849
self.stop()
4950

5051

51-
class Agent:
52+
class Agent(object):
5253

53-
AGENT_VERSION = "1.2.2"
54+
AGENT_VERSION = "1.2.3"
5455
SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com"
5556

5657
def __init__(self, **kwargs):
@@ -99,11 +100,19 @@ def __init__(self, **kwargs):
99100
config.report_interval = 120
100101
self.block_reporter = ProfileReporter(self, BlockProfiler(self), config)
101102

103+
config = ProfilerConfig()
104+
config.log_prefix = 'TensorFlow profiler'
105+
config.max_profile_duration = 20
106+
config.max_span_duration = 5
107+
config.max_span_count = 30
108+
config.span_interval = 20
109+
config.report_interval = 120
110+
self.tf_reporter = ProfileReporter(self, TFProfiler(self), config)
102111

103112
self.options = None
104113

105114

106-
def get_option(self, name, default_val = None):
115+
def get_option(self, name, default_val=None):
107116
if name not in self.options:
108117
return default_val
109118
else:
@@ -151,6 +160,7 @@ def start(self, **kwargs):
151160
self.cpu_reporter.setup()
152161
self.allocation_reporter.setup()
153162
self.block_reporter.setup()
163+
self.tf_reporter.setup()
154164
self.span_reporter.setup()
155165
self.error_reporter.setup()
156166
self.process_reporter.setup()
@@ -201,6 +211,7 @@ def enable(self):
201211
self.cpu_reporter.start()
202212
self.allocation_reporter.start()
203213
self.block_reporter.start()
214+
self.tf_reporter.start()
204215
self.span_reporter.start()
205216
self.error_reporter.start()
206217
self.process_reporter.start()
@@ -212,13 +223,14 @@ def disable(self):
212223
self.cpu_reporter.stop()
213224
self.allocation_reporter.stop()
214225
self.block_reporter.stop()
226+
self.tf_reporter.stop()
215227
self.span_reporter.stop()
216228
self.error_reporter.stop()
217229
self.process_reporter.stop()
218230
self.config.set_agent_enabled(False)
219231

220232

221-
def profile(self, name = 'Default'):
233+
def profile(self, name='Default'):
222234
if not self.agent_started or self.span_active:
223235
return Span(None)
224236

@@ -232,19 +244,21 @@ def profile(self, name = 'Default'):
232244
active_reporters.append(self.allocation_reporter)
233245
if self.block_reporter.started:
234246
active_reporters.append(self.block_reporter)
247+
if self.tf_reporter.started:
248+
active_reporters.append(self.tf_reporter)
235249

236250
if len(active_reporters) > 0:
237251
selected_reporter = active_reporters[int(math.floor(random.random() * len(active_reporters)))]
238252
if not selected_reporter.start_profiling(True, True):
239253
selected_reporter = None
240254

241-
start_timestamp = timestamp()
255+
start_timestamp = time.time()
242256

243257
def stop_func():
244258
if selected_reporter:
245259
selected_reporter.stop_profiling()
246260

247-
duration = timestamp() - start_timestamp
261+
duration = time.time() - start_timestamp
248262
self.span_reporter.record_span(name, duration)
249263

250264
self.span_active = False
@@ -298,15 +312,24 @@ def stop_block_profiler(self):
298312
self._stop_profiler(self.block_reporter)
299313

300314

315+
def start_tf_profiler(self):
316+
self._start_profiler(self.tf_reporter)
317+
318+
319+
def stop_tf_profiler(self):
320+
self._stop_profiler(self.tf_reporter)
321+
322+
301323
def report(self):
302324
if not self.agent_started or self.get_option('auto_profiling'):
303325
return
304326

305-
self.configLoader.load(True)
327+
self.config_loader.load(True)
306328

307-
self.cpuReporter.report(True);
308-
self.allocationReporter.report(True);
309-
self.asyncReporter.report(True);
329+
self.cpu_reporter.report(True);
330+
self.allocation_reporter.report(True);
331+
self.block_reporter.report(True);
332+
self.tf_reporter.report(True);
310333

311334
self.messageQueue.flush(True)
312335

@@ -325,13 +348,15 @@ def destroy(self):
325348
self.cpu_reporter.stop()
326349
self.allocation_reporter.stop()
327350
self.block_reporter.stop()
351+
self.tf_reporter.stop()
328352
self.error_reporter.stop()
329353
self.span_reporter.stop()
330354
self.process_reporter.stop()
331355

332356
self.cpu_reporter.destroy()
333357
self.allocation_reporter.destroy()
334358
self.block_reporter.destroy()
359+
self.tf_reporter.destroy()
335360
self.error_reporter.destroy()
336361
self.span_reporter.destroy()
337362
self.process_reporter.destroy()
@@ -421,7 +446,7 @@ def run_in_main_thread(self, func):
421446

422447

423448

424-
class TimerWraper():
449+
class TimerWraper(object):
425450
def __init__(self):
426451
self.timer = None
427452
self.cancel_lock = threading.Lock()

stackimpact/api_request.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from urllib.parse import urlencode
2121

2222

23-
class APIRequest:
23+
class APIRequest(object):
2424
def __init__(self, agent):
2525
self.agent = agent
2626

@@ -42,11 +42,12 @@ def post(self, endpoint, payload):
4242
req_body = {
4343
'runtime_type': 'python',
4444
'runtime_version': '{0.major}.{0.minor}.{0.micro}'.format(sys.version_info),
45+
'runtime_path': sys.prefix,
4546
'agent_version': self.agent.AGENT_VERSION,
4647
'app_name': self.agent.get_option('app_name'),
4748
'app_version': self.agent.get_option('app_version'),
4849
'app_environment': self.agent.get_option('app_environment'),
49-
'host_name': self.agent.get_option('host_name', socket.gethostname()),
50+
'host_name': self.agent.get_option('host_name', host_name),
5051
'process_id': os.getpid(),
5152
'run_id': self.agent.run_id,
5253
'run_ts': self.agent.run_ts,
@@ -55,9 +56,9 @@ def post(self, endpoint, payload):
5556
}
5657

5758
gzip_out = BytesIO()
58-
with gzip.GzipFile(fileobj=gzip_out, mode="w") as f:
59-
f.write(json.dumps(req_body).encode('utf-8'))
60-
f.close()
59+
with gzip.GzipFile(fileobj=gzip_out, mode="w") as out_file:
60+
out_file.write(json.dumps(req_body).encode('utf-8'))
61+
out_file.close()
6162

6263
gzip_out_val = gzip_out.getvalue()
6364
if isinstance(gzip_out_val, str):

0 commit comments

Comments
 (0)