My server runs many long running notebooks, and I'd like to monitor the notebooks memory.
Is there a way to match between the pid or process name and a notebook?
My server runs many long running notebooks, and I'd like to monitor the notebooks memory.
Is there a way to match between the pid or process name and a notebook?
Since the question is about monitoring notebooks' memory, I've written a complete example showing the memory consumption of the running notebooks. It is based on the excellent @jcb91 answer and a few other answers (1, 2, 3, 4).
import json
import os
import os.path
import posixpath
import subprocess
import urllib2
import pandas as pd
import psutil
def show_notebooks_table(host, port):
"""Show table with info about running jupyter notebooks.
Args:
host: host of the jupyter server.
port: port of the jupyter server.
Returns:
DataFrame with rows corresponding to running notebooks and following columns:
* index: notebook kernel id.
* path: path to notebook file.
* pid: pid of the notebook process.
* memory: notebook memory consumption in percentage.
"""
notebooks = get_running_notebooks(host, port)
prefix = long_substr([notebook['path'] for notebook in notebooks])
df = pd.DataFrame(notebooks)
df = df.set_index('kernel_id')
df.index.name = prefix
df.path = df.path.apply(lambda x: x[len(prefix):])
df['pid'] = df.apply(lambda row: get_process_id(row.name), axis=1)
# same notebook can be run in multiple processes
df = expand_column(df, 'pid')
df['memory'] = df.pid.apply(memory_usage_psutil)
return df.sort_values('memory', ascending=False)
def get_running_notebooks(host, port):
"""Get kernel ids and paths of the running notebooks.
Args:
host: host at which the notebook server is listening. E.g. 'localhost'.
port: port at which the notebook server is listening. E.g. 8888.
username: name of the user who runs the notebooks.
Returns:
list of dicts {kernel_id: notebook kernel id, path: path to notebook file}.
"""
# find which kernel corresponds to which notebook
# by querying the notebook server api for sessions
sessions_url = posixpath.join('http://%s:%d' % (host, port), 'api', 'sessions')
response = urllib2.urlopen(sessions_url).read()
res = json.loads(response)
notebooks = [{'kernel_id': notebook['kernel']['id'],
'path': notebook['notebook']['path']} for notebook in res]
return notebooks
def get_process_id(name):
"""Return process ids found by (partial) name or regex.
Source: https://stackoverflow.com/a/44712205/304209.
>>> get_process_id('kthreadd')
[2]
>>> get_process_id('watchdog')
[10, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61] # ymmv
>>> get_process_id('non-existent process')
[]
"""
child = subprocess.Popen(['pgrep', '-f', name], stdout=subprocess.PIPE, shell=False)
response = child.communicate()[0]
return [int(pid) for pid in response.split()]
def memory_usage_psutil(pid=None):
"""Get memory usage percentage by current process or by process specified by id, like in top.
Source: https://stackoverflow.com/a/30014612/304209.
Args:
pid: pid of the process to analyze. If None, analyze the current process.
Returns:
memory usage of the process, in percentage like in top, values in [0, 100].
"""
if pid is None:
pid = os.getpid()
process = psutil.Process(pid)
return process.memory_percent()
def long_substr(strings):
"""Find longest common substring in a list of strings.
Source: https://stackoverflow.com/a/2894073/304209.
Args:
strings: list of strings.
Returns:
longest substring which is found in all of the strings.
"""
substr = ''
if len(strings) > 1 and len(strings[0]) > 0:
for i in range(len(strings[0])):
for j in range(len(strings[0])-i+1):
if j > len(substr) and all(strings[0][i:i+j] in x for x in strings):
substr = strings[0][i:i+j]
return substr
def expand_column(dataframe, column):
"""Transform iterable column values into multiple rows.
Source: https://stackoverflow.com/a/27266225/304209.
Args:
dataframe: DataFrame to process.
column: name of the column to expand.
Returns:
copy of the DataFrame with the following updates:
* for rows where column contains only 1 value, keep them as is.
* for rows where column contains a list of values, transform them
into multiple rows, each of which contains one value from the list in column.
"""
tmp_df = dataframe.apply(
lambda row: pd.Series(row[column]), axis=1).stack().reset_index(level=1, drop=True)
tmp_df.name = column
return dataframe.drop(column, axis=1).join(tmp_df)
Here is an example output of show_notebooks_table('localhost', 8888)
:
This is possible, although I could only think of the rather hackish solution I outline below. In summary:
security
directorynetstat
to determine which pid is listening to the ports found in step 1I suspect there is a much simpler way, but I'm not sure as yet where to find it.
import glob
import os.path
import posixpath
import re
import json
import subprocess
import urllib2
# the url and port at which your notebook server listens
server_path = 'http://localhost'
server_port = 8888
# the security directory of the notebook server, containing its connections files
server_sec_dir = 'C:/Users/Josh/.ipython/profile_default/security/'
# part 1 : open all the connection json files to find their port numbers
kernels = {}
for json_path in glob.glob(os.path.join(server_sec_dir, 'kernel-*.json')):
control_port = json.load(open(json_path, 'r'))['control_port']
key = os.path.basename(json_path)[7:-5]
kernels[control_port] = {'control_port': control_port, 'key': key}
# part2 : get netstat info for which processes use which tcp ports
netstat_ouput = subprocess.check_output(['netstat', '-ano'])
# parse the netstat output to map ports to PIDs
netstat_regex = re.compile(
"^\s+\w+\s+" # protocol word
"\d+(\.\d+){3}:(\d+)\s+" # local ip:port
"\d+(\.\d+){3}:(\d+)\s+" # foreign ip:port
"LISTENING\s+" # connection state
"(\d+)$" # PID
)
for line in netstat_ouput.splitlines(False):
match = netstat_regex.match(line)
if match and match.lastindex == 5:
port = int(match.group(2))
if port in kernels:
pid = int(match.group(5))
kernels[port]['pid'] = pid
# reorganize kernels to use 'key' as keys
kernels = {kernel['key']: kernel for kernel in kernels.values()}
# part 3 : find which kernel corresponds to which notebook
# by querying the notebook server api for sessions
sessions_url = posixpath.join('%s:%d' % (server_path, server_port),
'api','sessions')
response = urllib2.urlopen(sessions_url).read()
for session in json.loads(response):
key = session['kernel']['id']
if key in kernels:
nb_path = os.path.join(session['notebook']['path'],
session['notebook']['name'])
kernels[key]['nb_path'] = nb_path
# now do what you will with the dict. I just print a pretty list version:
print json.dumps(kernels.values(), sort_keys=True, indent=4)
outputs (for me, at the moment):
[
{
"key": "9142896a-34ca-4c01-bc71-e5709652cac5",
"nb_path": "2015/2015-01-16\\serhsdh.ipynb",
"pid": 11436,
"port": 56173
},
{
"key": "1ddedd95-5673-45a6-b0fb-a3083debb681",
"nb_path": "Untitled0.ipynb",
"pid": 11248,
"port": 52191
},
{
"key": "330343dc-ae60-4f5c-b9b8-e5d05643df19",
"nb_path": "ipynb\\temp.ipynb",
"pid": 4680,
"port": 55446
},
{
"key": "888ad49b-5729-40c8-8d53-0e025b03ecc6",
"nb_path": "Untitled2.ipynb",
"pid": 7584,
"port": 55401
},
{
"key": "26d9ddd2-546a-40b4-975f-07403bb4e048",
"nb_path": "Untitled1.ipynb",
"pid": 10916,
"port": 55351
}
]
Adding to the Dennis Golomazov's answer to:
I replaced the get_running_notebooks function by this one (source):
import requests
def get_running_notebooks(host, port, password=''):
"""
Get kernel ids and paths of the running notebooks.
Args:
host: host at which the notebook server is listening. E.g. 'localhost'.
port: port at which the notebook server is listening. E.g. 8888.
Returns:
list of dicts {kernel_id: notebook kernel id, path: path to notebook file}.
"""
BASE_URL = 'http://{0}:{1}/'.format(host, port)
# Get the cookie data
s = requests.Session()
url = BASE_URL + 'login?next=%2F'
resp = s.get(url)
xsrf_cookie = resp.cookies['_xsrf']
# Login with the password
params = {'_xsrf': xsrf_cookie, 'password': password}
res = s.post(url, data=params)
# Find which kernel corresponds to which notebook
# by querying the notebook server api for sessions
url = posixpath.join(BASE_URL, 'api', 'sessions')
ret = s.get(url)
#print('Status code:', ret.status_code)
# Get the notebook list
res = json.loads(ret.text)
notebooks = [{'kernel_id': notebook['kernel']['id'],
'path': notebook['notebook']['path']} for notebook in res]
return notebooks