leandro commited on
Commit
df8330d
·
1 Parent(s): 965b999

initial commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "jupyter/jupyter_server:app", "--host", "0.0.0.0", "--port", "7860"]
jupyter/Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Minimal requirements
6
+ RUN pip install --upgrade pip flask docker requests ipython jupyter-client ipykernel
7
+ RUN ipython kernel install --name "python3" --user
8
+
9
+ # Extra requirements
10
+ RUN pip install pandas scikit-learn matplotlib seaborn
11
+
12
+ COPY jupyter_kernel.py .
13
+
14
+ EXPOSE 5000
15
+
16
+ ENTRYPOINT ["python", "jupyter_kernel.py"]
jupyter/jupyter_kernel.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from jupyter_client import KernelManager
3
+ import sys, io, time, queue, os
4
+
5
+
6
+ app = Flask(__name__)
7
+ TIMEOUT = os.getenv("TIMEOUT", 60)
8
+
9
+ km = KernelManager(kernel_name='python3')
10
+
11
+ km.start_kernel()
12
+ kc = km.client()
13
+
14
+ @app.route('/execute', methods=['POST'])
15
+ def execute_code():
16
+ global kc
17
+ code = request.json.get('code')
18
+ if not code:
19
+ return jsonify({'error': 'No code provided'}), 400
20
+
21
+ outputs = []
22
+ start_time = time.time()
23
+
24
+ _ = kc.execute(code)
25
+
26
+ success = True
27
+ error = None
28
+
29
+ while True:
30
+ try:
31
+ # if we timed out we interrupt the kernel so we can send the next request
32
+ if time.time() - start_time > TIMEOUT:
33
+ outputs.append({"text": f"Code execution has timed out (max {TIMEOUT}sec)."})
34
+ success = False
35
+ error = "TimeOut"
36
+ km.interrupt_kernel()
37
+ break
38
+
39
+ # get message from output buffer
40
+ msg = kc.get_iopub_msg(timeout=1)
41
+
42
+ # if the kernel is idle again we can wrap up
43
+ if msg['header']['msg_type'] == 'status' and msg['content']['execution_state'] == 'idle':
44
+ break
45
+
46
+ # save output messages
47
+ if msg['header']['msg_type'] == 'stream':
48
+ outputs.append({"text": msg['content']['text']})
49
+
50
+ # handle error messages
51
+ elif msg['header']['msg_type'] == 'error':
52
+ outputs.append({"text": "\n".join(msg['content']['traceback'])})
53
+ success = False
54
+ error = msg["content"]["ename"]
55
+
56
+ # handle figures
57
+ elif msg['header']['msg_type'] == 'display_data':
58
+ msg["content"]["data"]["text"] = msg["content"]["data"].pop("text/plain")
59
+ outputs.append(msg["content"]["data"])
60
+
61
+ # if queue is empty we try again, unless the kernel is dead
62
+ except queue.Empty:
63
+ print("No message received (timeout)", )
64
+ if not kc.is_alive():
65
+ outputs.append({"text": "Kernel has died!"})
66
+ success = False
67
+ error = "KernelDied"
68
+ km.restart_kernel()
69
+ break
70
+
71
+ return jsonify({
72
+ 'result': outputs,
73
+ 'success': success,
74
+ 'error': error
75
+ })
76
+
77
+ @app.route('/restart', methods=['POST'])
78
+ def restart_kernel():
79
+ global kc
80
+ km.restart_kernel()
81
+ kc = km.client()
82
+ return jsonify(status="Kernel has been restarted."), 200
83
+
84
+ @app.route('/health', methods=['GET'])
85
+ def health_check():
86
+ return jsonify(status="healthy"), 200
87
+
88
+ if __name__ == '__main__':
89
+ app.run(host='0.0.0.0', port=5000)
jupyter/jupyter_server.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import uuid
3
+ import time
4
+ import docker
5
+ import requests
6
+ import atexit
7
+ import socket
8
+ import argparse
9
+ import logging
10
+ from pydantic import BaseModel, Field, ValidationError
11
+
12
+
13
+ app = Flask(__name__)
14
+ app.logger.setLevel(logging.INFO)
15
+
16
+
17
+ # CLI function to parse arguments
18
+ def parse_args():
19
+ parser = argparse.ArgumentParser(description="Jupyter server.")
20
+ parser.add_argument('--n_instances', type=int, help="Number of Jupyter instances.")
21
+ parser.add_argument('--n_cpus', type=int, default=2, help="Number of CPUs per Jupyter instance.")
22
+ parser.add_argument('--mem', type=str, default="2g", help="Amount of memory per Jupyter instance.")
23
+ parser.add_argument('--execution_timeout', type=int, default=10, help="Timeout period for a code execution.")
24
+ parser.add_argument('--port', type=int, default=5001, help="Port of main server")
25
+ return parser.parse_args()
26
+
27
+
28
+ def get_unused_port(start=50000, end=65535, exclusion=[]):
29
+ for port in range(start, end + 1):
30
+ if port in exclusion:
31
+ continue
32
+ try:
33
+ sock = socket.socket()
34
+ sock.bind(("", port))
35
+ sock.listen(1)
36
+ sock.close()
37
+ return port
38
+ except OSError:
39
+ continue
40
+ raise IOError("No free ports available in range {}-{}".format(start, end))
41
+
42
+
43
+ def create_kernel_containers(n_instances, n_cpus=2, mem="2g", execution_timeout=10):
44
+
45
+ docker_client = docker.from_env()
46
+ app.logger.info("Buidling docker image...")
47
+ image, logs = docker_client.images.build(path='./', tag='jupyter-kernel:latest')
48
+ app.logger.info("Building docker image complete.")
49
+
50
+ containers = []
51
+ port_exclusion = []
52
+ for i in range(n_instances):
53
+
54
+ free_port = get_unused_port(exclusion=port_exclusion)
55
+ port_exclusion.append(free_port) # it takes a while to startup so we don't use the same port twice
56
+ app.logger.info(f"Starting container {i} on port {free_port}...")
57
+ container = docker_client.containers.run(
58
+ "jupyter-kernel:latest",
59
+ detach=True,
60
+ mem_limit=mem,
61
+ cpuset_cpus=f"{i*n_cpus}-{(i+1)*n_cpus-1}", # Limit to CPU cores 0 and 1
62
+ remove=True,
63
+ ports={'5000/tcp': free_port},
64
+ environment={"EXECUTION_TIMEOUT": execution_timeout},
65
+ )
66
+
67
+ containers.append({"container": container, "port": free_port})
68
+
69
+ start_time = time.time()
70
+
71
+ containers_ready = []
72
+
73
+ while len(containers_ready) < n_instances:
74
+ app.logger.info("Pinging Jupyter containers to check readiness.")
75
+ if time.time() - start_time > 60:
76
+ raise TimeoutError("Container took too long to startup.")
77
+ for i in range(n_instances):
78
+ if i in containers_ready:
79
+ continue
80
+ url = f"http://localhost:{containers[i]['port']}/health"
81
+ try:
82
+ # TODO: dedicated health endpoint
83
+ response = requests.get(url)
84
+ if response.status_code == 200:
85
+ containers_ready.append(i)
86
+ except Exception as e:
87
+ # Catch any other errors that might occur
88
+ pass
89
+ time.sleep(0.5)
90
+ app.logger.info("Containers ready!")
91
+ return containers
92
+
93
+ def shutdown_cleanup():
94
+ app.logger.info("Shutting down. Stopping and removing all containers...")
95
+ for instance in app.containers:
96
+ try:
97
+ instance['container'].stop()
98
+ instance['container'].remove()
99
+ except Exception as e:
100
+ app.logger.info(f"Error stopping/removing container: {str(e)}")
101
+ app.logger.info("All containers stopped and removed.")
102
+
103
+
104
+ class ServerRequest(BaseModel):
105
+ code: str = Field(..., example="print('Hello World!')")
106
+ instance_id: int = Field(0, example=0)
107
+ restart: bool = Field(False, example=False)
108
+
109
+
110
+ @app.route('/execute', methods=['POST'])
111
+ def execute_code():
112
+ try:
113
+ input = ServerRequest(**request.json)
114
+ except ValidationError as e:
115
+ return jsonify(e.errors()), 400
116
+
117
+
118
+ port = app.containers[input.instance_id]["port"]
119
+
120
+ app.logger.info(f"Received request for instance {input.instance_id} (port={port}).")
121
+
122
+ try:
123
+ if input.restart:
124
+ response = requests.post(f'http://localhost:{port}/restart', json={})
125
+ if response.status_code==200:
126
+ app.logger.info(f"Kernel for instance {input.instance_id} restarted.")
127
+ else:
128
+ app.logger.info(f"Error when restarting kernel of instance {input.instance_id}: {response.json()}.")
129
+
130
+ response = requests.post(f'http://localhost:{port}/execute', json={'code': input.code})
131
+ result = response.json()
132
+ return result
133
+
134
+ except Exception as e:
135
+ app.logger.info(f"Error in execute_code: {str(e)}")
136
+ return jsonify({
137
+ 'result': 'error',
138
+ 'output': str(e)
139
+ }), 500
140
+
141
+
142
+ atexit.register(shutdown_cleanup)
143
+
144
+ if __name__ == '__main__':
145
+ args = parse_args()
146
+ app.containers = create_kernel_containers(
147
+ args.n_instances,
148
+ n_cpus=args.n_cpus,
149
+ mem=args.mem,
150
+ execution_timeout=args.execution_timeout
151
+ )
152
+ # don't use debug=True --> it will run main twice and thus start double the containers
153
+ app.run(debug=False, host='0.0.0.0', port=args.port)
154
+
155
+
156
+ # TODO:
157
+ # how to mount data at runtime into the container? idea: mount a (read only)
158
+ # folder into the container at startup and copy the data in there. before starting
159
+ # the kernel we could cp the necessary data into the pwd.
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ flask
2
+ jupyter-client
3
+ pydantic
4
+ docker
5
+ uvicorn