Skip to content

Commit 9dbff99

Browse files
Merge pull request #153 from aws/develop
Merge Release 2.4.0
2 parents 9fae22d + 9db8e2e commit 9dbff99

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+3166
-785
lines changed

.flake8

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
[flake8]
2+
#ignore =
3+
# D105, # Missing docstring in magic method
4+
# D100, # Missing docstring in public module
5+
# D104, # Missing docstring in public package
6+
# D107, # Missing docstring in __init__
7+
# W503, # line break before binary operator => Conflicts with black style.
8+
# D103 Missing docstring in public function
9+
# E402 module level import not at top of file
10+
# D101 Missing docstring in public class
11+
# D102 Missing docstring in public method
12+
# D205 1 blank line required between summary line and description
13+
# D400 First line should end with a period
14+
# D401 First line should be in imperative mood
15+
exclude =
16+
.tox,
17+
.git,
18+
.pytest_cache,
19+
docs/source/conf.py,
20+
build,
21+
dist,
22+
tests/fixtures/*,
23+
*.pyc,
24+
*.egg-info,
25+
.cache,
26+
.eggs
27+
max-complexity = 10
28+
max-line-length = 120
29+
import-order-style = google
30+
application-import-names = flake8
31+
format = ${cyan}%(path)s${reset}:${yellow_bold}%(row)d${reset}:${green_bold}%(col)d${reset}: ${red_bold}%(code)s${reset} %(text)s

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ build/
88
*.egg-info/
99
.idea/
1010
.DS_Store
11+
.tox/

.isort.cfg

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[settings]
2+
line_length=120
3+
known_future_library=future
4+
known_third_party=boto3,botocore,awscli,tabulate,argparse,configparser,pytest,pytest,pytest-datadir,pytest-html,pytest-rerunfailures,pytest-xdist,argparse,retrying,junitparser,Jinja2
5+
# 3 - Vertical Hanging Indent
6+
# from third_party import (
7+
# lib1,
8+
# lib2,
9+
# lib3,
10+
# lib4,
11+
# )
12+
multi_line_output=3
13+
include_trailing_comma=true

.travis.yml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@ python:
88
- "3.6"
99

1010
install:
11-
- if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then xargs -L 1 pip install < requirements26.txt; fi
12-
- if [[ $TRAVIS_PYTHON_VERSION != '2.6' ]]; then pip install -r requirements.txt; fi
13-
- pip install -e .
11+
- pip install tox-travis
1412

1513
sudo: false
1614

17-
script:
18-
- sh tests/test.sh
19-
- python jobwatcher/plugins/unittests.py
15+
matrix:
16+
include:
17+
- name: Code Checks
18+
python: 3.6
19+
stage: linters
20+
env: TOXENV=code-linters
21+
22+
script: tox

CHANGELOG.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,39 @@ aws-parallelcluster-node CHANGELOG
33

44
This file is used to list changes made in each version of the aws-parallelcluster-node package.
55

6+
2.4.0
7+
-----
8+
9+
**ENHANCEMENTS**
10+
- Dynamically fetch compute instance type and cluster size in order to support updates
11+
- SGE:
12+
- process nodes added to or removed from the cluster in batches in order to speed up cluster scaling.
13+
- scale up only if required slots/nodes can be satisfied
14+
- scale down if pending jobs have unsatisfiable CPU/nodes requirements
15+
- add support for jobs in hold/suspended state (this includes job dependencies)
16+
- automatically terminate and replace faulty or unresponsive compute nodes
17+
- add retries in case of failures when adding or removing nodes
18+
- Slurm:
19+
- scale up only if required slots/nodes can be satisfied
20+
- scale down if pending jobs have unsatisfiable CPU/nodes requirements
21+
- automatically terminate and replace faulty or unresponsive compute nodes
22+
- Dump logs of replaced failing compute nodes to shared home directory
23+
24+
**CHANGES**
25+
- SQS messages that fail to be processed are re-queued only 3 times and not forever
26+
- Reset idletime to 0 when the host becomes essential for the cluster (because of min size of ASG or because there are
27+
pending jobs in the scheduler queue)
28+
- SGE: a node is considered as busy when in one of the following states "u", "C", "s", "d", "D", "E", "P", "o".
29+
This allows a quick replacement of the node without waiting for the `nodewatcher` to terminate it.
30+
31+
**BUG FIXES**
32+
- Slurm: add "BeginTime", "NodeDown", "Priority" and "ReqNodeNotAvail" to the pending reasons that trigger
33+
a cluster scaling
34+
- Add a timeout on remote commands execution so that the daemons are not stuck if the compute node is unresponsive
35+
- Fix an edge case that was causing the `nodewatcher` to hang forever in case the node had become essential to the
36+
cluster during a call to `self_terminate`.
37+
38+
639
2.3.1
740
-----
841

common/remote_command_executor.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License").
4+
# You may not use this file except in compliance with the License.
5+
# A copy of the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "LICENSE.txt" file accompanying this file.
10+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
11+
# See the License for the specific language governing permissions and limitations under the License.
12+
import collections
13+
import logging
14+
import os
15+
import time
16+
from math import ceil
17+
from multiprocessing import Pool
18+
19+
from retrying import retry
20+
21+
from common.time_utils import seconds
22+
from paramiko import AutoAddPolicy, SSHClient
23+
24+
RemoteCommandResult = collections.namedtuple("RemoteCommandResult", ["return_code", "stdout", "stderr"])
25+
26+
27+
class RemoteCommandExecutionError(Exception):
28+
"""Signal a failure in remote command execution."""
29+
30+
pass
31+
32+
33+
class RemoteCommandExecutor:
34+
"""Execute remote commands."""
35+
36+
def __init__(self, hostname, user, ssh_key_file=None):
37+
try:
38+
if not ssh_key_file:
39+
ssh_key_file = os.path.expanduser("~" + user) + "/.ssh/id_rsa"
40+
self.__ssh_client = SSHClient()
41+
self.__ssh_client.load_system_host_keys()
42+
self.__ssh_client.set_missing_host_key_policy(AutoAddPolicy())
43+
self.__ssh_client.connect(hostname=hostname, username=user, key_filename=ssh_key_file)
44+
self.__user_at_hostname = "{0}@{1}".format(user, hostname)
45+
except Exception as e:
46+
logging.error("Failed when connecting to host %s with error: %s", hostname, e)
47+
raise
48+
49+
def __del__(self):
50+
try:
51+
self.__ssh_client.close()
52+
except Exception as e:
53+
# Catch all exceptions if we fail to close the clients
54+
logging.warning("Exception raised when closing remote clients: {0}".format(e))
55+
56+
def run_remote_command(self, command, timeout=seconds(5), log_error=True, fail_on_error=True):
57+
"""
58+
Execute remote command on the configured host.
59+
60+
:param command: command to execute.
61+
:param log_error: log errors.
62+
:return: result of the execution.
63+
"""
64+
if isinstance(command, list):
65+
command = " ".join(command)
66+
logging.info("Executing remote command command on {0}: {1}".format(self.__user_at_hostname, command))
67+
result = None
68+
try:
69+
stdin, stdout, stderr = self.__ssh_client.exec_command(command, get_pty=True)
70+
self._wait_for_command_execution(timeout, stdout)
71+
result = RemoteCommandResult(
72+
return_code=stdout.channel.recv_exit_status(),
73+
stdout="\n".join(stdout.read().decode().splitlines()),
74+
stderr="\n".join(stderr.read().decode().splitlines()),
75+
)
76+
if result.return_code != 0 and fail_on_error:
77+
raise RemoteCommandExecutionError(result)
78+
return result
79+
except Exception:
80+
if log_error and result:
81+
logging.error(
82+
"Command {0} failed with error:\n{1}\nand output:\n{2}".format(
83+
command, result.stderr, result.stdout
84+
)
85+
)
86+
raise
87+
88+
@staticmethod
89+
def _wait_for_command_execution(timeout, stdout):
90+
# Using the non-blocking exit_status_ready to avoid being stuck forever on recv_exit_status
91+
# especially when a compute node is terminated during this operation
92+
while timeout > 0 and not stdout.channel.exit_status_ready():
93+
timeout = timeout - 1
94+
time.sleep(1)
95+
if not stdout.channel.exit_status_ready():
96+
raise RemoteCommandExecutionError("Timeout occurred when executing remote command")
97+
98+
@staticmethod
99+
def run_remote_command_on_multiple_hosts(
100+
command, hostnames, user, ssh_key_file=None, parallelism=10, timeout=10, fail_on_error=True
101+
):
102+
if not hostnames:
103+
return {}
104+
105+
pool = Pool(parallelism)
106+
try:
107+
r = pool.map_async(
108+
_pickable_run_remote_command,
109+
[(hostname, command, user, ssh_key_file, timeout, fail_on_error) for hostname in hostnames],
110+
)
111+
# The pool timeout is computed by adding 2 times the command timeout for each batch of hosts that is
112+
# processed in sequence. Where the size of a batch is given by the degree of parallelism.
113+
results = r.get(timeout=int(ceil(len(hostnames) / float(parallelism)) * (2 * timeout)))
114+
finally:
115+
pool.terminate()
116+
117+
return dict(results)
118+
119+
120+
@retry(stop_max_attempt_number=2)
121+
def _pickable_run_remote_command(args):
122+
"""Pickable version of the run_command method that can be used by a pool."""
123+
(hostname, command, user, ssh_key_file, timeout, fail_on_error) = args
124+
try:
125+
remote_command_executor = RemoteCommandExecutor(hostname, user, ssh_key_file)
126+
remote_command_executor.run_remote_command(command, timeout, fail_on_error=fail_on_error)
127+
return hostname, True
128+
except Exception as e:
129+
logging.error("Failed when executing remote command on node %s with error %s", hostname, e)
130+
return hostname, False

common/schedulers/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4+
# with the License. A copy of the License is located at
5+
#
6+
# http://aws.amazon.com/apache2.0/
7+
#
8+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
9+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10+
# limitations under the License.

0 commit comments

Comments
 (0)