Skip to content

Commit 2b08d83

Browse files
author
Sean Smith
authored
Release 2.5.0
Merge Release 2.5.0
2 parents fc3ffe9 + 362a621 commit 2b08d83

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1893
-377
lines changed

.flake8

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
[flake8]
2-
#ignore =
3-
# D105, # Missing docstring in magic method
4-
# D100, # Missing docstring in public module
5-
# D104, # Missing docstring in public package
6-
# D107, # Missing docstring in __init__
7-
# W503, # line break before binary operator => Conflicts with black style.
8-
# D103 Missing docstring in public function
9-
# E402 module level import not at top of file
10-
# D101 Missing docstring in public class
11-
# D102 Missing docstring in public method
12-
# D205 1 blank line required between summary line and description
13-
# D400 First line should end with a period
14-
# D401 First line should be in imperative mood
2+
ignore =
3+
D105, # Missing docstring in magic method
4+
D100, # Missing docstring in public module
5+
D101, # Missing docstring in public class
6+
D102, # Missing docstring in public method
7+
D103, # Missing docstring in public function
8+
D104, # Missing docstring in public package
9+
D107, # Missing docstring in __init__
10+
W503, # line break before binary operator => Conflicts with black style.
1511
exclude =
1612
.tox,
1713
.git,

CHANGELOG.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,40 @@ aws-parallelcluster-node CHANGELOG
33

44
This file is used to list changes made in each version of the aws-parallelcluster-node package.
55

6+
2.5.0
7+
-----
8+
9+
**ENHANCEMENTS**
10+
- Slurm:
11+
- Add support for scheduling with GPU options. Currently supports the following GPU-related options: `—G/——gpus,
12+
——gpus-per-task, ——gpus-per-node, ——gres=gpu, ——cpus-per-gpu`.
13+
- Add gres.conf and slurm_parallelcluster_gres.conf in order to enable GPU options. slurm_parallelcluster_gres.conf
14+
is automatically generated by node daemon and contains GPU information from compute instances. If need to specify
15+
additional GRES options manually, please modify gres.conf and avoid changing slurm_parallelcluster_gres.conf when
16+
possible.
17+
- Integrated GPU requirements into scaling logic, cluster will scale automatically to satisfy GPU/CPU requirements
18+
for pending jobs. When submitting GPU jobs, CPU/node/task information is not required but preferred in order to
19+
avoid ambiguity. If only GPU requirements are specified, cluster will scale up to the minimum number of nodes
20+
required to satisfy all GPU requirements.
21+
- Slurm daemons will now keep running when cluster is stopped for better stability. However, it is not recommended
22+
to submit jobs when the cluster is stopped.
23+
- Change jobwatcher logic to consider both GPU and CPU when making scaling decision for slurm jobs. In general,
24+
cluster will scale up to the minimum number of nodes needed to satisfy all GPU/CPU requirements.
25+
- Reduce number of calls to ASG in nodewatcher to avoid throttling, especially at cluster scale-down.
26+
27+
**CHANGES**
28+
- Increase max number of SQS messages that can be processed by sqswatcher in a single batch from 50 to 200. This
29+
improves the scaling time especially with increased ASG launch rates.
30+
- Increase faulty node termination timeout from 1 minute to 5 in order to give some additional time to the scheduler
31+
to recover when under heavy load.
32+
33+
**BUG FIXES**
34+
- Fix jobwatcher behaviour that was marking nodes locked by the nodewatcher as busy even if they had been removed
35+
already from the ASG Desired count. This was causing, in rare circumstances, a cluster overscaling.
36+
- Better handling of errors occurred when adding/removing nodes from the scheduler config.
37+
- Fix bug that was causing failures in sqswatcher when ADD and REMOVE event for the same host are fetched together.
38+
39+
640
2.4.1
741
-----
842

setup.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Copyright 2013-2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
#
3-
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
4-
# License. A copy of the License is located at
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
4+
# the License. A copy of the License is located at
55
#
66
# http://aws.amazon.com/apache2.0/
77
#
@@ -10,7 +10,6 @@
1010
# limitations under the License.
1111

1212
import os
13-
import sys
1413

1514
from setuptools import find_packages, setup
1615

@@ -29,7 +28,7 @@ def read(fname):
2928
"nodewatcher = nodewatcher.nodewatcher:main",
3029
"jobwatcher = jobwatcher.jobwatcher:main",
3130
]
32-
version = "2.4.1"
31+
version = "2.5.0"
3332
requires = ["requests>=2.21.0", "boto3>=1.7.55", "retrying>=1.3.3", "configparser>=3.7.4", "paramiko>=2.4.2"]
3433

3534
setup(

src/common/remote_command_executor.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
from retrying import retry
2020

21-
from common.time_utils import seconds
2221
from paramiko import AutoAddPolicy, SSHClient
2322

2423
RemoteCommandResult = collections.namedtuple("RemoteCommandResult", ["return_code", "stdout", "stderr"])
@@ -33,6 +32,8 @@ class RemoteCommandExecutionError(Exception):
3332
class RemoteCommandExecutor:
3433
"""Execute remote commands."""
3534

35+
DEFAULT_TIMEOUT = 10
36+
3637
def __init__(self, hostname, user, ssh_key_file=None):
3738
try:
3839
if not ssh_key_file:
@@ -53,17 +54,19 @@ def __del__(self):
5354
# Catch all exceptions if we fail to close the clients
5455
logging.warning("Exception raised when closing remote clients: {0}".format(e))
5556

56-
def run_remote_command(self, command, timeout=seconds(5), log_error=True, fail_on_error=True):
57+
def run_remote_command(self, command, timeout=DEFAULT_TIMEOUT, log_error=True, fail_on_error=True):
5758
"""
5859
Execute remote command on the configured host.
5960
6061
:param command: command to execute.
62+
:param timeout: timeout for command execution in sec
6163
:param log_error: log errors.
64+
:param fail_on_error: raise Exception on command execution failures
6265
:return: result of the execution.
6366
"""
6467
if isinstance(command, list):
6568
command = " ".join(command)
66-
logging.info("Executing remote command command on {0}: {1}".format(self.__user_at_hostname, command))
69+
logging.info("Executing remote command on {0}: {1}".format(self.__user_at_hostname, command))
6770
result = None
6871
try:
6972
stdin, stdout, stderr = self.__ssh_client.exec_command(command, get_pty=True)

src/common/schedulers/converters.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
def from_xml_to_obj(xml, obj_type):
1818
"""
19-
Maps a given xml document into a python object.
19+
Map a given xml document into a python object.
2020
2121
The python object you want to map the xml into needs to define a MAPPINGS dictionary which declare how
2222
to map each tag of the xml doc into the object itself.
@@ -55,7 +55,7 @@ def from_xml_to_obj(xml, obj_type):
5555

5656
def from_table_to_obj_list(table, obj_type, separator="|"):
5757
"""
58-
Maps a given tabular output into a python object.
58+
Map a given tabular output into a python object.
5959
6060
The python object you want to map the table into needs to define a MAPPINGS dictionary which declare how
6161
to map each row element into the object itself.

src/common/schedulers/sge_commands.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,10 @@
5656
# or some combination thereof.
5757
# Refer to qstat man page for additional details.
5858
# o(rphaned) is not considered as busy since we assume a node in orphaned state is not present in ASG anymore
59-
SGE_BUSY_STATES = ["u", "C", "s", "d", "D", "E", "P"]
59+
SGE_BUSY_STATES = ["u", "C", "s", "D", "E", "P"]
60+
61+
# This state is set by nodewatcher when the node is locked and is being terminated.
62+
SGE_DISABLED_STATE = "d"
6063

6164
# If an o(rphaned) state is displayed for a queue instance, it indicates that the queue instance is no longer demanded
6265
# by the current cluster queue configuration or the host group configuration. The queue instance is kept because jobs
@@ -133,10 +136,11 @@ def remove_hosts_from_queue(hosts):
133136
def install_sge_on_compute_nodes(hosts, cluster_user):
134137
"""Start sge on compute nodes in parallel."""
135138
command = (
136-
"sudo sh -c 'cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf'"
139+
"sudo sh -c 'ps aux | grep [s]ge_execd || "
140+
"(cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf)'"
137141
).format(sge.SGE_ROOT)
138142
hostnames = [host.hostname for host in hosts]
139-
result = RemoteCommandExecutor.run_remote_command_on_multiple_hosts(command, hostnames, cluster_user)
143+
result = RemoteCommandExecutor.run_remote_command_on_multiple_hosts(command, hostnames, cluster_user, timeout=20)
140144

141145
succeeded_hosts = []
142146
for host in hosts:
@@ -206,6 +210,7 @@ def get_jobs_info(hostname_filter=None, job_state_filter=None):
206210
def get_pending_jobs_info(max_slots_filter=None, skip_if_state=None):
207211
"""
208212
Retrieve the list of pending jobs.
213+
209214
:param max_slots_filter: discard jobs that require a number of slots bigger than the given value
210215
:param skip_if_state: discard jobs that are in the given state
211216
:return: the list of filtered pending jos.

0 commit comments

Comments
 (0)