-
Notifications
You must be signed in to change notification settings - Fork 319
[Observability] Alarm on clustermgtd not running #7209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4df799d
76054aa
2edb607
d3884ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,6 +64,9 @@ | |
| CW_ALARM_PERIOD_DEFAULT, | ||
| CW_LOG_GROUP_NAME_PREFIX, | ||
| CW_LOGS_CFN_PARAM_NAME, | ||
| CW_METRICS_CLUSTERMGTD_HEARTBEAT, | ||
| CW_METRICS_DIMENSION_CLUSTER_NAME, | ||
| CW_METRICS_NAMESPACE, | ||
| DEFAULT_EPHEMERAL_DIR, | ||
| EFS_PORT, | ||
| FSX_PORTS, | ||
|
|
@@ -364,38 +367,64 @@ def _cw_metric_head_node( | |
| def _add_head_node_alarms(self): | ||
| self.head_node_alarms = [] | ||
|
|
||
| # Metric-specific configurations (only specify overrides from defaults) | ||
| metrics_for_alarms = { | ||
| "Health": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"), | ||
| "Cpu": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"), | ||
| "Mem": self._cw_metric_head_node("CWAgent", "mem_used_percent"), | ||
| "Disk": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}), | ||
| "Health": { | ||
| "metric": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"), | ||
| "threshold": 0, | ||
| }, | ||
| "Cpu": { | ||
| "metric": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"), | ||
| }, | ||
| "Mem": { | ||
| "metric": self._cw_metric_head_node("CWAgent", "mem_used_percent"), | ||
| }, | ||
| "Disk": { | ||
| "metric": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}), | ||
| }, | ||
| } | ||
|
|
||
| for metric_key, metric in metrics_for_alarms.items(): | ||
| if self._condition_is_slurm(): | ||
| metrics_for_alarms["ClustermgtdHeartbeat"] = { | ||
| "metric": self._cw_metric_head_node( | ||
| CW_METRICS_NAMESPACE, | ||
| CW_METRICS_CLUSTERMGTD_HEARTBEAT, | ||
| extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name}, | ||
| ), | ||
| "evaluation_periods": 10, | ||
| "datapoints_to_alarm": 10, | ||
| "comparison_operator": cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD, | ||
| "threshold": 1, | ||
| "treat_missing_data": cloudwatch.TreatMissingData.BREACHING, | ||
| } | ||
|
|
||
| for metric_key, alarm_config in metrics_for_alarms.items(): | ||
| alarm_id = f"HeadNode{metric_key}Alarm" | ||
| alarm_name = f"{self.stack.stack_name}-HeadNode-{metric_key}" | ||
| threshold = 0 if metric_key == "Health" else CW_ALARM_PERCENT_THRESHOLD_DEFAULT | ||
| self.head_node_alarms.append( | ||
| cloudwatch.Alarm( | ||
| scope=self.stack, | ||
| id=alarm_id, | ||
| alarm_name=alarm_name, | ||
| metric=metric, | ||
| evaluation_periods=CW_ALARM_EVALUATION_PERIODS_DEFAULT, | ||
| threshold=threshold, | ||
| comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, | ||
| datapoints_to_alarm=CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, | ||
| ) | ||
| ) | ||
|
|
||
| self.head_node_alarms.append( | ||
| cloudwatch.CompositeAlarm( | ||
| alarm = cloudwatch.Alarm( | ||
| scope=self.stack, | ||
| id="HeadNodeAlarm", | ||
| composite_alarm_name=f"{self.stack.stack_name}-HeadNode", | ||
| alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms), | ||
| id=alarm_id, | ||
| alarm_name=alarm_name, | ||
| metric=alarm_config["metric"], | ||
| evaluation_periods=alarm_config.get("evaluation_periods", CW_ALARM_EVALUATION_PERIODS_DEFAULT), | ||
| threshold=alarm_config.get("threshold", CW_ALARM_PERCENT_THRESHOLD_DEFAULT), | ||
| comparison_operator=alarm_config.get( | ||
| "comparison_operator", cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD | ||
| ), | ||
| datapoints_to_alarm=alarm_config.get("datapoints_to_alarm", CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT), | ||
| treat_missing_data=alarm_config.get("treat_missing_data", cloudwatch.TreatMissingData.MISSING), | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NOTES FOR THE REVIEWER
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup I checked already the default |
||
| ) | ||
| alarm.node.add_dependency(self.wait_condition) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NOTES FOR THE REVIEWER
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand the Alarm dependency for the new ClustermgtdHeartbeat Alarm but why add dependency for all the alarms?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good point. My reasoning is:
So, un less there is a real value in created head node alarms before it completes its setup, there is not reason to have different behaviors. Do you agree with that?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes agreed! |
||
| self.head_node_alarms.append(alarm) | ||
|
|
||
| composite_alarm = cloudwatch.CompositeAlarm( | ||
| scope=self.stack, | ||
| id="HeadNodeAlarm", | ||
| composite_alarm_name=f"{self.stack.stack_name}-HeadNode", | ||
| alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms), | ||
| ) | ||
| composite_alarm.node.add_dependency(self.wait_condition) | ||
| self.head_node_alarms.append(composite_alarm) | ||
|
|
||
| def _add_iam_resources(self): | ||
| head_node_iam_resources = HeadNodeIamResources( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| Image: | ||
| Os: alinux2 | ||
| HeadNode: | ||
| InstanceType: t3.micro | ||
| Ssh: | ||
| KeyName: String | ||
| Networking: | ||
| SubnetId: subnet-12345678 | ||
| Scheduling: | ||
| Scheduler: awsbatch | ||
| AwsBatchQueues: | ||
| - Name: queue1 | ||
| Networking: | ||
| SubnetIds: | ||
| - subnet-12345678 | ||
| ComputeResources: | ||
| - Name: compute_resource1 | ||
| InstanceTypes: | ||
| - c4.xlarge | ||
| MaxvCpus: 10 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| Image: | ||
| Os: alinux2 | ||
| HeadNode: | ||
| InstanceType: String | ||
| Ssh: | ||
| KeyName: String | ||
| Networking: | ||
| SubnetId: subnet-12345678 | ||
| Iam: | ||
| S3Access: | ||
| - BucketName: testbucketpball | ||
| EnableWriteAccess: True | ||
| Scheduling: | ||
| Scheduler: slurm | ||
| SlurmQueues: | ||
| - Name: queue1 | ||
| ComputeResources: | ||
| - Name: compute_resource1 | ||
| InstanceType: t3.micro | ||
| MinCount: 1 | ||
| MaxCount: 5 | ||
| Networking: | ||
| SubnetIds: | ||
| - subnet-12345678 | ||
| Iam: | ||
| S3Access: | ||
| - BucketName: testbucketpball | ||
| EnableWriteAccess: True |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NOTE FOR THE REVIEWER
The refactoring of how alarms are defined here was required because clustermgtd heartbeat introduces an alarm that has different needs that the other alarms (threshold, missing data, observation period); so the logic must be changed to accommodate more flexibility.