From 47786933705ea79408e3011de6516c7ab570ca9e Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Thu, 18 Jul 2019 17:43:04 +0530 Subject: [PATCH 01/40] AUT-544-new: Jio like deployement flow. --- .../node => chaostoolkit_nimble}/__init__.py | 0 .../actions}/__init__.py | 0 chaostoolkit_nimble/actions/base/__init__.py | 0 .../actions/base/flows/__init__.py | 0 .../actions/base/flows/user_actions.py | 33 ++++ .../actions/sample/__init__.py | 0 .../sample/sample_application_actions.py | 10 ++ chaostoolkit_nimble/controllers/__init__.py | 0 .../controllers/extensions/__init__.py | 0 .../extensions/chaosk8s/__init__.py | 0 .../extensions/chaosk8s/control.py | 40 +++++ .../extensions/shell_app/__init__.py | 0 .../extensions/shell_app/control.py | 32 ++++ chaostoolkit_nimble/core/__init__.py | 0 chaostoolkit_nimble/core/configs/__init__.py | 0 .../core/configs/chaos_exp_config_parser.py | 150 ++++++++++++++++++ .../core/extensions/__init__.py | 0 .../core/extensions/chaosk8s}/__init__.py | 0 .../core/extensions/chaosk8s}/actions.py | 2 +- .../core/extensions/chaosk8s/node/__init__.py | 0 .../core/extensions/chaosk8s}/node/actions.py | 2 +- .../core/extensions/chaosk8s}/node/probes.py | 2 +- .../core/extensions/chaosk8s/pod/__init__.py | 0 .../core/extensions/chaosk8s}/pod/actions.py | 2 +- .../core/extensions/chaosk8s}/pod/probes.py | 2 +- .../core/extensions/chaosk8s}/probes.py | 4 +- chaostoolkit_nimble/core/utils/__init__.py | 0 .../core/utils/fabric_utils.py | 62 ++++++++ chaostoolkit_nimble/core/utils/ha_utils.py | 107 +++++++++++++ chaostoolkit_nimble/core/utils/shell_utils.py | 67 ++++++++ .../components/component_attributes.yml | 61 +++++++ .../component_attributes_cloudera.yml | 38 +++++ .../component_attributes_kerberos.yml | 64 ++++++++ .../exp_templates/experiment_template.json | 44 +++++ .../kubernetes_experiment_template.json | 73 +++++++++ .../process_experiment_template.json | 43 +++++ .../shell_app/shell_app_exp.json | 44 +++++ .../shell_app/shell_app_exp_template.json | 43 +++++ .../resources/security/krb5.conf | 26 +++ .../resources/testbeds/open_nebula_135_35.yml | 51 ++++++ .../resources/testbeds/open_nebula_135_52.yml | 52 ++++++ chaostoolkit_nimble/tests/__init__.py | 0 chaostoolkit_nimble/tests/conftest.py | 126 +++++++++++++++ .../tests}/fixtures/invalid-k8s.txt | 0 chaostoolkit_nimble/tests/sample/__init__.py | 0 .../tests/sample/test_shell_app_exp.py | 83 ++++++++++ .../tests}/test_actions.py | 4 +- .../tests}/test_client.py | 2 +- .../tests}/test_discovery.py | 2 +- .../tests}/test_pod.py | 6 +- .../tests}/test_probes.py | 4 +- journal.json | 104 ++++++++++++ logging.cfg | 28 ++++ pytest.ini | 6 +- 54 files changed, 1400 insertions(+), 19 deletions(-) rename {chaosk8s/node => chaostoolkit_nimble}/__init__.py (100%) rename {chaosk8s/pod => chaostoolkit_nimble/actions}/__init__.py (100%) create mode 100644 chaostoolkit_nimble/actions/base/__init__.py create mode 100644 chaostoolkit_nimble/actions/base/flows/__init__.py create mode 100644 chaostoolkit_nimble/actions/base/flows/user_actions.py create mode 100644 chaostoolkit_nimble/actions/sample/__init__.py create mode 100644 chaostoolkit_nimble/actions/sample/sample_application_actions.py create mode 100644 chaostoolkit_nimble/controllers/__init__.py create mode 100644 chaostoolkit_nimble/controllers/extensions/__init__.py create mode 100644 chaostoolkit_nimble/controllers/extensions/chaosk8s/__init__.py create mode 100644 chaostoolkit_nimble/controllers/extensions/chaosk8s/control.py create mode 100644 chaostoolkit_nimble/controllers/extensions/shell_app/__init__.py create mode 100644 chaostoolkit_nimble/controllers/extensions/shell_app/control.py create mode 100644 chaostoolkit_nimble/core/__init__.py create mode 100644 chaostoolkit_nimble/core/configs/__init__.py create mode 100644 chaostoolkit_nimble/core/configs/chaos_exp_config_parser.py create mode 100644 chaostoolkit_nimble/core/extensions/__init__.py rename {chaosk8s => chaostoolkit_nimble/core/extensions/chaosk8s}/__init__.py (100%) rename {chaosk8s => chaostoolkit_nimble/core/extensions/chaosk8s}/actions.py (98%) create mode 100644 chaostoolkit_nimble/core/extensions/chaosk8s/node/__init__.py rename {chaosk8s => chaostoolkit_nimble/core/extensions/chaosk8s}/node/actions.py (99%) rename {chaosk8s => chaostoolkit_nimble/core/extensions/chaosk8s}/node/probes.py (90%) create mode 100644 chaostoolkit_nimble/core/extensions/chaosk8s/pod/__init__.py rename {chaosk8s => chaostoolkit_nimble/core/extensions/chaosk8s}/pod/actions.py (98%) rename {chaosk8s => chaostoolkit_nimble/core/extensions/chaosk8s}/pod/probes.py (99%) rename {chaosk8s => chaostoolkit_nimble/core/extensions/chaosk8s}/probes.py (98%) create mode 100644 chaostoolkit_nimble/core/utils/__init__.py create mode 100644 chaostoolkit_nimble/core/utils/fabric_utils.py create mode 100644 chaostoolkit_nimble/core/utils/ha_utils.py create mode 100644 chaostoolkit_nimble/core/utils/shell_utils.py create mode 100644 chaostoolkit_nimble/resources/components/component_attributes.yml create mode 100644 chaostoolkit_nimble/resources/components/component_attributes_cloudera.yml create mode 100644 chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml create mode 100644 chaostoolkit_nimble/resources/exp_templates/experiment_template.json create mode 100644 chaostoolkit_nimble/resources/exp_templates/kubernetes_experiment_template.json create mode 100644 chaostoolkit_nimble/resources/exp_templates/shell_app/process_experiment_template.json create mode 100644 chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp.json create mode 100644 chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp_template.json create mode 100644 chaostoolkit_nimble/resources/security/krb5.conf create mode 100644 chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml create mode 100644 chaostoolkit_nimble/resources/testbeds/open_nebula_135_52.yml create mode 100644 chaostoolkit_nimble/tests/__init__.py create mode 100644 chaostoolkit_nimble/tests/conftest.py rename {tests => chaostoolkit_nimble/tests}/fixtures/invalid-k8s.txt (100%) create mode 100644 chaostoolkit_nimble/tests/sample/__init__.py create mode 100644 chaostoolkit_nimble/tests/sample/test_shell_app_exp.py rename {tests => chaostoolkit_nimble/tests}/test_actions.py (98%) rename {tests => chaostoolkit_nimble/tests}/test_client.py (96%) rename {tests => chaostoolkit_nimble/tests}/test_discovery.py (80%) rename {tests => chaostoolkit_nimble/tests}/test_pod.py (98%) rename {tests => chaostoolkit_nimble/tests}/test_probes.py (98%) create mode 100644 journal.json create mode 100644 logging.cfg diff --git a/chaosk8s/node/__init__.py b/chaostoolkit_nimble/__init__.py similarity index 100% rename from chaosk8s/node/__init__.py rename to chaostoolkit_nimble/__init__.py diff --git a/chaosk8s/pod/__init__.py b/chaostoolkit_nimble/actions/__init__.py similarity index 100% rename from chaosk8s/pod/__init__.py rename to chaostoolkit_nimble/actions/__init__.py diff --git a/chaostoolkit_nimble/actions/base/__init__.py b/chaostoolkit_nimble/actions/base/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/actions/base/flows/__init__.py b/chaostoolkit_nimble/actions/base/flows/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/actions/base/flows/user_actions.py b/chaostoolkit_nimble/actions/base/flows/user_actions.py new file mode 100644 index 0000000..e248bf4 --- /dev/null +++ b/chaostoolkit_nimble/actions/base/flows/user_actions.py @@ -0,0 +1,33 @@ +from nimble.core import global_constants +# class UserActions(object): +# """Actions exposed to the user for data validation.""" +# +# def __init__(self, config_parser, node_obj=NodeManager.node_obj): +# """ +# +# :type config_parser: :class:`nimble.core.configs.validation_config_parser.ValidationConfigParser` +# :type node_obj: :class:`nimble.core.entity.nodes.Nodes` +# """ +# self._logger = logging.getLogger(__name__) +# self.node_obj = node_obj +# self.config_parser = config_parser +# self.file_server_utils = FileServerUtils() +from nimble.core.utils.dynamic_substitution_utils import DynamicSubstitutionUtils +from nimble.core.utils.shell_utils import ShellUtils +from nimble.tests.conftest import OPTIONS_DICT + + +def run_experiment(experiments_template_path=None): + experiments_base_path = "%s/tmp/experiments" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH + ShellUtils.execute_shell_command(ShellUtils.remove_and_create_directory(experiments_base_path)) + if not experiments_template_path: + experiments_path = OPTIONS_DICT["experimentsPath"] + ShellUtils.execute_shell_command(ShellUtils.copy(experiments_path, experiments_base_path)) + else: + ShellUtils.execute_shell_command(ShellUtils.copy(experiments_template_path, experiments_base_path)) + + experiment_file_response = ShellUtils.execute_shell_command( + ShellUtils.find_files_in_directory(experiments_base_path)) + for experiment_file in experiment_file_response.stdout.strip().split("\n"): + DynamicSubstitutionUtils.update_file(experiment_file) + ShellUtils.execute_shell_command("chaos run %s" % experiment_file) diff --git a/chaostoolkit_nimble/actions/sample/__init__.py b/chaostoolkit_nimble/actions/sample/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/actions/sample/sample_application_actions.py b/chaostoolkit_nimble/actions/sample/sample_application_actions.py new file mode 100644 index 0000000..589c512 --- /dev/null +++ b/chaostoolkit_nimble/actions/sample/sample_application_actions.py @@ -0,0 +1,10 @@ +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.components.hadoop_utils import HadoopUtils + + +def launch_application(): + hadoop_utils = HadoopUtils() + master_namenode = hadoop_utils.master_namenode + command = "sleep 5m" + NodeManager.node_obj.execute_command_on_node(master_namenode, command) + a = 1 diff --git a/chaostoolkit_nimble/controllers/__init__.py b/chaostoolkit_nimble/controllers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/controllers/extensions/__init__.py b/chaostoolkit_nimble/controllers/extensions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/controllers/extensions/chaosk8s/__init__.py b/chaostoolkit_nimble/controllers/extensions/chaosk8s/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/controllers/extensions/chaosk8s/control.py b/chaostoolkit_nimble/controllers/extensions/chaosk8s/control.py new file mode 100644 index 0000000..e5177f0 --- /dev/null +++ b/chaostoolkit_nimble/controllers/extensions/chaosk8s/control.py @@ -0,0 +1,40 @@ +import time +from typing import List + +from chaoslib.types import Configuration, \ + Experiment, Run, Secrets, Activity + + +def after_activity_control(context: Activity, state: Run, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + after-control of the activity's execution + + Called by the Chaos Toolkit before the activity is applied. The result of + the execution is passed as `state`. See + https://docs.chaostoolkit.org/reference/api/journal/#run for more + information. + """ + print("----------------STATE AFTER ACTIVITY: %s" %state) + +def after_method_control(context: Experiment, state: List[Run], + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + after-control of the method's execution + + Called by the Chaos Toolkit after the activities of the method have been + applied. The `state` is the list of activity results. See + https://docs.chaostoolkit.org/reference/api/journal/#run for more + information. + """ + print("----------------STATE AFTER METHOD: %s" % state) + for run in state: + activity_obj = run["activity"] + activity_name = activity_obj["name"] + run_status = run["status"] + if "terminate_gracefully_pod_" in activity_name and run_status == "succeeded": + time.sleep(60) + elif "read_new_spawned_logs_for_pod" in activity_name and run_status == "succeeded": + print(run["output"].keys()) diff --git a/chaostoolkit_nimble/controllers/extensions/shell_app/__init__.py b/chaostoolkit_nimble/controllers/extensions/shell_app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/controllers/extensions/shell_app/control.py b/chaostoolkit_nimble/controllers/extensions/shell_app/control.py new file mode 100644 index 0000000..a3c4dc6 --- /dev/null +++ b/chaostoolkit_nimble/controllers/extensions/shell_app/control.py @@ -0,0 +1,32 @@ +import time +from typing import List + +from chaoslib.types import Configuration, \ + Experiment, Run, Secrets, Activity + + +def after_activity_control(context: Activity, state: Run, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + after-control of the activity's execution + + Called by the Chaos Toolkit before the activity is applied. The result of + the execution is passed as `state`. See + https://docs.chaostoolkit.org/reference/api/journal/#run for more + information. + """ + print("----------------STATE AFTER ACTIVITY: %s" %state) + +def after_method_control(context: Experiment, state: List[Run], + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + after-control of the method's execution + + Called by the Chaos Toolkit after the activities of the method have been + applied. The `state` is the list of activity results. See + https://docs.chaostoolkit.org/reference/api/journal/#run for more + information. + """ + print("----------------STATE AFTER METHOD: %s" % state) \ No newline at end of file diff --git a/chaostoolkit_nimble/core/__init__.py b/chaostoolkit_nimble/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/core/configs/__init__.py b/chaostoolkit_nimble/core/configs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/core/configs/chaos_exp_config_parser.py b/chaostoolkit_nimble/core/configs/chaos_exp_config_parser.py new file mode 100644 index 0000000..d55eef5 --- /dev/null +++ b/chaostoolkit_nimble/core/configs/chaos_exp_config_parser.py @@ -0,0 +1,150 @@ +import logging +import os + +from nimble.core import global_constants +from nimble.core.adapters.sqlite.sqlite_adapter import SqliteAdapter +from nimble.core.configs.base_yaml_parser import BaseYamlParser +from nimble.core.configs.scheduler_config_factory import SchedulerConfigFactory +from nimble.core.configs.source_config_factory import SourceConfigFactory +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.string_utils import StringUtils + + +class ChaosExpConfigParser(BaseYamlParser): + """Methods to fetch the validation attributes from the validation YAML config.""" + + def __init__(self, config_path, node_obj=NodeManager.node_obj): + """ + + :param config_path: Path of the config file to be used. + :type config_path: str + :type node_obj: :class:`nimble.core.entity.nodes.Nodes` + """ + self._logger = logging.getLogger(__name__) + super(ChaosExpConfigParser, self).__init__() + self.config_path = config_path + # self.config_obj = self.load_configs(config_path) + # self.project = self.get_defaults_from_config()["project"] + # self.build = self.get_defaults_from_config()["build"] + # self.customer = self.get_defaults_from_config()["customer"] + # self.stop_jobs_flag = self.get_defaults_from_config().get("stop_jobs", True) + # self.golden_build = self.get_attribute_or_default_or_pass(self.get_defaults_from_config(), "golden_build") + # self.mail_to = self.get_defaults_from_config()["mail_to"] + # self.output_file_name = "output.txt" + # self.ib_tmp_file = "ib_tmp_file.txt" + # self.input_tmp_file = "input_tmp_file.txt" + # self.project = self.get_defaults_from_config()["project"] + # self.build = self.get_defaults_from_config()["build"] + # self.customer = self.get_defaults_from_config()["customer"] + # self.base_http_path = "modules/%s/%s/%s" % (self.project, self.golden_build, self.customer) + # self.base_latest_http_path = "modules/%s/%s_latest/%s" % (self.project, self.build, self.customer) + # self.separator = ',' + # self.sqlite_file_path = "%s/validation_entities.db" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH + # self.urlcat_delimiter = "^^" + # self.sqlite_adapter = SqliteAdapter(db_file=self.sqlite_file_path) + # node_obj.fetch_timestamp_from_server() + + # def get_job_schedule_configs(self, job_alias): + # return SchedulerConfigFactory.get(self.get_job_schedule_source(job_alias), self.config_obj).get_configs( + # self.get_job_schedule(job_alias)) + # + # def get_ibs(self): + # return self.get_attribute_or_default_or_pass(self.config_obj, "ibs") + # + # def get_ib_attributes(self, ib_alias): + # return self.get_attribute_or_pass(self.get_ibs(), ib_alias) + # + # def get_ib_source(self, ib_alias): + # return self.get_attribute_or_default_or_pass(self.get_ib_attributes(ib_alias), "source") + # + # def get_ib_source_configs(self, ib_alias): + # return SourceConfigFactory.get(self.get_ib_source(ib_alias), self.config_obj).get_configs( + # self.get_ib_attributes(ib_alias)) + # + # def get_input_source_configs(self, job_alias, input_alias): + # """Get the attributes for input source for a given job and input alias from the YAML config file. + # + # :param job_alias: Job alias for which the input source attributes are to be fetched. + # :type job_alias: str + # :param input_alias: Input alias for which the source attributes are to be fetched. + # :type input_alias: str + # :return: Ordered dictionary of input source attributes. + # :rtype: :class:`collections.OrderedDict` + # """ + # return SourceConfigFactory.get(self.get_input_source(job_alias, input_alias), self.config_obj).get_configs( + # self.get_input_attributes(job_alias, input_alias)) + # + # def get_job_actual_output_source_configs(self, job_alias, output_alias): + # """Get the attributes for actual output source for a given job and input alias from the YAML config file. + # + # :param job_alias: Job alias for which the actual output source attributes are to be fetched. + # :type job_alias: str + # :param output_alias: Output alias for which the actual output source attributes are to be fetched. + # :type output_alias: str + # :return: Ordered dictionary of actual output source attributes. + # :rtype: :class:`collections.OrderedDict` + # """ + # source = self.get_job_actual_output_source(job_alias, output_alias) + # return SourceConfigFactory.get(source, self.config_obj).get_configs( + # self.get_job_actual_output_attributes(job_alias, output_alias)) + # + # def get_urlcat_command_attributes(self, job_alias, input_alias): + # """Get the urlcat command attributes for the given job and input alias from the YAML config file. + # + # :param job_alias: Job alias for which the urlcat command attributes are to be fetched. + # :type job_alias: str + # :param input_alias: Input alias for which the urlcat command attributes are to be fetched. + # :type input_alias: str + # :return: Ordered dictionary of `urlcat_command` attributes. + # :rtype: :class:`collections.OrderedDict` + # """ + # return self.get_attribute_or_default_or_pass(self.get_input_attributes(job_alias, input_alias), + # "urlcat_command") + # + # def get_urlcat_command_input_select_query(self, job_alias, input_alias): + # """Get the input select query for the given job and input alias from the YAML config file. + # + # This is the query given to fetch data from sqlite which will form the input for URLCat input. The URLcat command + # will operate on the fields selected through this query and generate an output accordingly. + # + # :param job_alias: Job alias for which the input select query is to be fetched. + # :type job_alias: str + # :param input_alias: Input alias for which the input select query is to be fetched. + # :type input_alias: str + # :return: Input select query. + # :rtype: str + # """ + # return self.get_attribute_or_default_or_pass(self.get_urlcat_command_attributes(job_alias, input_alias), + # "input_select_query") + # + # def get_urlcat_regression_ibs(self): + # """Get the url paths for all four `iv` and `ibstore` ibs that are supplied as job input parameters on jenkins. + # + # :return: Return all ib's url paths or None for each ib in case ib parameters are not supplied with the job. + # :rtype: tuple + # """ + # try: + # iv_url_expected = os.environ["iv_url_expected"] + # ibstore_url_expected = os.environ["ibstore_url_expected"] + # iv_url_actual = os.environ["iv_url_actual"] + # ibstore_url_actual = os.environ["ibstore_url_actual"] + # return (iv_url_expected, ibstore_url_expected, iv_url_actual, + # ibstore_url_actual) + # except KeyError: + # return None, None, None, None + # + # def get_urlcat_ib_versions(self): + # """Get the version number for expected as well as actual ibs. + # + # :return: Return both the version numbers or None for each set of ibs in case ib parameters are not supplied + # with the job. + # :rtype: tuple + # """ + # try: + # ib_version_expected = \ + # StringUtils.none_safe_string(self.get_urlcat_regression_ibs()[0]).replace("//", "/").split("/")[4] + # ib_version_actual = \ + # StringUtils.none_safe_string(self.get_urlcat_regression_ibs()[2]).replace("//", "/").split("/")[4] + # return (ib_version_expected, ib_version_actual) + # except IndexError: + # return None, None diff --git a/chaostoolkit_nimble/core/extensions/__init__.py b/chaostoolkit_nimble/core/extensions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaosk8s/__init__.py b/chaostoolkit_nimble/core/extensions/chaosk8s/__init__.py similarity index 100% rename from chaosk8s/__init__.py rename to chaostoolkit_nimble/core/extensions/chaosk8s/__init__.py diff --git a/chaosk8s/actions.py b/chaostoolkit_nimble/core/extensions/chaosk8s/actions.py similarity index 98% rename from chaosk8s/actions.py rename to chaostoolkit_nimble/core/extensions/chaosk8s/actions.py index 686fd65..c402ca3 100644 --- a/chaosk8s/actions.py +++ b/chaostoolkit_nimble/core/extensions/chaosk8s/actions.py @@ -9,7 +9,7 @@ from kubernetes.client.rest import ApiException from logzero import logger -from chaosk8s import create_k8s_api_client +from chaostoolkit_nimble.core.extensions import create_k8s_api_client __all__ = ["start_microservice", "kill_microservice", "scale_microservice", "remove_service_endpoint"] diff --git a/chaostoolkit_nimble/core/extensions/chaosk8s/node/__init__.py b/chaostoolkit_nimble/core/extensions/chaosk8s/node/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaosk8s/node/actions.py b/chaostoolkit_nimble/core/extensions/chaosk8s/node/actions.py similarity index 99% rename from chaosk8s/node/actions.py rename to chaostoolkit_nimble/core/extensions/chaosk8s/node/actions.py index e92a66f..b8cd918 100644 --- a/chaosk8s/node/actions.py +++ b/chaostoolkit_nimble/core/extensions/chaosk8s/node/actions.py @@ -13,7 +13,7 @@ from kubernetes.client.rest import ApiException from logzero import logger -from chaosk8s import create_k8s_api_client +from chaostoolkit_nimble.core.extensions import create_k8s_api_client __all__ = ["create_node", "delete_nodes", "cordon_node", "drain_nodes", "uncordon_node"] diff --git a/chaosk8s/node/probes.py b/chaostoolkit_nimble/core/extensions/chaosk8s/node/probes.py similarity index 90% rename from chaosk8s/node/probes.py rename to chaostoolkit_nimble/core/extensions/chaosk8s/node/probes.py index ffc0984..7f2aeef 100644 --- a/chaosk8s/node/probes.py +++ b/chaostoolkit_nimble/core/extensions/chaosk8s/node/probes.py @@ -4,7 +4,7 @@ from chaoslib.types import Configuration, Secrets from kubernetes import client -from chaosk8s import create_k8s_api_client +from chaostoolkit_nimble.core.extensions import create_k8s_api_client __all__ = ["get_nodes"] diff --git a/chaostoolkit_nimble/core/extensions/chaosk8s/pod/__init__.py b/chaostoolkit_nimble/core/extensions/chaosk8s/pod/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaosk8s/pod/actions.py b/chaostoolkit_nimble/core/extensions/chaosk8s/pod/actions.py similarity index 98% rename from chaosk8s/pod/actions.py rename to chaostoolkit_nimble/core/extensions/chaosk8s/pod/actions.py index 3f78e2f..d01b633 100644 --- a/chaosk8s/pod/actions.py +++ b/chaostoolkit_nimble/core/extensions/chaosk8s/pod/actions.py @@ -8,7 +8,7 @@ from kubernetes import client from logzero import logger -from chaosk8s import create_k8s_api_client +from chaostoolkit_nimble.core.extensions import create_k8s_api_client __all__ = ["terminate_pods"] diff --git a/chaosk8s/pod/probes.py b/chaostoolkit_nimble/core/extensions/chaosk8s/pod/probes.py similarity index 99% rename from chaosk8s/pod/probes.py rename to chaostoolkit_nimble/core/extensions/chaosk8s/pod/probes.py index 7e3e245..6b420fa 100644 --- a/chaosk8s/pod/probes.py +++ b/chaostoolkit_nimble/core/extensions/chaosk8s/pod/probes.py @@ -8,7 +8,7 @@ from kubernetes import client from logzero import logger -from chaosk8s import create_k8s_api_client +from chaostoolkit_nimble.core.extensions import create_k8s_api_client __all__ = [ "pods_in_phase", diff --git a/chaosk8s/probes.py b/chaostoolkit_nimble/core/extensions/chaosk8s/probes.py similarity index 98% rename from chaosk8s/probes.py rename to chaostoolkit_nimble/core/extensions/chaosk8s/probes.py index dda82f2..1aff508 100644 --- a/chaosk8s/probes.py +++ b/chaostoolkit_nimble/core/extensions/chaosk8s/probes.py @@ -7,8 +7,8 @@ from kubernetes import client, watch from logzero import logger -from chaosk8s import create_k8s_api_client -from chaosk8s.pod.probes import read_pod_logs +from chaostoolkit_nimble.core.extensions import create_k8s_api_client +from chaostoolkit_nimble.core.extensions.chaosk8s.pod import read_pod_logs __all__ = ["all_microservices_healthy", "microservice_available_and_healthy", "microservice_is_not_available", "service_endpoint_is_initialized", diff --git a/chaostoolkit_nimble/core/utils/__init__.py b/chaostoolkit_nimble/core/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/core/utils/fabric_utils.py b/chaostoolkit_nimble/core/utils/fabric_utils.py new file mode 100644 index 0000000..7702a1b --- /dev/null +++ b/chaostoolkit_nimble/core/utils/fabric_utils.py @@ -0,0 +1,62 @@ +import logging + +# from nimble.core import global_constants +# from nimble.core.entity.guavus_response import GuavusResponse +# from nimble.core.utils.fabric_utils import FabricUtils +from fabric import operations +from fabric.context_managers import hide + +_LOGGER = logging.getLogger(__name__) + + + +def run_command_on_remote(command, ip, username, password, command_timeout=None, + connection_timeout=120): + """Run a shell command on a remote server. + + :param command: Command that is to be executed on the shell of the remote server. + :type command: str + :param ip: Remote server ip on which command is to be fired. + :type ip: str + :param username: Username to be used for login on the remote server. + :type username: str + :param password: Password to be used for login on the remote server. + :type password: str + :param command_timeout: Time(in seconds) to wait for the given `command` to get executed, after which + the `CommandTimeout` exception will be raised. + :type command_timeout: int + :param connection_timeout: Time(in seconds) to wait for the connection to get established with the remote server, + after which the `ConnectionTimeout` exception will be raised. + :type connection_timeout: int + :return: Return the result of the command being executed on the remote server. + :rtype: :class:`operations._AttributeString` + """ + set_fabric_environment(ip, username, password, connection_timeout=connection_timeout) + with hide("output"): # pylint: disable=not-context-manager + ############ return operations.run(command, timeout=command_timeout) + return operations.run(command, timeout=command_timeout).stdout + +def set_fabric_environment(ip, username, password, sudo_password=None, + connection_timeout=120): + """Set the basic fabric environment variables upon which the other fabric utilities will be operated. + + :param ip: Remote server ip on which the action is to be performed. + :type ip: str + :param username: Username to be used for login on the remote server. + :type username: str + :param password: Password to be used for login on the remote server. + :type password: str + :param sudo_password: Sudo password to be used with the comamnd on the remote server. + :type sudo_password: str + :param connection_timeout: Time(in seconds) to wait for the connection to get established with the remote server, + after which the `ConnectionTimeout` exception will be raised. + :type connection_timeout: int + """ + operations.env.host_string = ip + operations.env.user = username + operations.env.password = password + operations.env.sudo_password = sudo_password + operations.env.warn_only = True + operations.env.abort_on_prompts = True + operations.env.disable_known_hosts = True + operations.env.timeout = connection_timeout diff --git a/chaostoolkit_nimble/core/utils/ha_utils.py b/chaostoolkit_nimble/core/utils/ha_utils.py new file mode 100644 index 0000000..c40aaa1 --- /dev/null +++ b/chaostoolkit_nimble/core/utils/ha_utils.py @@ -0,0 +1,107 @@ +import logging +import random + +from nimble.core.entity.components import Components +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.shell_utils import ShellUtils +from retrying import retry + +_LOGGER = logging.getLogger(__name__) + + +def _check_process(result): + process_id = result[0] + process_id_after_kill_process = result[1] + return process_id == process_id_after_kill_process or process_id_after_kill_process == '' + + +def fetch_process_id(component, process_name=None): + """Fetch the process id for any particular component. + + :param component: Name of the component. + :type component: str + :return: List of :class:`nimble.core.entity.guavus_response.GuavusResponse` objects. + :rtype: list + """ + if process_name: + command = ShellUtils.fetch_process_id(process_name) + else: + process_name = Components.get_process_name(component) + command = ShellUtils.fetch_process_id(process_name) + _LOGGER.info("Fetching process id for process '%s' from component: %s" % (process_name, component)) + response_list = NodeManager.node_obj.execute_command_on_component(component, command, + consolidated_success_flag=False) + return response_list + + +def check_process_running(component, process_name=None): + if not process_name: + process_name = Components.get_process_name(component) + _LOGGER.info("Checking if process '%s' is running by verifying its process id" % process_name) + return NodeManager.node_obj.execute_command_on_component(component, ShellUtils.fetch_process_id(process_name), + consolidated_success_flag=True) + + +def kill_process(process_name, component, num_of_nodes=None): + """Kill the process of any particular component + + :param process_name: Name of the process + :type process_name: str + :param component: Name of the component + :type component: str + """ + node_aliases = [] + for node in NodeManager.node_obj.nodes_by_type[component]: + node_aliases.append(node.name) + if num_of_nodes: + node_aliases = random.sample(node_aliases, num_of_nodes) + command = ShellUtils.kill_process_by_name(process_name) + response_list = [] + for node_alias in node_aliases: + response_list.append( + NodeManager.node_obj.execute_command_on_node(node_alias, command, consolidated_success_flag=False)) + return response_list + + +# def process_ha(component, process_name=None): +# """This function is used to do the process HA of the components at the remote server. +# +# :return: Return object of :class:`nimble.core.entity.guavus_response.GuavusResponse`. +# :rtype: :class:`nimble.core.entity.guavus_response.GuavusResponse` +# """ +# failure_reason = "" +# kill_process(component, process_name) +# +# try: +# check_process(process_id, component) +# status_code = 0 +# except RetryError: +# status_code = global_constants.DEFAULT_ERROR_CODE +# guavus_response_after_kill = fetch_process_id(component) +# process_id_after_kill = guavus_response_after_kill[0].stdout +# if process_id == process_id_after_kill: +# failure_reason = "Process is not killed for component %s" % component +# elif process_id_after_kill == "": +# failure_reason = "Process is not UP and running after killed for component: %s" % component +# +# guavus_response[0].status_code = status_code +# guavus_response[0].healthcheck_response.failure_reason.append(failure_reason) +# +# return guavus_response + + +@retry(wait_fixed=3000, stop_max_delay=300000, retry_on_result=_check_process) +def check_process(process_id, component): + """ This function is used to check the process is up or not. + + :param process_id: It is the process id before killing the process. + :type process_id: int + :param component: Name of the component. + :type component: str + :return: Return the process id of the process before kill and after kill. + :rtype: tuple + """ + _LOGGER.info("Running process check for process '%s' on component: %s" % (process_name, component)) + guavus_response_after_kill = fetch_process_id(component) + process_id_after_kill_process = guavus_response_after_kill[0].stdout + return process_id, process_id_after_kill_process diff --git a/chaostoolkit_nimble/core/utils/shell_utils.py b/chaostoolkit_nimble/core/utils/shell_utils.py new file mode 100644 index 0000000..b2224b0 --- /dev/null +++ b/chaostoolkit_nimble/core/utils/shell_utils.py @@ -0,0 +1,67 @@ +import logging +import subprocess + +from nimble.core import global_constants + +_LOGGER = logging.getLogger(__name__) + + +class ShellUtils(object): + """Utilities related to linux shell.""" + + def __init__(self, username=global_constants.DEFAULT_SERVER_USERNAME, + password=global_constants.DEFAULT_SERVER_PASSWORD): + """ + + :param username: Username to be used for login on the remote server. Defaults to `root`. + :type username: str + :param password: Password to be used for login on the remote server. Defaults to `root@123`. + :type password: str + """ + self._username = username + self._password = password + + @staticmethod + def log_guavus_response(guavus_response, log_response=True): + """Log the `stdout`, `stderr` and/or `status_code` in `execution.log` from the given `guavus_response`. + + :type guavus_response: :class:`nimble.core.entity.GuavusResponse` + :param log_response: If True, the `stdout`, `stderr` and `status_code` will be logged from the `guavus_response` + else only the `status_code` will be logged. Defaults to `True`. + :type log_response: bool + """ + if log_response: + if guavus_response.status_code == 0: + _LOGGER.info("stdout: %s\nstderr: %s\nstatus code: %s" % ( + guavus_response.stdout, guavus_response.stderr, guavus_response.status_code)) + else: + _LOGGER.error("stdout: %s\nstderr: %s\nstatus code: %s" % ( + guavus_response.stdout, guavus_response.stderr, guavus_response.status_code)) + else: + if guavus_response.status_code == 0: + _LOGGER.info("status code: %s" % guavus_response.status_code) + else: + _LOGGER.error("status code: %s" % guavus_response.status_code) + + @staticmethod + def execute_shell_command(command, log_response=True): + """Execute the given `command` on shell. + + :param command: Command that is to be executed on shell. + :type command: str + :param log_response: If True, the `stdout`, `stderr` and `status_code` will be logged from the `guavus_response` + else only the `status_code` will be logged in `execution.log`. Defaults to `True`. + :type log_response: bool + :return: Return response of the given shell `command`. + :rtype: :class:`nimble.core.entity.GuavusResponse` + """ + _LOGGER.info("Executing command: %s" % command) + ######### subprocess_obj = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) + subprocess_obj = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, + encoding="utf-8") + response_obj = subprocess_obj.communicate() + ######################## + # guavus_response = GuavusResponse(response_obj[0].strip(), response_obj[1], subprocess_obj.returncode) + # ShellUtils.log_guavus_response(guavus_response, log_response=log_response) + # return guavus_response + return response_obj diff --git a/chaostoolkit_nimble/resources/components/component_attributes.yml b/chaostoolkit_nimble/resources/components/component_attributes.yml new file mode 100644 index 0000000..057afed --- /dev/null +++ b/chaostoolkit_nimble/resources/components/component_attributes.yml @@ -0,0 +1,61 @@ +defaults: + ui_username: admin + ui_password: Sbe7UGkRBMF8 +components: + HDFS: + linux_user: hdfs + ui_port: 50070 + AMBARI: + ui_port: 8080 + ATLAS: + ui_port: 21000 + CDAP: + ui_port: 11011 + application_port: 11015 + ELASTICSEARCH: + mode: private + ui_port: 9201 + HAPROXY: + ui_port: 5070 + ui_username: haproxy + ui_password: "!H4pr0xy$" + HBASE: + linux_user: hbase + HIVE: + linux_user: hive + application_port: 10000 + SPARK: + linux_user: hdfs + ui_port: 18081 + ZOOKEEPER: + linux_user: zookeeper + application_port: 2181 + SCHEMA_REGISTRY: + ui_port: 7788 + REDIS: + ui_port: 6379,6380 + NIFI: + ui_port: 9090 + OOZIE: + linux_user: oozie + ui_port: 11000 + PSQL: + linux_user: postgres + db_username: postgres + db_password: postgres + application_port: 5432 + KAFKA: + linux_user: kafka + application_port: 6667 + RANGER: + ui_port: 6080 + JANUSGRAPH: + linux_user: janusgraph + application_port: 8182 + PROMETHEUS: + ui_port: 9095 + GRAFANA: + ui_port: 3000 + HADOOP_YARN: + ui_port: 8088 + diff --git a/chaostoolkit_nimble/resources/components/component_attributes_cloudera.yml b/chaostoolkit_nimble/resources/components/component_attributes_cloudera.yml new file mode 100644 index 0000000..ede7145 --- /dev/null +++ b/chaostoolkit_nimble/resources/components/component_attributes_cloudera.yml @@ -0,0 +1,38 @@ +defaults: + linux_user: root +components: + HDFS: + ui_port: 50070 + HIVE: + application_port: 10000 + HBASE: + PSQL: + application_port: 5432 + db_username: postgres + db_password: postgres + AZKABAN: + ui_port: 8507 + ui_username: azkaban + ui_password: "!4zk4b4n$" + ZOOKEEPER: + ui_port: 2181 + KAFKA_CONNECT: + ui_port: 8083 + ELASTICSEARCH: + ui_port: 6775 + HAPROXY: + ui_port: 5070 + ui_username: haproxy + ui_password: "!H4pr0xy$" + SPARK: + ui_port: 18080 + SCHEMA_REGISTRY: + ui_port: 8081 + REDIS: + ui_port: 6379,6380 + IMPALA: + linux_user: impala + db_username: impala + application_port: 21050 + KAFKA: + application_port: 9092 diff --git a/chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml b/chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml new file mode 100644 index 0000000..85bce81 --- /dev/null +++ b/chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml @@ -0,0 +1,64 @@ +defaults: + ui_username: admin + ui_password: Sbe7UGkRBMF8 + keytab_file_path: /etc/security/keytabs/smokeuser.headless.keytab +components: + HDFS: + linux_user: hdfs + ui_port: 50070 + AMBARI: + ui_port: 8080 + ATLAS: + ui_port: 21000 + CDAP: + ui_port: 11011 + application_port: 11015 + ELASTICSEARCH: + mode: vip + ui_port: 9201 + ui_username: elasticsearch + ui_password: elasticsearch@123 + HAPROXY: + ui_port: 5070 + ui_username: haproxy + ui_password: "!H4pr0xy$" + HBASE: + linux_user: hbase + HIVE: + linux_user: hive + application_port: 10000 + SPARK: + linux_user: hdfs + ui_port: 18081 + ZOOKEEPER: + linux_user: zookeeper + application_port: 2181 + SCHEMA_REGISTRY: + ui_port: 7788 + REDIS: + ui_port: 6379,6380 + NIFI: + ui_port: 9090 + OOZIE: + linux_user: oozie + ui_port: 11000 + PSQL: + linux_user: postgres + db_username: postgres + db_password: postgres + application_port: 5432 + KAFKA: + linux_user: kafka + application_port: 6667 + RANGER: + ui_port: 6080 + JANUSGRAPH: + linux_user: janusgraph + application_port: 8182 + PROMETHEUS: + ui_port: 9095 + GRAFANA: + ui_port: 3000 + HADOOP_YARN: + ui_port: 8088 + diff --git a/chaostoolkit_nimble/resources/exp_templates/experiment_template.json b/chaostoolkit_nimble/resources/exp_templates/experiment_template.json new file mode 100644 index 0000000..5d73d30 --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/experiment_template.json @@ -0,0 +1,44 @@ +{ + "version": "1.0.0", + "title": "What is the impact on an ES service when the ES running pod goes down?", + "description": "If ES running pod goes down, the ES service should still be functioning properly.", + "tags": ["kubernetes"], + "controls": [{ + "name": "my-controls", + "provider": { + "type": "python", + "module": "custom.controllers.control" + } + }], + "steady-state-hypothesis": { + "title": "Application responds", + "probes": [{ + "type": "probe", + "name": "test_run_remote_python_cmd", + "tolerance": "hi\n", + "provider": { + "module": "custom.fabric_utils", + "type": "python", + "func": "run_command_on_remote", + "arguments": { + "command": "{{my_var}}", + "ip": "192.168.135.35", + "username":"root", + "password":"guavus@123", + "command_timeout":120, + "connection_timeout": 120 + } + } + }] + }, + "method": [{ + "type": "action", + "name": "test_method", + "provider": { + "module": "chaosk8s.pod.actions", + "type": "process", + "path": "ls", + "arguments": "/tmp" + } + }] +} diff --git a/chaostoolkit_nimble/resources/exp_templates/kubernetes_experiment_template.json b/chaostoolkit_nimble/resources/exp_templates/kubernetes_experiment_template.json new file mode 100644 index 0000000..bf1df42 --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/kubernetes_experiment_template.json @@ -0,0 +1,73 @@ +{ + "version": "1.0.0", + "title": "Does kubernetes handle pod {{pod_name}} failover?", + "description": "If the pod {{pod_name}} goes down, then kubenetes should handle its restart garcefully and the service it is running should get restored successfully.", + "tags": ["kubernetes"], + "controls": [{ + "name": "my-controls", + "provider": { + "type": "python", + "module": "mypackage.custom.controllers.control" + } + }], + "steady-state-hypothesis": { + "title": "Pod {{pod_name}} and its service {{service_name}} is up and running", + "probes": [{ + "type": "probe", + "name": "{{pod_name}}_pod_is_running", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.chaosk8s.pod.probes", + "type": "python", + "func": "pods_in_phase", + "arguments": { + "ns": "{{pod_ns_var}}", + "phase" : "Running", + "label_selector" : "{{pod_label_selector_var}}" + } + } + }, + { + "type": "probe", + "name": "{{service_name}}_service_exists", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.chaosk8s.probes", + "type": "python", + "func": "service_endpoint_is_initialized", + "arguments": { + "name": "{{service_name}}", + "ns": "{{service_ns_var}}", + "label_selector" : "{{service_label_selector_var}}" + } + } + }] + }, + "method": [{ + "type": "action", + "name": "terminate_gracefully_pod_{{pod_name}}", + "provider": { + "module": "chaostoolkit_nimble.chaosk8s.pod.actions", + "type": "python", + "func": "terminate_pods", + "arguments": { + "name_pattern": "{{pod_name}}*", + "ns": "{{pod_ns_var}}", + "grace_period": 5 + } + } + }, + { + "type": "probe", + "name": "read_new_spawned_logs_for_pod_{{pod_name}}", + "provider": { + "module": "chaostoolkit_nimble.chaosk8s.pod.probes", + "type": "python", + "func": "read_pod_logs", + "arguments": { + "ns": "{{pod_ns_var}}", + "label_selector" : "{{pod_label_selector_var}}" + } + } + }] +} diff --git a/chaostoolkit_nimble/resources/exp_templates/shell_app/process_experiment_template.json b/chaostoolkit_nimble/resources/exp_templates/shell_app/process_experiment_template.json new file mode 100644 index 0000000..dd265cb --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/shell_app/process_experiment_template.json @@ -0,0 +1,43 @@ +{ + "version": "1.0.0", + "title": "Experiment running shell commands on local and remote", + "description": "Shell command should get successfully executed on remote using python provider through fabric and on local it should run through process provider", + "tags": ["kubernetes"], + "controls": [{ + "name": "my-controls", + "provider": { + "type": "python", + "module": "mypackage.custom.controllers.control" + } + }], + "steady-state-hypothesis": { + "title": "Shell command runs on remote server", + "probes": [{ + "type": "probe", + "name": "run_shell_cmd_on_remote", + "tolerance": "{{expected_remote_command_output}}", + "provider": { + "module": "chaostoolkit_nimble.custom.fabric_utils", + "type": "python", + "func": "run_command_on_remote", + "arguments": { + "command": "{{remote_command_var}}", + "ip": "{{remote_ip_var}}", + "username":"{{remote_username_var}}", + "password":"{{remote_password_var}}", + "command_timeout": 120, + "connection_timeout": 120 + } + } + }] + }, + "method": [{ + "type": "action", + "name": "run_shell_cmd_on_local", + "provider": { + "type": "process", + "path": "{{local_command_var}}", + "arguments": "{{local_command_arguments}}" + } + }] +} diff --git a/chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp.json b/chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp.json new file mode 100644 index 0000000..873157e --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp.json @@ -0,0 +1,44 @@ +{ + "version": "1.0.0", + "title": "Experiment running shell commands on local and remote", + "description": "Shell command should get successfully executed on remote using python provider through fabric and on local it should run through process provider", + "tags": ["kubernetes"], + "controls": [{ + "name": "shell-app-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.extensions.shell_app.control" + } + }], + "steady-state-hypothesis": { + "title": "Shell application is up and running", + "probes": [{ + "type": "probe", + "name": "Fetch-shell-app-process-id", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "check_process_running", + "arguments": { + "component": "rand_dynamic_component", + "process_name": "rand_dynamic_process_name" + } + } + }] + }, + "method": [{ + "type": "action", + "name": "Kill-shell-app", + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "kill_process", + "arguments": { + "component": "rand_dynamic_component", + "process_name": "rand_dynamic_process_name", + "num_of_nodes": "1" + } + } + }] +} diff --git a/chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp_template.json b/chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp_template.json new file mode 100644 index 0000000..1eb410a --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp_template.json @@ -0,0 +1,43 @@ +{ + "version": "1.0.0", + "title": "Experiment running shell commands on local and remote", + "description": "Shell command should get successfully executed on remote using python provider through fabric and on local it should run through process provider", + "tags": ["kubernetes"], + "controls": [{ + "name": "shell-app-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.extensions.shell_application.control" + } + }], + "steady-state-hypothesis": { + "title": "Shell application is up and running", + "probes": [{ + "type": "probe", + "name": "Fetch-application-process-id", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "check_process_running", + "arguments": { + "component": "{{component}}", + "process_name": "{{process_name}}" + } + } + }] + }, + "method": [{ + "type": "action", + "name": "Kill-application-process", + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "kill_process", + "arguments": { + "component": "{{component}}", + "process_name": "{{process_name}}" + } + } + }] +} diff --git a/chaostoolkit_nimble/resources/security/krb5.conf b/chaostoolkit_nimble/resources/security/krb5.conf new file mode 100644 index 0000000..92a3395 --- /dev/null +++ b/chaostoolkit_nimble/resources/security/krb5.conf @@ -0,0 +1,26 @@ + +[libdefaults] + renew_lifetime = 7d + forwardable = true + default_realm = GVS.GGN + ticket_lifetime = 24h + dns_lookup_realm = false + dns_lookup_kdc = false + default_ccache_name = /tmp/krb5cc_%{uid} + #default_tgs_enctypes = aes des3-cbc-sha1 rc4 des-cbc-md5 + #default_tkt_enctypes = aes des3-cbc-sha1 rc4 des-cbc-md5 + +[domain_realm] + GVS.GGN = GVS.GGN + +[logging] + default = FILE:/var/log/krb5kdc.log + admin_server = FILE:/var/log/kadmind.log + kdc = FILE:/var/log/krb5kdc.log + +[realms] + GVS.GGN = { + admin_server = kdc-devopslabs.gvs.ggn + kdc = kdc-devopslabs.gvs.ggn + } + diff --git a/chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml b/chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml new file mode 100644 index 0000000..bde2ac1 --- /dev/null +++ b/chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml @@ -0,0 +1,51 @@ +# You can choose to skip some attributes that are given in default tag if the value for that VM is same as that in the +# default tag. + +defaults: + username: root + password: guavus@123 + dynamic_vm_ips: True + vip_hostname_domain: testautomation-lb-vip.cloud.in.guavus.com + vip: 192.168.135.34 + nameservice: testautomation-reflex-platform + stage: dev + inventory_name: testautomation + customer_name: guavus + platform: ambari + kerberized: False + vm_netmask: 255.255.252.0 + vm_gateway: 192.168.132.1 + type: DATANODE,ETCD,HREGION,HBASE,ZOOKEEPER,REDIS,REDIS_MASTER,REDIS_SLAVE,HADOOP_HDFS,HADOOP_YARN,KAFKA,KAFKA_BROKER,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,NIFI,NODE_MANAGER,HIVE,HIVE_SERVER,CORE_DNS,KAFKA_MM,SNMP +bare_metals: + BM1: + ip: dummy + vm: + testautomation-mgt-01: + vm_ip: 192.168.135.35 + type: MANAGEMENT,PSQL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,JOURNALNODE,ATLAS,RANGER,RANGER_ADMIN,RANGER_USERSYNC,RANGER_TAGSYNC,HISTORY_SERVER,SPARK2_HISTORY_SERVER,ELASTICSEARCH,JANUSGRAPH,AMBARI,GRAFANA,SNMP + hostname_domain: testautomation-mgt-01.cloud.in.guavus.com + testautomation-lb-01: + vm_ip: 192.168.135.42 + type: LB,HAPROXY,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,COROSYNC,PACEMAKER,SNMP + hostname_domain: testautomation-lb-01.cloud.in.guavus.com + testautomation-lb-02: + vm_ip: 192.168.135.43 + type: LB,HAPROXY,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,COROSYNC,PACEMAKER,SNMP + hostname_domain: testautomation-lb-02.cloud.in.guavus.com + testautomation-slv-01: + vm_ip: 192.168.135.38 + hostname_domain: testautomation-slv-01.cloud.in.guavus.com + testautomation-mst-01: + vm_ip: 192.168.135.36 + type: NAMENODE,KUBERNETES,HYPERKUBE_SCHEDULER,HYPERKUBE_APISERVER,HYPERKUBE_CONTROLLER_MANAGER,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,HMASTER,HIVE,HADOOP_HDFS,HADOOP_YARN,ZKFC,HBASE,SPARK,SCHEMA_REGISTRY,PGHA_WHATCHDOG,RESOURCE_MANAGER,JOURNALNODE,KUBERNETES_DASHBOARD,HIVE_METASTORE,CDAP,CDAP_AUTH_SERVER,CDAP_MASTER,CDAP_ROUTER,CDAP_UI + hostname_domain: testautomation-mst-01.cloud.in.guavus.com + testautomation-slv-02: + vm_ip: 192.168.135.39 + hostname_domain: testautomation-slv-02.cloud.in.guavus.com + testautomation-mst-02: + vm_ip: 192.168.135.37 + type: NAMENODE,KUBERNETES,HYPERKUBE_SCHEDULER,HYPERKUBE_APISERVER,HYPERKUBE_CONTROLLER_MANAGER,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,HMASTER,HIVE,HADOOP_HDFS,HADOOP_YARN,ZKFC,HBASE,SPARK,PGHA_WHATCHDOG,RESOURCE_MANAGER,JOURNALNODE,KUBERNETES_DASHBOARD,HIVE_METASTORE,CDAP,CDAP_AUTH_SERVER,CDAP_MASTER,CDAP_ROUTER,CDAP_UI + hostname_domain: testautomation-mst-02.cloud.in.guavus.com + testautomation-slv-03: + vm_ip: 192.168.135.40 + hostname_domain: testautomation-slv-03.cloud.in.guavus.com \ No newline at end of file diff --git a/chaostoolkit_nimble/resources/testbeds/open_nebula_135_52.yml b/chaostoolkit_nimble/resources/testbeds/open_nebula_135_52.yml new file mode 100644 index 0000000..0884650 --- /dev/null +++ b/chaostoolkit_nimble/resources/testbeds/open_nebula_135_52.yml @@ -0,0 +1,52 @@ +# You can choose to skip some attributes that are given in default tag if the value for that VM is same as that in the +# default tag. + +defaults: + username: root + password: guavus@123 + dynamic_vm_ips: True + vip_hostname_domain: testautomation002-lb-vip.cloud.in.guavus.com + vip: 192.168.134.191 + nameservice: testautomation-reflex-platform + stage: dev + inventory_name: testautomation + customer_name: guavus + platform: ambari + kerberized: True + kerberos_client_user: ambari-qa + vm_netmask: 255.255.252.0 + vm_gateway: 192.168.132.1 + type: DATANODE,ETCD,HREGION,HBASE,ZOOKEEPER,REDIS,REDIS_MASTER,REDIS_SLAVE,HADOOP_HDFS,HADOOP_YARN,KAFKA,KAFKA_BROKER,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,NIFI,NODE_MANAGER,HIVE,HIVE_SERVER,CORE_DNS,KAFKA_MM,SNMP +bare_metals: + BM1: + ip: dummy + vm: + testautomation002-mgt-01: + vm_ip: 192.168.135.59 + type: MANAGEMENT,PSQL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,JOURNALNODE,ATLAS,RANGER,RANGER_ADMIN,RANGER_USERSYNC,RANGER_TAGSYNC,HISTORY_SERVER,SPARK2_HISTORY_SERVER,ELASTICSEARCH,JANUSGRAPH,AMBARI,GRAFANA,SNMP + hostname_domain: testautomation002-mgt-01.cloud.in.guavus.com + testautomation002-lb-01: + vm_ip: 192.168.135.186 + type: LB,HAPROXY,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,COROSYNC,PACEMAKER,SNMP + hostname_domain: testautomation002-lb-01.cloud.in.guavus.com + testautomation002-lb-02: + vm_ip: 192.168.135.187 + type: LB,HAPROXY,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,COROSYNC,PACEMAKER,SNMP + hostname_domain: testautomation002-lb-02.cloud.in.guavus.com + testautomation002-slv-01: + vm_ip: 192.168.135.183 + hostname_domain: testautomation002-slv-01.cloud.in.guavus.com + testautomation002-mst-01: + vm_ip: 192.168.135.60 + type: NAMENODE,KUBERNETES,HYPERKUBE_SCHEDULER,HYPERKUBE_APISERVER,HYPERKUBE_CONTROLLER_MANAGER,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,HMASTER,HIVE,HADOOP_HDFS,HADOOP_YARN,ZKFC,HBASE,SPARK,SCHEMA_REGISTRY,PGHA_WHATCHDOG,RESOURCE_MANAGER,JOURNALNODE,KUBERNETES_DASHBOARD,HIVE_METASTORE,CDAP,CDAP_AUTH_SERVER,CDAP_MASTER,CDAP_ROUTER,CDAP_UI + hostname_domain: testautomation002-mst-01.cloud.in.guavus.com + testautomation002-slv-02: + vm_ip: 192.168.135.184 + hostname_domain: testautomation002-slv-02.cloud.in.guavus.com + testautomation002-mst-02: + vm_ip: 192.168.135.64 + type: NAMENODE,KUBERNETES,HYPERKUBE_SCHEDULER,HYPERKUBE_APISERVER,HYPERKUBE_CONTROLLER_MANAGER,FLANNEL,DOCKER,HYPERKUBE_PROXY,HYPERKUBE_KUBELET,HMASTER,HIVE,HADOOP_HDFS,HADOOP_YARN,ZKFC,HBASE,SPARK,PGHA_WHATCHDOG,RESOURCE_MANAGER,JOURNALNODE,KUBERNETES_DASHBOARD,HIVE_METASTORE,CDAP,CDAP_AUTH_SERVER,CDAP_MASTER,CDAP_ROUTER,CDAP_UI + hostname_domain: testautomation002-mst-02.cloud.in.guavus.com + testautomation002-slv-03: + vm_ip: 192.168.135.185 + hostname_domain: testautomation002-slv-03.cloud.in.guavus.com \ No newline at end of file diff --git a/chaostoolkit_nimble/tests/__init__.py b/chaostoolkit_nimble/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/tests/conftest.py b/chaostoolkit_nimble/tests/conftest.py new file mode 100644 index 0000000..0b5dd53 --- /dev/null +++ b/chaostoolkit_nimble/tests/conftest.py @@ -0,0 +1,126 @@ +# Do not change the name of the file, since pytest detects it with this name only. + +import logging.config +import os + +import pytest +from nimble.core import global_constants + +try: + os.makedirs(global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH) +except Exception: + pass + +logging.config.fileConfig(global_constants.DEFAULT_LOGGING_FILE_PATH) + +# pylint: disable=wrong-import-position +from nimble.core.configs.validation_config_parser import ValidationConfigParser +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.report_utils import ReportUtils +from nimble.actions.base.flows.user_actions import UserActions + +OPTIONS_DICT = {} +PREVIOUS_FAILED = None +ITEM_LIST = [] + + +def pytest_addoption(parser): + parser.addoption("--testbed", + help="Relative path (to the project root) of the testbed file. E.g. python -m pytest --testbed=resources/testbeds/open_nebula_134_192.yml") + parser.addoption("--componentAttributesConfig", + help="Relative path (to the project root) of the file containing component attributes configs. E.g. python -m pytest --componentAttributesConfig=resources/components/component_attributes_ambari.yml") + parser.addoption("--validationConfig", + help="Relative path (to the project root) of the file containing validation configs. E.g. python -m pytest --validationConfig=resources/validation/sample_validation_config.yml") + parser.addoption("--chaosExpConfig", + help="Relative path (to the project root) of the file containing chaos experiment configs. E.g. python -m pytest --validationConfig=resources/validation/chaos_exp_config.yml") + parser.addoption("--experimentsPath", + help="Relative path (to the project root) of the file containing chaos experiment json files. E.g. python -m pytest --validationConfig=resources/validation/chaos_exp_config.yml") + + +@pytest.fixture(scope="session", autouse=True) +def initialize_node_obj(request): + testbed_file = request.config.getoption("--testbed") + component_arttributes_file = request.config.getoption("--componentAttributesConfig") + if not component_arttributes_file: + component_arttributes_file = "nimble/resources/components/component_attributes.yml" + if testbed_file: + NodeManager.initialize(testbed_file, component_arttributes_file) + + +@pytest.fixture(scope="session", autouse=True) +def initialize_arguments(request): + global OPTIONS_DICT + + for option, value in list(request.config.option.__dict__.items()): + OPTIONS_DICT[option] = value + + +@pytest.fixture(scope="session") +def config_parser(initialize_arguments): # pylint: disable=redefined-outer-name,unused-argument + """Initialize the validation config parser. + + :param initialize_arguments: Fixture defined above. + :return: Return the object of the Validation config parser. + :rtype: :class:`nimble.core.configs.validation_config_parser.ValidationConfigParser` + """ + return ValidationConfigParser(OPTIONS_DICT["validationConfig"]) + + +@pytest.fixture(scope="session") +def dump_allure_env_file(config_parser, initialize_node_obj): # pylint: disable=redefined-outer-name,unused-argument + """Dump the basic environment variables for Allure. + + :param config_parser: Fixture defined above. + :param initialize_node_obj: Fixture defined above. + """ + report_dict = ReportUtils.get_generic_attributes(config_parser) + ReportUtils.dump_allure_env_file(report_dict) + + +@pytest.fixture(scope="session") +def user_actions(config_parser, dump_allure_env_file): # pylint: disable=redefined-outer-name,unused-argument + """Initialize the object for user actions. + + :param config_parser: Fixture defined above. + :param dump_allure_env_file: Fixture defined above. + :rtype: :class:`nimble.actions.base.flows.user_actions.UserActions` + """ + return UserActions(config_parser) + + +def pytest_runtest_makereport(item, call): + """ + Sometimes you may have a testing situation which consists of a series of test steps. If one step fails it makes no + sense to execute further steps as they are all expected to fail anyway and their tracebacks add no insight. + This and the next hook implementations work together to abort incremental-marked tests in a class. + + + :param item: Pytest's internal fixture. + :param call: Pytest's internal fixture. + """ + if "incremental" in item.keywords: + if call.excinfo is not None: + parent = item.parent + parent._previousfailed = item # pylint: disable=protected-access + + +def pytest_runtest_setup(item): + """ + Sometimes you may have a testing situation which consists of a series of test steps. If one step fails it makes no + sense to execute further steps as they are all expected to fail anyway and their tracebacks add no insight. + This and the next hook implementations work together to abort incremental-marked tests in a class. + + :param item: Pytest's internal fixture. + """ + global PREVIOUS_FAILED, ITEM_LIST + if "incremental" in item.keywords: + for previous_item in ITEM_LIST: + if PREVIOUS_FAILED is None: + PREVIOUS_FAILED = getattr(previous_item.parent, "_previousfailed", None) + if PREVIOUS_FAILED is not None: + pytest.fail("previous test failed (%s)" % PREVIOUS_FAILED.name) + try: + ITEM_LIST.pop(0) + except IndexError: + pass + ITEM_LIST.append(item) diff --git a/tests/fixtures/invalid-k8s.txt b/chaostoolkit_nimble/tests/fixtures/invalid-k8s.txt similarity index 100% rename from tests/fixtures/invalid-k8s.txt rename to chaostoolkit_nimble/tests/fixtures/invalid-k8s.txt diff --git a/chaostoolkit_nimble/tests/sample/__init__.py b/chaostoolkit_nimble/tests/sample/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/tests/sample/test_shell_app_exp.py b/chaostoolkit_nimble/tests/sample/test_shell_app_exp.py new file mode 100644 index 0000000..3489f16 --- /dev/null +++ b/chaostoolkit_nimble/tests/sample/test_shell_app_exp.py @@ -0,0 +1,83 @@ +import logging + +import pytest +from nimble.core.entity.components import Components +from nimble.core.utils.dynamic_substitution_utils import DynamicSubstitutionUtils +from nimble.core.utils.multiprocessing_utils import MultiprocessingUtils + +from chaostoolkit_nimble.actions.base.flows import user_actions +from chaostoolkit_nimble.actions.sample import sample_application_actions + +_LOGGER = logging.getLogger(__name__) + + +class TestShellAppExp(): + @pytest.fixture(scope="session") + def multiprocessing_utils(self): + return MultiprocessingUtils(1) + + @pytest.fixture(scope="session") + def launch_application_on_remote(self, multiprocessing_utils): + # NodeManager.node_obj.execute_command_on_node("testautomation-mst-01", "nohup sleep 5m &") + process_list = multiprocessing_utils.run_method_in_parallel_async( + sample_application_actions.launch_application()) + yield + for process in process_list: + process.terminate() + + # def test_application_ha(self, launch_application_on_remote): + def test_application_ha(self): + # ha_utils.process_ha(Components.MANAGEMENT.name, "sleep 5m") + ################## Templating + # env = Environment(loader=PackageLoader("chaostoolkit_nimble", "resources/templates/shell_application/")) + # template = env.get_template('shell_app_exp_template.json') + # process_name = "sleep 5m" + # variables = {"component": Components.MANAGEMENT.name, + # "process_name": process_name, + # "expected_process_id": "", + # } + # # print(template.render(variables)) + # json_string = template.render(variables) + # experiment_file_path = "%s/process_experiment.json" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH + # response_obj = ShellUtils.execute_shell_command("chaos run %s" % (experiment_file_path)) + + ###############Dynamic Substitution################################################################################# + # experiment_template_file_path = "chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp.json" + # experiment_file_path = "%s/shell_app_exp.json" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH + # ShellUtils.execute_shell_command("cp %s %s" % (experiment_template_file_path, experiment_file_path)) + # process_name = "sleep 5m" + # variables = {"rand_dynamic_component": Components.NAMENODE.name, + # "rand_dynamic_process_name": process_name, + # } + # # print(template.render(variables)) + # DynamicSubstitutionUtils.add(variables) + # DynamicSubstitutionUtils.update_file(experiment_file_path) + # response_obj = ShellUtils.execute_shell_command("chaos run %s" % experiment_file_path) + + ######################### User actions & Dynamic Substitution ########################################################### + experiments_template_path = "chaostoolkit_nimble/resources/exp_templates/shell_app/shell_app_exp.json" + variables = {"rand_dynamic_component": Components.NAMENODE.name, + "rand_dynamic_process_name": "sleep 5m", + } + DynamicSubstitutionUtils.add(variables) + user_actions.run_experiment(experiments_template_path) + +########################################################################################################## +# env = Environment(loader=PackageLoader("chaostoolkit_nimble", "resources/templates/shell_application")) +# template = env.get_template('process_experiment_template.json') +# variables = {"remote_command_var": "echo Hi", +# "remote_ip_var": "192.168.135.59", +# "remote_username_var": "root", +# "remote_password_var": "guavus@123", +# "remote_command_timeout_var": 120, +# "remote_connection_timeout_var": 120, +# "expected_remote_command_output": "Hi", +# "local_command_var": "ls", +# "local_command_arguments": "/tmp" +# } +# +# print(template.render(variables)) +# + + +# TestChaosOnApplocation().test_application_ha() diff --git a/tests/test_actions.py b/chaostoolkit_nimble/tests/test_actions.py similarity index 98% rename from tests/test_actions.py rename to chaostoolkit_nimble/tests/test_actions.py index 75e94e0..2b2574b 100644 --- a/tests/test_actions.py +++ b/chaostoolkit_nimble/tests/test_actions.py @@ -5,8 +5,8 @@ from chaoslib.exceptions import ActivityFailed from kubernetes.client.rest import ApiException -from chaosk8s.actions import start_microservice, kill_microservice -from chaosk8s.node.actions import cordon_node, create_node, delete_nodes, \ +from chaostoolkit_nimble.core.extensions import start_microservice, kill_microservice +from chaostoolkit_nimble.core.extensions.chaosk8s.node import cordon_node, create_node, delete_nodes, \ uncordon_node, drain_nodes diff --git a/tests/test_client.py b/chaostoolkit_nimble/tests/test_client.py similarity index 96% rename from tests/test_client.py rename to chaostoolkit_nimble/tests/test_client.py index 0ad8d4a..60b2f2b 100644 --- a/tests/test_client.py +++ b/chaostoolkit_nimble/tests/test_client.py @@ -2,7 +2,7 @@ import os from unittest.mock import MagicMock, patch -from chaosk8s import create_k8s_api_client +from chaostoolkit_nimble.core.extensions import create_k8s_api_client @patch('chaosk8s.has_local_config_file', autospec=True) diff --git a/tests/test_discovery.py b/chaostoolkit_nimble/tests/test_discovery.py similarity index 80% rename from tests/test_discovery.py rename to chaostoolkit_nimble/tests/test_discovery.py index 7d8006b..f3a9d8b 100644 --- a/tests/test_discovery.py +++ b/chaostoolkit_nimble/tests/test_discovery.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from chaosk8s import __version__, discover +from chaostoolkit_nimble.core.extensions import __version__, discover def test_discover_extension_capabilities(): diff --git a/tests/test_pod.py b/chaostoolkit_nimble/tests/test_pod.py similarity index 98% rename from tests/test_pod.py rename to chaostoolkit_nimble/tests/test_pod.py index a4507b3..646ff48 100644 --- a/tests/test_pod.py +++ b/chaostoolkit_nimble/tests/test_pod.py @@ -1,13 +1,11 @@ # -*- coding: utf-8 -*- from unittest.mock import MagicMock, patch, ANY, call -from kubernetes import client - import pytest from chaoslib.exceptions import ActivityFailed -from chaosk8s.pod.actions import terminate_pods -from chaosk8s.pod.probes import pods_in_phase, pods_not_in_phase, pods_in_conditions +from chaostoolkit_nimble.core.extensions.chaosk8s.pod import terminate_pods +from chaostoolkit_nimble.core.extensions.chaosk8s.pod import pods_in_phase, pods_not_in_phase, pods_in_conditions @patch('chaosk8s.has_local_config_file', autospec=True) diff --git a/tests/test_probes.py b/chaostoolkit_nimble/tests/test_probes.py similarity index 98% rename from tests/test_probes.py rename to chaostoolkit_nimble/tests/test_probes.py index 74887cc..9d2ce78 100644 --- a/tests/test_probes.py +++ b/chaostoolkit_nimble/tests/test_probes.py @@ -7,8 +7,8 @@ import urllib3 from chaoslib.exceptions import ActivityFailed -from chaosk8s.node.probes import get_nodes -from chaosk8s.probes import all_microservices_healthy, \ +from chaostoolkit_nimble.core.extensions.chaosk8s.node.probes import get_nodes +from chaostoolkit_nimble.core.extensions.chaosk8s.probes import all_microservices_healthy, \ microservice_available_and_healthy, microservice_is_not_available, \ service_endpoint_is_initialized, deployment_is_not_fully_available, \ deployment_is_fully_available, read_microservices_logs diff --git a/journal.json b/journal.json new file mode 100644 index 0000000..d711e9d --- /dev/null +++ b/journal.json @@ -0,0 +1,104 @@ +{ + "chaoslib-version": "1.5.0", + "platform": "Darwin-18.0.0-x86_64-i386-64bit", + "node": "kritika-Saxena.local", + "experiment": { + "version": "1.0.0", + "title": "Experiment running shell commands on local and remote", + "description": "Shell command should get successfully executed on remote using python provider through fabric and on local it should run through process provider", + "tags": [ + "kubernetes" + ], + "controls": [ + { + "name": "shell-app-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.extensions.shell_app.control" + } + } + ], + "steady-state-hypothesis": { + "title": "Shell application is up and running", + "probes": [ + { + "type": "probe", + "name": "Fetch-shell-app-process-id", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "check_process_running", + "arguments": { + "component": "NAMENODE", + "process_name": "sleep 5m" + } + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Kill-shell-app", + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "kill_process", + "arguments": { + "component": "NAMENODE", + "process_name": "sleep 5m", + "num_of_nodes": "1" + } + } + } + ], + "dry": false + }, + "start": "2019-07-18T10:50:51.666552", + "status": "failed", + "deviated": false, + "steady_states": { + "before": { + "steady_state_met": false, + "probes": [ + { + "activity": { + "type": "probe", + "name": "Fetch-shell-app-process-id", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "check_process_running", + "arguments": { + "component": "NAMENODE", + "process_name": "sleep 5m" + } + } + }, + "output": null, + "status": "failed", + "exception": [ + "Traceback (most recent call last):\n", + " File \"/Users/kritika.saxena/KR_VIRENV_CHAOS_NIMBLE_PY3/venv/lib/python3.7/site-packages/chaoslib/provider/python.py\", line 55, in run_python_activity\n return func(**arguments)\n", + " File \"/Users/kritika.saxena/chaos_folder_3/chaos_eng_automation/chaostoolkit_nimble/core/utils/ha_utils.py\", line 42, in check_process_running\n consolidated_success_flag=True)\n", + " File \"/Users/kritika.saxena/KR_VIRENV_CHAOS_NIMBLE_PY3/venv/lib/python3.7/site-packages/nimble/core/entity/nodes.py\", line 109, in execute_command_on_component\n for node_attributes in self.nodes_by_type[component]:\n", + "KeyError: 'NAMENODE'\n", + "\nDuring handling of the above exception, another exception occurred:\n\n", + "chaoslib.exceptions.ActivityFailed: KeyError: 'NAMENODE'\n" + ], + "start": "2019-07-18T10:50:51.668208", + "end": "2019-07-18T10:50:51.670135", + "duration": 0.001927, + "tolerance_met": false + } + ] + }, + "after": null + }, + "run": [], + "rollbacks": [], + "end": "2019-07-18T10:50:51.671344", + "duration": 0.025391817092895508 +} \ No newline at end of file diff --git a/logging.cfg b/logging.cfg new file mode 100644 index 0000000..cbcb624 --- /dev/null +++ b/logging.cfg @@ -0,0 +1,28 @@ +[loggers] +keys=root + +[handlers] +keys=consoleHandler,fileHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=INFO +handlers=fileHandler + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=simpleFormatter +args=(sys.stdout,) + +[handler_fileHandler] +class=FileHandler +level=DEBUG +formatter=simpleFormatter +args=('target/artifacts/execution.log', 'a+') + +[formatter_simpleFormatter] +format=%(asctime)s - [%(name)s::%(funcName)s()::%(lineno)s] - %(processName)s - %(levelname)s - %(message)s +datefmt= diff --git a/pytest.ini b/pytest.ini index c34a995..a5e2dbb 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,5 @@ [pytest] -norecursedirs=dist build htmlcov docs .eggs -addopts=-v -rxs --junitxml=junit-test-results.xml --cov=chaosk8s --cov-report term-missing:skip-covered --cov-report xml +norecursedirs=dist build htmlcov docs .eggs core target resources actions locators +#addopts=-v -rxs --junitxml=junit-test-results.xml --cov=chaosk8s --cov-report term-missing:skip-covered --cov-report xml +addopts = -s -v --html=report.html --self-contained-html --html=target/artifacts/report.html --alluredir=target/artifacts/allure/ +python_files = *.py From 0a859519452b063e2517f3aeccf8542010645be7 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 24 Jul 2019 18:50:59 +0530 Subject: [PATCH 02/40] AUT-544-new: temporary changes --- .../core/utils/fabric_utils.py | 2 +- chaostoolkit_nimble/core/utils/ha_utils.py | 146 ++++++++++-------- requirements.txt | 4 +- 3 files changed, 85 insertions(+), 67 deletions(-) diff --git a/chaostoolkit_nimble/core/utils/fabric_utils.py b/chaostoolkit_nimble/core/utils/fabric_utils.py index 7702a1b..9da0520 100644 --- a/chaostoolkit_nimble/core/utils/fabric_utils.py +++ b/chaostoolkit_nimble/core/utils/fabric_utils.py @@ -8,7 +8,7 @@ _LOGGER = logging.getLogger(__name__) - +b = 56 def run_command_on_remote(command, ip, username, password, command_timeout=None, connection_timeout=120): diff --git a/chaostoolkit_nimble/core/utils/ha_utils.py b/chaostoolkit_nimble/core/utils/ha_utils.py index c40aaa1..2c241b6 100644 --- a/chaostoolkit_nimble/core/utils/ha_utils.py +++ b/chaostoolkit_nimble/core/utils/ha_utils.py @@ -1,45 +1,61 @@ import logging import random +from logzero import logger from nimble.core.entity.components import Components from nimble.core.entity.node_manager import NodeManager from nimble.core.utils.shell_utils import ShellUtils -from retrying import retry + +from chaostoolkit_nimble.core.utils import fabric_utils + + _LOGGER = logging.getLogger(__name__) -def _check_process(result): - process_id = result[0] - process_id_after_kill_process = result[1] - return process_id == process_id_after_kill_process or process_id_after_kill_process == '' +# node_obj = __import__('nimble.core.entity.node_manager').NodeManager.node_obj +# import __builtin__ +logger.info("Helooooooooo") +# node_obj = NodeManager.node_obj +# +# def _check_process(result): +# process_id = result[0] +# process_id_after_kill_process = result[1] +# return process_id == process_id_after_kill_process or process_id_after_kill_process == '' -def fetch_process_id(component, process_name=None): - """Fetch the process id for any particular component. - :param component: Name of the component. - :type component: str - :return: List of :class:`nimble.core.entity.guavus_response.GuavusResponse` objects. - :rtype: list - """ - if process_name: - command = ShellUtils.fetch_process_id(process_name) - else: - process_name = Components.get_process_name(component) - command = ShellUtils.fetch_process_id(process_name) - _LOGGER.info("Fetching process id for process '%s' from component: %s" % (process_name, component)) - response_list = NodeManager.node_obj.execute_command_on_component(component, command, - consolidated_success_flag=False) - return response_list +# def fetch_process_id(component, process_name=None): +# """Fetch the process id for any particular component. +# +# :param component: Name of the component. +# :type component: str +# :return: List of :class:`nimble.core.entity.guavus_response.GuavusResponse` objects. +# :rtype: list +# """ +# if process_name: +# command = ShellUtils.fetch_process_id(process_name) +# else: +# process_name = Components.get_process_name(component) +# command = ShellUtils.fetch_process_id(process_name) +# _LOGGER.info("Fetching process id for process '%s' from component: %s" % (process_name, component)) +# response_list = NodeManager.node_obj.execute_command_on_component(component, command, +# consolidated_success_flag=False) +# return response_list def check_process_running(component, process_name=None): - if not process_name: - process_name = Components.get_process_name(component) - _LOGGER.info("Checking if process '%s' is running by verifying its process id" % process_name) - return NodeManager.node_obj.execute_command_on_component(component, ShellUtils.fetch_process_id(process_name), - consolidated_success_flag=True) + # if not process_name + # process_name = Components.get_process_name(component) + # logger.info("Checking if process '%s' is running by fetching its process id" % process_name) + # testbed_file = "chaostoolkit_nimble/resources/testbeds/open_nebula_135_52.yml" + # component_arttributes_file = "chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml" + # NodeManager.initialize(testbed_file, component_arttributes_file) + logger.debug("NODE_OBJ----------------: %s" % NodeManager.node_obj.vip) + logger.debug("b = ----------------: %s" % fabric_utils.b ) + # return NodeManager.node_obj.execute_command_on_component(component, ShellUtils.fetch_process_id(process_name), + # consolidated_success_flag=True) + def kill_process(process_name, component, num_of_nodes=None): @@ -63,45 +79,45 @@ def kill_process(process_name, component, num_of_nodes=None): return response_list -# def process_ha(component, process_name=None): -# """This function is used to do the process HA of the components at the remote server. -# -# :return: Return object of :class:`nimble.core.entity.guavus_response.GuavusResponse`. -# :rtype: :class:`nimble.core.entity.guavus_response.GuavusResponse` -# """ -# failure_reason = "" -# kill_process(component, process_name) +# # def process_ha(component, process_name=None): +# # """This function is used to do the process HA of the components at the remote server. +# # +# # :return: Return object of :class:`nimble.core.entity.guavus_response.GuavusResponse`. +# # :rtype: :class:`nimble.core.entity.guavus_response.GuavusResponse` +# # """ +# # failure_reason = "" +# # kill_process(component, process_name) +# # +# # try: +# # check_process(process_id, component) +# # status_code = 0 +# # except RetryError: +# # status_code = global_constants.DEFAULT_ERROR_CODE +# # guavus_response_after_kill = fetch_process_id(component) +# # process_id_after_kill = guavus_response_after_kill[0].stdout +# # if process_id == process_id_after_kill: +# # failure_reason = "Process is not killed for component %s" % component +# # elif process_id_after_kill == "": +# # failure_reason = "Process is not UP and running after killed for component: %s" % component +# # +# # guavus_response[0].status_code = status_code +# # guavus_response[0].healthcheck_response.failure_reason.append(failure_reason) +# # +# # return guavus_response # -# try: -# check_process(process_id, component) -# status_code = 0 -# except RetryError: -# status_code = global_constants.DEFAULT_ERROR_CODE -# guavus_response_after_kill = fetch_process_id(component) -# process_id_after_kill = guavus_response_after_kill[0].stdout -# if process_id == process_id_after_kill: -# failure_reason = "Process is not killed for component %s" % component -# elif process_id_after_kill == "": -# failure_reason = "Process is not UP and running after killed for component: %s" % component # -# guavus_response[0].status_code = status_code -# guavus_response[0].healthcheck_response.failure_reason.append(failure_reason) +# @retry(wait_fixed=3000, stop_max_delay=300000, retry_on_result=_check_process) +# def check_process(process_id, component): +# """ This function is used to check the process is up or not. # -# return guavus_response - - -@retry(wait_fixed=3000, stop_max_delay=300000, retry_on_result=_check_process) -def check_process(process_id, component): - """ This function is used to check the process is up or not. - - :param process_id: It is the process id before killing the process. - :type process_id: int - :param component: Name of the component. - :type component: str - :return: Return the process id of the process before kill and after kill. - :rtype: tuple - """ - _LOGGER.info("Running process check for process '%s' on component: %s" % (process_name, component)) - guavus_response_after_kill = fetch_process_id(component) - process_id_after_kill_process = guavus_response_after_kill[0].stdout - return process_id, process_id_after_kill_process +# :param process_id: It is the process id before killing the process. +# :type process_id: int +# :param component: Name of the component. +# :type component: str +# :return: Return the process id of the process before kill and after kill. +# :rtype: tuple +# """ +# _LOGGER.info("Running process check for process '%s' on component: %s" % (process_name, component)) +# guavus_response_after_kill = fetch_process_id(component) +# process_id_after_kill_process = guavus_response_after_kill[0].stdout +# return process_id, process_id_after_kill_process diff --git a/requirements.txt b/requirements.txt index 1e6dc39..b8931e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ +nimble dateparser kubernetes logzero -chaostoolkit-lib>=0.20.0 +chaostoolkit-lib pyyaml +pytest-html From 4e2cf7de05d7e2a7b0faa2711e9508db5b52a3e6 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 26 Jul 2019 12:42:26 +0530 Subject: [PATCH 03/40] AUT-544-new: Adding chaostoolkit-reporting package in requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b8931e7..ca16931 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ logzero chaostoolkit-lib pyyaml pytest-html +chaostoolkit-reporting From 00f001b0d358086030905bea894d74a2b0432c96 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Thu, 1 Aug 2019 18:12:40 +0530 Subject: [PATCH 04/40] AUT-544-new: Adding chaos validation code w.r.t jio --- ...{user_actions.py => chaos_user_actions.py} | 3 +- chaostoolkit_nimble/actions/jio/__init__.py | 0 .../actions/jio/common_actions.py | 100 ++++++++ .../actions/jio/media_plane_actions.py | 223 ++++++++++++++++++ .../controllers/base/__init__.py | 0 .../controllers/base/control.py | 38 +++ .../controllers/jio/__init__.py | 0 .../controllers/jio/control.py | 47 ++++ chaostoolkit_nimble/core/utils/ha_utils.py | 22 +- .../exp_templates/jio/shell_app_exp.json | 44 ++++ .../validation/sample_validation_config.yml | 169 +++++++++++++ chaostoolkit_nimble/tests/conftest.py | 12 + chaostoolkit_nimble/tests/sample/test_jio.py | 85 +++++++ .../tests/sample/test_shell_app_exp.py | 4 +- journal.json | 92 +++++--- 15 files changed, 793 insertions(+), 46 deletions(-) rename chaostoolkit_nimble/actions/base/flows/{user_actions.py => chaos_user_actions.py} (92%) create mode 100644 chaostoolkit_nimble/actions/jio/__init__.py create mode 100644 chaostoolkit_nimble/actions/jio/common_actions.py create mode 100644 chaostoolkit_nimble/actions/jio/media_plane_actions.py create mode 100644 chaostoolkit_nimble/controllers/base/__init__.py create mode 100644 chaostoolkit_nimble/controllers/base/control.py create mode 100644 chaostoolkit_nimble/controllers/jio/__init__.py create mode 100644 chaostoolkit_nimble/controllers/jio/control.py create mode 100644 chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json create mode 100644 chaostoolkit_nimble/resources/validation/sample_validation_config.yml create mode 100644 chaostoolkit_nimble/tests/sample/test_jio.py diff --git a/chaostoolkit_nimble/actions/base/flows/user_actions.py b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py similarity index 92% rename from chaostoolkit_nimble/actions/base/flows/user_actions.py rename to chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py index e248bf4..17cf38e 100644 --- a/chaostoolkit_nimble/actions/base/flows/user_actions.py +++ b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py @@ -17,7 +17,7 @@ from nimble.tests.conftest import OPTIONS_DICT -def run_experiment(experiments_template_path=None): +def run_experiment(experiments_template_path=None, variables_dict=None): experiments_base_path = "%s/tmp/experiments" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH ShellUtils.execute_shell_command(ShellUtils.remove_and_create_directory(experiments_base_path)) if not experiments_template_path: @@ -29,5 +29,6 @@ def run_experiment(experiments_template_path=None): experiment_file_response = ShellUtils.execute_shell_command( ShellUtils.find_files_in_directory(experiments_base_path)) for experiment_file in experiment_file_response.stdout.strip().split("\n"): + DynamicSubstitutionUtils.add(variables_dict) DynamicSubstitutionUtils.update_file(experiment_file) ShellUtils.execute_shell_command("chaos run %s" % experiment_file) diff --git a/chaostoolkit_nimble/actions/jio/__init__.py b/chaostoolkit_nimble/actions/jio/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/actions/jio/common_actions.py b/chaostoolkit_nimble/actions/jio/common_actions.py new file mode 100644 index 0000000..5a53181 --- /dev/null +++ b/chaostoolkit_nimble/actions/jio/common_actions.py @@ -0,0 +1,100 @@ +import logging + +from nimble.core.entity.components import Components +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.components.hadoop_utils import HadoopUtils +from nimble.core.utils.shell_utils import ShellUtils +from nimble.core.utils.date_utils import DateUtils, Timezone + + +class CommonActions(object): + + def __init__(self): + self._logger = logging.getLogger(__name__) + self.hadoop_utils = HadoopUtils + self.date_utils = DateUtils(Timezone.UTC.value) + self.date_format = "%Y-%m-%d %H:%M:%S" + self.bin_interval = 900 + + def hdfs_keytab(self): + hadoop_utils = HadoopUtils(NodeManager.node_obj) + mgt_alias = hadoop_utils.master_namenode + hdfs_keytab = NodeManager.node_obj.execute_command_on_node(mgt_alias,"klist -tke /etc/security/keytabs/hdfs.headless.keytab |grep '@' |awk -F' ' '{print $4}'|head -1") + kinit_command = "kinit -kt /etc/security/keytabs/hdfs.headless.keytab %s" % hdfs_keytab.stdout + NodeManager.node_obj.execute_command_on_component(Components.MASTER_NAMENODE.name,ShellUtils.su("hdfs", kinit_command)) + + def computeBinTime(self, timestamp): + epoch_time = self.date_utils.convert_human_readable_to_epoch(timestamp, self.date_format) + bin_epoch = self.date_utils.round_down_epoch(epoch_time, self.bin_interval) + bin_timestamp = self.date_utils.convert_epoch_to_human_readable(bin_epoch, self.date_format) + return bin_timestamp + + def calc_end_time(self, start_time, duration): + epoch_start_time = self.date_utils.convert_human_readable_to_epoch(start_time[:-4], self.date_format) + epoch_end_time = str(format((float(epoch_start_time) + float(duration)), '.6f')) + end_time = self.date_utils.convert_epoch_to_human_readable(float(epoch_end_time), self.date_format) + return end_time + + def date_format_changer(self, input_date, input_date_format, output_date_format): + epoch_input = self.date_utils.convert_human_readable_to_epoch(input_date,input_date_format) + return self.date_utils.convert_epoch_to_human_readable(epoch_input, output_date_format) + + def get_time_range_list(self, min_time, max_time, frequency, date_time_format): + tmp_list = [] + time_range_list = [] + min_time_epoch = self.date_utils.convert_human_readable_to_epoch(min_time, date_time_format) + max_time_epoch = self.date_utils.convert_human_readable_to_epoch(max_time, date_time_format) + min_round_down = self.date_utils.round_down_epoch(min_time_epoch, frequency) + max_round_down = self.date_utils.round_up_epoch(max_time_epoch, frequency) + if min_round_down == max_round_down: + max_round_up = self.date_utils.round_up_epoch(max_time_epoch, frequency) + time_range_list = [(self.date_utils.convert_epoch_to_human_readable(min_round_down, date_time_format), + self.date_utils.convert_epoch_to_human_readable(max_round_up, date_time_format))] + else: + while min_round_down <= max_round_down: + tmp_list.append(min_round_down) + min_round_down += frequency + for index in range(0, (len(tmp_list) - 1)): + time_range_list.append(( + self.date_utils.convert_epoch_to_human_readable(tmp_list[index], date_time_format), + self.date_utils.convert_epoch_to_human_readable(tmp_list[index + 1], + date_time_format))) + return time_range_list + + def get_time_colums(self, frequency): + if frequency < 60: + return "year,month,day,hour,minute,seconds" + elif frequency >= 60 and frequency < 3600: + return "year,month,day,hour,minute" + elif frequency <= 3600 and frequency < 86400: + return "year,month,day,hour" + elif frequency <= 86400 and frequency < 2592000: + return "year,month,day" + elif frequency <= 2592000 and frequency < 31536000: + return "year,month" + elif frequency <= 31536000: + return "year" + else: + self._logger.error("Incorrect Frequency Provided : %s" % frequency) + + def time_column_needful(self, frequency, timestamp_in_ns): + year = int(timestamp_in_ns[:4]) + month = int(timestamp_in_ns[4:6]) + day = int(timestamp_in_ns[6:8]) + + time_columns = self.get_time_colums(frequency) + + if len(time_columns.split(',')) == 3: + return [year, month, day] + if len(time_columns.split(',')) == 4: + hour = int(timestamp_in_ns[8:10]) + return [year, month, day, hour] + if len(time_columns.split(',')) == 5: + hour = int(timestamp_in_ns[8:10]) + minute = int(timestamp_in_ns[10:12]) + return [year, month, day, hour, minute] + if len(time_columns.split(',')) == 6: + hour = int(timestamp_in_ns[8:10]) + minute = int(timestamp_in_ns[10:12]) + second = int(timestamp_in_ns[12:14]) + return [year, month, day, hour, minute, second] \ No newline at end of file diff --git a/chaostoolkit_nimble/actions/jio/media_plane_actions.py b/chaostoolkit_nimble/actions/jio/media_plane_actions.py new file mode 100644 index 0000000..7a116a1 --- /dev/null +++ b/chaostoolkit_nimble/actions/jio/media_plane_actions.py @@ -0,0 +1,223 @@ +import re +import logging +import mmh3 +from chaostoolkit_nimble.actions.jio.common_actions import CommonActions + + +class MediaPlaneActions(object): + + def __init__(self, job_alias): + + self._logger = logging.getLogger(__name__) + self.job_alias = job_alias + self.comman_action = CommonActions() + self.seed = 42 + self.date_format_all = '%Y%m%d%H%M%S' + self.date_format_usual = "%Y-%m-%d %H:%M:%S" + self.frequency=900 + + def none_or_value(self, value): + if str(value) == "": + return None + else: + return value + + def generate_cell_id(self, mcc, mnc, cell_id): + if mcc == '' or mnc == '' or cell_id == '': + return '-1' + else: + if str(mnc) != 3: + final_mnc = '0' * (3 - len(str(mnc))) + str(mnc) + else: + final_mnc = mnc + + return str(mcc) + str(final_mnc) + '0' * (9 - len(str(cell_id))) + str(cell_id) + + def generate_hash(self, source_ip, source_port, dest_ip, dest_port): + hash_input_string = "%s|%s|%s|%s"%(source_ip, source_port, dest_ip, dest_port) + hash_value = mmh3.hash(hash_input_string, self.seed) + return hash_value + + def generate_binary_error_code(self, list_of_flags): + binary_string = "".join(list_of_flags) + binary_code = int(binary_string, 2) + return binary_code + + def link_specific_list(self, list): + default_list = [None, None, None, None, None, None, None, None, None, None, None, None, None, None] + if len(list) == 0: + return default_list + else: + if self.none_or_value(list[34]).startswith("2405:0203") or self.none_or_value(list[34]).startswith("2405:0204") or self.none_or_value(list[34]).startswith("2405:0205") or self.none_or_value(list[34]).startswith("2409:4000"): + return [self.none_or_value(list[34]), self.none_or_value(list[35]), self.none_or_value(list[19]), self.none_or_value(list[20]), self.none_or_value(list[45]), self.none_or_value(list[43]), self.none_or_value(list[11]), self.none_or_value(list[12]), self.none_or_value(list[22]), self.none_or_value(list[26]), self.none_or_value(list[27]), self.none_or_value(list[32]), self.none_or_value(list[33]), self.none_or_value(list[36])] + # m_u_source_ip , m_u_source_port , m_u_destination_ip , m_u_destination_port , subscriber_id , subscriber_msisdn , media_active_time_seconds , media_completed_indicator , media_long_call_indicator , media_middle_gap_indicator , media_short_call_indicator , media_single_direction_indicator , media_start_gap_indicator + # checked if its uplink record + + else: + return [self.none_or_value(list[19]), self.none_or_value(list[20]), self.none_or_value(list[34]), self.none_or_value(list[35]), self.none_or_value(list[45]), self.none_or_value(list[43]), self.none_or_value(list[11]), self.none_or_value(list[12]), self.none_or_value(list[22]), self.none_or_value(list[26]), self.none_or_value(list[27]), self.none_or_value(list[32]), self.none_or_value(list[33]), self.none_or_value(list[36])] + # m_u_destination_ip , m_u_destination_port , m_u_source_ip , m_u_source_port, subscriber_id , subscriber_msisdn , media_active_time_seconds , media_completed_indicator , media_long_call_indicator , media_middle_gap_indicator , media_short_call_indicator , media_single_direction_indicator , media_start_gap_indicator + # checked if its down_link record + + def validate_media_plane(self, validation_entities): + hashed_row_dict = {} + min_time = validation_entities.sqlite_adapter.select("select min(cal_timestamp_time) from %s_input1" % self.job_alias)[-1][0] + + max_time = validation_entities.sqlite_adapter.select("select max(cal_timestamp_time) from %s_input1" % self.job_alias)[-1][0] + + time_range_list = self.comman_action.get_time_range_list(min_time, max_time, self.frequency, date_time_format=self.date_format_usual) + media_header = ['m_hash_tuple', 'm_timestamp', 'm_u_source_ip', 'm_u_source_port', 'm_u_destination_ip', + 'm_u_destination_port', 'm_u_imsi', 'm_u_msisdn', 'm_u_call_duration', 'm_u_call_completed', + 'm_u_end_gap_indicator', 'm_u_long_call_indicator', 'm_u_middle_gap_indicator', + 'm_u_short_call_indicator', 'm_u_one_way_audio', 'm_u_start_gap_indicator', + 'm_u_weighted_jitter_total', 'm_u_weighted_mos_total', 'm_u_weighted_packet_loss_total', + 'm_u_weighted_rtd_total', 'm_u_jitter_sum', 'm_u_rtd_sum', 'm_u_packet_loss_sum', + 'm_u_degradation_sum', 'm_u_cell_id', 'm_d_destination_ip', 'm_d_destination_port', + 'm_d_source_ip', 'm_d_source_port', 'm_d_imsi', 'm_d_msisdn', 'm_d_call_duration', + 'm_d_call_completed', 'm_d_end_gap_indicator', 'm_d_long_call_indicator', + 'm_d_middle_gap_indicator', 'm_d_short_call_indicator', 'm_d_one_way_audio', + 'm_d_start_gap_indicator', 'm_d_weighted_jitter_total', 'm_d_weighted_mos_total', + 'm_d_weighted_packet_loss_total', 'm_d_weighted_rtd_total', 'm_d_jitter_sum', 'm_d_rtd_sum', + 'm_d_packet_loss_sum', 'm_d_degradation_sum', 'm_d_cell_id', 'm_error_code', 'm_ue_ip', + 'm_msisdn', 'm_imsi', 'm_cell_id', 'm_call_id', 'm_weighted_mos_sum', 'm_weighted_jitter_sum', + 'm_weighted_packet_loss_sum', 'm_weighted_rtd_sum', 'm_degradation_sum', 'm_jitter_sum', + 'm_packet_loss_sum', 'm_rtd_sum', 'm_mos', 'm_jitter', 'm_packet_loss', 'm_rtd', + 'm_binned_timestamp', 'sql_timestamp'] + final_media_dump = [media_header] + time_columns = self.comman_action.get_time_colums(frequency=self.frequency) + final_media_dump[0].extend(time_columns.split(',')) + for time_range in time_range_list: + where_clause = "cal_timestamp_time >= '%s' and cal_timestamp_time < '%s'" % (time_range[0], time_range[1]) + total_dump = validation_entities.sqlite_adapter.select("select * from %s_input1 where %s" % (self.job_alias, where_clause)) + temp_list = [] + hashed_row_dict = {} + for row in total_dump[1:]: + temp_dict = {} + row = list(row) + if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith("2405:0205") or row[19].startswith("2409:4000"): + hash_value = self.generate_hash(row[19], row[20], row[34], row[35]) + elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith("2405:0205") or row[34].startswith("2409:4000"): + hash_value = self.generate_hash(row[34], row[35], row[19], row[20]) + else: + continue + if hash_value in hashed_row_dict.keys(): + if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith("2405:0205") or row[19].startswith("2409:4000"): + hashed_row_dict[hash_value]["downlink"] = row + elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith("2405:0205") or row[34].startswith("2409:4000"): + hashed_row_dict[hash_value]["uplink"] = row + else: + continue + else: + if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith("2405:0205") or row[19].startswith("2409:4000"): + temp_dict = {"downlink": row, "uplink": []} + elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith("2405:0205") or row[34].startswith("2409:4000"): + temp_dict = {"downlink": [], "uplink": row} + else: + continue + hashed_row_dict[hash_value] = temp_dict + + for hash_value in hashed_row_dict.keys(): + error_code_list = ['1', '1'] + media_uplink_row = hashed_row_dict[hash_value]["uplink"] + media_downlink_row = hashed_row_dict[hash_value]["downlink"] + media_u_row = self.link_specific_list(media_uplink_row) + media_d_row = self.link_specific_list(media_downlink_row) + if len(media_uplink_row) == 0: + m_timestamp = (re.sub('[^A-Za-z0-9]+', "", media_downlink_row[1])) + m_binned_timestamp = (re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_downlink_row[1]))) + sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S","%Y-%m-%d %H:%M:%S") + m_ueip = media_downlink_row[19] + m_msisdn = media_uplink_row[43] + m_imsi = media_uplink_row[45] + error_code_list[0] = 0 + m_u_wei_mos = 0 + m_u_wei_jitter = 0 + m_u_wei_pakt_los = 0 + m_u_wei_rtd = 0 + m_u_degradation_sum = 0 + m_u_jitter_sum = 0 + m_u_pakt_los_sum = 0 + m_u_rtd_sum = 0 + m_u_cell_id = -1 + m_u_calculated_item = [str(m_u_wei_jitter), str(m_u_wei_mos), str(m_u_wei_pakt_los), str(m_u_wei_rtd), str(m_u_jitter_sum), str(m_u_rtd_sum), str(m_u_pakt_los_sum), str(m_u_degradation_sum), str(m_u_cell_id)] + m_cell_id = m_u_cell_id + + else: + m_timestamp = (re.sub('[^A-Za-z0-9]+', "", media_uplink_row[1])) + m_binned_timestamp = (re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_uplink_row[1]))) + sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S","%Y-%m-%d %H:%M:%S") + m_ueip = media_uplink_row[34] + m_msisdn = media_uplink_row[43] + m_imsi = media_uplink_row[45] + m_u_wei_mos = int(media_uplink_row[38]) + m_u_wei_jitter = int(media_uplink_row[37]) + m_u_wei_pakt_los = int(media_uplink_row[39]) + m_u_wei_rtd = int(media_uplink_row[40]) + m_u_degradation_sum = int(media_uplink_row[13]) + int(media_uplink_row[14]) + int(media_uplink_row[15]) + m_u_jitter_sum = int(media_uplink_row[23]) + int(media_uplink_row[24]) + int(media_uplink_row[25]) + m_u_pakt_los_sum = str(int(media_uplink_row[28]) + int(media_uplink_row[29]) + int(media_uplink_row[30])) + m_u_rtd_sum = int(media_uplink_row[16]) + int(media_uplink_row[17]) + int(media_uplink_row[18]) + m_u_cell_id = self.generate_cell_id(media_uplink_row[6], media_uplink_row[7], media_uplink_row[3]) + m_u_calculated_item = [str(m_u_wei_jitter), str(m_u_wei_mos), str(m_u_wei_pakt_los), str(m_u_wei_rtd), str(m_u_jitter_sum), str(m_u_rtd_sum), str(m_u_pakt_los_sum), str(m_u_degradation_sum), str(m_u_cell_id)] + m_cell_id = m_u_cell_id + + if len(media_downlink_row) == 0: + m_timestamp = (re.sub('[^A-Za-z0-9]+', "", media_uplink_row[1])) + m_binned_timestamp = (re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_uplink_row[1]))) + sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S","%Y-%m-%d %H:%M:%S") + m_ueip = media_uplink_row[34] + m_msisdn = media_downlink_row[43] + m_imsi = media_downlink_row[45] + error_code_list[1] = 0 + m_d_wei_mos = 0 + m_d_wei_jitter = 0 + m_d_wei_pakt_los = 0 + m_d_wei_rtd = 0 + m_d_degradation_sum = 0 + m_d_jitter_sum = 0 + m_d_pakt_los_sum = 0 + m_d_rtd_sum = 0 + m_d_cell_id = -1 + m_d_calculated_item = [str(m_d_wei_jitter), str(m_d_wei_mos), str(m_d_wei_pakt_los), str(m_d_wei_rtd), str(m_d_jitter_sum), str(m_d_rtd_sum), str(m_d_pakt_los_sum), str(m_d_degradation_sum), str(m_d_cell_id)] + m_cell_id = m_d_cell_id + + else: + m_timestamp = (re.sub('[^A-Za-z0-9]+', "", media_downlink_row[1])) + m_binned_timestamp = (re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_downlink_row[1]))) + sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S", "%Y-%m-%d %H:%M:%S") + m_ueip = media_downlink_row[19] + m_msisdn = media_downlink_row[43] + m_imsi = media_downlink_row[45] + m_d_wei_mos = int(media_downlink_row[38]) + m_d_wei_jitter = int(media_downlink_row[37]) + m_d_wei_pakt_los = int(media_downlink_row[39]) + m_d_wei_rtd = int(media_downlink_row[40]) + m_d_degradation_sum = int(media_downlink_row[13]) + int(media_downlink_row[14]) + int(media_downlink_row[15]) + m_d_jitter_sum = int(media_downlink_row[23]) + int(media_downlink_row[24]) + int(media_downlink_row[25]) + m_d_pakt_los_sum = int(media_downlink_row[28]) + int(media_downlink_row[29]) + int(media_downlink_row[30]) + m_d_rtd_sum = int(media_downlink_row[16]) + int(media_downlink_row[17]) + int(media_downlink_row[18]) + m_d_cell_id = self.generate_cell_id(media_downlink_row[6], media_downlink_row[7], media_downlink_row[3]) + m_d_calculated_item = [str(m_d_wei_jitter), str(m_d_wei_mos), str(m_d_wei_pakt_los), str(m_d_wei_rtd), str(m_d_jitter_sum), str(m_d_rtd_sum), str(m_d_pakt_los_sum), str(m_d_degradation_sum), str(m_d_cell_id)] + m_cell_id = m_d_cell_id + + m_hash_tuple = hash_value + m_error_code = self.generate_binary_error_code(error_code_list) + m_call_id = "m_" + str(m_ueip) + "_" + str(m_hash_tuple) + m_wei_mos = int(m_u_wei_mos) + int(m_d_wei_mos) + m_wei_jitter = int(m_u_wei_jitter) + int(m_d_wei_jitter) + m_wei_pkt_los = int(m_u_wei_pakt_los) + int(m_d_wei_pakt_los) + m_wei_rtd = int(m_u_wei_rtd) + int(m_d_wei_rtd) + m_degradation_sum = int(m_u_degradation_sum) + int(m_d_degradation_sum) + m_jitter_sum = int(m_u_jitter_sum) + int(m_d_jitter_sum) + m_pkt_loss_sum = int(m_u_pakt_los_sum) + int(m_d_pakt_los_sum) + m_rtd_sum = int(m_u_rtd_sum) + int(m_d_rtd_sum) + m_mos = float(m_wei_mos)/m_degradation_sum/100 + m_jitter = float(m_wei_jitter)/m_jitter_sum + m_packet_loss = float(m_wei_pkt_los)/m_pkt_loss_sum/100 + m_rtd = float(m_wei_rtd)/m_rtd_sum + + time_array = self.comman_action.time_column_needful(self.frequency, m_binned_timestamp) + + media_output_list = [str(m_hash_tuple), str(m_timestamp)] + media_u_row + m_u_calculated_item + media_d_row + m_d_calculated_item + [str(m_error_code), str(m_ueip), str(m_msisdn), str(m_imsi), str(m_cell_id), str(m_call_id), str(m_wei_mos), str(m_wei_jitter), str(m_wei_pkt_los), str(m_wei_rtd), str(m_degradation_sum), str(m_jitter_sum), str(m_pkt_loss_sum), str(m_rtd_sum), str(m_mos), str(m_jitter), str(m_packet_loss), str(m_rtd), str(m_binned_timestamp), str(sql_timestamp)] + time_array + temp_list.append(media_output_list) + final_media_dump.extend(temp_list) + validation_entities.output_obj[self.job_alias]["output1"] = final_media_dump \ No newline at end of file diff --git a/chaostoolkit_nimble/controllers/base/__init__.py b/chaostoolkit_nimble/controllers/base/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/controllers/base/control.py b/chaostoolkit_nimble/controllers/base/control.py new file mode 100644 index 0000000..8ae43c5 --- /dev/null +++ b/chaostoolkit_nimble/controllers/base/control.py @@ -0,0 +1,38 @@ +from chaoslib.types import Configuration, \ + Experiment, Secrets, Settings +from logzero import logger +from nimble.core import global_constants + +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.shell_utils import ShellUtils + + +def configure_control(configuration: Configuration = None, + secrets: Secrets = None, settings: Settings = None, + experiment: Experiment = None): + """ + Configure the control's global state + + This is called once only per Chaos Toolkit's run and should be used to + initialize any state your control may require. + + The `settings` are only passed when the control is declared in the + settings file of the Chaos Toolkit. + """ + # testbed_file = "/Users/kritika.saxena/chaos_folder_3/chaos_eng_automation/chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml" + # component_arttributes_file = "/Users/kritika.saxena/chaos_folder_3/chaos_eng_automation/chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml" + # global OPTIONS_DICT + # testbed_file = OPTIONS_DICT["--testbed"] + # component_arttributes_file = OPTIONS_DICT["--componentAttributesConfig"] + # testbed_file = os.environ['TESTBED_FILE'] + # component_arttributes_file = os.environ['COMPONENT_ATTRIBUTES_FILE'] + + + setup_files_base_path = "%s/setup" % global_constants.DEFAULT_LOCAL_TMP_PATH + testbed_file = ShellUtils.execute_shell_command( + ShellUtils.find_files_in_directory(setup_files_base_path, file_name_regex="open_nebula_*")).stdout + component_attributes_file = ShellUtils.execute_shell_command( + ShellUtils.find_files_in_directory(setup_files_base_path, file_name_regex="component_*")).stdout + NodeManager.initialize(testbed_file, component_attributes_file) + logger.debug("NODE_OBJ FROM BASE CONTROLLER----------------: %s" % NodeManager.node_obj.vip) + diff --git a/chaostoolkit_nimble/controllers/jio/__init__.py b/chaostoolkit_nimble/controllers/jio/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/controllers/jio/control.py b/chaostoolkit_nimble/controllers/jio/control.py new file mode 100644 index 0000000..95f8cc8 --- /dev/null +++ b/chaostoolkit_nimble/controllers/jio/control.py @@ -0,0 +1,47 @@ +from time import sleep + +from chaoslib.types import Configuration, \ + Experiment, Run, Secrets, Activity +from chaostoolkit_nimble.controllers.base import control +from logzero import logger + +control.configure_control() + +def after_activity_control(context: Activity, state: Run, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + after-control of the activity's execution + + Called by the Chaos Toolkit before the activity is applied. The result of + the execution is passed as `state`. See + https://docs.chaostoolkit.org/reference/api/journal/#run for more + information. + """ + logger.debug("----------------STATE AFTER ACTIVITY: %s" % state) + + +def before_method_control(context: Experiment, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + before-control of the method's execution + + Called by the Chaos Toolkit before the activities of the method are + applied. + """ + logger.debug("----------------CONFIGURATION BEFORE METHOD: %s" % configuration) + sleep(120) + + +def after_method_control(context: Experiment, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + before-control of the method's execution + + Called by the Chaos Toolkit before the activities of the method are + applied. + """ + logger.debug("----------------CONFIGURATION AFTER METHOD: %s" % configuration) + sleep(120) diff --git a/chaostoolkit_nimble/core/utils/ha_utils.py b/chaostoolkit_nimble/core/utils/ha_utils.py index 2c241b6..a9fb0f0 100644 --- a/chaostoolkit_nimble/core/utils/ha_utils.py +++ b/chaostoolkit_nimble/core/utils/ha_utils.py @@ -45,16 +45,13 @@ def check_process_running(component, process_name=None): - # if not process_name - # process_name = Components.get_process_name(component) - # logger.info("Checking if process '%s' is running by fetching its process id" % process_name) - # testbed_file = "chaostoolkit_nimble/resources/testbeds/open_nebula_135_52.yml" - # component_arttributes_file = "chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml" - # NodeManager.initialize(testbed_file, component_arttributes_file) - logger.debug("NODE_OBJ----------------: %s" % NodeManager.node_obj.vip) - logger.debug("b = ----------------: %s" % fabric_utils.b ) - # return NodeManager.node_obj.execute_command_on_component(component, ShellUtils.fetch_process_id(process_name), - # consolidated_success_flag=True) + if not process_name: + process_name = Components.get_process_name(component) + logger.info("Checking if process '%s' is running by fetching its process id." % process_name) + logger.debug("NODE_OBJ FROM ha_utils: ----------------: %s" % NodeManager.node_obj.vip) + # logger.debug("b = ----------------: %s" % fabric_utils.b ) + return NodeManager.node_obj.execute_command_on_component(component, ShellUtils.fetch_process_id(process_name), + consolidated_success_flag=True) @@ -70,12 +67,13 @@ def kill_process(process_name, component, num_of_nodes=None): for node in NodeManager.node_obj.nodes_by_type[component]: node_aliases.append(node.name) if num_of_nodes: - node_aliases = random.sample(node_aliases, num_of_nodes) + node_aliases = random.sample(node_aliases, int(num_of_nodes)) command = ShellUtils.kill_process_by_name(process_name) response_list = [] for node_alias in node_aliases: + logger.debug("Killing process '%s' on node '%s'" % (process_name, node_alias)) response_list.append( - NodeManager.node_obj.execute_command_on_node(node_alias, command, consolidated_success_flag=False)) + NodeManager.node_obj.execute_command_on_node(node_alias, command)) return response_list diff --git a/chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json b/chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json new file mode 100644 index 0000000..ed53b65 --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json @@ -0,0 +1,44 @@ +{ + "version": "1.0.0", + "title": "Experiment running shell commands on local and remote", + "description": "Shell command should get successfully executed on remote using python provider through fabric and on local it should run through process provider", + "tags": ["kubernetes"], + "controls": [{ + "name": "jio-15min-job-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.jio.control" + } + }], + "steady-state-hypothesis": { + "title": "Shell application is up and running", + "probes": [{ + "type": "probe", + "name": "Fetch-shell-app-process-id", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "check_process_running", + "arguments": { + "component": "rand_dynamic_component", + "process_name": "rand_dynamic_process_name" + } + } + }] + }, + "method": [{ + "type": "action", + "name": "Kill-shell-app", + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "kill_process", + "arguments": { + "component": "rand_dynamic_component", + "process_name": "rand_dynamic_process_name", + "num_of_nodes": "1" + } + } + }] +} diff --git a/chaostoolkit_nimble/resources/validation/sample_validation_config.yml b/chaostoolkit_nimble/resources/validation/sample_validation_config.yml new file mode 100644 index 0000000..df9b95e --- /dev/null +++ b/chaostoolkit_nimble/resources/validation/sample_validation_config.yml @@ -0,0 +1,169 @@ +defaults: + project: platform + build: nightly + golden_build: nightly + customer: thales + mail_to: kritika.saxena@guavus.com,Jyoti.Arora@guavus.com,samarth.goel@guavus.com + headers: True + delimiter: "," + aggregate: False + sort_columns: False + config_separator: "=" +jobs: + hive_to_hive: + input: + input1: + source: hive + db_name: randynamic_db_name + table_name: randynamic_table_name + where_clause: + select_statement: '*' + group_by_clause: + output: + output1: + actual: + source: hive + db_name: aut_squad_hive_db + table_name: aut_squad_hive_table + where_clause: + ignore_columns: appnameid,deviceid,searchkeyword,httpreferer,deviceosversion,gds,sessionidtype4,sessionidtype3,sessionidtype2,sessionidtype1,duration,mcc,ci,att_content_type,accountid,deviceos,imei,spt2category,contenttype,tier1_id,contentupbytes,location8,devicemanufacturer,location7,location6,location5,tier2_id,location4,location3,location2,location1,contentdownbytes,att_os,mobileapp,wcsubid,recordendtime,recordendtimedateformat,apn,mnc,timezone,att_application,contenttypeid,mobileappcategory,url,appprotocol,formatedrat,att_device,ipprotocol,destinationip,ntc_id,devicename,lac,httpmethod,adidtimestamp,sourceip,att_domain,uaidagent,adid,ods3,ods2,ods1,devicegroup,responsecode,lineformat,att_sp,spt1category,serviceprovider,destinationport,downpackets,host,roamingcountry,topologylevel2,sourceport,downbytes,topologylevel1,roamingoperator,recordstarttime,recordstarttimedateformat,time + measures: output_type + psql_to_psql: + input: + input1: + source: psql + db_name: &psql_to_psql_input_db aut_squad_pgsql_db + table_name: &psql_to_psql_input_table aut_squad_pgsql_table + select_statement: '*' + where_clause: + group_by_clause: + output: + output1: + actual: + source: psql + db_name: aut_squad_pgsql_db + table_name: aut_squad_pgsql_table + where_clause: + ignore_columns: id + measures: description,name,url,displayname + hdfs_to_hdfs_parquet: + input: + input1: + source: hdfs + base_path: &hdfs_to_hdfs_parquet_input_base_path /tmp/aut_squad_dir/parquet + file_format: parquet + delimiter: "\t" + compression: gz + headers: False + output: + output1: + actual: + source: hdfs + base_path: /tmp/aut_squad_dir/parquet + aggregation_level: 0 + file_format: parquet + delimiter: "\t" + compression: gz + headers: False + &hdfs_csv hdfs_to_hdfs_csv: +# configs: +# config1: +# location: local +# base_path: "/tmp/aut_squad_dir/automation_squad.yml" +# file_format: yaml +# components: +# - MANAGEMENT +# - NAMENODE +# properties: +# "[2].tasks[0].docker_image.name": "image3" +# schedule: +# source: api +# module: nimble.actions.sample.sample_actions +# class: +# class_name: SampleActions +# class_parameters: +# - *hdfs_csv +# method: +# method_name: schedule_job_via_api + input: + input1: + source: hdfs + base_path: /tmp/aut_squad_dir/csv + file_format: csv + file_name_flag: False + output: + output1: + actual: + source: hdfs + base_path: /tmp/aut_squad_dir/csv + file_format: csv + ignore_diff_column_indexes: 1,3 + output2: + actual: + source: hdfs + base_path: /tmp/aut_squad_dir/csv + file_format: csv + ignore_diff_column_indexes: 1,3 + local_to_hdfs: + input: + input1: + source: local + base_path: /tmp/aut_squad_dir/csv + file_format: csv + file_name_flag: True + components: + - MASTER_NAMENODE + output: + output1: + actual: + source: hdfs + base_path: &local_to_hdfs_output_base_path /tmp/aut_squad_dir/csv + file_format: csv + fileserver_to_hdfs: + input: + input1: + source: file_server + base_path: modules/platform/validation/aut_squad_test_data/input_output_sources/fileserver/csv/ + file_format: csv + output: + output1: + actual: + source: hdfs + base_path: &fileserver_to_hdfs_output_base_path /tmp/aut_squad_dir/csv + file_format: csv + elasticsearch_to_elasticsearch: + input: + input1: + source: elasticsearch + es_index: randynamic_es_index + output: + output1: + actual: + source: elasticsearch + es_index: randynamic_es_index + hbase_to_hbase: + input: + input1: + source: hbase + table_name: randynamic_table_name + scan_filter: FirstKeyOnlyFilter() + output: + output1: + actual: + source: hbase + table_name: aut_squad_hbase_table + scan_filter: FirstKeyOnlyFilter() + media_plane: + input: + input1: + source: hdfs + base_path: /tmp/partition_date=2019-07-20 + file_format: csv + file_name_flag: False + delimiter: "," + output: + output1: + actual: + source: hive + db_name: network360_volte + table_name: media_plane_table \ No newline at end of file diff --git a/chaostoolkit_nimble/tests/conftest.py b/chaostoolkit_nimble/tests/conftest.py index 0b5dd53..02b59fa 100644 --- a/chaostoolkit_nimble/tests/conftest.py +++ b/chaostoolkit_nimble/tests/conftest.py @@ -5,6 +5,7 @@ import pytest from nimble.core import global_constants +from nimble.core.utils.shell_utils import ShellUtils try: os.makedirs(global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH) @@ -43,8 +44,19 @@ def initialize_node_obj(request): component_arttributes_file = request.config.getoption("--componentAttributesConfig") if not component_arttributes_file: component_arttributes_file = "nimble/resources/components/component_attributes.yml" + setup_files_base_path = "%s/setup" % global_constants.DEFAULT_LOCAL_TMP_PATH if testbed_file: NodeManager.initialize(testbed_file, component_arttributes_file) + ShellUtils.execute_shell_command( + ShellUtils.remove_and_create_directory(setup_files_base_path)) + testbed_file_tmp_path = "%s/%s" % (setup_files_base_path, testbed_file.rsplit("/", 1)[1]) + component_arttributes_file_tmp_path = "%s/%s" % ( + setup_files_base_path, component_arttributes_file.rsplit("/", 1)[1]) + ShellUtils.execute_shell_command(ShellUtils.copy(testbed_file, testbed_file_tmp_path)) + ShellUtils.execute_shell_command( + ShellUtils.copy(component_arttributes_file, component_arttributes_file_tmp_path)) + yield + ShellUtils.execute_shell_command(ShellUtils.remove(setup_files_base_path, recursive=True)) @pytest.fixture(scope="session", autouse=True) diff --git a/chaostoolkit_nimble/tests/sample/test_jio.py b/chaostoolkit_nimble/tests/sample/test_jio.py new file mode 100644 index 0000000..704838b --- /dev/null +++ b/chaostoolkit_nimble/tests/sample/test_jio.py @@ -0,0 +1,85 @@ +import collections + +import allure +import pytest +from chaostoolkit_nimble.actions.base.flows import chaos_user_actions +from chaostoolkit_nimble.actions.jio.media_plane_actions import MediaPlaneActions +from nimble.actions.base.regression.config_actions import ConfigActions +from nimble.core.entity.components import Components +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.components.hadoop_utils import HadoopCliUtils +from nimble.core.utils.fabric_utils import FabricUtils +from nimble.core.utils.shell_utils import ShellUtils + + +class TestJio(): + job_alias = "media_plane" + # hdfs_path = "/tmp/aut_squad_dir/csv" + job_user = "ambari-qa" + database_name = "network360_volte" + table_name = "media_plane_table" + + # @pytest.fixture(scope="session") + # def sample_actions(self): + # return SampleActions(self.job_alias) + + @pytest.fixture(scope="session") + def media_plane_actions(self): + return MediaPlaneActions(self.job_alias) + + @pytest.fixture(scope="session") + def hadoop_cli_utils(self): + return HadoopCliUtils() + + # @pytest.fixture(scope="session") + # def send_data_to_hdfs(self, user_actions, hadoop_cli_utils): + # # hadoop_cli_utils.remove(self.hdfs_path, recursive=True) + # # user_actions.send_data_to_hdfs( + # # "modules/platform/validation/aut_squad_test_data/input_output_sources/hdfs/csv/", self.hdfs_path) + # # yield + # # hadoop_cli_utils.remove(self.hdfs_path, recursive=True) + # pass + + @pytest.fixture(scope="session") + def config_actions(self): + return ConfigActions() + + @pytest.fixture(scope="session") + def clean_table(self): + command = "hive -e 'drop table if exists %s.%s'" % (self.database_name, self.table_name) + NodeManager.node_obj.execute_command_on_node( + NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], + ShellUtils.su(self.job_user, command)) + + def test_validation_on_15min_job_ha(self, config_actions, user_actions, media_plane_actions, clean_table): + with allure.step('Schedule 15 min job'): + job_base_directory = "/data/jio_copy/microapp1/" + database_name = "network360_volte" + kwargs = collections.OrderedDict() + kwargs["config_separator"] = "=" + kwargs["location"] = "local" + kwargs["base_path"] = "%s/conf/MediaPlaneJob.json" % job_base_directory + kwargs["file_format"] = "json" + kwargs["components"] = [Components.MANAGEMENT.name] + kwargs["properties"] = {"mediaPlaneRawInput.type": "csv", "mediaPlaneRawInput.header": "true", + "mediaPlaneRawInput.pathPrefix": "/tmp/partition_date=", + "mediaPlaneRawInput.tableName": "%s.%s" % (self.database_name, self.table_name)} + config_actions.update_configs(**kwargs) + + job_run_command = "export SPARK_HOME=/usr/hdp/2.6.5.0-292/spark2 && cd %s && nohup sh scripts/media_plane_microapp1.sh &" % ( + job_base_directory) + # guavus_response = NodeManager.node_obj.execute_command_on_node( + # NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], + # ShellUtils.su(self.job_user, job_run_command), pty=False) + FabricUtils.run_command_on_remote_in_bg(job_run_command, "192.168.134.170", "root", + "guavus@123") + with allure.step('Perform Job HA via chaostoolkit'): + ####### To be decided where to keep all templates -- fileserver could be an option + template_path = "chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json" + variables_dict = {"rand_dynamic_component": Components.MANAGEMENT.name, + "rand_dynamic_process_name": "media_plane_microapp1", + } + chaos_user_actions.run_experiment(template_path, variables_dict) + with allure.step('Validate the data'): + user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) + # user_actions.validate(sample_actions.validate_hdfs_to_hdfs_csv, self.job_alias) diff --git a/chaostoolkit_nimble/tests/sample/test_shell_app_exp.py b/chaostoolkit_nimble/tests/sample/test_shell_app_exp.py index 3489f16..1f8cbed 100644 --- a/chaostoolkit_nimble/tests/sample/test_shell_app_exp.py +++ b/chaostoolkit_nimble/tests/sample/test_shell_app_exp.py @@ -5,7 +5,7 @@ from nimble.core.utils.dynamic_substitution_utils import DynamicSubstitutionUtils from nimble.core.utils.multiprocessing_utils import MultiprocessingUtils -from chaostoolkit_nimble.actions.base.flows import user_actions +from chaostoolkit_nimble.actions.base.flows import chaos_user_actions from chaostoolkit_nimble.actions.sample import sample_application_actions _LOGGER = logging.getLogger(__name__) @@ -60,7 +60,7 @@ def test_application_ha(self): "rand_dynamic_process_name": "sleep 5m", } DynamicSubstitutionUtils.add(variables) - user_actions.run_experiment(experiments_template_path) + chaos_user_actions.run_experiment(experiments_template_path) ########################################################################################################## # env = Environment(loader=PackageLoader("chaostoolkit_nimble", "resources/templates/shell_application")) diff --git a/journal.json b/journal.json index d711e9d..0c1c341 100644 --- a/journal.json +++ b/journal.json @@ -14,7 +14,7 @@ "name": "shell-app-controls", "provider": { "type": "python", - "module": "chaostoolkit_nimble.controllers.extensions.shell_app.control" + "module": "chaostoolkit_nimble.controllers.jio.control" } } ], @@ -30,8 +30,8 @@ "type": "python", "func": "check_process_running", "arguments": { - "component": "NAMENODE", - "process_name": "sleep 5m" + "component": "MANAGEMENT", + "process_name": "media_plane_microapp1" } } } @@ -46,8 +46,8 @@ "type": "python", "func": "kill_process", "arguments": { - "component": "NAMENODE", - "process_name": "sleep 5m", + "component": "MANAGEMENT", + "process_name": "media_plane_microapp1", "num_of_nodes": "1" } } @@ -55,12 +55,12 @@ ], "dry": false }, - "start": "2019-07-18T10:50:51.666552", - "status": "failed", + "start": "2019-08-01T12:32:31.708887", + "status": "completed", "deviated": false, "steady_states": { "before": { - "steady_state_met": false, + "steady_state_met": true, "probes": [ { "activity": { @@ -72,33 +72,63 @@ "type": "python", "func": "check_process_running", "arguments": { - "component": "NAMENODE", - "process_name": "sleep 5m" + "component": "MANAGEMENT", + "process_name": "media_plane_microapp1" } } }, - "output": null, - "status": "failed", - "exception": [ - "Traceback (most recent call last):\n", - " File \"/Users/kritika.saxena/KR_VIRENV_CHAOS_NIMBLE_PY3/venv/lib/python3.7/site-packages/chaoslib/provider/python.py\", line 55, in run_python_activity\n return func(**arguments)\n", - " File \"/Users/kritika.saxena/chaos_folder_3/chaos_eng_automation/chaostoolkit_nimble/core/utils/ha_utils.py\", line 42, in check_process_running\n consolidated_success_flag=True)\n", - " File \"/Users/kritika.saxena/KR_VIRENV_CHAOS_NIMBLE_PY3/venv/lib/python3.7/site-packages/nimble/core/entity/nodes.py\", line 109, in execute_command_on_component\n for node_attributes in self.nodes_by_type[component]:\n", - "KeyError: 'NAMENODE'\n", - "\nDuring handling of the above exception, another exception occurred:\n\n", - "chaoslib.exceptions.ActivityFailed: KeyError: 'NAMENODE'\n" - ], - "start": "2019-07-18T10:50:51.668208", - "end": "2019-07-18T10:50:51.670135", - "duration": 0.001927, - "tolerance_met": false + "output": true, + "status": "succeeded", + "start": "2019-08-01T12:32:31.710821", + "end": "2019-08-01T12:32:31.977866", + "duration": 0.267045, + "tolerance_met": true } ] }, - "after": null + "after": { + "steady_state_met": true, + "probes": [ + { + "activity": { + "type": "probe", + "name": "Fetch-shell-app-process-id", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "check_process_running", + "arguments": { + "component": "MANAGEMENT", + "process_name": "media_plane_microapp1" + } + } + }, + "output": true, + "status": "succeeded", + "start": "2019-08-01T12:36:32.104985", + "end": "2019-08-01T12:36:32.237630", + "duration": 0.132645, + "tolerance_met": true + } + ] + } }, - "run": [], - "rollbacks": [], - "end": "2019-07-18T10:50:51.671344", - "duration": 0.025391817092895508 -} \ No newline at end of file + "run": [ + { + "activity": { + "type": "action", + "name": "Kill-shell-app", + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "kill_process", + "arguments": { + "component": "MANAGEMENT", + "process_name": "media_plane_microapp1", + "num_of_nodes": "1" + } + } + }, + "output": [ + \ No newline at end of file From 5de8c1cd41e9e3158964e0f32008c3ab871d4bbd Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Thu, 1 Aug 2019 19:16:05 +0530 Subject: [PATCH 05/40] AUT-544-new: Adding chaos validation code w.r.t jio --- .../controllers/jio/control.py | 2 +- chaostoolkit_nimble/tests/sample/test_jio.py | 33 ++++++++++++++++--- journal.json | 16 ++++----- notes | 1 + 4 files changed, 39 insertions(+), 13 deletions(-) create mode 100644 notes diff --git a/chaostoolkit_nimble/controllers/jio/control.py b/chaostoolkit_nimble/controllers/jio/control.py index 95f8cc8..bf7ed70 100644 --- a/chaostoolkit_nimble/controllers/jio/control.py +++ b/chaostoolkit_nimble/controllers/jio/control.py @@ -31,7 +31,7 @@ def before_method_control(context: Experiment, applied. """ logger.debug("----------------CONFIGURATION BEFORE METHOD: %s" % configuration) - sleep(120) + sleep(15) def after_method_control(context: Experiment, diff --git a/chaostoolkit_nimble/tests/sample/test_jio.py b/chaostoolkit_nimble/tests/sample/test_jio.py index 704838b..d3a81a6 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio.py +++ b/chaostoolkit_nimble/tests/sample/test_jio.py @@ -54,11 +54,14 @@ def clean_table(self): def test_validation_on_15min_job_ha(self, config_actions, user_actions, media_plane_actions, clean_table): with allure.step('Schedule 15 min job'): job_base_directory = "/data/jio_copy/microapp1/" - database_name = "network360_volte" + job_config_file = "%s/conf/MediaPlaneJob.json" % job_base_directory + job_script_file = "%s/scripts/media_plane_microapp1.sh" % job_base_directory + + ######## Update job config file kwargs = collections.OrderedDict() kwargs["config_separator"] = "=" kwargs["location"] = "local" - kwargs["base_path"] = "%s/conf/MediaPlaneJob.json" % job_base_directory + kwargs["base_path"] = job_config_file kwargs["file_format"] = "json" kwargs["components"] = [Components.MANAGEMENT.name] kwargs["properties"] = {"mediaPlaneRawInput.type": "csv", "mediaPlaneRawInput.header": "true", @@ -66,13 +69,35 @@ def test_validation_on_15min_job_ha(self, config_actions, user_actions, media_pl "mediaPlaneRawInput.tableName": "%s.%s" % (self.database_name, self.table_name)} config_actions.update_configs(**kwargs) + ######## Update job script file + NodeManager.node_obj.execute_command_on_node( + NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], + ShellUtils.find_and_replace_whole_line_in_file("basedirectory=", + "basedirectory=/data/jio_copy/microapp1/", + job_script_file)) + NodeManager.node_obj.execute_command_on_node( + NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], + ShellUtils.find_and_replace_whole_line_in_file("lastdayepoch=", + """lastdayepoch=`date -d "2019-07-20 05:30:00" +%s`""", + job_script_file)) + NodeManager.node_obj.execute_command_on_node( + NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], + ShellUtils.find_and_replace_in_file("--timeIncrementInFilesInMin=15", + "--timeIncrementInFilesInMin=15", + job_script_file)) + NodeManager.node_obj.execute_command_on_node( + NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], + ShellUtils.find_and_replace_in_file("--durationOfDataToProcessInMin=15", + "--durationOfDataToProcessInMin=15", + job_script_file)) + ####### Run job script on managemnet node job_run_command = "export SPARK_HOME=/usr/hdp/2.6.5.0-292/spark2 && cd %s && nohup sh scripts/media_plane_microapp1.sh &" % ( job_base_directory) # guavus_response = NodeManager.node_obj.execute_command_on_node( # NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], # ShellUtils.su(self.job_user, job_run_command), pty=False) - FabricUtils.run_command_on_remote_in_bg(job_run_command, "192.168.134.170", "root", - "guavus@123") + FabricUtils.run_command_on_remote_in_bg(ShellUtils.su(self.job_user, job_run_command), "192.168.134.170", + "root", "guavus@123") with allure.step('Perform Job HA via chaostoolkit'): ####### To be decided where to keep all templates -- fileserver could be an option template_path = "chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json" diff --git a/journal.json b/journal.json index 0c1c341..88366d1 100644 --- a/journal.json +++ b/journal.json @@ -11,7 +11,7 @@ ], "controls": [ { - "name": "shell-app-controls", + "name": "jio-15min-job-controls", "provider": { "type": "python", "module": "chaostoolkit_nimble.controllers.jio.control" @@ -55,7 +55,7 @@ ], "dry": false }, - "start": "2019-08-01T12:32:31.708887", + "start": "2019-08-01T13:00:58.814917", "status": "completed", "deviated": false, "steady_states": { @@ -79,9 +79,9 @@ }, "output": true, "status": "succeeded", - "start": "2019-08-01T12:32:31.710821", - "end": "2019-08-01T12:32:31.977866", - "duration": 0.267045, + "start": "2019-08-01T13:00:58.816403", + "end": "2019-08-01T13:00:59.044517", + "duration": 0.228114, "tolerance_met": true } ] @@ -106,9 +106,9 @@ }, "output": true, "status": "succeeded", - "start": "2019-08-01T12:36:32.104985", - "end": "2019-08-01T12:36:32.237630", - "duration": 0.132645, + "start": "2019-08-01T13:03:14.180425", + "end": "2019-08-01T13:03:14.306980", + "duration": 0.126555, "tolerance_met": true } ] diff --git a/notes b/notes new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/notes @@ -0,0 +1 @@ + From dcaeacc94c07850ec69e896b6f301704b9b9b016 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Thu, 1 Aug 2019 19:16:58 +0530 Subject: [PATCH 06/40] AUT-544-new: Adding chaos validation code w.r.t jio --- notes | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/notes b/notes index 8b13789..3c5401d 100644 --- a/notes +++ b/notes @@ -1 +1,13 @@ +-----------------------------------------------------------------------Nimble fixes/enhancements for chaos: +/Users/kritika.saxena/KR_VIRENV_CHAOS_NIMBLE_PY3_DEP/venv/lib/python3.7/site-packages/nimble/core/utils/shell_utils.py /us/bin/cp ---> cp + +return "%s | xargs --verbose -r %s" % (ShellUtils.fetch_process_id(process_name), ShellUtils.kill_process_by_id("")) + +execute_command_on node/ execute_remote_command / run_remote_command --> pty property add + +@staticmethod +def find_and_replace_whole_line_in_file(find, replace, input_file): + find = find.replace("/", "\\/") + replace = replace.replace("/", "\\/") + return "sed -i '/%s/c\%s' %s" % (find, replace, input_file) \ No newline at end of file From efc462793acd66f3a1d19f9bb2b5e1afdc4f5a5b Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Mon, 5 Aug 2019 19:33:58 +0530 Subject: [PATCH 07/40] AUT-544-new: Jinja templating + modularizing jio job schedule actions + added method for remote background command in node, shell fabric utils --- .../actions/base/flows/chaos_user_actions.py | 68 ++++--- .../actions/jio/media_plane_actions.py | 188 ++++++++++++++---- .../{extensions => chaosk8s}/__init__.py | 0 .../{extensions => }/chaosk8s/control.py | 0 .../extensions/shell_app/control.py | 32 --- .../controllers/jio/__init__.py | 0 .../chaosk8s => process}/__init__.py | 0 .../controllers/{jio => process}/control.py | 2 +- .../shell_app => core/exceptions}/__init__.py | 0 .../core/exceptions/custom_exceptions.py | 7 + chaostoolkit_nimble/core/utils/ha_utils.py | 17 +- .../resources/exp_templates/process/exp.json | 44 ++++ .../exp_dynamic.json} | 16 +- .../validation/sample_validation_config.yml | 2 +- chaostoolkit_nimble/tests/conftest.py | 2 - chaostoolkit_nimble/tests/sample/test_jio.py | 83 ++------ journal.json | 49 +++-- 17 files changed, 300 insertions(+), 210 deletions(-) rename chaostoolkit_nimble/controllers/{extensions => chaosk8s}/__init__.py (100%) rename chaostoolkit_nimble/controllers/{extensions => }/chaosk8s/control.py (100%) delete mode 100644 chaostoolkit_nimble/controllers/extensions/shell_app/control.py delete mode 100644 chaostoolkit_nimble/controllers/jio/__init__.py rename chaostoolkit_nimble/controllers/{extensions/chaosk8s => process}/__init__.py (100%) rename chaostoolkit_nimble/controllers/{jio => process}/control.py (99%) rename chaostoolkit_nimble/{controllers/extensions/shell_app => core/exceptions}/__init__.py (100%) create mode 100644 chaostoolkit_nimble/core/exceptions/custom_exceptions.py create mode 100644 chaostoolkit_nimble/resources/exp_templates/process/exp.json rename chaostoolkit_nimble/resources/exp_templates/{jio/shell_app_exp.json => process/exp_dynamic.json} (63%) diff --git a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py index 17cf38e..c07134d 100644 --- a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py +++ b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py @@ -1,34 +1,54 @@ +import logging + +import allure +import jinja2 from nimble.core import global_constants -# class UserActions(object): -# """Actions exposed to the user for data validation.""" -# -# def __init__(self, config_parser, node_obj=NodeManager.node_obj): -# """ -# -# :type config_parser: :class:`nimble.core.configs.validation_config_parser.ValidationConfigParser` -# :type node_obj: :class:`nimble.core.entity.nodes.Nodes` -# """ -# self._logger = logging.getLogger(__name__) -# self.node_obj = node_obj -# self.config_parser = config_parser -# self.file_server_utils = FileServerUtils() -from nimble.core.utils.dynamic_substitution_utils import DynamicSubstitutionUtils from nimble.core.utils.shell_utils import ShellUtils from nimble.tests.conftest import OPTIONS_DICT +_LOGGER = logging.getLogger(__name__) + +EXPERIMENTS_BASE_PATH = "%s/tmp/experiments/" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH + -def run_experiment(experiments_template_path=None, variables_dict=None): - experiments_base_path = "%s/tmp/experiments" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH - ShellUtils.execute_shell_command(ShellUtils.remove_and_create_directory(experiments_base_path)) - if not experiments_template_path: +def run_experiment(exp_template_file=None, context=None): + if not exp_template_file: experiments_path = OPTIONS_DICT["experimentsPath"] - ShellUtils.execute_shell_command(ShellUtils.copy(experiments_path, experiments_base_path)) + ShellUtils.execute_shell_command(ShellUtils.copy(experiments_path, EXPERIMENTS_BASE_PATH)) else: - ShellUtils.execute_shell_command(ShellUtils.copy(experiments_template_path, experiments_base_path)) - + render_template(exp_template_file, context) experiment_file_response = ShellUtils.execute_shell_command( - ShellUtils.find_files_in_directory(experiments_base_path)) + ShellUtils.find_files_in_directory(EXPERIMENTS_BASE_PATH)) for experiment_file in experiment_file_response.stdout.strip().split("\n"): - DynamicSubstitutionUtils.add(variables_dict) - DynamicSubstitutionUtils.update_file(experiment_file) ShellUtils.execute_shell_command("chaos run %s" % experiment_file) + html_report_path = generate_html() + allure.attach.file(html_report_path, name='Chaos experiment html report', + attachment_type=allure.attachment_type.HTML) + + +def render_template(exp_template_file, context): + ShellUtils.execute_shell_command(ShellUtils.remove_and_create_directory(EXPERIMENTS_BASE_PATH)) + template_base_dir = "chaostoolkit_nimble/resources/exp_templates" + templateLoader = jinja2.FileSystemLoader(searchpath=template_base_dir) + # templateLoader = jinja2.FileSystemLoader(searchpath="chaostoolkit_nimble/resources/exp_templates/process") + templateEnv = jinja2.Environment(loader=templateLoader) + # exp_template_file = "exp.json" + template = templateEnv.get_template(exp_template_file) + _LOGGER.info('Rendering from template: %s' % template.name) + template.stream(context).dump('%s/exp.json' % EXPERIMENTS_BASE_PATH) + + +def generate_html(): + journal_json_path = "journal.json" + html_report_path = "%s/chaos_report.html" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH + command = "export LC_ALL=en_US.UTF-8 && chaos report --export-format=html5 %s %s" % ( + journal_json_path, html_report_path) + guavus_reposnse = ShellUtils.execute_shell_command(command) + return html_report_path + + +def validate(user_actions, callable_, job_alias, transfer_to_file_server=False, validation_entities=None, + dataset_alias=global_constants.DEFAULT_DATASET_ALIAS, ibs=None, mode=None, output_alias=None, **kwargs): + user_actions.validate(callable_, job_alias, transfer_to_file_server=transfer_to_file_server, + validation_entities=validation_entities, + dataset_alias=dataset_alias, ibs=ibs, mode=mode, output_alias=output_alias, **kwargs) \ No newline at end of file diff --git a/chaostoolkit_nimble/actions/jio/media_plane_actions.py b/chaostoolkit_nimble/actions/jio/media_plane_actions.py index 7a116a1..10da376 100644 --- a/chaostoolkit_nimble/actions/jio/media_plane_actions.py +++ b/chaostoolkit_nimble/actions/jio/media_plane_actions.py @@ -1,12 +1,18 @@ -import re +import collections import logging +import re + import mmh3 from chaostoolkit_nimble.actions.jio.common_actions import CommonActions +from nimble.actions.base.regression.config_actions import ConfigActions +from nimble.core.entity.components import Components +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.shell_utils import ShellUtils class MediaPlaneActions(object): - def __init__(self, job_alias): + def __init__(self, job_alias, config_parser, job_user="ambari-qa"): self._logger = logging.getLogger(__name__) self.job_alias = job_alias @@ -14,7 +20,48 @@ def __init__(self, job_alias): self.seed = 42 self.date_format_all = '%Y%m%d%H%M%S' self.date_format_usual = "%Y-%m-%d %H:%M:%S" - self.frequency=900 + self.frequency = 900 + self.config_actions = ConfigActions() + self.job_user = job_user + actual_output_configs = config_parser.get_job_actual_output_source_configs(self.job_alias, "output1") + self.database_name = actual_output_configs["db_name"] + self.table_name = actual_output_configs["table_name"] + self.node_alias = NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0] + + def schedule_15_min_job(self): + job_base_directory = "/data/jio_copy/microapp1/" + job_config_file = "%s/conf/MediaPlaneJob.json" % job_base_directory + job_script_file = "%s/scripts/media_plane_microapp1.sh" % job_base_directory + + ######## Update job config file + kwargs = collections.OrderedDict() + kwargs["config_separator"] = "=" + kwargs["location"] = "local" + kwargs["base_path"] = job_config_file + kwargs["file_format"] = "json" + kwargs["components"] = [Components.MANAGEMENT.name] + kwargs["properties"] = {"mediaPlaneRawInput.type": "csv", "mediaPlaneRawInput.header": "true", + "mediaPlaneRawInput.pathPrefix": "/tmp/partition_date=", + "mediaPlaneRawInput.tableName": "%s.%s" % (self.database_name, self.table_name)} + self.config_actions.update_configs(**kwargs) + + ######## Update job script file + NodeManager.node_obj.execute_command_on_node(self.node_alias, + ShellUtils.find_and_replace_whole_line_in_file("basedirectory=", + "basedirectory=/data/jio_copy/microapp1/", + job_script_file)) + NodeManager.node_obj.execute_command_on_node(self.node_alias, + ShellUtils.find_and_replace_whole_line_in_file("lastdayepoch=", + """lastdayepoch=`date -d "2019-07-20 05:30:00" +%s`""", + job_script_file)) + NodeManager.node_obj.execute_command_on_node(self.node_alias, ShellUtils.find_and_replace_in_file( + "--timeIncrementInFilesInMin=15", "--timeIncrementInFilesInMin=15", job_script_file)) + NodeManager.node_obj.execute_command_on_node(self.node_alias, ShellUtils.find_and_replace_in_file( + "--durationOfDataToProcessInMin=15", "--durationOfDataToProcessInMin=15", job_script_file)) + ############# Run job on management node + job_run_command = "export SPARK_HOME=/usr/hdp/2.6.5.0-292/spark2 && cd %s && nohup scripts/media_plane_microapp1.sh >> a.out 2>>a.out &" % ( + job_base_directory) + NodeManager.node_obj.execute_remote_command_in_bg(self.node_alias, job_run_command) def none_or_value(self, value): if str(value) == "": @@ -34,7 +81,7 @@ def generate_cell_id(self, mcc, mnc, cell_id): return str(mcc) + str(final_mnc) + '0' * (9 - len(str(cell_id))) + str(cell_id) def generate_hash(self, source_ip, source_port, dest_ip, dest_port): - hash_input_string = "%s|%s|%s|%s"%(source_ip, source_port, dest_ip, dest_port) + hash_input_string = "%s|%s|%s|%s" % (source_ip, source_port, dest_ip, dest_port) hash_value = mmh3.hash(hash_input_string, self.seed) return hash_value @@ -48,23 +95,40 @@ def link_specific_list(self, list): if len(list) == 0: return default_list else: - if self.none_or_value(list[34]).startswith("2405:0203") or self.none_or_value(list[34]).startswith("2405:0204") or self.none_or_value(list[34]).startswith("2405:0205") or self.none_or_value(list[34]).startswith("2409:4000"): - return [self.none_or_value(list[34]), self.none_or_value(list[35]), self.none_or_value(list[19]), self.none_or_value(list[20]), self.none_or_value(list[45]), self.none_or_value(list[43]), self.none_or_value(list[11]), self.none_or_value(list[12]), self.none_or_value(list[22]), self.none_or_value(list[26]), self.none_or_value(list[27]), self.none_or_value(list[32]), self.none_or_value(list[33]), self.none_or_value(list[36])] + if self.none_or_value(list[34]).startswith("2405:0203") or self.none_or_value(list[34]).startswith( + "2405:0204") or self.none_or_value(list[34]).startswith("2405:0205") or self.none_or_value( + list[34]).startswith("2409:4000"): + return [self.none_or_value(list[34]), self.none_or_value(list[35]), self.none_or_value(list[19]), + self.none_or_value(list[20]), self.none_or_value(list[45]), self.none_or_value(list[43]), + self.none_or_value(list[11]), self.none_or_value(list[12]), self.none_or_value(list[22]), + self.none_or_value(list[26]), self.none_or_value(list[27]), self.none_or_value(list[32]), + self.none_or_value(list[33]), self.none_or_value(list[36])] # m_u_source_ip , m_u_source_port , m_u_destination_ip , m_u_destination_port , subscriber_id , subscriber_msisdn , media_active_time_seconds , media_completed_indicator , media_long_call_indicator , media_middle_gap_indicator , media_short_call_indicator , media_single_direction_indicator , media_start_gap_indicator # checked if its uplink record else: - return [self.none_or_value(list[19]), self.none_or_value(list[20]), self.none_or_value(list[34]), self.none_or_value(list[35]), self.none_or_value(list[45]), self.none_or_value(list[43]), self.none_or_value(list[11]), self.none_or_value(list[12]), self.none_or_value(list[22]), self.none_or_value(list[26]), self.none_or_value(list[27]), self.none_or_value(list[32]), self.none_or_value(list[33]), self.none_or_value(list[36])] + return [self.none_or_value(list[19]), self.none_or_value(list[20]), self.none_or_value(list[34]), + self.none_or_value(list[35]), self.none_or_value(list[45]), self.none_or_value(list[43]), + self.none_or_value(list[11]), self.none_or_value(list[12]), self.none_or_value(list[22]), + self.none_or_value(list[26]), self.none_or_value(list[27]), self.none_or_value(list[32]), + self.none_or_value(list[33]), self.none_or_value(list[36])] # m_u_destination_ip , m_u_destination_port , m_u_source_ip , m_u_source_port, subscriber_id , subscriber_msisdn , media_active_time_seconds , media_completed_indicator , media_long_call_indicator , media_middle_gap_indicator , media_short_call_indicator , media_single_direction_indicator , media_start_gap_indicator # checked if its down_link record def validate_media_plane(self, validation_entities): hashed_row_dict = {} - min_time = validation_entities.sqlite_adapter.select("select min(cal_timestamp_time) from %s_input1" % self.job_alias)[-1][0] + min_time = \ + validation_entities.sqlite_adapter.select("select min(cal_timestamp_time) from %s_input1" % self.job_alias)[ + -1][ + 0] - max_time = validation_entities.sqlite_adapter.select("select max(cal_timestamp_time) from %s_input1" % self.job_alias)[-1][0] + max_time = \ + validation_entities.sqlite_adapter.select("select max(cal_timestamp_time) from %s_input1" % self.job_alias)[ + -1][ + 0] - time_range_list = self.comman_action.get_time_range_list(min_time, max_time, self.frequency, date_time_format=self.date_format_usual) + time_range_list = self.comman_action.get_time_range_list(min_time, max_time, self.frequency, + date_time_format=self.date_format_usual) media_header = ['m_hash_tuple', 'm_timestamp', 'm_u_source_ip', 'm_u_source_port', 'm_u_destination_ip', 'm_u_destination_port', 'm_u_imsi', 'm_u_msisdn', 'm_u_call_duration', 'm_u_call_completed', 'm_u_end_gap_indicator', 'm_u_long_call_indicator', 'm_u_middle_gap_indicator', @@ -87,29 +151,36 @@ def validate_media_plane(self, validation_entities): final_media_dump[0].extend(time_columns.split(',')) for time_range in time_range_list: where_clause = "cal_timestamp_time >= '%s' and cal_timestamp_time < '%s'" % (time_range[0], time_range[1]) - total_dump = validation_entities.sqlite_adapter.select("select * from %s_input1 where %s" % (self.job_alias, where_clause)) + total_dump = validation_entities.sqlite_adapter.select( + "select * from %s_input1 where %s" % (self.job_alias, where_clause)) temp_list = [] hashed_row_dict = {} for row in total_dump[1:]: temp_dict = {} row = list(row) - if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith("2405:0205") or row[19].startswith("2409:4000"): + if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith( + "2405:0205") or row[19].startswith("2409:4000"): hash_value = self.generate_hash(row[19], row[20], row[34], row[35]) - elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith("2405:0205") or row[34].startswith("2409:4000"): + elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith( + "2405:0205") or row[34].startswith("2409:4000"): hash_value = self.generate_hash(row[34], row[35], row[19], row[20]) else: continue if hash_value in hashed_row_dict.keys(): - if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith("2405:0205") or row[19].startswith("2409:4000"): + if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith( + "2405:0205") or row[19].startswith("2409:4000"): hashed_row_dict[hash_value]["downlink"] = row - elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith("2405:0205") or row[34].startswith("2409:4000"): + elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith( + "2405:0205") or row[34].startswith("2409:4000"): hashed_row_dict[hash_value]["uplink"] = row else: continue else: - if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith("2405:0205") or row[19].startswith("2409:4000"): + if row[19].startswith("2405:0203") or row[19].startswith("2405:0204") or row[19].startswith( + "2405:0205") or row[19].startswith("2409:4000"): temp_dict = {"downlink": row, "uplink": []} - elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith("2405:0205") or row[34].startswith("2409:4000"): + elif row[34].startswith("2405:0203") or row[34].startswith("2405:0204") or row[34].startswith( + "2405:0205") or row[34].startswith("2409:4000"): temp_dict = {"downlink": [], "uplink": row} else: continue @@ -123,8 +194,10 @@ def validate_media_plane(self, validation_entities): media_d_row = self.link_specific_list(media_downlink_row) if len(media_uplink_row) == 0: m_timestamp = (re.sub('[^A-Za-z0-9]+', "", media_downlink_row[1])) - m_binned_timestamp = (re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_downlink_row[1]))) - sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S","%Y-%m-%d %H:%M:%S") + m_binned_timestamp = ( + re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_downlink_row[1]))) + sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S", + "%Y-%m-%d %H:%M:%S") m_ueip = media_downlink_row[19] m_msisdn = media_uplink_row[43] m_imsi = media_uplink_row[45] @@ -133,18 +206,22 @@ def validate_media_plane(self, validation_entities): m_u_wei_jitter = 0 m_u_wei_pakt_los = 0 m_u_wei_rtd = 0 - m_u_degradation_sum = 0 + m_u_degradation_sum = 0 m_u_jitter_sum = 0 m_u_pakt_los_sum = 0 m_u_rtd_sum = 0 m_u_cell_id = -1 - m_u_calculated_item = [str(m_u_wei_jitter), str(m_u_wei_mos), str(m_u_wei_pakt_los), str(m_u_wei_rtd), str(m_u_jitter_sum), str(m_u_rtd_sum), str(m_u_pakt_los_sum), str(m_u_degradation_sum), str(m_u_cell_id)] + m_u_calculated_item = [str(m_u_wei_jitter), str(m_u_wei_mos), str(m_u_wei_pakt_los), + str(m_u_wei_rtd), str(m_u_jitter_sum), str(m_u_rtd_sum), + str(m_u_pakt_los_sum), str(m_u_degradation_sum), str(m_u_cell_id)] m_cell_id = m_u_cell_id else: m_timestamp = (re.sub('[^A-Za-z0-9]+', "", media_uplink_row[1])) - m_binned_timestamp = (re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_uplink_row[1]))) - sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S","%Y-%m-%d %H:%M:%S") + m_binned_timestamp = ( + re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_uplink_row[1]))) + sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S", + "%Y-%m-%d %H:%M:%S") m_ueip = media_uplink_row[34] m_msisdn = media_uplink_row[43] m_imsi = media_uplink_row[45] @@ -152,18 +229,24 @@ def validate_media_plane(self, validation_entities): m_u_wei_jitter = int(media_uplink_row[37]) m_u_wei_pakt_los = int(media_uplink_row[39]) m_u_wei_rtd = int(media_uplink_row[40]) - m_u_degradation_sum = int(media_uplink_row[13]) + int(media_uplink_row[14]) + int(media_uplink_row[15]) + m_u_degradation_sum = int(media_uplink_row[13]) + int(media_uplink_row[14]) + int( + media_uplink_row[15]) m_u_jitter_sum = int(media_uplink_row[23]) + int(media_uplink_row[24]) + int(media_uplink_row[25]) - m_u_pakt_los_sum = str(int(media_uplink_row[28]) + int(media_uplink_row[29]) + int(media_uplink_row[30])) + m_u_pakt_los_sum = str( + int(media_uplink_row[28]) + int(media_uplink_row[29]) + int(media_uplink_row[30])) m_u_rtd_sum = int(media_uplink_row[16]) + int(media_uplink_row[17]) + int(media_uplink_row[18]) m_u_cell_id = self.generate_cell_id(media_uplink_row[6], media_uplink_row[7], media_uplink_row[3]) - m_u_calculated_item = [str(m_u_wei_jitter), str(m_u_wei_mos), str(m_u_wei_pakt_los), str(m_u_wei_rtd), str(m_u_jitter_sum), str(m_u_rtd_sum), str(m_u_pakt_los_sum), str(m_u_degradation_sum), str(m_u_cell_id)] + m_u_calculated_item = [str(m_u_wei_jitter), str(m_u_wei_mos), str(m_u_wei_pakt_los), + str(m_u_wei_rtd), str(m_u_jitter_sum), str(m_u_rtd_sum), + str(m_u_pakt_los_sum), str(m_u_degradation_sum), str(m_u_cell_id)] m_cell_id = m_u_cell_id if len(media_downlink_row) == 0: m_timestamp = (re.sub('[^A-Za-z0-9]+', "", media_uplink_row[1])) - m_binned_timestamp = (re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_uplink_row[1]))) - sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S","%Y-%m-%d %H:%M:%S") + m_binned_timestamp = ( + re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_uplink_row[1]))) + sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S", + "%Y-%m-%d %H:%M:%S") m_ueip = media_uplink_row[34] m_msisdn = media_downlink_row[43] m_imsi = media_downlink_row[45] @@ -172,18 +255,22 @@ def validate_media_plane(self, validation_entities): m_d_wei_jitter = 0 m_d_wei_pakt_los = 0 m_d_wei_rtd = 0 - m_d_degradation_sum = 0 + m_d_degradation_sum = 0 m_d_jitter_sum = 0 m_d_pakt_los_sum = 0 m_d_rtd_sum = 0 m_d_cell_id = -1 - m_d_calculated_item = [str(m_d_wei_jitter), str(m_d_wei_mos), str(m_d_wei_pakt_los), str(m_d_wei_rtd), str(m_d_jitter_sum), str(m_d_rtd_sum), str(m_d_pakt_los_sum), str(m_d_degradation_sum), str(m_d_cell_id)] + m_d_calculated_item = [str(m_d_wei_jitter), str(m_d_wei_mos), str(m_d_wei_pakt_los), + str(m_d_wei_rtd), str(m_d_jitter_sum), str(m_d_rtd_sum), + str(m_d_pakt_los_sum), str(m_d_degradation_sum), str(m_d_cell_id)] m_cell_id = m_d_cell_id else: m_timestamp = (re.sub('[^A-Za-z0-9]+', "", media_downlink_row[1])) - m_binned_timestamp = (re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_downlink_row[1]))) - sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S", "%Y-%m-%d %H:%M:%S") + m_binned_timestamp = ( + re.sub('[^A-Za-z0-9]+', "", self.comman_action.computeBinTime(media_downlink_row[1]))) + sql_timestamp = self.comman_action.date_format_changer(m_binned_timestamp, "%Y%m%d%H%M%S", + "%Y-%m-%d %H:%M:%S") m_ueip = media_downlink_row[19] m_msisdn = media_downlink_row[43] m_imsi = media_downlink_row[45] @@ -191,12 +278,19 @@ def validate_media_plane(self, validation_entities): m_d_wei_jitter = int(media_downlink_row[37]) m_d_wei_pakt_los = int(media_downlink_row[39]) m_d_wei_rtd = int(media_downlink_row[40]) - m_d_degradation_sum = int(media_downlink_row[13]) + int(media_downlink_row[14]) + int(media_downlink_row[15]) - m_d_jitter_sum = int(media_downlink_row[23]) + int(media_downlink_row[24]) + int(media_downlink_row[25]) - m_d_pakt_los_sum = int(media_downlink_row[28]) + int(media_downlink_row[29]) + int(media_downlink_row[30]) - m_d_rtd_sum = int(media_downlink_row[16]) + int(media_downlink_row[17]) + int(media_downlink_row[18]) - m_d_cell_id = self.generate_cell_id(media_downlink_row[6], media_downlink_row[7], media_downlink_row[3]) - m_d_calculated_item = [str(m_d_wei_jitter), str(m_d_wei_mos), str(m_d_wei_pakt_los), str(m_d_wei_rtd), str(m_d_jitter_sum), str(m_d_rtd_sum), str(m_d_pakt_los_sum), str(m_d_degradation_sum), str(m_d_cell_id)] + m_d_degradation_sum = int(media_downlink_row[13]) + int(media_downlink_row[14]) + int( + media_downlink_row[15]) + m_d_jitter_sum = int(media_downlink_row[23]) + int(media_downlink_row[24]) + int( + media_downlink_row[25]) + m_d_pakt_los_sum = int(media_downlink_row[28]) + int(media_downlink_row[29]) + int( + media_downlink_row[30]) + m_d_rtd_sum = int(media_downlink_row[16]) + int(media_downlink_row[17]) + int( + media_downlink_row[18]) + m_d_cell_id = self.generate_cell_id(media_downlink_row[6], media_downlink_row[7], + media_downlink_row[3]) + m_d_calculated_item = [str(m_d_wei_jitter), str(m_d_wei_mos), str(m_d_wei_pakt_los), + str(m_d_wei_rtd), str(m_d_jitter_sum), str(m_d_rtd_sum), + str(m_d_pakt_los_sum), str(m_d_degradation_sum), str(m_d_cell_id)] m_cell_id = m_d_cell_id m_hash_tuple = hash_value @@ -210,14 +304,20 @@ def validate_media_plane(self, validation_entities): m_jitter_sum = int(m_u_jitter_sum) + int(m_d_jitter_sum) m_pkt_loss_sum = int(m_u_pakt_los_sum) + int(m_d_pakt_los_sum) m_rtd_sum = int(m_u_rtd_sum) + int(m_d_rtd_sum) - m_mos = float(m_wei_mos)/m_degradation_sum/100 - m_jitter = float(m_wei_jitter)/m_jitter_sum - m_packet_loss = float(m_wei_pkt_los)/m_pkt_loss_sum/100 - m_rtd = float(m_wei_rtd)/m_rtd_sum + m_mos = float(m_wei_mos) / m_degradation_sum / 100 + m_jitter = float(m_wei_jitter) / m_jitter_sum + m_packet_loss = float(m_wei_pkt_los) / m_pkt_loss_sum / 100 + m_rtd = float(m_wei_rtd) / m_rtd_sum time_array = self.comman_action.time_column_needful(self.frequency, m_binned_timestamp) - media_output_list = [str(m_hash_tuple), str(m_timestamp)] + media_u_row + m_u_calculated_item + media_d_row + m_d_calculated_item + [str(m_error_code), str(m_ueip), str(m_msisdn), str(m_imsi), str(m_cell_id), str(m_call_id), str(m_wei_mos), str(m_wei_jitter), str(m_wei_pkt_los), str(m_wei_rtd), str(m_degradation_sum), str(m_jitter_sum), str(m_pkt_loss_sum), str(m_rtd_sum), str(m_mos), str(m_jitter), str(m_packet_loss), str(m_rtd), str(m_binned_timestamp), str(sql_timestamp)] + time_array + media_output_list = [str(m_hash_tuple), str( + m_timestamp)] + media_u_row + m_u_calculated_item + media_d_row + m_d_calculated_item + [ + str(m_error_code), str(m_ueip), str(m_msisdn), str(m_imsi), str(m_cell_id), + str(m_call_id), str(m_wei_mos), str(m_wei_jitter), str(m_wei_pkt_los), + str(m_wei_rtd), str(m_degradation_sum), str(m_jitter_sum), str(m_pkt_loss_sum), + str(m_rtd_sum), str(m_mos), str(m_jitter), str(m_packet_loss), str(m_rtd), + str(m_binned_timestamp), str(sql_timestamp)] + time_array temp_list.append(media_output_list) final_media_dump.extend(temp_list) - validation_entities.output_obj[self.job_alias]["output1"] = final_media_dump \ No newline at end of file + validation_entities.output_obj[self.job_alias]["output1"] = final_media_dump diff --git a/chaostoolkit_nimble/controllers/extensions/__init__.py b/chaostoolkit_nimble/controllers/chaosk8s/__init__.py similarity index 100% rename from chaostoolkit_nimble/controllers/extensions/__init__.py rename to chaostoolkit_nimble/controllers/chaosk8s/__init__.py diff --git a/chaostoolkit_nimble/controllers/extensions/chaosk8s/control.py b/chaostoolkit_nimble/controllers/chaosk8s/control.py similarity index 100% rename from chaostoolkit_nimble/controllers/extensions/chaosk8s/control.py rename to chaostoolkit_nimble/controllers/chaosk8s/control.py diff --git a/chaostoolkit_nimble/controllers/extensions/shell_app/control.py b/chaostoolkit_nimble/controllers/extensions/shell_app/control.py deleted file mode 100644 index a3c4dc6..0000000 --- a/chaostoolkit_nimble/controllers/extensions/shell_app/control.py +++ /dev/null @@ -1,32 +0,0 @@ -import time -from typing import List - -from chaoslib.types import Configuration, \ - Experiment, Run, Secrets, Activity - - -def after_activity_control(context: Activity, state: Run, - configuration: Configuration = None, - secrets: Secrets = None, **kwargs): - """ - after-control of the activity's execution - - Called by the Chaos Toolkit before the activity is applied. The result of - the execution is passed as `state`. See - https://docs.chaostoolkit.org/reference/api/journal/#run for more - information. - """ - print("----------------STATE AFTER ACTIVITY: %s" %state) - -def after_method_control(context: Experiment, state: List[Run], - configuration: Configuration = None, - secrets: Secrets = None, **kwargs): - """ - after-control of the method's execution - - Called by the Chaos Toolkit after the activities of the method have been - applied. The `state` is the list of activity results. See - https://docs.chaostoolkit.org/reference/api/journal/#run for more - information. - """ - print("----------------STATE AFTER METHOD: %s" % state) \ No newline at end of file diff --git a/chaostoolkit_nimble/controllers/jio/__init__.py b/chaostoolkit_nimble/controllers/jio/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/chaostoolkit_nimble/controllers/extensions/chaosk8s/__init__.py b/chaostoolkit_nimble/controllers/process/__init__.py similarity index 100% rename from chaostoolkit_nimble/controllers/extensions/chaosk8s/__init__.py rename to chaostoolkit_nimble/controllers/process/__init__.py diff --git a/chaostoolkit_nimble/controllers/jio/control.py b/chaostoolkit_nimble/controllers/process/control.py similarity index 99% rename from chaostoolkit_nimble/controllers/jio/control.py rename to chaostoolkit_nimble/controllers/process/control.py index bf7ed70..aacaddb 100644 --- a/chaostoolkit_nimble/controllers/jio/control.py +++ b/chaostoolkit_nimble/controllers/process/control.py @@ -31,7 +31,7 @@ def before_method_control(context: Experiment, applied. """ logger.debug("----------------CONFIGURATION BEFORE METHOD: %s" % configuration) - sleep(15) + sleep(3) def after_method_control(context: Experiment, diff --git a/chaostoolkit_nimble/controllers/extensions/shell_app/__init__.py b/chaostoolkit_nimble/core/exceptions/__init__.py similarity index 100% rename from chaostoolkit_nimble/controllers/extensions/shell_app/__init__.py rename to chaostoolkit_nimble/core/exceptions/__init__.py diff --git a/chaostoolkit_nimble/core/exceptions/custom_exceptions.py b/chaostoolkit_nimble/core/exceptions/custom_exceptions.py new file mode 100644 index 0000000..1337ef0 --- /dev/null +++ b/chaostoolkit_nimble/core/exceptions/custom_exceptions.py @@ -0,0 +1,7 @@ +class NoDataError(Exception): + """Raised when the no data is found from the given source""" + pass + +class DataRetrievingError(Exception): + """Raised when the data cannot be retrieved from the given source""" + pass \ No newline at end of file diff --git a/chaostoolkit_nimble/core/utils/ha_utils.py b/chaostoolkit_nimble/core/utils/ha_utils.py index a9fb0f0..2618470 100644 --- a/chaostoolkit_nimble/core/utils/ha_utils.py +++ b/chaostoolkit_nimble/core/utils/ha_utils.py @@ -6,16 +6,13 @@ from nimble.core.entity.node_manager import NodeManager from nimble.core.utils.shell_utils import ShellUtils -from chaostoolkit_nimble.core.utils import fabric_utils - - - _LOGGER = logging.getLogger(__name__) - # node_obj = __import__('nimble.core.entity.node_manager').NodeManager.node_obj # import __builtin__ logger.info("Helooooooooo") + + # node_obj = NodeManager.node_obj # @@ -48,13 +45,11 @@ def check_process_running(component, process_name=None): if not process_name: process_name = Components.get_process_name(component) logger.info("Checking if process '%s' is running by fetching its process id." % process_name) - logger.debug("NODE_OBJ FROM ha_utils: ----------------: %s" % NodeManager.node_obj.vip) - # logger.debug("b = ----------------: %s" % fabric_utils.b ) + # logger.debug("NODE_OBJ FROM ha_utils: ----------------: %s" % NodeManager.node_obj.vip) return NodeManager.node_obj.execute_command_on_component(component, ShellUtils.fetch_process_id(process_name), consolidated_success_flag=True) - def kill_process(process_name, component, num_of_nodes=None): """Kill the process of any particular component @@ -72,10 +67,8 @@ def kill_process(process_name, component, num_of_nodes=None): response_list = [] for node_alias in node_aliases: logger.debug("Killing process '%s' on node '%s'" % (process_name, node_alias)) - response_list.append( - NodeManager.node_obj.execute_command_on_node(node_alias, command)) - return response_list - + response_list.append(NodeManager.node_obj.execute_command_on_node(node_alias, command)) + return str(response_list) # # def process_ha(component, process_name=None): # # """This function is used to do the process HA of the components at the remote server. diff --git a/chaostoolkit_nimble/resources/exp_templates/process/exp.json b/chaostoolkit_nimble/resources/exp_templates/process/exp.json new file mode 100644 index 0000000..92cfa8b --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/process/exp.json @@ -0,0 +1,44 @@ +{ + "version": "1.0.0", + "title": "Experiment with killing a process on system", + "description": "The process should auto-respawn after being killed.", + "tags": ["process"], + "controls": [{ + "name": "process-related-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.process.control" + } + }], + "steady-state-hypothesis": { + "title": "Process {{rand_dynamic_process_name}} is up and running", + "probes": [{ + "type": "probe", + "name": "Fetch-process-id", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "check_process_running", + "arguments": { + "component": "{{rand_dynamic_component}}", + "process_name": "{{rand_dynamic_process_name}}" + } + } + }] + }, + "method": [{ + "type": "action", + "name": "Kill-process", + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "kill_process", + "arguments": { + "component": "{{rand_dynamic_component}}", + "process_name": "{{rand_dynamic_process_name}}", + "num_of_nodes": "1" + } + } + }] +} diff --git a/chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json b/chaostoolkit_nimble/resources/exp_templates/process/exp_dynamic.json similarity index 63% rename from chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json rename to chaostoolkit_nimble/resources/exp_templates/process/exp_dynamic.json index ed53b65..e1ce8f8 100644 --- a/chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/process/exp_dynamic.json @@ -1,20 +1,20 @@ { "version": "1.0.0", - "title": "Experiment running shell commands on local and remote", - "description": "Shell command should get successfully executed on remote using python provider through fabric and on local it should run through process provider", - "tags": ["kubernetes"], + "title": "Experiment with killing a process on system", + "description": "The process should auto-respawn after being killed.", + "tags": ["process"], "controls": [{ - "name": "jio-15min-job-controls", + "name": "process-related-controls", "provider": { "type": "python", - "module": "chaostoolkit_nimble.controllers.jio.control" + "module": "chaostoolkit_nimble.controllers.process.control" } }], "steady-state-hypothesis": { - "title": "Shell application is up and running", + "title": "Process is up and running", "probes": [{ "type": "probe", - "name": "Fetch-shell-app-process-id", + "name": "Fetch-process-id", "tolerance": true, "provider": { "module": "chaostoolkit_nimble.core.utils.ha_utils", @@ -29,7 +29,7 @@ }, "method": [{ "type": "action", - "name": "Kill-shell-app", + "name": "Kill-process", "provider": { "module": "chaostoolkit_nimble.core.utils.ha_utils", "type": "python", diff --git a/chaostoolkit_nimble/resources/validation/sample_validation_config.yml b/chaostoolkit_nimble/resources/validation/sample_validation_config.yml index df9b95e..8c47a9f 100644 --- a/chaostoolkit_nimble/resources/validation/sample_validation_config.yml +++ b/chaostoolkit_nimble/resources/validation/sample_validation_config.yml @@ -166,4 +166,4 @@ jobs: actual: source: hive db_name: network360_volte - table_name: media_plane_table \ No newline at end of file + table_name: media_plane_table_chaos \ No newline at end of file diff --git a/chaostoolkit_nimble/tests/conftest.py b/chaostoolkit_nimble/tests/conftest.py index 02b59fa..1414916 100644 --- a/chaostoolkit_nimble/tests/conftest.py +++ b/chaostoolkit_nimble/tests/conftest.py @@ -32,8 +32,6 @@ def pytest_addoption(parser): help="Relative path (to the project root) of the file containing component attributes configs. E.g. python -m pytest --componentAttributesConfig=resources/components/component_attributes_ambari.yml") parser.addoption("--validationConfig", help="Relative path (to the project root) of the file containing validation configs. E.g. python -m pytest --validationConfig=resources/validation/sample_validation_config.yml") - parser.addoption("--chaosExpConfig", - help="Relative path (to the project root) of the file containing chaos experiment configs. E.g. python -m pytest --validationConfig=resources/validation/chaos_exp_config.yml") parser.addoption("--experimentsPath", help="Relative path (to the project root) of the file containing chaos experiment json files. E.g. python -m pytest --validationConfig=resources/validation/chaos_exp_config.yml") diff --git a/chaostoolkit_nimble/tests/sample/test_jio.py b/chaostoolkit_nimble/tests/sample/test_jio.py index d3a81a6..3f5e892 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio.py +++ b/chaostoolkit_nimble/tests/sample/test_jio.py @@ -1,5 +1,3 @@ -import collections - import allure import pytest from chaostoolkit_nimble.actions.base.flows import chaos_user_actions @@ -8,24 +6,21 @@ from nimble.core.entity.components import Components from nimble.core.entity.node_manager import NodeManager from nimble.core.utils.components.hadoop_utils import HadoopCliUtils -from nimble.core.utils.fabric_utils import FabricUtils from nimble.core.utils.shell_utils import ShellUtils class TestJio(): job_alias = "media_plane" - # hdfs_path = "/tmp/aut_squad_dir/csv" job_user = "ambari-qa" - database_name = "network360_volte" - table_name = "media_plane_table" + # hdfs_path = "/tmp/aut_squad_dir/csv" # @pytest.fixture(scope="session") # def sample_actions(self): # return SampleActions(self.job_alias) @pytest.fixture(scope="session") - def media_plane_actions(self): - return MediaPlaneActions(self.job_alias) + def media_plane_actions(self, config_parser): + return MediaPlaneActions(self.job_alias, config_parser, self.job_user) @pytest.fixture(scope="session") def hadoop_cli_utils(self): @@ -44,67 +39,23 @@ def hadoop_cli_utils(self): def config_actions(self): return ConfigActions() - @pytest.fixture(scope="session") - def clean_table(self): - command = "hive -e 'drop table if exists %s.%s'" % (self.database_name, self.table_name) - NodeManager.node_obj.execute_command_on_node( - NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], - ShellUtils.su(self.job_user, command)) + @pytest.fixture(scope="session", autouse=True) + def clean_table(self, media_plane_actions): + command = "hive -e 'drop table if exists %s.%s'" % ( + media_plane_actions.database_name, media_plane_actions.table_name) + assert NodeManager.node_obj.execute_command_on_node(media_plane_actions.node_alias, + ShellUtils.su(self.job_user, command)).status - def test_validation_on_15min_job_ha(self, config_actions, user_actions, media_plane_actions, clean_table): + def test_validation_on_15min_job_ha(self, config_actions, user_actions, media_plane_actions): + node_alias = NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0] with allure.step('Schedule 15 min job'): - job_base_directory = "/data/jio_copy/microapp1/" - job_config_file = "%s/conf/MediaPlaneJob.json" % job_base_directory - job_script_file = "%s/scripts/media_plane_microapp1.sh" % job_base_directory - - ######## Update job config file - kwargs = collections.OrderedDict() - kwargs["config_separator"] = "=" - kwargs["location"] = "local" - kwargs["base_path"] = job_config_file - kwargs["file_format"] = "json" - kwargs["components"] = [Components.MANAGEMENT.name] - kwargs["properties"] = {"mediaPlaneRawInput.type": "csv", "mediaPlaneRawInput.header": "true", - "mediaPlaneRawInput.pathPrefix": "/tmp/partition_date=", - "mediaPlaneRawInput.tableName": "%s.%s" % (self.database_name, self.table_name)} - config_actions.update_configs(**kwargs) - - ######## Update job script file - NodeManager.node_obj.execute_command_on_node( - NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], - ShellUtils.find_and_replace_whole_line_in_file("basedirectory=", - "basedirectory=/data/jio_copy/microapp1/", - job_script_file)) - NodeManager.node_obj.execute_command_on_node( - NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], - ShellUtils.find_and_replace_whole_line_in_file("lastdayepoch=", - """lastdayepoch=`date -d "2019-07-20 05:30:00" +%s`""", - job_script_file)) - NodeManager.node_obj.execute_command_on_node( - NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], - ShellUtils.find_and_replace_in_file("--timeIncrementInFilesInMin=15", - "--timeIncrementInFilesInMin=15", - job_script_file)) - NodeManager.node_obj.execute_command_on_node( - NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], - ShellUtils.find_and_replace_in_file("--durationOfDataToProcessInMin=15", - "--durationOfDataToProcessInMin=15", - job_script_file)) - ####### Run job script on managemnet node - job_run_command = "export SPARK_HOME=/usr/hdp/2.6.5.0-292/spark2 && cd %s && nohup sh scripts/media_plane_microapp1.sh &" % ( - job_base_directory) - # guavus_response = NodeManager.node_obj.execute_command_on_node( - # NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0], - # ShellUtils.su(self.job_user, job_run_command), pty=False) - FabricUtils.run_command_on_remote_in_bg(ShellUtils.su(self.job_user, job_run_command), "192.168.134.170", - "root", "guavus@123") + media_plane_actions.schedule_15_min_job() with allure.step('Perform Job HA via chaostoolkit'): - ####### To be decided where to keep all templates -- fileserver could be an option - template_path = "chaostoolkit_nimble/resources/exp_templates/jio/shell_app_exp.json" - variables_dict = {"rand_dynamic_component": Components.MANAGEMENT.name, - "rand_dynamic_process_name": "media_plane_microapp1", - } - chaos_user_actions.run_experiment(template_path, variables_dict) + exp_template_file = "process/exp.json" + context = {"rand_dynamic_component": Components.MANAGEMENT.name, + "rand_dynamic_process_name": "media_plane_microapp1", + } + chaos_user_actions.run_experiment(exp_template_file, context) with allure.step('Validate the data'): user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) # user_actions.validate(sample_actions.validate_hdfs_to_hdfs_csv, self.job_alias) diff --git a/journal.json b/journal.json index 88366d1..ef9c1fc 100644 --- a/journal.json +++ b/journal.json @@ -4,26 +4,26 @@ "node": "kritika-Saxena.local", "experiment": { "version": "1.0.0", - "title": "Experiment running shell commands on local and remote", - "description": "Shell command should get successfully executed on remote using python provider through fabric and on local it should run through process provider", + "title": "Experiment with killing a process on system", + "description": "The process should auto-respawn after being killed.", "tags": [ - "kubernetes" + "process" ], "controls": [ { - "name": "jio-15min-job-controls", + "name": "process-related-controls", "provider": { "type": "python", - "module": "chaostoolkit_nimble.controllers.jio.control" + "module": "chaostoolkit_nimble.controllers.process.control" } } ], "steady-state-hypothesis": { - "title": "Shell application is up and running", + "title": "Process is up and running", "probes": [ { "type": "probe", - "name": "Fetch-shell-app-process-id", + "name": "Fetch-process-id", "tolerance": true, "provider": { "module": "chaostoolkit_nimble.core.utils.ha_utils", @@ -40,7 +40,7 @@ "method": [ { "type": "action", - "name": "Kill-shell-app", + "name": "Kill-process", "provider": { "module": "chaostoolkit_nimble.core.utils.ha_utils", "type": "python", @@ -55,7 +55,7 @@ ], "dry": false }, - "start": "2019-08-01T13:00:58.814917", + "start": "2019-08-05T13:42:10.450632", "status": "completed", "deviated": false, "steady_states": { @@ -65,7 +65,7 @@ { "activity": { "type": "probe", - "name": "Fetch-shell-app-process-id", + "name": "Fetch-process-id", "tolerance": true, "provider": { "module": "chaostoolkit_nimble.core.utils.ha_utils", @@ -79,9 +79,9 @@ }, "output": true, "status": "succeeded", - "start": "2019-08-01T13:00:58.816403", - "end": "2019-08-01T13:00:59.044517", - "duration": 0.228114, + "start": "2019-08-05T13:42:10.452512", + "end": "2019-08-05T13:42:10.742304", + "duration": 0.289792, "tolerance_met": true } ] @@ -92,7 +92,7 @@ { "activity": { "type": "probe", - "name": "Fetch-shell-app-process-id", + "name": "Fetch-process-id", "tolerance": true, "provider": { "module": "chaostoolkit_nimble.core.utils.ha_utils", @@ -106,9 +106,9 @@ }, "output": true, "status": "succeeded", - "start": "2019-08-01T13:03:14.180425", - "end": "2019-08-01T13:03:14.306980", - "duration": 0.126555, + "start": "2019-08-05T13:44:13.856913", + "end": "2019-08-05T13:44:13.991240", + "duration": 0.134327, "tolerance_met": true } ] @@ -118,7 +118,7 @@ { "activity": { "type": "action", - "name": "Kill-shell-app", + "name": "Kill-process", "provider": { "module": "chaostoolkit_nimble.core.utils.ha_utils", "type": "python", @@ -130,5 +130,14 @@ } } }, - "output": [ - \ No newline at end of file + "output": "[{'stdout': 'kill -9 6112 6113', 'stderr': '', 'status_code': 0, 'node': '192.168.134.170', 'linux_user': None, 'kerberos_client_user': None, 'comments': None, 'object_diff_response': {'intersection_count': None, 'intersection': None, 'union_count': None, 'union': None, 'lhs_diff_count': None, 'lhs_diff': None, 'rhs_diff_count': None, 'rhs_diff': None, 'lhs_extra_count': None, 'lhs_extra': None, 'rhs_extra_count': None, 'rhs_extra': None, 'mismatches_count': None, 'mismatches': None, 'column_diff_count': None, 'column_diff': None, 'actual_file': None, 'expected_file': None}, 'healthcheck_response': {'test_case_id': None, 'test_description': None, 'failure_reason': [], 'resolution': []}, 'container_id': None}]", + "status": "succeeded", + "start": "2019-08-05T13:42:13.751279", + "end": "2019-08-05T13:42:13.853999", + "duration": 0.10272 + } + ], + "rollbacks": [], + "end": "2019-08-05T13:44:13.992637", + "duration": 123.55752611160278 +} \ No newline at end of file From 340486e778f3afaa0d31c81ab4f01fd84333e22b Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 7 Aug 2019 11:25:09 +0530 Subject: [PATCH 08/40] AUT-544-new: Working code + incremental tests . Exception handling for kill action + correction in check_process return --- .../actions/base/flows/chaos_user_actions.py | 19 ++-- .../actions/jio/media_plane_actions.py | 2 +- .../controllers/base/control.py | 10 --- .../controllers/process/control.py | 1 + .../core/exceptions/custom_exceptions.py | 7 +- chaostoolkit_nimble/core/utils/ha_utils.py | 88 ++----------------- chaostoolkit_nimble/tests/sample/test_jio.py | 52 +++-------- 7 files changed, 31 insertions(+), 148 deletions(-) diff --git a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py index c07134d..51790e5 100644 --- a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py +++ b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py @@ -4,30 +4,28 @@ import jinja2 from nimble.core import global_constants from nimble.core.utils.shell_utils import ShellUtils -from nimble.tests.conftest import OPTIONS_DICT _LOGGER = logging.getLogger(__name__) EXPERIMENTS_BASE_PATH = "%s/tmp/experiments/" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH +ShellUtils.execute_shell_command(ShellUtils.remove_and_create_directory(EXPERIMENTS_BASE_PATH)) -def run_experiment(exp_template_file=None, context=None): - if not exp_template_file: - experiments_path = OPTIONS_DICT["experimentsPath"] - ShellUtils.execute_shell_command(ShellUtils.copy(experiments_path, EXPERIMENTS_BASE_PATH)) +def run_experiment(exp_file=None, exp_template_file=None, context=None): + if exp_file: + ShellUtils.execute_shell_command(ShellUtils.copy(exp_file, EXPERIMENTS_BASE_PATH)) else: render_template(exp_template_file, context) experiment_file_response = ShellUtils.execute_shell_command( ShellUtils.find_files_in_directory(EXPERIMENTS_BASE_PATH)) for experiment_file in experiment_file_response.stdout.strip().split("\n"): - ShellUtils.execute_shell_command("chaos run %s" % experiment_file) + a = ShellUtils.execute_shell_command("chaos run %s" % experiment_file) html_report_path = generate_html() allure.attach.file(html_report_path, name='Chaos experiment html report', attachment_type=allure.attachment_type.HTML) def render_template(exp_template_file, context): - ShellUtils.execute_shell_command(ShellUtils.remove_and_create_directory(EXPERIMENTS_BASE_PATH)) template_base_dir = "chaostoolkit_nimble/resources/exp_templates" templateLoader = jinja2.FileSystemLoader(searchpath=template_base_dir) # templateLoader = jinja2.FileSystemLoader(searchpath="chaostoolkit_nimble/resources/exp_templates/process") @@ -45,10 +43,3 @@ def generate_html(): journal_json_path, html_report_path) guavus_reposnse = ShellUtils.execute_shell_command(command) return html_report_path - - -def validate(user_actions, callable_, job_alias, transfer_to_file_server=False, validation_entities=None, - dataset_alias=global_constants.DEFAULT_DATASET_ALIAS, ibs=None, mode=None, output_alias=None, **kwargs): - user_actions.validate(callable_, job_alias, transfer_to_file_server=transfer_to_file_server, - validation_entities=validation_entities, - dataset_alias=dataset_alias, ibs=ibs, mode=mode, output_alias=output_alias, **kwargs) \ No newline at end of file diff --git a/chaostoolkit_nimble/actions/jio/media_plane_actions.py b/chaostoolkit_nimble/actions/jio/media_plane_actions.py index 10da376..018c8a6 100644 --- a/chaostoolkit_nimble/actions/jio/media_plane_actions.py +++ b/chaostoolkit_nimble/actions/jio/media_plane_actions.py @@ -42,7 +42,7 @@ def schedule_15_min_job(self): kwargs["components"] = [Components.MANAGEMENT.name] kwargs["properties"] = {"mediaPlaneRawInput.type": "csv", "mediaPlaneRawInput.header": "true", "mediaPlaneRawInput.pathPrefix": "/tmp/partition_date=", - "mediaPlaneRawInput.tableName": "%s.%s" % (self.database_name, self.table_name)} + "mediaPlaneProcessedOutput.tableName": "%s.%s" % (self.database_name, self.table_name)} self.config_actions.update_configs(**kwargs) ######## Update job script file diff --git a/chaostoolkit_nimble/controllers/base/control.py b/chaostoolkit_nimble/controllers/base/control.py index 8ae43c5..8aa3a8e 100644 --- a/chaostoolkit_nimble/controllers/base/control.py +++ b/chaostoolkit_nimble/controllers/base/control.py @@ -19,15 +19,6 @@ def configure_control(configuration: Configuration = None, The `settings` are only passed when the control is declared in the settings file of the Chaos Toolkit. """ - # testbed_file = "/Users/kritika.saxena/chaos_folder_3/chaos_eng_automation/chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml" - # component_arttributes_file = "/Users/kritika.saxena/chaos_folder_3/chaos_eng_automation/chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml" - # global OPTIONS_DICT - # testbed_file = OPTIONS_DICT["--testbed"] - # component_arttributes_file = OPTIONS_DICT["--componentAttributesConfig"] - # testbed_file = os.environ['TESTBED_FILE'] - # component_arttributes_file = os.environ['COMPONENT_ATTRIBUTES_FILE'] - - setup_files_base_path = "%s/setup" % global_constants.DEFAULT_LOCAL_TMP_PATH testbed_file = ShellUtils.execute_shell_command( ShellUtils.find_files_in_directory(setup_files_base_path, file_name_regex="open_nebula_*")).stdout @@ -35,4 +26,3 @@ def configure_control(configuration: Configuration = None, ShellUtils.find_files_in_directory(setup_files_base_path, file_name_regex="component_*")).stdout NodeManager.initialize(testbed_file, component_attributes_file) logger.debug("NODE_OBJ FROM BASE CONTROLLER----------------: %s" % NodeManager.node_obj.vip) - diff --git a/chaostoolkit_nimble/controllers/process/control.py b/chaostoolkit_nimble/controllers/process/control.py index aacaddb..b5d4754 100644 --- a/chaostoolkit_nimble/controllers/process/control.py +++ b/chaostoolkit_nimble/controllers/process/control.py @@ -7,6 +7,7 @@ control.configure_control() + def after_activity_control(context: Activity, state: Run, configuration: Configuration = None, secrets: Secrets = None, **kwargs): diff --git a/chaostoolkit_nimble/core/exceptions/custom_exceptions.py b/chaostoolkit_nimble/core/exceptions/custom_exceptions.py index 1337ef0..2fca835 100644 --- a/chaostoolkit_nimble/core/exceptions/custom_exceptions.py +++ b/chaostoolkit_nimble/core/exceptions/custom_exceptions.py @@ -1,7 +1,2 @@ -class NoDataError(Exception): - """Raised when the no data is found from the given source""" +class ChaosActionFailedError(Exception): pass - -class DataRetrievingError(Exception): - """Raised when the data cannot be retrieved from the given source""" - pass \ No newline at end of file diff --git a/chaostoolkit_nimble/core/utils/ha_utils.py b/chaostoolkit_nimble/core/utils/ha_utils.py index 2618470..69930c9 100644 --- a/chaostoolkit_nimble/core/utils/ha_utils.py +++ b/chaostoolkit_nimble/core/utils/ha_utils.py @@ -1,6 +1,7 @@ import logging import random +from chaostoolkit_nimble.core.exceptions.custom_exceptions import ChaosActionFailedError from logzero import logger from nimble.core.entity.components import Components from nimble.core.entity.node_manager import NodeManager @@ -8,46 +9,15 @@ _LOGGER = logging.getLogger(__name__) -# node_obj = __import__('nimble.core.entity.node_manager').NodeManager.node_obj -# import __builtin__ -logger.info("Helooooooooo") - - -# node_obj = NodeManager.node_obj - -# -# def _check_process(result): -# process_id = result[0] -# process_id_after_kill_process = result[1] -# return process_id == process_id_after_kill_process or process_id_after_kill_process == '' - - -# def fetch_process_id(component, process_name=None): -# """Fetch the process id for any particular component. -# -# :param component: Name of the component. -# :type component: str -# :return: List of :class:`nimble.core.entity.guavus_response.GuavusResponse` objects. -# :rtype: list -# """ -# if process_name: -# command = ShellUtils.fetch_process_id(process_name) -# else: -# process_name = Components.get_process_name(component) -# command = ShellUtils.fetch_process_id(process_name) -# _LOGGER.info("Fetching process id for process '%s' from component: %s" % (process_name, component)) -# response_list = NodeManager.node_obj.execute_command_on_component(component, command, -# consolidated_success_flag=False) -# return response_list - def check_process_running(component, process_name=None): if not process_name: process_name = Components.get_process_name(component) logger.info("Checking if process '%s' is running by fetching its process id." % process_name) - # logger.debug("NODE_OBJ FROM ha_utils: ----------------: %s" % NodeManager.node_obj.vip) - return NodeManager.node_obj.execute_command_on_component(component, ShellUtils.fetch_process_id(process_name), - consolidated_success_flag=True) + response_list = NodeManager.node_obj.execute_command_on_component(component, + ShellUtils.fetch_process_id(process_name), + consolidated_success_flag=False) + return all([response.stdout != "" for response in response_list]) def kill_process(process_name, component, num_of_nodes=None): @@ -67,48 +37,8 @@ def kill_process(process_name, component, num_of_nodes=None): response_list = [] for node_alias in node_aliases: logger.debug("Killing process '%s' on node '%s'" % (process_name, node_alias)) - response_list.append(NodeManager.node_obj.execute_command_on_node(node_alias, command)) + response = NodeManager.node_obj.execute_command_on_node(node_alias, command) + if "kill -9 " not in response.stdout: + raise ChaosActionFailedError("Could not kill process '%s' on node '%s'" % (process_name, node_alias)) + response_list.append(response) return str(response_list) - -# # def process_ha(component, process_name=None): -# # """This function is used to do the process HA of the components at the remote server. -# # -# # :return: Return object of :class:`nimble.core.entity.guavus_response.GuavusResponse`. -# # :rtype: :class:`nimble.core.entity.guavus_response.GuavusResponse` -# # """ -# # failure_reason = "" -# # kill_process(component, process_name) -# # -# # try: -# # check_process(process_id, component) -# # status_code = 0 -# # except RetryError: -# # status_code = global_constants.DEFAULT_ERROR_CODE -# # guavus_response_after_kill = fetch_process_id(component) -# # process_id_after_kill = guavus_response_after_kill[0].stdout -# # if process_id == process_id_after_kill: -# # failure_reason = "Process is not killed for component %s" % component -# # elif process_id_after_kill == "": -# # failure_reason = "Process is not UP and running after killed for component: %s" % component -# # -# # guavus_response[0].status_code = status_code -# # guavus_response[0].healthcheck_response.failure_reason.append(failure_reason) -# # -# # return guavus_response -# -# -# @retry(wait_fixed=3000, stop_max_delay=300000, retry_on_result=_check_process) -# def check_process(process_id, component): -# """ This function is used to check the process is up or not. -# -# :param process_id: It is the process id before killing the process. -# :type process_id: int -# :param component: Name of the component. -# :type component: str -# :return: Return the process id of the process before kill and after kill. -# :rtype: tuple -# """ -# _LOGGER.info("Running process check for process '%s' on component: %s" % (process_name, component)) -# guavus_response_after_kill = fetch_process_id(component) -# process_id_after_kill_process = guavus_response_after_kill[0].stdout -# return process_id, process_id_after_kill_process diff --git a/chaostoolkit_nimble/tests/sample/test_jio.py b/chaostoolkit_nimble/tests/sample/test_jio.py index 3f5e892..697dbfe 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio.py +++ b/chaostoolkit_nimble/tests/sample/test_jio.py @@ -1,44 +1,20 @@ -import allure import pytest from chaostoolkit_nimble.actions.base.flows import chaos_user_actions from chaostoolkit_nimble.actions.jio.media_plane_actions import MediaPlaneActions -from nimble.actions.base.regression.config_actions import ConfigActions from nimble.core.entity.components import Components from nimble.core.entity.node_manager import NodeManager -from nimble.core.utils.components.hadoop_utils import HadoopCliUtils from nimble.core.utils.shell_utils import ShellUtils +@pytest.mark.incremental class TestJio(): job_alias = "media_plane" job_user = "ambari-qa" - # hdfs_path = "/tmp/aut_squad_dir/csv" - - # @pytest.fixture(scope="session") - # def sample_actions(self): - # return SampleActions(self.job_alias) @pytest.fixture(scope="session") def media_plane_actions(self, config_parser): return MediaPlaneActions(self.job_alias, config_parser, self.job_user) - @pytest.fixture(scope="session") - def hadoop_cli_utils(self): - return HadoopCliUtils() - - # @pytest.fixture(scope="session") - # def send_data_to_hdfs(self, user_actions, hadoop_cli_utils): - # # hadoop_cli_utils.remove(self.hdfs_path, recursive=True) - # # user_actions.send_data_to_hdfs( - # # "modules/platform/validation/aut_squad_test_data/input_output_sources/hdfs/csv/", self.hdfs_path) - # # yield - # # hadoop_cli_utils.remove(self.hdfs_path, recursive=True) - # pass - - @pytest.fixture(scope="session") - def config_actions(self): - return ConfigActions() - @pytest.fixture(scope="session", autouse=True) def clean_table(self, media_plane_actions): command = "hive -e 'drop table if exists %s.%s'" % ( @@ -46,16 +22,16 @@ def clean_table(self, media_plane_actions): assert NodeManager.node_obj.execute_command_on_node(media_plane_actions.node_alias, ShellUtils.su(self.job_user, command)).status - def test_validation_on_15min_job_ha(self, config_actions, user_actions, media_plane_actions): - node_alias = NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0] - with allure.step('Schedule 15 min job'): - media_plane_actions.schedule_15_min_job() - with allure.step('Perform Job HA via chaostoolkit'): - exp_template_file = "process/exp.json" - context = {"rand_dynamic_component": Components.MANAGEMENT.name, - "rand_dynamic_process_name": "media_plane_microapp1", - } - chaos_user_actions.run_experiment(exp_template_file, context) - with allure.step('Validate the data'): - user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) - # user_actions.validate(sample_actions.validate_hdfs_to_hdfs_csv, self.job_alias) + def test_schedule_15min_job(self, media_plane_actions): + media_plane_actions.schedule_15_min_job() + + def test_perform_15min_job_ha(self): + exp_template_file = "process/exp.json" + context = {"rand_dynamic_component": Components.MANAGEMENT.name, + "rand_dynamic_process_name": "media_plane_microapp1", + } + chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) + # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) + + def test_validation_on_15min_job_ha(self, user_actions, media_plane_actions): + user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) From 6c1b7358cb0561aceaecf66db594c59b2bd394eb Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 7 Aug 2019 12:13:10 +0530 Subject: [PATCH 09/40] AUT-544-new: Added assert on job schedule action. --- .../actions/jio/media_plane_actions.py | 6 ++++-- .../resources/validation/sample_validation_config.yml | 2 +- chaostoolkit_nimble/tests/sample/test_jio.py | 11 ++++++----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/chaostoolkit_nimble/actions/jio/media_plane_actions.py b/chaostoolkit_nimble/actions/jio/media_plane_actions.py index 018c8a6..3fd1e5a 100644 --- a/chaostoolkit_nimble/actions/jio/media_plane_actions.py +++ b/chaostoolkit_nimble/actions/jio/media_plane_actions.py @@ -12,7 +12,7 @@ class MediaPlaneActions(object): - def __init__(self, job_alias, config_parser, job_user="ambari-qa"): + def __init__(self, job_alias, config_parser, job_user="ambari-qa", component=Components.MANAGEMENT.name): self._logger = logging.getLogger(__name__) self.job_alias = job_alias @@ -26,7 +26,7 @@ def __init__(self, job_alias, config_parser, job_user="ambari-qa"): actual_output_configs = config_parser.get_job_actual_output_source_configs(self.job_alias, "output1") self.database_name = actual_output_configs["db_name"] self.table_name = actual_output_configs["table_name"] - self.node_alias = NodeManager.node_obj.get_node_aliases_by_component(Components.MANAGEMENT.name)[0] + self.node_alias = NodeManager.node_obj.get_node_aliases_by_component(component)[0] def schedule_15_min_job(self): job_base_directory = "/data/jio_copy/microapp1/" @@ -62,6 +62,8 @@ def schedule_15_min_job(self): job_run_command = "export SPARK_HOME=/usr/hdp/2.6.5.0-292/spark2 && cd %s && nohup scripts/media_plane_microapp1.sh >> a.out 2>>a.out &" % ( job_base_directory) NodeManager.node_obj.execute_remote_command_in_bg(self.node_alias, job_run_command) + return NodeManager.node_obj.execute_command_on_node(self.node_alias, + ShellUtils.fetch_process_id(self.job_alias)).stdout != "" def none_or_value(self, value): if str(value) == "": diff --git a/chaostoolkit_nimble/resources/validation/sample_validation_config.yml b/chaostoolkit_nimble/resources/validation/sample_validation_config.yml index 8c47a9f..0133dc2 100644 --- a/chaostoolkit_nimble/resources/validation/sample_validation_config.yml +++ b/chaostoolkit_nimble/resources/validation/sample_validation_config.yml @@ -153,7 +153,7 @@ jobs: source: hbase table_name: aut_squad_hbase_table scan_filter: FirstKeyOnlyFilter() - media_plane: + media_plane_microapp1: input: input1: source: hdfs diff --git a/chaostoolkit_nimble/tests/sample/test_jio.py b/chaostoolkit_nimble/tests/sample/test_jio.py index 697dbfe..4efd731 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio.py +++ b/chaostoolkit_nimble/tests/sample/test_jio.py @@ -8,12 +8,13 @@ @pytest.mark.incremental class TestJio(): - job_alias = "media_plane" + job_alias = "media_plane_microapp1" job_user = "ambari-qa" + job_running_component = Components.MANAGEMENT.name @pytest.fixture(scope="session") def media_plane_actions(self, config_parser): - return MediaPlaneActions(self.job_alias, config_parser, self.job_user) + return MediaPlaneActions(self.job_alias, config_parser, job_user=self.job_user) @pytest.fixture(scope="session", autouse=True) def clean_table(self, media_plane_actions): @@ -23,12 +24,12 @@ def clean_table(self, media_plane_actions): ShellUtils.su(self.job_user, command)).status def test_schedule_15min_job(self, media_plane_actions): - media_plane_actions.schedule_15_min_job() + assert media_plane_actions.schedule_15_min_job() def test_perform_15min_job_ha(self): exp_template_file = "process/exp.json" - context = {"rand_dynamic_component": Components.MANAGEMENT.name, - "rand_dynamic_process_name": "media_plane_microapp1", + context = {"rand_dynamic_component": self.job_running_component, + "rand_dynamic_process_name": self.job_alias, } chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) From 5989e99753e55e0404bb8ca3591588c7b54663f5 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 7 Aug 2019 16:32:23 +0530 Subject: [PATCH 10/40] AUT-544-new: assert on chaos experiment. --- .../actions/base/flows/chaos_user_actions.py | 10 +++++---- .../controllers/process/control.py | 22 +++++++++++++++++++ chaostoolkit_nimble/tests/sample/test_jio.py | 3 ++- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py index 51790e5..0d137ec 100644 --- a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py +++ b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py @@ -1,4 +1,5 @@ import logging +import re import allure import jinja2 @@ -12,6 +13,7 @@ def run_experiment(exp_file=None, exp_template_file=None, context=None): + status = None if exp_file: ShellUtils.execute_shell_command(ShellUtils.copy(exp_file, EXPERIMENTS_BASE_PATH)) else: @@ -19,18 +21,18 @@ def run_experiment(exp_file=None, exp_template_file=None, context=None): experiment_file_response = ShellUtils.execute_shell_command( ShellUtils.find_files_in_directory(EXPERIMENTS_BASE_PATH)) for experiment_file in experiment_file_response.stdout.strip().split("\n"): - a = ShellUtils.execute_shell_command("chaos run %s" % experiment_file) + response = ShellUtils.execute_shell_command("chaos run %s" % experiment_file) + status = re.search(r'.*Experiment\sended\swith\sstatus:\s(.*)', response.stderr).group(1) html_report_path = generate_html() allure.attach.file(html_report_path, name='Chaos experiment html report', attachment_type=allure.attachment_type.HTML) + assert status == "completed" def render_template(exp_template_file, context): template_base_dir = "chaostoolkit_nimble/resources/exp_templates" templateLoader = jinja2.FileSystemLoader(searchpath=template_base_dir) - # templateLoader = jinja2.FileSystemLoader(searchpath="chaostoolkit_nimble/resources/exp_templates/process") templateEnv = jinja2.Environment(loader=templateLoader) - # exp_template_file = "exp.json" template = templateEnv.get_template(exp_template_file) _LOGGER.info('Rendering from template: %s' % template.name) template.stream(context).dump('%s/exp.json' % EXPERIMENTS_BASE_PATH) @@ -41,5 +43,5 @@ def generate_html(): html_report_path = "%s/chaos_report.html" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH command = "export LC_ALL=en_US.UTF-8 && chaos report --export-format=html5 %s %s" % ( journal_json_path, html_report_path) - guavus_reposnse = ShellUtils.execute_shell_command(command) + ShellUtils.execute_shell_command(command) return html_report_path diff --git a/chaostoolkit_nimble/controllers/process/control.py b/chaostoolkit_nimble/controllers/process/control.py index b5d4754..ef23515 100644 --- a/chaostoolkit_nimble/controllers/process/control.py +++ b/chaostoolkit_nimble/controllers/process/control.py @@ -8,6 +8,9 @@ control.configure_control() +# EXPERIMENT_STATUS = None + + def after_activity_control(context: Activity, state: Run, configuration: Configuration = None, secrets: Secrets = None, **kwargs): @@ -46,3 +49,22 @@ def after_method_control(context: Experiment, """ logger.debug("----------------CONFIGURATION AFTER METHOD: %s" % configuration) sleep(120) + +# def after_experiment_control(context: Experiment, state: Journal, +# configuration: Configuration = None, +# secrets: Secrets = None, **kwargs): +# """ +# after-control of the experiment's execution +# +# Called by the Chaos Toolkit after the experiment's completed. It passes the +# journal of the execution. At that stage, the after control has no influence +# over the execution however. Please see +# https://docs.chaostoolkit.org/reference/api/journal/#journal-elements +# for more information about the journal. +# """ +# logger.debug("Experiment State----------: %s" % state) +# logger.debug("Experiment context--------------------: %s" % context) +# global EXPERIMENT_STATUS +# EXPERIMENT_STATUS = state["status"] +# logger.debug("EXPERIMENT_STATUS %s" % EXPERIMENT_STATUS) +# assert state["status"] == "completed" diff --git a/chaostoolkit_nimble/tests/sample/test_jio.py b/chaostoolkit_nimble/tests/sample/test_jio.py index 4efd731..5df4e91 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio.py +++ b/chaostoolkit_nimble/tests/sample/test_jio.py @@ -29,7 +29,8 @@ def test_schedule_15min_job(self, media_plane_actions): def test_perform_15min_job_ha(self): exp_template_file = "process/exp.json" context = {"rand_dynamic_component": self.job_running_component, - "rand_dynamic_process_name": self.job_alias, + # "rand_dynamic_process_name": self.job_alias, + "rand_dynamic_process_name": "sleep 1h", } chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) From f3da0a0d6a0054e308ab800868c4a06dd48f470d Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Tue, 13 Aug 2019 18:02:03 +0530 Subject: [PATCH 11/40] Spark-executor kill use case for jio job. (Chaos run breaks at validating experiment) --- .../actions/base/flows/chaos_user_actions.py | 2 + .../actions/jio/media_plane_actions.py | 14 +- .../controllers/process/control.py | 22 --- chaostoolkit_nimble/core/utils/ha_utils.py | 2 + .../core/utils/spark_apps_ha_utils.py | 111 +++++++++++ .../core/utils/yarn_apps_ha_utils.py | 11 ++ .../resources/exp_templates/process/exp.json | 80 ++++---- .../resources/exp_templates/spark/__init__.py | 0 .../spark/executor_kill_exp.json | 42 +++++ .../validation/sample_validation_config.yml | 2 +- chaostoolkit_nimble/tests/sample/test_jio.py | 6 +- .../tests/sample/test_jio_spark_job.py | 177 ++++++++++++++++++ requirements.txt | 1 + 13 files changed, 402 insertions(+), 68 deletions(-) create mode 100644 chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py create mode 100644 chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py create mode 100644 chaostoolkit_nimble/resources/exp_templates/spark/__init__.py create mode 100644 chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json create mode 100644 chaostoolkit_nimble/tests/sample/test_jio_spark_job.py diff --git a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py index 0d137ec..894044f 100644 --- a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py +++ b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py @@ -3,6 +3,7 @@ import allure import jinja2 + from nimble.core import global_constants from nimble.core.utils.shell_utils import ShellUtils @@ -21,6 +22,7 @@ def run_experiment(exp_file=None, exp_template_file=None, context=None): experiment_file_response = ShellUtils.execute_shell_command( ShellUtils.find_files_in_directory(EXPERIMENTS_BASE_PATH)) for experiment_file in experiment_file_response.stdout.strip().split("\n"): + experiment_file = "/Users/kritika.saxena/chaos_folder_3/chaos_eng_automation/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp_direct.json" response = ShellUtils.execute_shell_command("chaos run %s" % experiment_file) status = re.search(r'.*Experiment\sended\swith\sstatus:\s(.*)', response.stderr).group(1) html_report_path = generate_html() diff --git a/chaostoolkit_nimble/actions/jio/media_plane_actions.py b/chaostoolkit_nimble/actions/jio/media_plane_actions.py index 3fd1e5a..4efcda7 100644 --- a/chaostoolkit_nimble/actions/jio/media_plane_actions.py +++ b/chaostoolkit_nimble/actions/jio/media_plane_actions.py @@ -27,11 +27,12 @@ def __init__(self, job_alias, config_parser, job_user="ambari-qa", component=Com self.database_name = actual_output_configs["db_name"] self.table_name = actual_output_configs["table_name"] self.node_alias = NodeManager.node_obj.get_node_aliases_by_component(component)[0] + self.job_base_directory = "/data/jio_copy/microapp1/" + self.job_stdout_file = "a.out" def schedule_15_min_job(self): - job_base_directory = "/data/jio_copy/microapp1/" - job_config_file = "%s/conf/MediaPlaneJob.json" % job_base_directory - job_script_file = "%s/scripts/media_plane_microapp1.sh" % job_base_directory + job_config_file = "%s/conf/MediaPlaneJob.json" % self.job_base_directory + job_script_file = "%s/scripts/media_plane_microapp1.sh" % self.job_base_directory ######## Update job config file kwargs = collections.OrderedDict() @@ -59,9 +60,10 @@ def schedule_15_min_job(self): NodeManager.node_obj.execute_command_on_node(self.node_alias, ShellUtils.find_and_replace_in_file( "--durationOfDataToProcessInMin=15", "--durationOfDataToProcessInMin=15", job_script_file)) ############# Run job on management node - job_run_command = "export SPARK_HOME=/usr/hdp/2.6.5.0-292/spark2 && cd %s && nohup scripts/media_plane_microapp1.sh >> a.out 2>>a.out &" % ( - job_base_directory) - NodeManager.node_obj.execute_remote_command_in_bg(self.node_alias, job_run_command) + job_run_command = "export SPARK_HOME=/usr/hdp/2.6.5.0-292/spark2 && cd %s && nohup scripts/media_plane_microapp1.sh >> %s 2>>%s &" % ( + self.job_base_directory, self.job_stdout_file, self.job_stdout_file) + NodeManager.node_obj.execute_remote_command_in_bg(self.node_alias, + ShellUtils.su(self.job_user, job_run_command)) return NodeManager.node_obj.execute_command_on_node(self.node_alias, ShellUtils.fetch_process_id(self.job_alias)).stdout != "" diff --git a/chaostoolkit_nimble/controllers/process/control.py b/chaostoolkit_nimble/controllers/process/control.py index ef23515..b5d4754 100644 --- a/chaostoolkit_nimble/controllers/process/control.py +++ b/chaostoolkit_nimble/controllers/process/control.py @@ -8,9 +8,6 @@ control.configure_control() -# EXPERIMENT_STATUS = None - - def after_activity_control(context: Activity, state: Run, configuration: Configuration = None, secrets: Secrets = None, **kwargs): @@ -49,22 +46,3 @@ def after_method_control(context: Experiment, """ logger.debug("----------------CONFIGURATION AFTER METHOD: %s" % configuration) sleep(120) - -# def after_experiment_control(context: Experiment, state: Journal, -# configuration: Configuration = None, -# secrets: Secrets = None, **kwargs): -# """ -# after-control of the experiment's execution -# -# Called by the Chaos Toolkit after the experiment's completed. It passes the -# journal of the execution. At that stage, the after control has no influence -# over the execution however. Please see -# https://docs.chaostoolkit.org/reference/api/journal/#journal-elements -# for more information about the journal. -# """ -# logger.debug("Experiment State----------: %s" % state) -# logger.debug("Experiment context--------------------: %s" % context) -# global EXPERIMENT_STATUS -# EXPERIMENT_STATUS = state["status"] -# logger.debug("EXPERIMENT_STATUS %s" % EXPERIMENT_STATUS) -# assert state["status"] == "completed" diff --git a/chaostoolkit_nimble/core/utils/ha_utils.py b/chaostoolkit_nimble/core/utils/ha_utils.py index 69930c9..18a94b0 100644 --- a/chaostoolkit_nimble/core/utils/ha_utils.py +++ b/chaostoolkit_nimble/core/utils/ha_utils.py @@ -11,6 +11,8 @@ def check_process_running(component, process_name=None): + ############# To do + ### make this process in shell utils and here call this method only if not process_name: process_name = Components.get_process_name(component) logger.info("Checking if process '%s' is running by fetching its process id." % process_name) diff --git a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py new file mode 100644 index 0000000..191a133 --- /dev/null +++ b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py @@ -0,0 +1,111 @@ +import random + +from logzero import logger + +from chaostoolkit_nimble.core.exceptions.custom_exceptions import ChaosActionFailedError +from nimble.core.adapters.hadoop.base_hadoop_adapter import ApplicationState +from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.shell_utils import ShellUtils + +# spark_client_utils = SparkRestClientUtils() + + +def is_job_running_on_spark(job_name): + from nimble.core.utils.components.spark_utils import SparkRestClientUtils + spark_client_utils = SparkRestClientUtils() + logger.debug("Checking if job '%s' on spark" % job_name) + spark_client_utils.is_job_running(job_name) + + +def kill_active_executors(job_name, num_of_exec=1): + from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils + from nimble.core.utils.components.spark_utils import SparkRestClientUtils + hadoop_rest_client_utils = HadoopRestClientUtils() + spark_client_utils = SparkRestClientUtils() + application_id = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(job_name, + state=ApplicationState.RUNNING.value) + executors = spark_client_utils.get_application_active_executors(application_id) + for i in range(len(executors)): + if executors[i]["id"] == "driver": + executors.pop(i) + break + executors = random.sample(executors, int(num_of_exec)) + response_list = [] + for executor in executors: + executor_id = executor["id"] + node_hostname_domain = executor["hostPort"].split(":")[0] + logger.debug("Killing executor id %s on node %s" % (executor_id, node_hostname_domain)) + response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, + ShellUtils.kill_process_by_name("spark", + pipe_command="grep -i '--executor-id %s'" % executor_id)) + if "kill -9 " not in response.stdout: + raise ChaosActionFailedError( + "Could not kill process with executor id %s on node %s" % (executor_id, node_hostname_domain)) + response_list.append(response) + return str(response_list) + +# # def kill_executor_on_node(node_alias): +# +# def kill_driver() . +# +# +# create +# its +# template +# also +# +# --------------yarn +# +# +# def get_all_jobs(): +# in nimble +# "curl -i -X GET http://192.168.134.192:8088/ws/v1/cluster/apps/" +# +# +# def get_all_jobs_with_state(state): +# in nimble +# "curl -i -X GET http://192.168.134.192:8088/ws/v1/cluster/apps/?state=RUNNING" +# +# +# def get_application_id(): +# in nimble +# +# +# def get_job_details(): +# in nimble +# "curl -i -X GET http://192.168.134.192:8088/ws/v1/cluster/apps/application_1563363670071_0206" +# +# +# def get_job_elapsed_time(): +# +# +# -------------------- spark +# +# +# def get_all_jobs(): +# in nimble +# "curl -i -X GET http://testautomation-mgt-01.cloud.in.guavus.com:18081/api/v1/applications/" +# X +# +# +# def get_job_details(): +# in nimble +# "curl -i -X GET http://testautomation-mgt-01.cloud.in.guavus.com:18081/api/v1/applications/application_1563363670071_0214" +# +# +# def get_job_attempt_ids(): +# in nimble +# +# +# def get_job_executors(): +# "curl -i -X GET http://testautomation-mgt-01.cloud.in.guavus.com:18081/api/v1/applications/application_1563363670071_0214/1/executors/" +# in nimble +# +# +# handlings +# for kerberos +# improve +# logging +# docstrings +# pending +# things diff --git a/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py new file mode 100644 index 0000000..8d80053 --- /dev/null +++ b/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py @@ -0,0 +1,11 @@ +from logzero import logger + + +# hadoop_rest_client_utils = HadoopRestClientUtils() + + +def is_job_running_on_yarn(job_name): + from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils + hadoop_rest_client_utils = HadoopRestClientUtils() + logger.debug("Checking if job '%s' on yarn" % job_name) + return hadoop_rest_client_utils.is_yarn_job_running(job_name) diff --git a/chaostoolkit_nimble/resources/exp_templates/process/exp.json b/chaostoolkit_nimble/resources/exp_templates/process/exp.json index 92cfa8b..e73fcbb 100644 --- a/chaostoolkit_nimble/resources/exp_templates/process/exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/process/exp.json @@ -1,44 +1,52 @@ { - "version": "1.0.0", - "title": "Experiment with killing a process on system", - "description": "The process should auto-respawn after being killed.", - "tags": ["process"], - "controls": [{ - "name": "process-related-controls", + "version": "1.0.0", + "title": "Experiment with killing a process on system", + "description": "The process should auto-respawn after being killed.", + "tags": [ + "process" + ], + "controls": [ + { + "name": "process-related-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.process.control" + } + } + ], + "steady-state-hypothesis": { + "title": "Process {{rand_dynamic_process_name}} is up and running", + "probes": [ + { + "type": "probe", + "name": "Fetch-process-id", + "tolerance": true, "provider": { - "type": "python", - "module": "chaostoolkit_nimble.controllers.process.control" + "module": "chaostoolkit_nimble.core.utils.ha_utils", + "type": "python", + "func": "check_process_running", + "arguments": { + "component": "{{rand_dynamic_component}}", + "process_name": "{{rand_dynamic_process_name}}" + } } - }], - "steady-state-hypothesis": { - "title": "Process {{rand_dynamic_process_name}} is up and running", - "probes": [{ - "type": "probe", - "name": "Fetch-process-id", - "tolerance": true, - "provider": { - "module": "chaostoolkit_nimble.core.utils.ha_utils", + } + ] + }, + "method": [ + { + "type": "action", + "name": "Kill-process", + "provider": { + "module": "chaostoolkit_nimble.core.utils.ha_utils", "type": "python", - "func": "check_process_running", + "func": "kill_process", "arguments": { "component": "{{rand_dynamic_component}}", - "process_name": "{{rand_dynamic_process_name}}" - } - } - }] - }, - "method": [{ - "type": "action", - "name": "Kill-process", - "provider": { - "module": "chaostoolkit_nimble.core.utils.ha_utils", - "type": "python", - "func": "kill_process", - "arguments": { - "component": "{{rand_dynamic_component}}", - "process_name": "{{rand_dynamic_process_name}}", - "num_of_nodes": "1" + "process_name": "{{rand_dynamic_process_name}}", + "num_of_nodes": "1" } } - }] -} + } + ] +} \ No newline at end of file diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/__init__.py b/chaostoolkit_nimble/resources/exp_templates/spark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json new file mode 100644 index 0000000..3e434ff --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json @@ -0,0 +1,42 @@ +{ + "version": "1.0.0", + "title": "Experiment upon killing spark executor process/es on slave hosts", + "description": "The spark executor process/es should get auto-respawned after being killed.", + "tags": ["spark"], + "controls": [{ + "name": "process-related-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.process.control" + } + }], + "steady-state-hypothesis": { + "title": "Job {{yarn_job_name}} is up and running on yarn", + "probes": [{ + "type": "probe", + "name": "Check-job-{{yarn_job_name}}-running-on-yarn", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.yarn_apps_ha_utils", + "type": "python", + "func": "is_job_running_on_yarn", + "arguments": { + "job_name": "{{yarn_job_name}}" + } + } + }] + }, + "method": [{ + "type": "action", + "name": "Kill-active-spark-executors-for-job-{{spark_job_name}}", + "provider": { + "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", + "type": "python", + "func": "kill_active_executors", + "arguments": { + "job_name": "{{spark_job_name}}", + "num_of_exec": "{{num_of_exec_to_kill}}" + } + } + }] +} \ No newline at end of file diff --git a/chaostoolkit_nimble/resources/validation/sample_validation_config.yml b/chaostoolkit_nimble/resources/validation/sample_validation_config.yml index 0133dc2..8636604 100644 --- a/chaostoolkit_nimble/resources/validation/sample_validation_config.yml +++ b/chaostoolkit_nimble/resources/validation/sample_validation_config.yml @@ -153,7 +153,7 @@ jobs: source: hbase table_name: aut_squad_hbase_table scan_filter: FirstKeyOnlyFilter() - media_plane_microapp1: + Media_Plane: input: input1: source: hdfs diff --git a/chaostoolkit_nimble/tests/sample/test_jio.py b/chaostoolkit_nimble/tests/sample/test_jio.py index 5df4e91..dcddcff 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio.py +++ b/chaostoolkit_nimble/tests/sample/test_jio.py @@ -8,7 +8,7 @@ @pytest.mark.incremental class TestJio(): - job_alias = "media_plane_microapp1" + job_alias = "Media_Plane" job_user = "ambari-qa" job_running_component = Components.MANAGEMENT.name @@ -29,8 +29,8 @@ def test_schedule_15min_job(self, media_plane_actions): def test_perform_15min_job_ha(self): exp_template_file = "process/exp.json" context = {"rand_dynamic_component": self.job_running_component, - # "rand_dynamic_process_name": self.job_alias, - "rand_dynamic_process_name": "sleep 1h", + "rand_dynamic_process_name": self.job_alias, + # "rand_dynamic_process_name": "sleep 1h", } chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) diff --git a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py new file mode 100644 index 0000000..15c5e6a --- /dev/null +++ b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -0,0 +1,177 @@ +import pytest +from chaostoolkit_nimble.actions.base.flows import chaos_user_actions +from chaostoolkit_nimble.actions.jio.media_plane_actions import MediaPlaneActions +from nimble.core.entity.components import Components + +from nimble.core.entity.node_manager import NodeManager + +from nimble.core.utils.shell_utils import ShellUtils + +from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils + +from nimble.core.utils.components.spark_utils import SparkRestClientUtils + + +@pytest.mark.incremental +class TestJioSparkJob(): + job_alias = "Media_Plane" + job_user = "ambari-qa" + job_running_component = Components.MANAGEMENT.name + + @pytest.fixture(scope="session") + def media_plane_actions(self, config_parser): + return MediaPlaneActions(self.job_alias, config_parser, job_user=self.job_user) + + @pytest.fixture(scope="session") + def clean_table(self, media_plane_actions): + command = "hive -e 'drop table if exists %s.%s'" % ( + media_plane_actions.database_name, media_plane_actions.table_name) + assert NodeManager.node_obj.execute_command_on_node(media_plane_actions.node_alias, + ShellUtils.su(self.job_user, command)).status + + @pytest.fixture(scope="session") + def clean_job_stdout_files(self, media_plane_actions): + command = "cd %s && rm -rf %s" % (media_plane_actions.job_base_directory, media_plane_actions.job_stdout_file) + assert NodeManager.node_obj.execute_command_on_node(media_plane_actions.node_alias, + ShellUtils.su(self.job_user, command)).status + + ############################################# YARN APIs + + # def test_get_all_yarn_jobs(self): + # res1 = HadoopRestClientUtils().get_all_yarn_jobs() + # a = 1 + # + # def test_get_all_yarn_jobs_with_state(self): + # res2 = HadoopRestClientUtils().get_all_yarn_jobs(state=ApplicationState.FINISHED.value) + # a = 1 + # + # def test_get_yarn_most_recent_application_id_by_job_name(self): + # res3 = HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value) + # a = 1 + # + # def test_get_yarn_job_details(self): + # res4 = HadoopRestClientUtils().get_yarn_job_details(HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value)) + # a = 1 + # + # def test_get_yarn_job_attempts(self): + # res5 = HadoopRestClientUtils().get_yarn_job_attempts( + # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value)) + # a = 1 + # + # def test_get_yarn_job_last_attempt_id(self): + # rest6 = HadoopRestClientUtils().get_yarn_job_last_attempt_id( + # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value)) + # + # a = 1 + + ######################################## SPARK APIs + + # def test_get_all_applications(self): + # sres1 = SparkRestClientUtils().get_all_applications() + # a = 1 + # + # def test_get_all_applications_with_status(self): + # sres1 = SparkRestClientUtils().get_all_applications(status=ApplicationStatus.COMPLETED.value) + # a = 1 + # + # def test_get_application_details(self): + # sres2 = SparkRestClientUtils().get_application_details( + # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value)) + # a = 1 + # + # def test_get_application_attempts(self): + # sres3 = SparkRestClientUtils().get_application_attempts( + # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value)) + # a = 3 + # + # def test_get_application_last_attempt_id(self): + # sres4 = SparkRestClientUtils().get_application_last_attempt_id( + # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value)) + # a = 2 + # + # def test_get_application_all_executors(self): + # sres5 = SparkRestClientUtils().get_application_all_executors( + # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value)) + # a = 3 + # + # def test_get_application_active_executors(self): + # sres6 = SparkRestClientUtils().get_application_active_executors( + # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, + # ApplicationState.FINISHED.value)) + # a = 4 + + ######################################## CHAOS APIs + # def test_is_job_running_on_yarn(self): + # yres1 = yarn_apps_ha_utils.is_job_running_on_yarn(self.job_alias) + # a = 3 + # + # def test_is_job_running_on_spark(self): + # ssres1 = spark_apps_ha_utils.is_job_running_on_spark("Media Plane") + # a = 2 + # + # def test_kill_executor(self): + # ssres2 = spark_apps_ha_utils.kill_active_executors("Media Plane", num_of_exec=2) + # a = 4 + + # def test_kill_executor(self): + # job_name = self.job_alias + # num_of_exec = 1 + # hadoop_rest_client_utils = HadoopRestClientUtils() + # spark_client_utils = SparkRestClientUtils() + # application_id = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(job_name, + # state=ApplicationState.KILLED.value) + # executors = spark_client_utils.get_application_active_executors(application_id) + # for i in range(len(executors)): + # if executors[i]["id"] == "driver": + # executors.pop(i) + # break + # executors = random.sample(executors, int(num_of_exec)) + # response_list = [] + # for executor in executors: + # executor_id = executor["id"] + # node_hostname_domain = executor["hostPort"].split(":")[0] + # response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, + # ShellUtils.kill_process_by_name("spark", + # pipe_command="grep -i '--executor-id %s'" % executor_id)) + # if "kill -9 " not in response.stdout: + # raise ChaosActionFailedError( + # "Could not kill process with executor id '%s' on node '%s'" % (executor_id, node_hostname_domain)) + # response_list.append(response) + # return str(response_list) + + # def test_1(self): + # arm = BaseHadoopAdapter().active_resource_manager() + # a = 5 + + # def test_is_job_running_on_yarn(self): + # hadoop_rest_client_utils = HadoopRestClientUtils() + # a = hadoop_rest_client_utils.is_yarn_job_running(self.job_alias) + # b = 4 + + # def test_is_job_running_on_spark(self): + # spark_client_utils = SparkRestClientUtils() + # c = spark_client_utils.is_job_running("Media Plane") + # d = 5 + + def test_schedule_15min_job(self, media_plane_actions, clean_table, clean_job_stdout_files): + assert media_plane_actions.schedule_15_min_job() + + def test_perform_15min_spark_job_ha(self, ): + exp_template_file = "spark/executor_kill_exp.json" + context = {"yarn_job_name": self.job_alias, + "spark_job_name": "Media_Plane", + "num_of_exec_to_kill": "1", + } + chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) + # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) + + def test_validation_on_15min_job_ha(self, user_actions, media_plane_actions): + user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) diff --git a/requirements.txt b/requirements.txt index ca16931..f1d5362 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ logzero chaostoolkit-lib pyyaml pytest-html +chaostoolkit chaostoolkit-reporting From a0a34775cea014b1337cfe6c87c384d03d5c9b98 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 16 Aug 2019 16:48:59 +0530 Subject: [PATCH 12/40] Added retries and exception handlings in yarn and spark apis being used for chaos. Also added after experiment control. Code working (Breaking at chaos action). --- .../actions/base/flows/chaos_user_actions.py | 1 - .../actions/jio/media_plane_actions.py | 4 +- .../controllers/spark/__init__.py | 0 .../controllers/spark/control.py | 70 ++++++++++++++ .../core/utils/spark_apps_ha_utils.py | 93 +++---------------- .../core/utils/yarn_apps_ha_utils.py | 11 ++- .../spark/executor_kill_exp.json | 4 +- .../tests/sample/test_jio_spark_job.py | 81 +++++++++++----- 8 files changed, 152 insertions(+), 112 deletions(-) create mode 100644 chaostoolkit_nimble/controllers/spark/__init__.py create mode 100644 chaostoolkit_nimble/controllers/spark/control.py diff --git a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py index 894044f..e574009 100644 --- a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py +++ b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py @@ -22,7 +22,6 @@ def run_experiment(exp_file=None, exp_template_file=None, context=None): experiment_file_response = ShellUtils.execute_shell_command( ShellUtils.find_files_in_directory(EXPERIMENTS_BASE_PATH)) for experiment_file in experiment_file_response.stdout.strip().split("\n"): - experiment_file = "/Users/kritika.saxena/chaos_folder_3/chaos_eng_automation/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp_direct.json" response = ShellUtils.execute_shell_command("chaos run %s" % experiment_file) status = re.search(r'.*Experiment\sended\swith\sstatus:\s(.*)', response.stderr).group(1) html_report_path = generate_html() diff --git a/chaostoolkit_nimble/actions/jio/media_plane_actions.py b/chaostoolkit_nimble/actions/jio/media_plane_actions.py index 4efcda7..c253f0e 100644 --- a/chaostoolkit_nimble/actions/jio/media_plane_actions.py +++ b/chaostoolkit_nimble/actions/jio/media_plane_actions.py @@ -1,8 +1,8 @@ import collections import logging +import mmh3 import re -import mmh3 from chaostoolkit_nimble.actions.jio.common_actions import CommonActions from nimble.actions.base.regression.config_actions import ConfigActions from nimble.core.entity.components import Components @@ -64,7 +64,7 @@ def schedule_15_min_job(self): self.job_base_directory, self.job_stdout_file, self.job_stdout_file) NodeManager.node_obj.execute_remote_command_in_bg(self.node_alias, ShellUtils.su(self.job_user, job_run_command)) - return NodeManager.node_obj.execute_command_on_node(self.node_alias, + return NodeManager.node_obj.execute_command_on_node(self.node_alias, "sleep 5s ; %s" % ShellUtils.fetch_process_id(self.job_alias)).stdout != "" def none_or_value(self, value): diff --git a/chaostoolkit_nimble/controllers/spark/__init__.py b/chaostoolkit_nimble/controllers/spark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chaostoolkit_nimble/controllers/spark/control.py b/chaostoolkit_nimble/controllers/spark/control.py new file mode 100644 index 0000000..ee008ab --- /dev/null +++ b/chaostoolkit_nimble/controllers/spark/control.py @@ -0,0 +1,70 @@ +from chaoslib.types import Experiment, Configuration, Secrets, Activity, Run, Journal +from logzero import logger + +from chaostoolkit_nimble.controllers.base import control +from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils + +control.configure_control() +APPLICATION_ID = None + + +def after_activity_control(context: Activity, state: Run, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + after-control of the activity's execution + + Called by the Chaos Toolkit before the activity is applied. The result of + the execution is passed as `state`. See + https://docs.chaostoolkit.org/reference/api/journal/#run for more + information. + """ + logger.debug("----------------STATE AFTER ACTIVITY: %s" % state) + + +def before_method_control(context: Experiment, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + before-control of the method's execution + + Called by the Chaos Toolkit before the activities of the method are + applied. + """ + logger.debug("----------------CONFIGURATION BEFORE METHOD: %s" % configuration) + + +def after_method_control(context: Experiment, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + before-control of the method's execution + + Called by the Chaos Toolkit before the activities of the method are + applied. + """ + logger.debug("----------------CONFIGURATION AFTER METHOD: %s" % configuration) + + +def after_experiment_control(context: Experiment, state: Journal, + configuration: Configuration = None, + secrets: Secrets = None, **kwargs): + """ + after-control of the experiment's execution + + Called by the Chaos Toolkit after the experiment's completed. It passes the + journal of the execution. At that stage, the after control has no influence + over the execution however. Please see + https://docs.chaostoolkit.org/reference/api/journal/#journal-elements + for more information about the journal. + """ + logger.debug("AFTER EXPERIMENT CONTROL: %s" % state) + hadoop_rest_client_utils = HadoopRestClientUtils() + if hadoop_rest_client_utils.is_yarn_job_finished(APPLICATION_ID): + job_stats = hadoop_rest_client_utils.get_yarn_job_details(APPLICATION_ID) + logger.info("Total execution time for yarn job with application id %s: %s" % ( + APPLICATION_ID, job_stats["app"]["elapsedTime"])) + else: + logger.info("Yarn job with application id %s is not in FINISHED state. Please check." % APPLICATION_ID) + logger.info("Stats for application id %s: %s" % ( + APPLICATION_ID, hadoop_rest_client_utils.get_yarn_job_details(APPLICATION_ID))) diff --git a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py index 191a133..f4c99b8 100644 --- a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py +++ b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py @@ -1,30 +1,27 @@ import random from logzero import logger +from retrying import RetryError +from chaostoolkit_nimble.controllers.spark import control from chaostoolkit_nimble.core.exceptions.custom_exceptions import ChaosActionFailedError from nimble.core.adapters.hadoop.base_hadoop_adapter import ApplicationState from nimble.core.entity.node_manager import NodeManager +from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils +from nimble.core.utils.components.spark_utils import SparkRestClientUtils from nimble.core.utils.shell_utils import ShellUtils -# spark_client_utils = SparkRestClientUtils() - - -def is_job_running_on_spark(job_name): - from nimble.core.utils.components.spark_utils import SparkRestClientUtils - spark_client_utils = SparkRestClientUtils() - logger.debug("Checking if job '%s' on spark" % job_name) - spark_client_utils.is_job_running(job_name) - def kill_active_executors(job_name, num_of_exec=1): - from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils - from nimble.core.utils.components.spark_utils import SparkRestClientUtils hadoop_rest_client_utils = HadoopRestClientUtils() spark_client_utils = SparkRestClientUtils() - application_id = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(job_name, - state=ApplicationState.RUNNING.value) - executors = spark_client_utils.get_application_active_executors(application_id) + try: + control.APPLICATION_ID = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(job_name, + state=ApplicationState.RUNNING.value) + except RetryError: + raise ChaosActionFailedError( + "Could not fetch yarn application id for job %s in state %s:" % (job_name, ApplicationState.RUNNING.value)) + executors = spark_client_utils.get_application_active_executors(control.APPLICATION_ID) for i in range(len(executors)): if executors[i]["id"] == "driver": executors.pop(i) @@ -37,75 +34,9 @@ def kill_active_executors(job_name, num_of_exec=1): logger.debug("Killing executor id %s on node %s" % (executor_id, node_hostname_domain)) response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, ShellUtils.kill_process_by_name("spark", - pipe_command="grep -i '--executor-id %s'" % executor_id)) + pipe_command='grep -i "executor-id %s"' % executor_id)) if "kill -9 " not in response.stdout: raise ChaosActionFailedError( "Could not kill process with executor id %s on node %s" % (executor_id, node_hostname_domain)) response_list.append(response) return str(response_list) - -# # def kill_executor_on_node(node_alias): -# -# def kill_driver() . -# -# -# create -# its -# template -# also -# -# --------------yarn -# -# -# def get_all_jobs(): -# in nimble -# "curl -i -X GET http://192.168.134.192:8088/ws/v1/cluster/apps/" -# -# -# def get_all_jobs_with_state(state): -# in nimble -# "curl -i -X GET http://192.168.134.192:8088/ws/v1/cluster/apps/?state=RUNNING" -# -# -# def get_application_id(): -# in nimble -# -# -# def get_job_details(): -# in nimble -# "curl -i -X GET http://192.168.134.192:8088/ws/v1/cluster/apps/application_1563363670071_0206" -# -# -# def get_job_elapsed_time(): -# -# -# -------------------- spark -# -# -# def get_all_jobs(): -# in nimble -# "curl -i -X GET http://testautomation-mgt-01.cloud.in.guavus.com:18081/api/v1/applications/" -# X -# -# -# def get_job_details(): -# in nimble -# "curl -i -X GET http://testautomation-mgt-01.cloud.in.guavus.com:18081/api/v1/applications/application_1563363670071_0214" -# -# -# def get_job_attempt_ids(): -# in nimble -# -# -# def get_job_executors(): -# "curl -i -X GET http://testautomation-mgt-01.cloud.in.guavus.com:18081/api/v1/applications/application_1563363670071_0214/1/executors/" -# in nimble -# -# -# handlings -# for kerberos -# improve -# logging -# docstrings -# pending -# things diff --git a/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py index 8d80053..605d5e1 100644 --- a/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py +++ b/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py @@ -1,11 +1,14 @@ from logzero import logger +from retrying import RetryError - -# hadoop_rest_client_utils = HadoopRestClientUtils() +from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils def is_job_running_on_yarn(job_name): - from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils hadoop_rest_client_utils = HadoopRestClientUtils() logger.debug("Checking if job '%s' on yarn" % job_name) - return hadoop_rest_client_utils.is_yarn_job_running(job_name) + try: + return hadoop_rest_client_utils.is_yarn_job_running(job_name=job_name) + except RetryError: + logger.info("Not able to fetch yarn job status.") + return False diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json index 3e434ff..439b3fe 100644 --- a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json @@ -4,10 +4,10 @@ "description": "The spark executor process/es should get auto-respawned after being killed.", "tags": ["spark"], "controls": [{ - "name": "process-related-controls", + "name": "spark-related-controls", "provider": { "type": "python", - "module": "chaostoolkit_nimble.controllers.process.control" + "module": "chaostoolkit_nimble.controllers.spark.control" } }], "steady-state-hypothesis": { diff --git a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py index 15c5e6a..56acb5d 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +++ b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -1,15 +1,14 @@ +import logging + import pytest + from chaostoolkit_nimble.actions.base.flows import chaos_user_actions from chaostoolkit_nimble.actions.jio.media_plane_actions import MediaPlaneActions from nimble.core.entity.components import Components - from nimble.core.entity.node_manager import NodeManager - from nimble.core.utils.shell_utils import ShellUtils -from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils - -from nimble.core.utils.components.spark_utils import SparkRestClientUtils +_LOGGER = logging.getLogger(__name__) @pytest.mark.incremental @@ -32,8 +31,61 @@ def clean_table(self, media_plane_actions): @pytest.fixture(scope="session") def clean_job_stdout_files(self, media_plane_actions): command = "cd %s && rm -rf %s" % (media_plane_actions.job_base_directory, media_plane_actions.job_stdout_file) - assert NodeManager.node_obj.execute_command_on_node(media_plane_actions.node_alias, - ShellUtils.su(self.job_user, command)).status + NodeManager.node_obj.execute_command_on_node(media_plane_actions.node_alias, + ShellUtils.su(self.job_user, command)) + + def test_schedule_15min_job(self, media_plane_actions, clean_table, clean_job_stdout_files): + assert media_plane_actions.schedule_15_min_job() + + # def test_is_job_running_on_yarn(self): + # hadoop_rest_client_utils = HadoopRestClientUtils() + # try: + # a = hadoop_rest_client_utils.is_yarn_job_running(self.job_alias) + # except RetryError: + # _LOGGER.exception("Not able to fetch CDAP query status") + # + # def test_kill_active_executors(self): + # yarn_job_name = "Media_Plane" + # spark_job_name = "Media Plane" + # num_of_exec = 1 + # hadoop_rest_client_utils = HadoopRestClientUtils() + # spark_client_utils = SparkRestClientUtils() + # try: + # application_id = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(yarn_job_name, + # state=ApplicationState.RUNNING.value) + # except RetryError: + # raise ChaosActionFailedError("Could not fetch yarn application id for job %s in state %s:" % ( + # yarn_job_name, ApplicationState.RUNNING.value)) + # executors = spark_client_utils.get_application_active_executors(application_id) + # for i in range(len(executors)): + # if executors[i]["id"] == "driver": + # executors.pop(i) + # break + # executors = random.sample(executors, int(num_of_exec)) + # response_list = [] + # for executor in executors: + # executor_id = executor["id"] + # node_hostname_domain = executor["hostPort"].split(":")[0] + # logger.debug("Killing executor id %s on node %s" % (executor_id, node_hostname_domain)) + # response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, + # ShellUtils.kill_process_by_name("spark", + # pipe_command='grep -i "executor-id %s"' % executor_id)) + # if "kill -9 " not in response.stdout: + # raise ChaosActionFailedError( + # "Could not kill process with executor id %s on node %s" % (executor_id, node_hostname_domain)) + # response_list.append(response) + # return str(response_list) + + def test_perform_15min_spark_job_ha(self): + exp_template_file = "spark/executor_kill_exp.json" + context = {"yarn_job_name": self.job_alias, + "num_of_exec_to_kill": "1", + } + chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) + # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) + + def test_validation_on_15min_job_ha(self, user_actions, media_plane_actions): + user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) ############################################# YARN APIs @@ -160,18 +212,3 @@ def clean_job_stdout_files(self, media_plane_actions): # spark_client_utils = SparkRestClientUtils() # c = spark_client_utils.is_job_running("Media Plane") # d = 5 - - def test_schedule_15min_job(self, media_plane_actions, clean_table, clean_job_stdout_files): - assert media_plane_actions.schedule_15_min_job() - - def test_perform_15min_spark_job_ha(self, ): - exp_template_file = "spark/executor_kill_exp.json" - context = {"yarn_job_name": self.job_alias, - "spark_job_name": "Media_Plane", - "num_of_exec_to_kill": "1", - } - chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) - # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) - - def test_validation_on_15min_job_ha(self, user_actions, media_plane_actions): - user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) From 46c929a6901741a4203d4cb8f6c320c61f0837fb Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 16 Aug 2019 18:58:28 +0530 Subject: [PATCH 13/40] Working code. --- .../spark/executor_kill_exp.json | 12 +- .../tests/sample/test_jio_spark_job.py | 167 +----------------- 2 files changed, 7 insertions(+), 172 deletions(-) diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json index 439b3fe..1e42149 100644 --- a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "title": "Experiment upon killing spark executor process/es on slave hosts", + "title": "Experiment with killing n number of spark executors for a spark job.", "description": "The spark executor process/es should get auto-respawned after being killed.", "tags": ["spark"], "controls": [{ @@ -11,30 +11,30 @@ } }], "steady-state-hypothesis": { - "title": "Job {{yarn_job_name}} is up and running on yarn", + "title": "Job {{job_name}} is up and running on yarn", "probes": [{ "type": "probe", - "name": "Check-job-{{yarn_job_name}}-running-on-yarn", + "name": "Check-job-{{job_name}}-running-on-yarn", "tolerance": true, "provider": { "module": "chaostoolkit_nimble.core.utils.yarn_apps_ha_utils", "type": "python", "func": "is_job_running_on_yarn", "arguments": { - "job_name": "{{yarn_job_name}}" + "job_name": "{{job_name}}" } } }] }, "method": [{ "type": "action", - "name": "Kill-active-spark-executors-for-job-{{spark_job_name}}", + "name": "Kill-active-spark-executors-for-job-{{job_name}}", "provider": { "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", "type": "python", "func": "kill_active_executors", "arguments": { - "job_name": "{{spark_job_name}}", + "job_name": "{{job_name}}", "num_of_exec": "{{num_of_exec_to_kill}}" } } diff --git a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py index 56acb5d..f881e47 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +++ b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -37,48 +37,9 @@ def clean_job_stdout_files(self, media_plane_actions): def test_schedule_15min_job(self, media_plane_actions, clean_table, clean_job_stdout_files): assert media_plane_actions.schedule_15_min_job() - # def test_is_job_running_on_yarn(self): - # hadoop_rest_client_utils = HadoopRestClientUtils() - # try: - # a = hadoop_rest_client_utils.is_yarn_job_running(self.job_alias) - # except RetryError: - # _LOGGER.exception("Not able to fetch CDAP query status") - # - # def test_kill_active_executors(self): - # yarn_job_name = "Media_Plane" - # spark_job_name = "Media Plane" - # num_of_exec = 1 - # hadoop_rest_client_utils = HadoopRestClientUtils() - # spark_client_utils = SparkRestClientUtils() - # try: - # application_id = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(yarn_job_name, - # state=ApplicationState.RUNNING.value) - # except RetryError: - # raise ChaosActionFailedError("Could not fetch yarn application id for job %s in state %s:" % ( - # yarn_job_name, ApplicationState.RUNNING.value)) - # executors = spark_client_utils.get_application_active_executors(application_id) - # for i in range(len(executors)): - # if executors[i]["id"] == "driver": - # executors.pop(i) - # break - # executors = random.sample(executors, int(num_of_exec)) - # response_list = [] - # for executor in executors: - # executor_id = executor["id"] - # node_hostname_domain = executor["hostPort"].split(":")[0] - # logger.debug("Killing executor id %s on node %s" % (executor_id, node_hostname_domain)) - # response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, - # ShellUtils.kill_process_by_name("spark", - # pipe_command='grep -i "executor-id %s"' % executor_id)) - # if "kill -9 " not in response.stdout: - # raise ChaosActionFailedError( - # "Could not kill process with executor id %s on node %s" % (executor_id, node_hostname_domain)) - # response_list.append(response) - # return str(response_list) - def test_perform_15min_spark_job_ha(self): exp_template_file = "spark/executor_kill_exp.json" - context = {"yarn_job_name": self.job_alias, + context = {"job_name": self.job_alias, "num_of_exec_to_kill": "1", } chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) @@ -86,129 +47,3 @@ def test_perform_15min_spark_job_ha(self): def test_validation_on_15min_job_ha(self, user_actions, media_plane_actions): user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) - - ############################################# YARN APIs - - # def test_get_all_yarn_jobs(self): - # res1 = HadoopRestClientUtils().get_all_yarn_jobs() - # a = 1 - # - # def test_get_all_yarn_jobs_with_state(self): - # res2 = HadoopRestClientUtils().get_all_yarn_jobs(state=ApplicationState.FINISHED.value) - # a = 1 - # - # def test_get_yarn_most_recent_application_id_by_job_name(self): - # res3 = HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value) - # a = 1 - # - # def test_get_yarn_job_details(self): - # res4 = HadoopRestClientUtils().get_yarn_job_details(HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value)) - # a = 1 - # - # def test_get_yarn_job_attempts(self): - # res5 = HadoopRestClientUtils().get_yarn_job_attempts( - # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value)) - # a = 1 - # - # def test_get_yarn_job_last_attempt_id(self): - # rest6 = HadoopRestClientUtils().get_yarn_job_last_attempt_id( - # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value)) - # - # a = 1 - - ######################################## SPARK APIs - - # def test_get_all_applications(self): - # sres1 = SparkRestClientUtils().get_all_applications() - # a = 1 - # - # def test_get_all_applications_with_status(self): - # sres1 = SparkRestClientUtils().get_all_applications(status=ApplicationStatus.COMPLETED.value) - # a = 1 - # - # def test_get_application_details(self): - # sres2 = SparkRestClientUtils().get_application_details( - # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value)) - # a = 1 - # - # def test_get_application_attempts(self): - # sres3 = SparkRestClientUtils().get_application_attempts( - # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value)) - # a = 3 - # - # def test_get_application_last_attempt_id(self): - # sres4 = SparkRestClientUtils().get_application_last_attempt_id( - # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value)) - # a = 2 - # - # def test_get_application_all_executors(self): - # sres5 = SparkRestClientUtils().get_application_all_executors( - # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value)) - # a = 3 - # - # def test_get_application_active_executors(self): - # sres6 = SparkRestClientUtils().get_application_active_executors( - # HadoopRestClientUtils().get_yarn_most_recent_application_id_by_job_name(self.job_alias, - # ApplicationState.FINISHED.value)) - # a = 4 - - ######################################## CHAOS APIs - # def test_is_job_running_on_yarn(self): - # yres1 = yarn_apps_ha_utils.is_job_running_on_yarn(self.job_alias) - # a = 3 - # - # def test_is_job_running_on_spark(self): - # ssres1 = spark_apps_ha_utils.is_job_running_on_spark("Media Plane") - # a = 2 - # - # def test_kill_executor(self): - # ssres2 = spark_apps_ha_utils.kill_active_executors("Media Plane", num_of_exec=2) - # a = 4 - - # def test_kill_executor(self): - # job_name = self.job_alias - # num_of_exec = 1 - # hadoop_rest_client_utils = HadoopRestClientUtils() - # spark_client_utils = SparkRestClientUtils() - # application_id = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(job_name, - # state=ApplicationState.KILLED.value) - # executors = spark_client_utils.get_application_active_executors(application_id) - # for i in range(len(executors)): - # if executors[i]["id"] == "driver": - # executors.pop(i) - # break - # executors = random.sample(executors, int(num_of_exec)) - # response_list = [] - # for executor in executors: - # executor_id = executor["id"] - # node_hostname_domain = executor["hostPort"].split(":")[0] - # response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, - # ShellUtils.kill_process_by_name("spark", - # pipe_command="grep -i '--executor-id %s'" % executor_id)) - # if "kill -9 " not in response.stdout: - # raise ChaosActionFailedError( - # "Could not kill process with executor id '%s' on node '%s'" % (executor_id, node_hostname_domain)) - # response_list.append(response) - # return str(response_list) - - # def test_1(self): - # arm = BaseHadoopAdapter().active_resource_manager() - # a = 5 - - # def test_is_job_running_on_yarn(self): - # hadoop_rest_client_utils = HadoopRestClientUtils() - # a = hadoop_rest_client_utils.is_yarn_job_running(self.job_alias) - # b = 4 - - # def test_is_job_running_on_spark(self): - # spark_client_utils = SparkRestClientUtils() - # c = spark_client_utils.is_job_running("Media Plane") - # d = 5 From e1e8ca29d9945d1bba5ed5c668889909335ab263 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Mon, 19 Aug 2019 14:14:33 +0530 Subject: [PATCH 14/40] Driver kill code + Logging updation in exec kill action + ms to minutes conversion --- .../controllers/spark/control.py | 7 ++- .../core/utils/spark_apps_ha_utils.py | 46 +++++++++++++++++-- .../exp_templates/spark/driver_kill_exp.json | 41 +++++++++++++++++ .../tests/sample/test_jio_spark_job.py | 1 + 4 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json diff --git a/chaostoolkit_nimble/controllers/spark/control.py b/chaostoolkit_nimble/controllers/spark/control.py index ee008ab..49edf89 100644 --- a/chaostoolkit_nimble/controllers/spark/control.py +++ b/chaostoolkit_nimble/controllers/spark/control.py @@ -3,6 +3,7 @@ from chaostoolkit_nimble.controllers.base import control from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils +from nimble.core.utils.date_utils import DateUtils, Timezone control.configure_control() APPLICATION_ID = None @@ -58,12 +59,14 @@ def after_experiment_control(context: Experiment, state: Journal, https://docs.chaostoolkit.org/reference/api/journal/#journal-elements for more information about the journal. """ + date_utils = DateUtils(Timezone.UTC.value) logger.debug("AFTER EXPERIMENT CONTROL: %s" % state) hadoop_rest_client_utils = HadoopRestClientUtils() if hadoop_rest_client_utils.is_yarn_job_finished(APPLICATION_ID): job_stats = hadoop_rest_client_utils.get_yarn_job_details(APPLICATION_ID) - logger.info("Total execution time for yarn job with application id %s: %s" % ( - APPLICATION_ID, job_stats["app"]["elapsedTime"])) + logger.info("Total execution time for yarn job with application id %s: %s ms (i.e %s minutes) " % ( + APPLICATION_ID, job_stats["app"]["elapsedTime"], + date_utils.get_minutes_from_milliseconds(job_stats["app"]["elapsedTime"]))) else: logger.info("Yarn job with application id %s is not in FINISHED state. Please check." % APPLICATION_ID) logger.info("Stats for application id %s: %s" % ( diff --git a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py index f4c99b8..fe105d0 100644 --- a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py +++ b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py @@ -16,12 +16,18 @@ def kill_active_executors(job_name, num_of_exec=1): hadoop_rest_client_utils = HadoopRestClientUtils() spark_client_utils = SparkRestClientUtils() try: + logger.info("Fetching yarn application id for the running job %s." % job_name) control.APPLICATION_ID = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(job_name, state=ApplicationState.RUNNING.value) except RetryError: raise ChaosActionFailedError( - "Could not fetch yarn application id for job %s in state %s:" % (job_name, ApplicationState.RUNNING.value)) - executors = spark_client_utils.get_application_active_executors(control.APPLICATION_ID) + "Could not fetch yarn application id for job %s. Job not found in '%s' state" % (job_name, ApplicationState.RUNNING.value)) + try: + logger.info("Fetching spark active executors for application id : %s" % control.APPLICATION_ID) + executors = spark_client_utils.get_application_active_executors(control.APPLICATION_ID) + except RetryError: + raise ChaosActionFailedError( + "Could not fetch spark executors for the application id :" % control.APPLICATION_ID) for i in range(len(executors)): if executors[i]["id"] == "driver": executors.pop(i) @@ -31,12 +37,44 @@ def kill_active_executors(job_name, num_of_exec=1): for executor in executors: executor_id = executor["id"] node_hostname_domain = executor["hostPort"].split(":")[0] - logger.debug("Killing executor id %s on node %s" % (executor_id, node_hostname_domain)) + logger.debug("Killing spark executor id %s on node %s" % (executor_id, node_hostname_domain)) response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, ShellUtils.kill_process_by_name("spark", pipe_command='grep -i "executor-id %s"' % executor_id)) if "kill -9 " not in response.stdout: raise ChaosActionFailedError( - "Could not kill process with executor id %s on node %s" % (executor_id, node_hostname_domain)) + "Could not kill process with spark executor id %s on node %s" % (executor_id, node_hostname_domain)) response_list.append(response) return str(response_list) + + +def kill_driver(job_name): + hadoop_rest_client_utils = HadoopRestClientUtils() + spark_client_utils = SparkRestClientUtils() + try: + logger.info("Fetching yarn application id for the running job %s." % job_name) + control.APPLICATION_ID = hadoop_rest_client_utils.get_yarn_most_recent_application_id_by_job_name(job_name, + state=ApplicationState.RUNNING.value) + except RetryError: + raise ChaosActionFailedError( + "Could not fetch yarn application id for job %s in state %s:" % (job_name, ApplicationState.RUNNING.value)) + try: + logger.info("Fetching spark driver for application id : %s" % control.APPLICATION_ID) + executors = spark_client_utils.get_application_active_executors(control.APPLICATION_ID) + except RetryError: + raise ChaosActionFailedError( + "Could not fetch spark executors for the job:" % job_name) + response = None + for executor in executors: + if executor["id"] == "driver": + executor_id = executor["id"] + node_hostname_domain = executor["hostPort"].split(":")[0] + logger.debug("Killing spark driver on node %s" % node_hostname_domain) + response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, + ShellUtils.kill_process_by_name("spark", + pipe_command='grep -i "executor-id %s"' % executor_id)) + if "kill -9 " not in response.stdout: + raise ChaosActionFailedError( + "Could not kill spark driver process on node %s" % node_hostname_domain) + break + return str(response) diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json new file mode 100644 index 0000000..db66244 --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json @@ -0,0 +1,41 @@ +{ + "version": "1.0.0", + "title": "Experiment with killing the spark driver for the spark job.", + "description": "The spark driver should get auto-respawned after being killed.", + "tags": ["spark"], + "controls": [{ + "name": "spark-related-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.spark.control" + } + }], + "steady-state-hypothesis": { + "title": "Job {{job_name}} is up and running on yarn", + "probes": [{ + "type": "probe", + "name": "Check-job-{{job_name}}-running-on-yarn", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.yarn_apps_ha_utils", + "type": "python", + "func": "is_job_running_on_yarn", + "arguments": { + "job_name": "{{job_name}}" + } + } + }] + }, + "method": [{ + "type": "action", + "name": "Kill-spark-driver-for-job-{{job_name}}", + "provider": { + "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", + "type": "python", + "func": "kill_driver", + "arguments": { + "job_name": "{{job_name}}" + } + } + }] +} \ No newline at end of file diff --git a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py index f881e47..5b5c681 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +++ b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -39,6 +39,7 @@ def test_schedule_15min_job(self, media_plane_actions, clean_table, clean_job_st def test_perform_15min_spark_job_ha(self): exp_template_file = "spark/executor_kill_exp.json" + # exp_template_file = "spark/driver_kill_exp.json" context = {"job_name": self.job_alias, "num_of_exec_to_kill": "1", } From a7d3ed00651f92c4759c9397e15051c56b19066a Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Mon, 19 Aug 2019 15:06:55 +0530 Subject: [PATCH 15/40] Grep on app id for driver kill + journal custom path --- .../actions/base/flows/chaos_user_actions.py | 10 +++++----- chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py index e574009..3b5d149 100644 --- a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py +++ b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py @@ -15,6 +15,7 @@ def run_experiment(exp_file=None, exp_template_file=None, context=None): status = None + journal_path = "%s/journal.json" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH if exp_file: ShellUtils.execute_shell_command(ShellUtils.copy(exp_file, EXPERIMENTS_BASE_PATH)) else: @@ -22,9 +23,9 @@ def run_experiment(exp_file=None, exp_template_file=None, context=None): experiment_file_response = ShellUtils.execute_shell_command( ShellUtils.find_files_in_directory(EXPERIMENTS_BASE_PATH)) for experiment_file in experiment_file_response.stdout.strip().split("\n"): - response = ShellUtils.execute_shell_command("chaos run %s" % experiment_file) + response = ShellUtils.execute_shell_command("chaos run --journal-path %s %s" % (journal_path, experiment_file)) status = re.search(r'.*Experiment\sended\swith\sstatus:\s(.*)', response.stderr).group(1) - html_report_path = generate_html() + html_report_path = generate_html(journal_path) allure.attach.file(html_report_path, name='Chaos experiment html report', attachment_type=allure.attachment_type.HTML) assert status == "completed" @@ -39,10 +40,9 @@ def render_template(exp_template_file, context): template.stream(context).dump('%s/exp.json' % EXPERIMENTS_BASE_PATH) -def generate_html(): - journal_json_path = "journal.json" +def generate_html(journal_path): html_report_path = "%s/chaos_report.html" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH command = "export LC_ALL=en_US.UTF-8 && chaos report --export-format=html5 %s %s" % ( - journal_json_path, html_report_path) + journal_path, html_report_path) ShellUtils.execute_shell_command(command) return html_report_path diff --git a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py index fe105d0..3017010 100644 --- a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py +++ b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py @@ -21,7 +21,8 @@ def kill_active_executors(job_name, num_of_exec=1): state=ApplicationState.RUNNING.value) except RetryError: raise ChaosActionFailedError( - "Could not fetch yarn application id for job %s. Job not found in '%s' state" % (job_name, ApplicationState.RUNNING.value)) + "Could not fetch yarn application id for job %s. Job not found in '%s' state" % ( + job_name, ApplicationState.RUNNING.value)) try: logger.info("Fetching spark active executors for application id : %s" % control.APPLICATION_ID) executors = spark_client_utils.get_application_active_executors(control.APPLICATION_ID) @@ -67,12 +68,11 @@ def kill_driver(job_name): response = None for executor in executors: if executor["id"] == "driver": - executor_id = executor["id"] node_hostname_domain = executor["hostPort"].split(":")[0] logger.debug("Killing spark driver on node %s" % node_hostname_domain) response = NodeManager.node_obj.execute_command_on_hostname_domain(node_hostname_domain, ShellUtils.kill_process_by_name("spark", - pipe_command='grep -i "executor-id %s"' % executor_id)) + pipe_command='grep -i %s' % control.APPLICATION_ID)) if "kill -9 " not in response.stdout: raise ChaosActionFailedError( "Could not kill spark driver process on node %s" % node_hostname_domain) From 913017ceb50b9737b763cb1e72a63201ff5cc091 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Mon, 19 Aug 2019 17:51:45 +0530 Subject: [PATCH 16/40] handled retryerror on after experiment control + inserted rollbacks empty blocks. --- chaostoolkit_nimble/controllers/spark/control.py | 10 +++++----- .../resources/exp_templates/process/exp.json | 4 +++- .../resources/exp_templates/spark/driver_kill_exp.json | 4 +++- .../exp_templates/spark/executor_kill_exp.json | 4 +++- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/chaostoolkit_nimble/controllers/spark/control.py b/chaostoolkit_nimble/controllers/spark/control.py index 49edf89..abf9aee 100644 --- a/chaostoolkit_nimble/controllers/spark/control.py +++ b/chaostoolkit_nimble/controllers/spark/control.py @@ -1,5 +1,6 @@ from chaoslib.types import Experiment, Configuration, Secrets, Activity, Run, Journal from logzero import logger +from retrying import RetryError from chaostoolkit_nimble.controllers.base import control from nimble.core.utils.components.hadoop_utils import HadoopRestClientUtils @@ -62,12 +63,11 @@ def after_experiment_control(context: Experiment, state: Journal, date_utils = DateUtils(Timezone.UTC.value) logger.debug("AFTER EXPERIMENT CONTROL: %s" % state) hadoop_rest_client_utils = HadoopRestClientUtils() - if hadoop_rest_client_utils.is_yarn_job_finished(APPLICATION_ID): + try: + hadoop_rest_client_utils.is_yarn_job_finished(APPLICATION_ID) job_stats = hadoop_rest_client_utils.get_yarn_job_details(APPLICATION_ID) logger.info("Total execution time for yarn job with application id %s: %s ms (i.e %s minutes) " % ( APPLICATION_ID, job_stats["app"]["elapsedTime"], date_utils.get_minutes_from_milliseconds(job_stats["app"]["elapsedTime"]))) - else: - logger.info("Yarn job with application id %s is not in FINISHED state. Please check." % APPLICATION_ID) - logger.info("Stats for application id %s: %s" % ( - APPLICATION_ID, hadoop_rest_client_utils.get_yarn_job_details(APPLICATION_ID))) + except RetryError: + logger.info("Yarn job with application id %s is not in 'FINISHED' state. Please check." % APPLICATION_ID) \ No newline at end of file diff --git a/chaostoolkit_nimble/resources/exp_templates/process/exp.json b/chaostoolkit_nimble/resources/exp_templates/process/exp.json index e73fcbb..b492aac 100644 --- a/chaostoolkit_nimble/resources/exp_templates/process/exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/process/exp.json @@ -48,5 +48,7 @@ } } } - ] + ], + "rollbacks": [ + ] } \ No newline at end of file diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json index db66244..d514a80 100644 --- a/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json @@ -37,5 +37,7 @@ "job_name": "{{job_name}}" } } - }] + }], + "rollbacks": [ + ] } \ No newline at end of file diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json index 1e42149..008e39e 100644 --- a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json @@ -38,5 +38,7 @@ "num_of_exec": "{{num_of_exec_to_kill}}" } } - }] + }], + "rollbacks": [ + ] } \ No newline at end of file From 4232c5c6aa66a59321700f0456cc5b13d21fbcc3 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Tue, 20 Aug 2019 14:48:38 +0530 Subject: [PATCH 17/40] Increased spark query timeout to from 90s to 180s and added spark driver + exec kill combo --- .../spark/driver_and_executor_kill_exp.json | 64 +++++++++++++ .../exp_templates/spark/driver_kill_exp.json | 88 +++++++++--------- .../spark/executor_kill_exp.json | 90 ++++++++++--------- .../tests/sample/test_jio_spark_job.py | 1 + 4 files changed, 162 insertions(+), 81 deletions(-) create mode 100644 chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json new file mode 100644 index 0000000..79e7b37 --- /dev/null +++ b/chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json @@ -0,0 +1,64 @@ +{ + "version": "1.0.0", + "title": "Experiment with killing the spark driver and n number of executors for the spark job {{job_name}}.", + "description": "The spark driver and executor process should get auto-respawned after being killed.", + "tags": [ + "spark" + ], + "controls": [ + { + "name": "spark-related-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.spark.control" + } + } + ], + "steady-state-hypothesis": { + "title": "Job {{job_name}} is up and running on yarn.", + "probes": [ + { + "type": "probe", + "name": "Check job {{job_name}} is running on yarn ", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.yarn_apps_ha_utils", + "type": "python", + "func": "is_job_running_on_yarn", + "arguments": { + "job_name": "{{job_name}}" + } + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Kill spark driver for the job {{job_name}} ", + "provider": { + "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", + "type": "python", + "func": "kill_driver", + "arguments": { + "job_name": "{{job_name}}" + } + } + }, + { + "type": "action", + "name": "Kill spark executors for job {{job_name}} ", + "provider": { + "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", + "type": "python", + "func": "kill_active_executors", + "arguments": { + "job_name": "{{job_name}}", + "num_of_exec": "{{num_of_exec_to_kill}}" + } + } + } + ], + "rollbacks": [ + ] +} \ No newline at end of file diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json index d514a80..ed91767 100644 --- a/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json @@ -1,43 +1,51 @@ { - "version": "1.0.0", - "title": "Experiment with killing the spark driver for the spark job.", - "description": "The spark driver should get auto-respawned after being killed.", - "tags": ["spark"], - "controls": [{ - "name": "spark-related-controls", - "provider": { - "type": "python", - "module": "chaostoolkit_nimble.controllers.spark.control" - } - }], - "steady-state-hypothesis": { - "title": "Job {{job_name}} is up and running on yarn", - "probes": [{ - "type": "probe", - "name": "Check-job-{{job_name}}-running-on-yarn", - "tolerance": true, - "provider": { - "module": "chaostoolkit_nimble.core.utils.yarn_apps_ha_utils", - "type": "python", - "func": "is_job_running_on_yarn", - "arguments": { - "job_name": "{{job_name}}" - } - } - }] - }, - "method": [{ - "type": "action", - "name": "Kill-spark-driver-for-job-{{job_name}}", - "provider": { - "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", - "type": "python", - "func": "kill_driver", - "arguments": { - "job_name": "{{job_name}}" - } - } - }], - "rollbacks": [ + "version": "1.0.0", + "title": "Experiment with killing the spark driver for the spark job {{job_name}}.", + "description": "The spark driver should get auto-respawned after being killed.", + "tags": [ + "spark" + ], + "controls": [ + { + "name": "spark-related-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.spark.control" + } + } + ], + "steady-state-hypothesis": { + "title": "Job {{job_name}} is up and running on yarn.", + "probes": [ + { + "type": "probe", + "name": "Check job {{job_name}} running on yarn. ", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.yarn_apps_ha_utils", + "type": "python", + "func": "is_job_running_on_yarn", + "arguments": { + "job_name": "{{job_name}}" + } + } + } ] + }, + "method": [ + { + "type": "action", + "name": "Kill spark driver for job {{job_name}}. ", + "provider": { + "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", + "type": "python", + "func": "kill_driver", + "arguments": { + "job_name": "{{job_name}}" + } + } + } + ], + "rollbacks": [ + ] } \ No newline at end of file diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json index 008e39e..e8c18ff 100644 --- a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json @@ -1,44 +1,52 @@ { - "version": "1.0.0", - "title": "Experiment with killing n number of spark executors for a spark job.", - "description": "The spark executor process/es should get auto-respawned after being killed.", - "tags": ["spark"], - "controls": [{ - "name": "spark-related-controls", - "provider": { - "type": "python", - "module": "chaostoolkit_nimble.controllers.spark.control" - } - }], - "steady-state-hypothesis": { - "title": "Job {{job_name}} is up and running on yarn", - "probes": [{ - "type": "probe", - "name": "Check-job-{{job_name}}-running-on-yarn", - "tolerance": true, - "provider": { - "module": "chaostoolkit_nimble.core.utils.yarn_apps_ha_utils", - "type": "python", - "func": "is_job_running_on_yarn", - "arguments": { - "job_name": "{{job_name}}" - } - } - }] - }, - "method": [{ - "type": "action", - "name": "Kill-active-spark-executors-for-job-{{job_name}}", - "provider": { - "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", - "type": "python", - "func": "kill_active_executors", - "arguments": { - "job_name": "{{job_name}}", - "num_of_exec": "{{num_of_exec_to_kill}}" - } - } - }], - "rollbacks": [ + "version": "1.0.0", + "title": "Experiment with killing n number of spark executors for a spark job {{job_name}}.", + "description": "The spark executor process/es should get auto-respawned after being killed.", + "tags": [ + "spark" + ], + "controls": [ + { + "name": "spark-related-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.spark.control" + } + } + ], + "steady-state-hypothesis": { + "title": "Job {{job_name}} is up and running on yarn.", + "probes": [ + { + "type": "probe", + "name": "Check job {{job_name}} running on yarn. ", + "tolerance": true, + "provider": { + "module": "chaostoolkit_nimble.core.utils.yarn_apps_ha_utils", + "type": "python", + "func": "is_job_running_on_yarn", + "arguments": { + "job_name": "{{job_name}}" + } + } + } ] + }, + "method": [ + { + "type": "action", + "name": "Kill active spark executors for job {{job_name}}. ", + "provider": { + "module": "chaostoolkit_nimble.core.utils.spark_apps_ha_utils", + "type": "python", + "func": "kill_active_executors", + "arguments": { + "job_name": "{{job_name}}", + "num_of_exec": "{{num_of_exec_to_kill}}" + } + } + } + ], + "rollbacks": [ + ] } \ No newline at end of file diff --git a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py index 5b5c681..4167127 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +++ b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -40,6 +40,7 @@ def test_schedule_15min_job(self, media_plane_actions, clean_table, clean_job_st def test_perform_15min_spark_job_ha(self): exp_template_file = "spark/executor_kill_exp.json" # exp_template_file = "spark/driver_kill_exp.json" + # exp_template_file = "spark/driver_and_executor_kill_exp.json" context = {"job_name": self.job_alias, "num_of_exec_to_kill": "1", } From b840b21af67f9f9d9ed1d3430b5c938e1cbe1531 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 21 Aug 2019 15:26:32 +0530 Subject: [PATCH 18/40] Test functions naming change + made schedule_job a fixture now --- .../controllers/spark/control.py | 9 ++++++- .../core/utils/yarn_apps_ha_utils.py | 4 ++-- .../tests/sample/test_jio_spark_job.py | 24 +++++++++++++++---- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/chaostoolkit_nimble/controllers/spark/control.py b/chaostoolkit_nimble/controllers/spark/control.py index abf9aee..97cdda8 100644 --- a/chaostoolkit_nimble/controllers/spark/control.py +++ b/chaostoolkit_nimble/controllers/spark/control.py @@ -70,4 +70,11 @@ def after_experiment_control(context: Experiment, state: Journal, APPLICATION_ID, job_stats["app"]["elapsedTime"], date_utils.get_minutes_from_milliseconds(job_stats["app"]["elapsedTime"]))) except RetryError: - logger.info("Yarn job with application id %s is not in 'FINISHED' state. Please check." % APPLICATION_ID) \ No newline at end of file + try: + hadoop_rest_client_utils.is_yarn_job_finished(APPLICATION_ID) + job_stats = hadoop_rest_client_utils.get_yarn_job_details(APPLICATION_ID) + logger.info("Total execution time for yarn job with application id %s: %s ms (i.e %s minutes) " % ( + APPLICATION_ID, job_stats["app"]["elapsedTime"], + date_utils.get_minutes_from_milliseconds(job_stats["app"]["elapsedTime"]))) + except RetryError: + logger.info("Yarn job with application id %s is not in 'FINISHED' state. Please check." % APPLICATION_ID) diff --git a/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py index 605d5e1..14e0115 100644 --- a/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py +++ b/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py @@ -6,9 +6,9 @@ def is_job_running_on_yarn(job_name): hadoop_rest_client_utils = HadoopRestClientUtils() - logger.debug("Checking if job '%s' on yarn" % job_name) + logger.debug("Checking if job '%s' running on yarn" % job_name) try: return hadoop_rest_client_utils.is_yarn_job_running(job_name=job_name) except RetryError: - logger.info("Not able to fetch yarn job status.") + logger.info("Not able to fetch yarn job '%s' status." % job_name) return False diff --git a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py index 4167127..66122c0 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +++ b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -34,18 +34,32 @@ def clean_job_stdout_files(self, media_plane_actions): NodeManager.node_obj.execute_command_on_node(media_plane_actions.node_alias, ShellUtils.su(self.job_user, command)) - def test_schedule_15min_job(self, media_plane_actions, clean_table, clean_job_stdout_files): + @pytest.fixture + def schedule_job(self, media_plane_actions, clean_table, clean_job_stdout_files): assert media_plane_actions.schedule_15_min_job() - def test_perform_15min_spark_job_ha(self): + def test_chaos_on_executor_kill(self, schedule_job): exp_template_file = "spark/executor_kill_exp.json" - # exp_template_file = "spark/driver_kill_exp.json" - # exp_template_file = "spark/driver_and_executor_kill_exp.json" context = {"job_name": self.job_alias, "num_of_exec_to_kill": "1", } chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) - def test_validation_on_15min_job_ha(self, user_actions, media_plane_actions): + def test_chaos_on_driver(self, schedule_job): + exp_template_file = "spark/driver_kill_exp.json" + context = {"job_name": self.job_alias, + } + chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) + # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) + + def test_chaos_on_driver_and_executor_kill(self, schedule_job): + exp_template_file = "spark/driver_and_executor_kill_exp.json" + context = {"job_name": self.job_alias, + "num_of_exec_to_kill": "1", + } + chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) + # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) + + def test_validation_post_chaos(self, user_actions, media_plane_actions): user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) From 8be84387e1a932f5956dc103641137ce11a47c66 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 21 Aug 2019 16:20:44 +0530 Subject: [PATCH 19/40] Update Readme --- README.md | 281 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) diff --git a/README.md b/README.md index c168c79..d00eb59 100644 --- a/README.md +++ b/README.md @@ -153,3 +153,284 @@ into the master branch of the repository. Please, make sure you can abide by the rules of the DCO before submitting a PR. [dco]: https://github.com/probot/dco#how-it-works +##ChaosToolkit Overview +*The end goal of using chaostoolkit is to practice chaos engineering, and discover how your system reacts when certain anomalies are injected in it. +*By doing this in a controlled fashion, you may learn how to change the system accordingly and make it more resilient on multiple levels like application, network and platform. + + +###The Various Sections of an Experiment +##### `Controls` +``` +*Here you declare the the control module, which is simply a set of functions that are called by the Chaos Toolkit when executing the experiment. +*The Controls are applied per experiment. +https://docs.chaostoolkit.org/reference/extending/create-control-extension/ +``` +##### `The steady state hypothesis` +``` +*The steady state hypothesis declares the various probes that will be applied as part of the hypothesis check. +*The hypothesis is played twice. The first time before we do anything else to ensure the system is indeed in a normal state, +The second time the hypothesis is applied is after the conditions were changed in the system, to validate it is still in a normal state. +*Hypothesis probes expect a tolerance property which tells the Chaos Toolkit how to validate a certain aspect of the state +``` + +##### `Method` +``` +*The method is the anomaly injection block which changes the conditions of our system/application. +* This section is executed only if Hypothesis (above) is successfully met, else this section would be skipped. +``` +##### `Rollbacks` +``` +*Finally, the rollback section (which is optional) tries to remediate to the changes we made on/off the system during the anomaly injection. +*This block will be executed always irrespective of the fact that Hypothesis was met or not in the first time. +``` + +#### `Sample Experimnet json file` +``` +{ + "version": "1.0.0", + "title": "What is the impact of an expired certificate on our application chain?", + "description": "If a certificate expires, we should gracefully deal with the issue.", + "tags": ["tls"], + "controls": [{ + "name": "spark-related-controls", + "provider": { + "type": "python", + "module": "chaostoolkit_nimble.controllers.spark.control" + } + }], + "steady-state-hypothesis": { + "title": "Application responds", + "probes": [{ + "type": "probe", + "name": "the-astre-service-must-be-running", + "tolerance": true, + "provider": { + "type": "python", + "module": "os.path", + "func": "exists", + "arguments": { + "path": "astre.pid" + } + } + }, + { + "type": "probe", + "name": "the-sunset-service-must-be-running", + "tolerance": true, + "provider": { + "type": "python", + "module": "os.path", + "func": "exists", + "arguments": { + "path": "sunset.pid" + } + } + }, + { + "type": "probe", + "name": "we-can-request-sunset", + "tolerance": 200, + "provider": { + "type": "http", + "timeout": 3, + "verify_tls": false, + "url": "https://localhost:8443/city/Paris" + } + } + ] + }, + "method": [{ + "type": "action", + "name": "swap-to-expired-cert", + "provider": { + "type": "process", + "path": "cp", + "arguments": "expired-cert.pem cert.pem" + } + }, + { + "type": "probe", + "name": "read-tls-cert-expiry-date", + "provider": { + "type": "process", + "path": "openssl", + "arguments": "x509 -enddate -noout -in cert.pem" + } + }, + { + "type": "action", + "name": "restart-astre-service-to-pick-up-certificate", + "provider": { + "type": "process", + "path": "pkill", + "arguments": "--echo -HUP -F astre.pid" + } + }, + { + "type": "action", + "name": "restart-sunset-service-to-pick-up-certificate", + "provider": { + "type": "process", + "path": "pkill", + "arguments": "--echo -HUP -F sunset.pid" + }, + "pauses": { + "after": 1 + } + } + ], + "rollbacks": [{ + "type": "action", + "name": "swap-to-vald-cert", + "provider": { + "type": "process", + "path": "cp", + "arguments": "valid-cert.pem cert.pem" + } + }, + { + "ref": "restart-astre-service-to-pick-up-certificate" + }, + { + "ref": "restart-sunset-service-to-pick-up-certificate" + } + ] +} +``` + + +## Jio Use Cases Solved +Job Name : Media Plane +Job frequency : 15min +Number of job instances: 1 + +Assumption : Job is running already + +### Use case 1: Kill n number of spark executors for a spark job running on yarn and validate data for that job instance. +``` +Chaos Experiment Template path - chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json + +------Before experiment control: +Read the user given testbed and initialize nimble `node_obj` object. + +------Hypothesis section: +Check job is running on yarn + +------Method section (Anomaly injection): +Kill spark job any active executors for the last spark driver attempt. + +------After experiment control: +Wait for the job to complete on yarn and then fetch the job total execution time from yarn. (Time fetched: 1.33 minutes) + + +User inputs required: +Testbed config yaml +Validation config yaml +Chaos Experiment Template path: +Num of executors to kill. Default is 1. + +Pytest command: +python -m pytest -k "test_chaos_on_executor_kill or test_validation_post_chaos" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +``` + +### Use case 2: Kill the spark driver for a spark job running on yarn and validate data for that job instance. +``` +Chaos Experiment Template - chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json + +------Before experiment control: +Read the user given testbed and initialize nimble `node_obj` object. + +------Hypothesis section: +Check job is running on yarn + +------Method section (Anomaly injection): +Kill the spark driver for this spark job. + +------After experiment control: +Wait for the job to complete on yarn and then fetch the job total execution time from yarn. (Time fetched: 1.74 minutes) + + +User inputs required: +Testbed config yaml +Validation config yaml +Chaos Experiment Template path: + +Pytest command: +python -m pytest -k "not(test_chaos_on_executor_kill or test_chaos_on_driver_and_executor_kill)" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py + +``` + +### Use case 3: Kill the driver and n number of executors for a spark job running on yarn and validate data for that job instance. +``` +Chaos Experiment Template used : chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json + +------Before experiment control: +Read the user given testbed and initialize nimble `node_obj` object. + +------Hypothesis section: +Check job is running on yarn + +------Method section (Anomaly injection): +Kill the spark driver for this spark job and then kill any active executors for the new spark attempt. + +------After experiment control: +Wait for the job to complete on yarn and then fetch the job total execution time from yarn. (Time fetched 1.76 minutes: ) + + +User inputs required: +Testbed config yaml +Validation config yaml +Chaos Experiment Template path: +Num of executors to kill. Default is 1. + +Pytest command: +python -m pytest -k "not(test_chaos_on_executor_kill or test_chaos_on_driver_kill)" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py + +``` + +## Setting up chaostoolkit-nimble on local system + +Assumptions : +Python 3 is already installed on the system +Test automation code is already checked out on the system + + + +##### Checkout 'chaos_eng_automation' repo code (i.e chaostoolkit-nimble) + +``` +1) mkdir chaos_automation ; cd chaos_automation +2) git clone https://github.com/kritika-saxena-guavus/chaos_eng_automation.git +3) cd chaos_eng_automation ; git checkout AUT-563 +``` + +##### Checkout 'st-automation' repo code (i.e nimble migrated on python 3) +``` +4) cd ../ ; git clone https://github.com/Guavus/st-automation.git +5) cd st-automation ; git checkout AUT-439-my-copy +``` + + +##### Add dependencies for projects 'chaos_eng_automation' and 'st-automation' in a virtual env +``` +1) cd ../ ; mkdir chaos_virenv ; cd chaos_virenv +2) virtualenv --python=python3 venv +3) source venv/bin/activate +4) pip install -r st-automation/requirements.txt +5) pip install -r chaos_eng_automation/requirements.txt +``` + +##### Add this virtual env in pycharm +``` +Pycharm --> Preferences --> Project interpreter --> settings --> show all --> add the chaos_virenv +``` + +##### Add the project 'chaos_eng_automation' and 'st-automation' as your main automation project dependencies +``` +1) Open the projects 'chaos_eng_automation' and 'st-automation' in the same pycharm window as that of your current project. +2) Pycharm --> Preferences --> Project dependencies --> Check chaos_eng_automation and 'st-automation' +3) Pycharm --> Preferences --> Project interpreter --> select the virtual env 'chaos_virenv' on both these projects +``` + + + From 85818036a493c8a58083dc3b24679faaeb6def33 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 21 Aug 2019 16:22:29 +0530 Subject: [PATCH 20/40] Update Readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d00eb59..fec137a 100644 --- a/README.md +++ b/README.md @@ -301,10 +301,12 @@ The second time the hypothesis is applied is after the conditions were changed i ## Jio Use Cases Solved Job Name : Media Plane + Job frequency : 15min + Number of job instances: 1 -Assumption : Job is running already +Assumption : Job is already running on the cluster. ### Use case 1: Kill n number of spark executors for a spark job running on yarn and validate data for that job instance. ``` From 1d9f4c06047eb66109bd0f6cf31eb0fd508a5438 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 21 Aug 2019 16:28:20 +0530 Subject: [PATCH 21/40] Readme formatting fixes. --- README.md | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index fec137a..5b60a13 100644 --- a/README.md +++ b/README.md @@ -153,34 +153,41 @@ into the master branch of the repository. Please, make sure you can abide by the rules of the DCO before submitting a PR. [dco]: https://github.com/probot/dco#how-it-works -##ChaosToolkit Overview +## ChaosToolkit Overview + *The end goal of using chaostoolkit is to practice chaos engineering, and discover how your system reacts when certain anomalies are injected in it. + *By doing this in a controlled fashion, you may learn how to change the system accordingly and make it more resilient on multiple levels like application, network and platform. -###The Various Sections of an Experiment +### The Various Sections of an Experiment ##### `Controls` ``` *Here you declare the the control module, which is simply a set of functions that are called by the Chaos Toolkit when executing the experiment. + *The Controls are applied per experiment. https://docs.chaostoolkit.org/reference/extending/create-control-extension/ ``` ##### `The steady state hypothesis` ``` *The steady state hypothesis declares the various probes that will be applied as part of the hypothesis check. + *The hypothesis is played twice. The first time before we do anything else to ensure the system is indeed in a normal state, The second time the hypothesis is applied is after the conditions were changed in the system, to validate it is still in a normal state. + *Hypothesis probes expect a tolerance property which tells the Chaos Toolkit how to validate a certain aspect of the state ``` ##### `Method` ``` *The method is the anomaly injection block which changes the conditions of our system/application. + * This section is executed only if Hypothesis (above) is successfully met, else this section would be skipped. ``` ##### `Rollbacks` ``` *Finally, the rollback section (which is optional) tries to remediate to the changes we made on/off the system during the anomaly injection. + *This block will be executed always irrespective of the fact that Hypothesis was met or not in the first time. ``` @@ -326,10 +333,10 @@ Wait for the job to complete on yarn and then fetch the job total execution time User inputs required: -Testbed config yaml -Validation config yaml -Chaos Experiment Template path: -Num of executors to kill. Default is 1. +* Testbed config yaml +* Validation config yaml +* Chaos Experiment Template path: +* Num of executors to kill. Default is 1. Pytest command: python -m pytest -k "test_chaos_on_executor_kill or test_validation_post_chaos" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -353,9 +360,9 @@ Wait for the job to complete on yarn and then fetch the job total execution time User inputs required: -Testbed config yaml -Validation config yaml -Chaos Experiment Template path: +* Testbed config yaml +* Validation config yaml +* Chaos Experiment Template path: Pytest command: python -m pytest -k "not(test_chaos_on_executor_kill or test_chaos_on_driver_and_executor_kill)" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -380,10 +387,10 @@ Wait for the job to complete on yarn and then fetch the job total execution time User inputs required: -Testbed config yaml -Validation config yaml -Chaos Experiment Template path: -Num of executors to kill. Default is 1. +* Testbed config yaml +* Validation config yaml +* Chaos Experiment Template path: +* Num of executors to kill. Default is 1. Pytest command: python -m pytest -k "not(test_chaos_on_executor_kill or test_chaos_on_driver_kill)" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -393,9 +400,9 @@ python -m pytest -k "not(test_chaos_on_executor_kill or test_chaos_on_driver_kil ## Setting up chaostoolkit-nimble on local system Assumptions : -Python 3 is already installed on the system -Test automation code is already checked out on the system +Python 3 is already installed on the system. +Automation code from your own solution repo is already checked out on the system. ##### Checkout 'chaos_eng_automation' repo code (i.e chaostoolkit-nimble) @@ -412,7 +419,6 @@ Test automation code is already checked out on the system 5) cd st-automation ; git checkout AUT-439-my-copy ``` - ##### Add dependencies for projects 'chaos_eng_automation' and 'st-automation' in a virtual env ``` 1) cd ../ ; mkdir chaos_virenv ; cd chaos_virenv From cff9bdb19142567b8d11470aaf11b8153bf2bb22 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 21 Aug 2019 16:30:14 +0530 Subject: [PATCH 22/40] Readme formatting fixes. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5b60a13..6db6cf0 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,7 @@ into the master branch of the repository. Please, make sure you can abide by the rules of the DCO before submitting a PR. [dco]: https://github.com/probot/dco#how-it-works + ## ChaosToolkit Overview *The end goal of using chaostoolkit is to practice chaos engineering, and discover how your system reacts when certain anomalies are injected in it. From 654c09053d9f41977e589aa42e7e3fe4d81a3d35 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 21 Aug 2019 16:58:31 +0530 Subject: [PATCH 23/40] kubernetes documentation order changed --- README.md | 315 +++++++++++++++++++++++++++--------------------------- 1 file changed, 157 insertions(+), 158 deletions(-) diff --git a/README.md b/README.md index 6db6cf0..eae3f1b 100644 --- a/README.md +++ b/README.md @@ -1,159 +1,3 @@ -# Chaos Toolkit Kubernetes Support - -[![Build Status](https://travis-ci.org/chaostoolkit/chaostoolkit-kubernetes.svg?branch=master)](https://travis-ci.org/chaostoolkit/chaostoolkit-kubernetes) -[![codecov](https://codecov.io/gh/chaostoolkit/chaostoolkit-kubernetes/branch/master/graph/badge.svg)](https://codecov.io/gh/chaostoolkit/chaostoolkit-kubernetes) -[![Python versions](https://img.shields.io/pypi/pyversions/chaostoolkit-kubernetes.svg)](https://www.python.org/) -[![Downloads](https://pepy.tech/badge/chaostoolkit-kubernetes)](https://pepy.tech/project/chaostoolkit-kubernetes) - -This project contains activities, such as probes and actions, you can call from -your experiment through the Chaos Toolkit. - -## Install - -To be used from your experiment, this package must be installed in the Python -environment where [chaostoolkit][] already lives. - -[chaostoolkit]: https://github.com/chaostoolkit/chaostoolkit - -``` -$ pip install chaostoolkit-kubernetes -``` - -## Usage - -To use the probes and actions from this package, add the following to your -experiment file: - -```json -{ - "name": "all-our-microservices-should-be-healthy", - "type": "probe", - "tolerance": "true", - "provider": { - "type": "python", - "module": "chaosk8s.probes", - "func": "microservice_available_and_healthy", - "arguments": { - "name": "myapp", - "ns": "myns" - } - } -}, -{ - "type": "action", - "name": "terminate-db-pod", - "provider": { - "type": "python", - "module": "chaosk8s.pod.actions", - "func": "terminate_pods", - "arguments": { - "label_selector": "app=my-app", - "name_pattern": "my-app-[0-9]$", - "rand": true, - "ns": "default" - } - }, - "pauses": { - "after": 5 - } -} -``` - -That's it! Notice how the action gives you the way to kill one pod randomly. - -Please explore the code to see existing probes and actions. - -### Discovery - -You may use the Chaos Toolkit to discover the capabilities of this extension: - -``` -$ chaos discover chaostoolkit-kubernetes --no-install -``` - -## Configuration - -This extension to the Chaos Toolkit can use the Kubernetes configuration -found at the usual place in your HOME directory under `~/.kube/`, or, when -run from a Pod in a Kubernetes cluster, it will use the local service account. -In that case, make sure to set the `CHAOSTOOLKIT_IN_POD` environment variable -to `"true"`. - -You can also pass the credentials via secrets as follows: - -```json -{ - "secrets": { - "kubernetes": { - "KUBERNETES_HOST": "http://somehost", - "KUBERNETES_API_KEY": { - "type": "env", - "key": "SOME_ENV_VAR" - } - } - } -} -``` - -Then in your probe or action: - -```json -{ - "name": "all-our-microservices-should-be-healthy", - "provider": { - "type": "python", - "module": "chaosk8s.probes", - "func": "microservice_available_and_healthy", - "secrets": ["kubernetes"], - "arguments": { - "name": "myapp", - "ns": "myns" - } - } -} -``` - -You may specify the Kubernetes context you want to use as follows: - -```json -{ - "secrets": { - "kubernetes": { - "KUBERNETES_CONTEXT": "minikube" - } - } -} -``` - -Or via the environment: - -``` -$ export KUBERNETES_CONTEXT=minikube -``` - -In the same spirit, you can specify where to find your Kubernetes configuration -with: - -``` -$ export KUBECONFIG=some/path/config -``` - -## Contribute - -If you wish to contribute more functions to this package, you are more than -welcome to do so. Please fork this project, make your changes following the -usual [PEP 8][pep8] code style, add appropriate tests and submit a PR for -review. - -[pep8]: https://pycodestyle.readthedocs.io/en/latest/ - -The Chaos Toolkit projects require all contributors must sign a -[Developer Certificate of Origin][dco] on each commit they would like to merge -into the master branch of the repository. Please, make sure you can abide by -the rules of the DCO before submitting a PR. - -[dco]: https://github.com/probot/dco#how-it-works - ## ChaosToolkit Overview *The end goal of using chaostoolkit is to practice chaos engineering, and discover how your system reacts when certain anomalies are injected in it. @@ -340,7 +184,7 @@ User inputs required: * Num of executors to kill. Default is 1. Pytest command: -python -m pytest -k "test_chaos_on_executor_kill or test_validation_post_chaos" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +python -m pytest -k "test_chaos_on_executor_kill or test_data_validation_post_chaos" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py ``` ### Use case 2: Kill the spark driver for a spark job running on yarn and validate data for that job instance. @@ -394,7 +238,7 @@ User inputs required: * Num of executors to kill. Default is 1. Pytest command: -python -m pytest -k "not(test_chaos_on_executor_kill or test_chaos_on_driver_kill)" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +python -m pytest -k "test_chaos_on_driver_and_executor_kill or test_data_validation_post_chaos" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py ``` @@ -441,5 +285,160 @@ Pycharm --> Preferences --> Project interpreter --> settings --> show all --> ad 3) Pycharm --> Preferences --> Project interpreter --> select the virtual env 'chaos_virenv' on both these projects ``` +# Chaos Toolkit Kubernetes Support + +[![Build Status](https://travis-ci.org/chaostoolkit/chaostoolkit-kubernetes.svg?branch=master)](https://travis-ci.org/chaostoolkit/chaostoolkit-kubernetes) +[![codecov](https://codecov.io/gh/chaostoolkit/chaostoolkit-kubernetes/branch/master/graph/badge.svg)](https://codecov.io/gh/chaostoolkit/chaostoolkit-kubernetes) +[![Python versions](https://img.shields.io/pypi/pyversions/chaostoolkit-kubernetes.svg)](https://www.python.org/) +[![Downloads](https://pepy.tech/badge/chaostoolkit-kubernetes)](https://pepy.tech/project/chaostoolkit-kubernetes) + +This project contains activities, such as probes and actions, you can call from +your experiment through the Chaos Toolkit. + +## Install + +To be used from your experiment, this package must be installed in the Python +environment where [chaostoolkit][] already lives. + +[chaostoolkit]: https://github.com/chaostoolkit/chaostoolkit + +``` +$ pip install chaostoolkit-kubernetes +``` + +## Usage + +To use the probes and actions from this package, add the following to your +experiment file: + +```json +{ + "name": "all-our-microservices-should-be-healthy", + "type": "probe", + "tolerance": "true", + "provider": { + "type": "python", + "module": "chaosk8s.probes", + "func": "microservice_available_and_healthy", + "arguments": { + "name": "myapp", + "ns": "myns" + } + } +}, +{ + "type": "action", + "name": "terminate-db-pod", + "provider": { + "type": "python", + "module": "chaosk8s.pod.actions", + "func": "terminate_pods", + "arguments": { + "label_selector": "app=my-app", + "name_pattern": "my-app-[0-9]$", + "rand": true, + "ns": "default" + } + }, + "pauses": { + "after": 5 + } +} +``` + +That's it! Notice how the action gives you the way to kill one pod randomly. + +Please explore the code to see existing probes and actions. + +### Discovery + +You may use the Chaos Toolkit to discover the capabilities of this extension: + +``` +$ chaos discover chaostoolkit-kubernetes --no-install +``` + +## Configuration + +This extension to the Chaos Toolkit can use the Kubernetes configuration +found at the usual place in your HOME directory under `~/.kube/`, or, when +run from a Pod in a Kubernetes cluster, it will use the local service account. +In that case, make sure to set the `CHAOSTOOLKIT_IN_POD` environment variable +to `"true"`. + +You can also pass the credentials via secrets as follows: + +```json +{ + "secrets": { + "kubernetes": { + "KUBERNETES_HOST": "http://somehost", + "KUBERNETES_API_KEY": { + "type": "env", + "key": "SOME_ENV_VAR" + } + } + } +} +``` + +Then in your probe or action: + +```json +{ + "name": "all-our-microservices-should-be-healthy", + "provider": { + "type": "python", + "module": "chaosk8s.probes", + "func": "microservice_available_and_healthy", + "secrets": ["kubernetes"], + "arguments": { + "name": "myapp", + "ns": "myns" + } + } +} +``` + +You may specify the Kubernetes context you want to use as follows: + +```json +{ + "secrets": { + "kubernetes": { + "KUBERNETES_CONTEXT": "minikube" + } + } +} +``` + +Or via the environment: + +``` +$ export KUBERNETES_CONTEXT=minikube +``` + +In the same spirit, you can specify where to find your Kubernetes configuration +with: + +``` +$ export KUBECONFIG=some/path/config +``` + +## Contribute + +If you wish to contribute more functions to this package, you are more than +welcome to do so. Please fork this project, make your changes following the +usual [PEP 8][pep8] code style, add appropriate tests and submit a PR for +review. + +[pep8]: https://pycodestyle.readthedocs.io/en/latest/ + +The Chaos Toolkit projects require all contributors must sign a +[Developer Certificate of Origin][dco] on each commit they would like to merge +into the master branch of the repository. Please, make sure you can abide by +the rules of the DCO before submitting a PR. + +[dco]: https://github.com/probot/dco#how-it-works From 5fc118cb4b2ba56ac6dc7e9a8b7f29640b126cdf Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 21 Aug 2019 17:04:14 +0530 Subject: [PATCH 24/40] Spellings corrected. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index eae3f1b..7b9dd12 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ The second time the hypothesis is applied is after the conditions were changed i *This block will be executed always irrespective of the fact that Hypothesis was met or not in the first time. ``` -#### `Sample Experimnet json file` +#### `Sample Experiment json file` ``` { "version": "1.0.0", From aee6f0b9ec5326bce54abcd5fc758b4820b8c515 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 21 Aug 2019 17:16:06 +0530 Subject: [PATCH 25/40] test_validation func name changed --- chaostoolkit_nimble/tests/sample/test_jio_spark_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py index 66122c0..58bc775 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +++ b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -61,5 +61,5 @@ def test_chaos_on_driver_and_executor_kill(self, schedule_job): chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) - def test_validation_post_chaos(self, user_actions, media_plane_actions): + def test_data_validation_post_chaos(self, user_actions, media_plane_actions): user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) From e7a545265a3ebac9b3e2cf607f790af8b91fa6da Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 23 Aug 2019 12:19:25 +0530 Subject: [PATCH 26/40] Templating with default value + fetching templates from fileserver + release changes --- CHANGELOG.md | 308 +----------------- .../actions/base/flows/chaos_user_actions.py | 11 +- .../spark/driver_and_executor_kill_exp.json | 2 +- .../spark/executor_kill_exp.json | 2 +- .../tests/sample/test_jio_spark_job.py | 20 +- requirements.txt | 2 +- setup.py | 105 +----- 7 files changed, 42 insertions(+), 408 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d747104..3db9e90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,302 +1,10 @@ # Changelog -## [Unreleased][] - -[Unreleased]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.20.0...HEAD - -- Try to read proxy from environment variable "HTTP_PROXY" and set it - appropriately if it exists -- Add the `deployment_is_fully_available` probe to wait for a deployment to be fully available [#38][38] -- Fix calls to `delete_namespaced_*` so that the `body` argument is passed - a named argument [#42][42]. A follow up to [#34][34] -- Fix calls to `delete_nodes` so that the `body` argument is passed - a named argument [#44][44]. - -[38]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/pull/38 -[42]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/issues/42 -[44]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/pull/44 - -## [0.20.0][] - 2018-03-25 - -[0.20.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.19.1...0.20.0 - -### Added - -- Add a probe to check pods conditions [PR#31][31] - -[31]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/pull/31 - -### Changed - -- Fix call to `delete_namespaced_pod` so that the `body` argument is passed - a named argument [#34][34] - -[34]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/issues/34 - - -## [0.19.1][] - 2018-10-08 - -[0.19.1]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.18.1...0.19.1 - -### Changed - -- As part of [#21][21], we realized that passing `None` to some parameters of the - Kubernetes client API was not the right move because, in that case, the client - turns that into a `"None"` string which is not what we want. So I had to - resort to many conditionals that make the code not as clean I'd want. Sigh! - -[21]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/issues/21 - -## [0.18.1][] - 2018-10-08 - -[0.18.1]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.18.0...0.18.1 - -### Changed - -- Fix: use `Succeeded` instead of `Completed` to filter successful pods created by a cronjob in the `all_microservices_healthy` probe. - -## [0.18.0][] - 2018-10-08 - -[0.18.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.17.0...0.18.0 - -### Added - -- [Codecov][codecov] integration -- Renamed `FailedActivity` to `ActivityFailed` as per [chaostoolkit 0.20.0][0.20.0]. See [PR#20][20] -- Add Ability to specify a maximum percentage of pods to be killed [PR#19][19] -- Consider `Completed` pods as healthy in the `all_microservices_healthy` probe. See [PR#23][23] -- Support a new `grace_period_seconds` parameter in the `terminate_pods` action. See [PR#24][24] - -[codecov]: https://codecov.io/gh/chaostoolkit/chaostoolkit-kubernetes -[0.20.0]: https://github.com/chaostoolkit/chaostoolkit-lib/blob/master/CHANGELOG.md#0200---2018-08-09 -[20]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/pull/20 -[19]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/pull/19 - -## [0.17.0][] - 2018-09-07 - -[0.17.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.16.2...0.17.0 - -### Added - -- List work nodes - -## [0.16.2][] - 2018-05-14 - -[0.16.2]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.16.1...0.16.2 - -### Added - -- Read version from source file without importing - -## [0.16.1][] - 2018-05-14 - -[0.16.1]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.16.0...0.16.1 - -### Added - -- Added requirements-dev.txt to MANIFEST.in so it gets packaged and distributed - -## [0.16.0][] - 2018-04-24 - -[0.16.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.15.0...0.16.0 - -### Added - -- Allow to pass the Kubernetes context to authenticate from by setting - the `"KUBERNETES_CONTEXT"` key in the environment or the secrets object - [#15][15] - -[15]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/issues/15 - -## [0.15.0][] - 2018-04-13 - -[0.15.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.14.0...0.15.0 - -### Added - -- a probe to count the number of pods -- actions to delete and create nodes -- actions to cordon, uncordon and drain nodes -- canot locate credentials automatically when ran from within a Pod if - you set the `CHAOSTOOLKIT_IN_POD: "true"` environment variable in the Pod - spec - -## [0.14.0][] - 2018-04-05 - -[0.14.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.13.0...0.14.0 - -### Added - -- allow to create a Kubernetes client from a Kubernetes cluster pod - -## [0.13.0][] - 2018-03-09 - -[0.13.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.12.0...0.13.0 - -### Added - -- `chaosk8s.pod.probes.pods_in_phase` to probe that all pods matching a label - are in a given pod Phase -- `chaosk8s.pod.probes.pods_not_in_phase` to probe that all pods matching a - label are not in a given pod Phase - -## [0.12.0][] - 2018-02-12 - -[0.12.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.11.0...0.12.0 - -### Changed - -- Moved the `chaosk8s.probes.read_microservice_logs` to - `chaosk8s.pod.probes.read_pod_logs` for clarity -- Make name optional for `chaosk8s.pod.probes.read_pod_logs` as it usually - more preferred to use a label for that probe -- Removed the system discovery as it wasn't used by chaostoolkit anyway - -## [0.11.0][] - 2018-01-28 - -[0.11.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.10.0...0.11.0 - -### Added - -- Added a pod specific set of actions - -### Changed - -- Refactor dev/test dependencies so they are not deployed on install - -## [0.10.0][] - 2018-01-22 - -[0.10.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.9.0...0.10.0 - -### Changed - -- activities now take a `label_selector` argument to let you adjust to your - conventions when selecting resources [#7][7] - -[7]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/issues/7 - -## [0.9.0][] - 2018-01-16 - -[0.9.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.8.0...0.9.0 - -### Added - -- discovery mechanism - -## [0.8.0][] - 2017-12-29 - -[0.8.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.7.0...0.8.0 - -### Added - -- `read_microservices_logs` probe to fetch pod's logs - -## [0.7.0][] - 2017-12-17 - -[0.7.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.6.0...0.7.0 - -### Added - -- Deployment scaler action - -### Changed - -- Updated to chaostoolkit-lib 0.8.0 - -## [0.6.0][] - 2017-12-12 - -[0.6.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.5.0...0.6.0 - -### Added - -- Logging at DEBUG level for investigation -- Probe `deployment_is_not_fully_available` to wait until a deployment is not - fully available (its desired state is different from its current state) - -### Changed - -- Selecting on the name Label rather than Service as it's more commonly used -- Updated chaostoolkit-lib to 0.7.0 for configuration support - -## [0.5.0][] - 2017-12-06 - -[0.5.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.4.3...0.5.0 - -### Changed - -- Updated to match chaostoolkit-lib 0.6.0 API changes -- Probes now return `True` on success so they can be nicely used from the - steady state hypothesis checks - -## [0.4.3][] - 2017-11-23 - -[0.4.3]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.4.2...0.4.3 - -### Changed - -- Removing unwanted parameter - -## [0.4.2][] - 2017-11-20 - -[0.4.2]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.4.1...0.4.2 - -### Changed - -- Proper verify SSL reading of the environment key - -## [0.4.1][] - 2017-11-20 - -[0.4.1]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.4.0...0.4.1 - -### Added - -- Passing secrets down to client function - - -## [0.4.0][] - 2017-11-20 - -[0.4.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.3.0...0.4.0 - -### Added - -- Can create a client from secrets - - -## [0.3.0][] - 2017-11-20 - -[0.3.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.2.0...0.3.0 - -### Added - -- Can now authenticate to the Kubernetes API endpoint either via a token, - username/password or certificate/pkey. All of this via environment variable. - By default, still looks up for ~/kube/config if it exists - - -## [0.2.0][] - 2017-10-23 - -[0.2.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.1.1...0.2.0 - -### Added - -- Remove check Kubernetes service by name - -### Changed - -- Do not build a universal wheel package (no Python 2 support in chaostoolkit) - -## [0.1.1][] - 2017-10-06 - -[0.1.1]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/compare/0.1.0...0.1.1 - -### Changed - -- Package up extra files when installed from source - -## [0.1.0][] - 2017-10-06 - -[0.1.0]: https://github.com/chaostoolkit/chaostoolkit-kubernetes/tree/0.1.0 - -### Added - -- Initial release +## [Unreleased] + +## [0.0.1]: 2019-08-23 +#### Added +- Support for checking +- Support to killing spark executor for a spark application. +- Support for killing spark driver for a spark application. +- Support for killing spark driver and exectutor for a spark application. \ No newline at end of file diff --git a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py index 3b5d149..d719b34 100644 --- a/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py +++ b/chaostoolkit_nimble/actions/base/flows/chaos_user_actions.py @@ -5,6 +5,7 @@ import jinja2 from nimble.core import global_constants +from nimble.core.utils.file_server_utils import FileServerUtils from nimble.core.utils.shell_utils import ShellUtils _LOGGER = logging.getLogger(__name__) @@ -32,12 +33,16 @@ def run_experiment(exp_file=None, exp_template_file=None, context=None): def render_template(exp_template_file, context): - template_base_dir = "chaostoolkit_nimble/resources/exp_templates" + file_server_utils = FileServerUtils() + exp_file_name = exp_template_file.rsplit("/", 1)[1] + template_base_dir = "%s/tmp/exp_templates/" % global_constants.DEFAULT_LOCAL_ARTIFACTS_PATH + ShellUtils.execute_shell_command(ShellUtils.remove_and_create_directory(template_base_dir)) + file_server_utils.download(exp_template_file, path_to_download=template_base_dir) templateLoader = jinja2.FileSystemLoader(searchpath=template_base_dir) templateEnv = jinja2.Environment(loader=templateLoader) - template = templateEnv.get_template(exp_template_file) + template = templateEnv.get_template(exp_file_name) _LOGGER.info('Rendering from template: %s' % template.name) - template.stream(context).dump('%s/exp.json' % EXPERIMENTS_BASE_PATH) + template.stream(context).dump('%s/%s' % (EXPERIMENTS_BASE_PATH, exp_file_name)) def generate_html(journal_path): diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json index 79e7b37..7d0c997 100644 --- a/chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json @@ -54,7 +54,7 @@ "func": "kill_active_executors", "arguments": { "job_name": "{{job_name}}", - "num_of_exec": "{{num_of_exec_to_kill}}" + "num_of_exec": "{{ num_of_exec_to_kill|default('1') }}" } } } diff --git a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json index e8c18ff..6540c0c 100644 --- a/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json +++ b/chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json @@ -42,7 +42,7 @@ "func": "kill_active_executors", "arguments": { "job_name": "{{job_name}}", - "num_of_exec": "{{num_of_exec_to_kill}}" + "num_of_exec": "{{ num_of_exec_to_kill|default('1') }}" } } } diff --git a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py index 58bc775..a2574cd 100644 --- a/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +++ b/chaostoolkit_nimble/tests/sample/test_jio_spark_job.py @@ -39,27 +39,19 @@ def schedule_job(self, media_plane_actions, clean_table, clean_job_stdout_files) assert media_plane_actions.schedule_15_min_job() def test_chaos_on_executor_kill(self, schedule_job): - exp_template_file = "spark/executor_kill_exp.json" - context = {"job_name": self.job_alias, - "num_of_exec_to_kill": "1", - } + exp_template_file = "automation/chaos/exp_templates/spark/executor_kill_exp.json" + context = {"job_name": self.job_alias} chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) - # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) def test_chaos_on_driver(self, schedule_job): - exp_template_file = "spark/driver_kill_exp.json" - context = {"job_name": self.job_alias, - } + exp_template_file = "automation/chaos/exp_templates/spark/driver_kill_exp.json" + context = {"job_name": self.job_alias} chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) - # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) def test_chaos_on_driver_and_executor_kill(self, schedule_job): - exp_template_file = "spark/driver_and_executor_kill_exp.json" - context = {"job_name": self.job_alias, - "num_of_exec_to_kill": "1", - } + exp_template_file = "automation/chaos/exp_templates/spark/driver_and_executor_kill_exp.json" + context = {"job_name": self.job_alias} chaos_user_actions.run_experiment(exp_template_file=exp_template_file, context=context) - # chaos_user_actions.run_experiment(exp_file=OPTIONS_DICT["experimentsPath"]) def test_data_validation_post_chaos(self, user_actions, media_plane_actions): user_actions.validate(media_plane_actions.validate_media_plane, self.job_alias) diff --git a/requirements.txt b/requirements.txt index f1d5362..80bbc0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -nimble dateparser kubernetes logzero @@ -7,3 +6,4 @@ pyyaml pytest-html chaostoolkit chaostoolkit-reporting +twine diff --git a/setup.py b/setup.py index 031f31c..71af68d 100644 --- a/setup.py +++ b/setup.py @@ -1,88 +1,17 @@ -#!/usr/bin/env python -"""chaostoolkit builder and installer""" -import os -import sys -import io - -import setuptools - -def get_version_from_package() -> str: - """ - Read the package version from the source without importing it. - """ - path = os.path.join(os.path.dirname(__file__), "chaosk8s/__init__.py") - path = os.path.normpath(os.path.abspath(path)) - with open(path) as f: - for line in f: - if line.startswith("__version__"): - token, version = line.split(" = ", 1) - version = version.replace("'", "").strip() - return version - -name = 'chaostoolkit-kubernetes' -desc = 'Chaos Toolkit Kubernetes support' - -with io.open('README.md', encoding='utf-8') as strm: - long_desc = strm.read() - -classifiers = [ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'License :: Freely Distributable', - 'Operating System :: OS Independent', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: Implementation', - 'Programming Language :: Python :: Implementation :: CPython' -] -author = 'chaostoolkit Team' -author_email = 'contact@chaostoolkit.org' -url = 'http://chaostoolkit.org' -license = 'Apache License Version 2.0' -packages = [ - 'chaosk8s', - 'chaosk8s.node', - 'chaosk8s.pod' -] - -needs_pytest = set(['pytest', 'test']).intersection(sys.argv) -pytest_runner = ['pytest_runner'] if needs_pytest else [] -test_require = [] -with io.open('requirements-dev.txt') as f: - test_require = [l.strip() for l in f if not l.startswith('#')] - -install_require = [] -with io.open('requirements.txt') as f: - install_require = [l.strip() for l in f if not l.startswith('#')] - - -setup_params = dict( - name=name, - version=get_version_from_package(), - description=desc, - long_description=long_desc, - classifiers=classifiers, - author=author, - author_email=author_email, - url=url, - license=license, - packages=packages, - include_package_data=True, - install_requires=install_require, - tests_require=test_require, - setup_requires=pytest_runner, - python_requires='>=3.5.*' -) - - -def main(): - """Package installation entry point.""" - setuptools.setup(**setup_params) - - -if __name__ == '__main__': - main() +from setuptools import setup, find_packages + +if __name__ == "__main__": + with open("requirements.txt") as f: + requirements = list(filter(lambda x: not x.startswith("pytest"), f.read().splitlines())) + + setup( + name="chaostoolkit-nimble", + packages=find_packages(), + description="Guavus Chaos Test automation framework", + version="0.0.1", + install_requires=requirements, + url="https://github.com/kritika-saxena-guavus/chaos_eng_automation", + author="Core Automation Squad", + author_email="automation-squad@guavus.com", + include_package_data=True + ) From 1890426b1c941f468e77fb482d1b64bf7fb4a5a8 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 23 Aug 2019 12:32:43 +0530 Subject: [PATCH 27/40] Removing space from requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 80bbc0c..7077efd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ pyyaml pytest-html chaostoolkit chaostoolkit-reporting -twine +twine \ No newline at end of file From 6e4a386ca0440df0979e4258b4a0d5eaa172a63f Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 23 Aug 2019 13:52:22 +0530 Subject: [PATCH 28/40] adding nimble in requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7077efd..6a508e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +nimble dateparser kubernetes logzero From 6f2baf14c4984360f288fc949368840617544a55 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 23 Aug 2019 14:51:33 +0530 Subject: [PATCH 29/40] Restricted nimble version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6a508e2..d08235f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -nimble +nimble>4.0.0 dateparser kubernetes logzero From 960cc8f7d60869481d3ae418ebd2d5aa189653f6 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 23 Aug 2019 14:57:22 +0530 Subject: [PATCH 30/40] Restricted nimble version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d08235f..9cfbe33 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -nimble>4.0.0 +nimble>=4.0.0 dateparser kubernetes logzero From a302a84f9c06562c3c136d36a9672125dad12bfe Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 28 Aug 2019 15:36:17 +0530 Subject: [PATCH 31/40] Added exception handling on testbed and component attributes yaml found in chaos env. --- chaostoolkit_nimble/controllers/base/control.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/chaostoolkit_nimble/controllers/base/control.py b/chaostoolkit_nimble/controllers/base/control.py index 8aa3a8e..03acbca 100644 --- a/chaostoolkit_nimble/controllers/base/control.py +++ b/chaostoolkit_nimble/controllers/base/control.py @@ -1,8 +1,8 @@ from chaoslib.types import Configuration, \ Experiment, Secrets, Settings from logzero import logger -from nimble.core import global_constants +from nimble.core import global_constants from nimble.core.entity.node_manager import NodeManager from nimble.core.utils.shell_utils import ShellUtils @@ -24,5 +24,8 @@ def configure_control(configuration: Configuration = None, ShellUtils.find_files_in_directory(setup_files_base_path, file_name_regex="open_nebula_*")).stdout component_attributes_file = ShellUtils.execute_shell_command( ShellUtils.find_files_in_directory(setup_files_base_path, file_name_regex="component_*")).stdout - NodeManager.initialize(testbed_file, component_attributes_file) - logger.debug("NODE_OBJ FROM BASE CONTROLLER----------------: %s" % NodeManager.node_obj.vip) + if testbed_file and component_attributes_file: + NodeManager.initialize(testbed_file, component_attributes_file) + logger.debug("NODE_OBJ VIP FROM BASE CONTROLLER----------------: %s" % NodeManager.node_obj.vip) + else: + raise Exception("Either testbed or component attributes yaml file not found in chaos!") From 32bdd0cd69561c8892ecb12c1c2044d1ed2d8b77 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Wed, 28 Aug 2019 15:58:42 +0530 Subject: [PATCH 32/40] Logging correction spark_apps_ha_utils --- chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py index 3017010..8e08254 100644 --- a/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py +++ b/chaostoolkit_nimble/core/utils/spark_apps_ha_utils.py @@ -24,11 +24,11 @@ def kill_active_executors(job_name, num_of_exec=1): "Could not fetch yarn application id for job %s. Job not found in '%s' state" % ( job_name, ApplicationState.RUNNING.value)) try: - logger.info("Fetching spark active executors for application id : %s" % control.APPLICATION_ID) + logger.info("Fetching spark active executors for application id: %s" % control.APPLICATION_ID) executors = spark_client_utils.get_application_active_executors(control.APPLICATION_ID) except RetryError: raise ChaosActionFailedError( - "Could not fetch spark executors for the application id :" % control.APPLICATION_ID) + "Could not fetch spark executors for the application id: %s" % control.APPLICATION_ID) for i in range(len(executors)): if executors[i]["id"] == "driver": executors.pop(i) @@ -60,11 +60,11 @@ def kill_driver(job_name): raise ChaosActionFailedError( "Could not fetch yarn application id for job %s in state %s:" % (job_name, ApplicationState.RUNNING.value)) try: - logger.info("Fetching spark driver for application id : %s" % control.APPLICATION_ID) + logger.info("Fetching spark driver for application id: %s" % control.APPLICATION_ID) executors = spark_client_utils.get_application_active_executors(control.APPLICATION_ID) except RetryError: raise ChaosActionFailedError( - "Could not fetch spark executors for the job:" % job_name) + "Could not fetch spark executors for the application id: %s" % control.APPLICATION_ID) response = None for executor in executors: if executor["id"] == "driver": From d29f94b24ec32c94c60b2e236f3440321a348aec Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 30 Aug 2019 12:57:14 +0530 Subject: [PATCH 33/40] Updated Readme.md --- README.md | 87 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 7b9dd12..3b4af2d 100644 --- a/README.md +++ b/README.md @@ -151,18 +151,23 @@ The second time the hypothesis is applied is after the conditions were changed i ``` -## Jio Use Cases Solved +## Jio Use Cases Implemented Job Name : Media Plane Job frequency : 15min -Number of job instances: 1 +Number of job instances being run: 1 Assumption : Job is already running on the cluster. +** NOTE: No custom code required here by the user. These three use cases (i.e experiments) have been templatized and these templates have been stored on fileserver at location: +`http://192.168.192.201/guavus/automation/chaos/exp_templates/spark/` +You need to provide this template path as an input to run your chaos experiments. + + ### Use case 1: Kill n number of spark executors for a spark job running on yarn and validate data for that job instance. ``` -Chaos Experiment Template path - chaostoolkit_nimble/resources/exp_templates/spark/executor_kill_exp.json +Chaos Experiment Template path (exp_template_file) = "automation/chaos/exp_templates/spark/executor_kill_exp.json" ------Before experiment control: Read the user given testbed and initialize nimble `node_obj` object. @@ -189,7 +194,7 @@ python -m pytest -k "test_chaos_on_executor_kill or test_data_validation_post_c ### Use case 2: Kill the spark driver for a spark job running on yarn and validate data for that job instance. ``` -Chaos Experiment Template - chaostoolkit_nimble/resources/exp_templates/spark/driver_kill_exp.json +Chaos Experiment Template path (exp_template_file) = "automation/chaos/exp_templates/spark/driver_kill_exp.json" ------Before experiment control: Read the user given testbed and initialize nimble `node_obj` object. @@ -216,7 +221,7 @@ python -m pytest -k "not(test_chaos_on_executor_kill or test_chaos_on_driver_an ### Use case 3: Kill the driver and n number of executors for a spark job running on yarn and validate data for that job instance. ``` -Chaos Experiment Template used : chaostoolkit_nimble/resources/exp_templates/spark/driver_and_executor_kill_exp.json +Chaos Experiment Template path (exp_template_file) = "automation/chaos/exp_templates/spark/driver_and_executor_kill_exp.json" ------Before experiment control: Read the user given testbed and initialize nimble `node_obj` object. @@ -250,27 +255,14 @@ Python 3 is already installed on the system. Automation code from your own solution repo is already checked out on the system. -##### Checkout 'chaos_eng_automation' repo code (i.e chaostoolkit-nimble) - -``` -1) mkdir chaos_automation ; cd chaos_automation -2) git clone https://github.com/kritika-saxena-guavus/chaos_eng_automation.git -3) cd chaos_eng_automation ; git checkout AUT-563 -``` - -##### Checkout 'st-automation' repo code (i.e nimble migrated on python 3) -``` -4) cd ../ ; git clone https://github.com/Guavus/st-automation.git -5) cd st-automation ; git checkout AUT-439-my-copy -``` - -##### Add dependencies for projects 'chaos_eng_automation' and 'st-automation' in a virtual env +##### Install chaostoolkit-nimble package ``` 1) cd ../ ; mkdir chaos_virenv ; cd chaos_virenv 2) virtualenv --python=python3 venv 3) source venv/bin/activate -4) pip install -r st-automation/requirements.txt -5) pip install -r chaos_eng_automation/requirements.txt +4)Remove nimble and add chaostoolkit-nimble in your requirements.txt +5)Install chaostoolkit-nimble in their virtualenv using command: +6) pip install -r --extra-index-url http://192.168.192.201:5050/simple/ --trusted-host 192.168.192.201 ``` ##### Add this virtual env in pycharm @@ -278,13 +270,56 @@ Automation code from your own solution repo is already checked out on the system Pycharm --> Preferences --> Project interpreter --> settings --> show all --> add the chaos_virenv ``` -##### Add the project 'chaos_eng_automation' and 'st-automation' as your main automation project dependencies +##### Post installation changes + +###### `Pre-requisite` +The testbed file names should follow the nomenclature `open_nebula_*`. + +###### `Changes required` +1)Add the chaos test case in the corresponding job's test file. +2)Update conftest.py with below piece of code + +``` +parser.addoption("--experimentsPath", + help="Relative path (to the project root) of the file containing chaos experiment json files. E.g. python -m pytest --validationConfig=resources/validation/chaos_exp_config.yml") +``` + ``` -1) Open the projects 'chaos_eng_automation' and 'st-automation' in the same pycharm window as that of your current project. -2) Pycharm --> Preferences --> Project dependencies --> Check chaos_eng_automation and 'st-automation' -3) Pycharm --> Preferences --> Project interpreter --> select the virtual env 'chaos_virenv' on both these projects +@pytest.fixture(scope="session", autouse=True) +def initialize_node_obj(request): + testbed_file = request.config.getoption("--testbed") + component_arttributes_file = request.config.getoption("--componentAttributesConfig") + if not component_arttributes_file: + component_arttributes_file = "nimble/resources/components/component_attributes.yml" + setup_files_base_path = "%s/setup" % global_constants.DEFAULT_LOCAL_TMP_PATH + if testbed_file: + NodeManager.initialize(testbed_file, component_arttributes_file) + ShellUtils.execute_shell_command( + ShellUtils.remove_and_create_directory(setup_files_base_path)) + testbed_file_tmp_path = "%s/%s" % (setup_files_base_path, testbed_file.rsplit("/", 1)[1]) + component_arttributes_file_tmp_path = "%s/%s" % ( + setup_files_base_path, component_arttributes_file.rsplit("/", 1)[1]) + ShellUtils.execute_shell_command(ShellUtils.copy(testbed_file, testbed_file_tmp_path)) + ShellUtils.execute_shell_command( + ShellUtils.copy(component_arttributes_file, component_arttributes_file_tmp_path)) + yield + ShellUtils.execute_shell_command(ShellUtils.remove(setup_files_base_path, recursive=True)) +``` +##### Resolving dependency issues on MAC +* `Install python 3 using below command` +``` +brew install python3 +``` +* `Chaos html report generation issue` +``` +pip install cairocffi --- already satisfied +brew uninstall py2cairo --- this will not install properly but one of its dependencies will get installed successfully. 'i.e' "cairo" +export PKG_CONFIG_PATH="/usr/local/opt/libffi/lib/pkgconfig" +pip install pycairo +brew install pandoc ``` +``` # Chaos Toolkit Kubernetes Support [![Build Status](https://travis-ci.org/chaostoolkit/chaostoolkit-kubernetes.svg?branch=master)](https://travis-ci.org/chaostoolkit/chaostoolkit-kubernetes) From 2d338a02fdf362d899ab95eafb492c4f63f99f64 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 30 Aug 2019 13:08:07 +0530 Subject: [PATCH 34/40] Updated Readme.md --- README.md | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 3b4af2d..9354e07 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ + +# chaosToolkit-nimble (Guavus Chaos Test automation framework) + +- [ChaosToolkit Overview](#ChaosToolkit Overview) +- [Jio Use Cases Implemented](#Jio Use Cases Implemented) +- [Installation](#Installation of chaostoolkit-nimble on the local system (MAC)) +- [Resolving dependency issues on local system (MAC)](#Resolving dependency issues on the local system (MAC)) + ## ChaosToolkit Overview *The end goal of using chaostoolkit is to practice chaos engineering, and discover how your system reacts when certain anomalies are injected in it. @@ -247,37 +255,34 @@ python -m pytest -k "test_chaos_on_driver_and_executor_kill or test_data_validat ``` -## Setting up chaostoolkit-nimble on local system +## Installation of chaostoolkit-nimble on the local system (MAC) -Assumptions : -Python 3 is already installed on the system. +`Assumptions` : -Automation code from your own solution repo is already checked out on the system. +* Python 3 is already installed on the system. +* Automation code from your own solution repo is already checked out on the system. -##### Install chaostoolkit-nimble package +##### 1. Install chaostoolkit-nimble package ``` -1) cd ../ ; mkdir chaos_virenv ; cd chaos_virenv -2) virtualenv --python=python3 venv -3) source venv/bin/activate -4)Remove nimble and add chaostoolkit-nimble in your requirements.txt -5)Install chaostoolkit-nimble in their virtualenv using command: -6) pip install -r --extra-index-url http://192.168.192.201:5050/simple/ --trusted-host 192.168.192.201 +1.1 cd ../ ; mkdir chaos_virenv ; cd chaos_virenv +1.2 virtualenv --python=python3 venv +1.3 source venv/bin/activate +1.4 Remove nimble and add chaostoolkit-nimble in your requirements.txt +1.5 Install chaostoolkit-nimble in their virtualenv using command: +1.6 pip install -r --extra-index-url http://192.168.192.201:5050/simple/ --trusted-host 192.168.192.201 ``` -##### Add this virtual env in pycharm +##### 2. Add this virtual env in pycharm ``` Pycharm --> Preferences --> Project interpreter --> settings --> show all --> add the chaos_virenv ``` -##### Post installation changes +##### 3. Post installation changes -###### `Pre-requisite` -The testbed file names should follow the nomenclature `open_nebula_*`. - -###### `Changes required` -1)Add the chaos test case in the corresponding job's test file. -2)Update conftest.py with below piece of code +3.1 Make sure the testbed file name follow the nomenclature `open_nebula_*`. If not then remane it accoridingly. +3.2 Add the chaos test case in the corresponding job's test file. +3.3 Update conftest.py with below piece of code ``` parser.addoption("--experimentsPath", @@ -305,7 +310,8 @@ def initialize_node_obj(request): yield ShellUtils.execute_shell_command(ShellUtils.remove(setup_files_base_path, recursive=True)) ``` -##### Resolving dependency issues on MAC + +## Resolving dependency issues on local system (MAC) * `Install python 3 using below command` ``` brew install python3 From b1fb302e59dd431c11550dfc9b1d3c6f4adc0cd7 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 30 Aug 2019 13:28:17 +0530 Subject: [PATCH 35/40] Updated Readme.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9354e07..f75dc99 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,15 @@ - [Installation](#Installation of chaostoolkit-nimble on the local system (MAC)) - [Resolving dependency issues on local system (MAC)](#Resolving dependency issues on the local system (MAC)) + ## ChaosToolkit Overview +* The Chaostoolkit is an open source third party tool that enables you to run chaos engineering experiments seamlessly against applications and infrastructure components to assess resiliency and fault tolerance. -*The end goal of using chaostoolkit is to practice chaos engineering, and discover how your system reacts when certain anomalies are injected in it. +* With use of Chaostoolkit you may learn how to change the system accordingly and make it more resilient on multiple levels like application, network and platform. -*By doing this in a controlled fashion, you may learn how to change the system accordingly and make it more resilient on multiple levels like application, network and platform. +* Chaostoolkit has been integrated with nimble which led to the formulation of `chaostoolkit-nimble`. +* `Chaostoolkit-nimble` is built on top of nimble and using all its core utilities. ### The Various Sections of an Experiment ##### `Controls` From 61c376e1aa10c9a322c4440c9e9d4009e2e47562 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Fri, 30 Aug 2019 13:33:14 +0530 Subject: [PATCH 36/40] Updated Readme.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f75dc99..68c74f5 100644 --- a/README.md +++ b/README.md @@ -200,7 +200,7 @@ User inputs required: * Num of executors to kill. Default is 1. Pytest command: -python -m pytest -k "test_chaos_on_executor_kill or test_data_validation_post_chaos" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +python -m pytest -k "test_chaos_on_executor_kill or test_data_validation" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py ``` ### Use case 2: Kill the spark driver for a spark job running on yarn and validate data for that job instance. @@ -254,7 +254,7 @@ User inputs required: * Num of executors to kill. Default is 1. Pytest command: -python -m pytest -k "test_chaos_on_driver_and_executor_kill or test_data_validation_post_chaos" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py +python -m pytest -k "test_chaos_on_driver_and_executor_kill or test_data_validation" --testbed=chaostoolkit_nimble/resources/testbeds/open_nebula_135_35.yml --componentAttributesConfig=chaostoolkit_nimble/resources/components/component_attributes_kerberos.yml --validationConfig=chaostoolkit_nimble/resources/validation/sample_validation_config.yml chaostoolkit_nimble/tests/sample/test_jio_spark_job.py ``` From aa2f03d5bbc5bd99bce348800248e8f13f3b9e00 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Mon, 2 Sep 2019 11:54:35 +0530 Subject: [PATCH 37/40] Updated Readme.md --- README.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 68c74f5..56e4af5 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,7 @@ * With use of Chaostoolkit you may learn how to change the system accordingly and make it more resilient on multiple levels like application, network and platform. -* Chaostoolkit has been integrated with nimble which led to the formulation of `chaostoolkit-nimble`. - -* `Chaostoolkit-nimble` is built on top of nimble and using all its core utilities. +* `Chaostoolkit-nimble` is built on top of `nimble` and `chaostoolkit-kubernetes` which is a kubernetes-specific chaos extension. ### The Various Sections of an Experiment ##### `Controls` @@ -47,6 +45,17 @@ The second time the hypothesis is applied is after the conditions were changed i *This block will be executed always irrespective of the fact that Hypothesis was met or not in the first time. ``` +#### `Possible Types of an activity` +###### `Probes` +``` +A probe is a way of detecting a particular set of conditions in the system that is undergoing experimentation. +``` + +###### `Action` +``` +An action is a particular activity that needs to be enacted on the system under experimentation. +``` + #### `Sample Experiment json file` ``` { From 0effd094570db4f90a5b81ec4e5022edd5949f01 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Mon, 2 Sep 2019 12:04:47 +0530 Subject: [PATCH 38/40] Updated probes and actions in Readme.md --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 56e4af5..ab53d4f 100644 --- a/README.md +++ b/README.md @@ -45,15 +45,17 @@ The second time the hypothesis is applied is after the conditions were changed i *This block will be executed always irrespective of the fact that Hypothesis was met or not in the first time. ``` -#### `Possible Types of an activity` -###### `Probes` +#### `Different Types of Activities` +###### `Probe` ``` -A probe is a way of detecting a particular set of conditions in the system that is undergoing experimentation. +* A probe is a way of detecting a particular set of conditions in the system that is undergoing experimentation. +* Hypothesis uses probes only and the method may or may not use them. ``` ###### `Action` ``` -An action is a particular activity that needs to be enacted on the system under experimentation. +* An action is a particular activity that needs to be enacted on the system under experimentation. +* Rollbacks are made of actions only and the method also use actions. ``` #### `Sample Experiment json file` From c0ac6703344f438b6b770e4067a7227766e04e48 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Mon, 2 Sep 2019 14:31:54 +0530 Subject: [PATCH 39/40] Readme.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ab53d4f..2fb7423 100644 --- a/README.md +++ b/README.md @@ -277,7 +277,7 @@ python -m pytest -k "test_chaos_on_driver_and_executor_kill or test_data_validat * Automation code from your own solution repo is already checked out on the system. -##### 1. Install chaostoolkit-nimble package +##### 1. Install chaostoolkit-nimble package in a virtualenv ``` 1.1 cd ../ ; mkdir chaos_virenv ; cd chaos_virenv 1.2 virtualenv --python=python3 venv From a3a7111caa4538cb5278aee2ae77dd908210d1e1 Mon Sep 17 00:00:00 2001 From: "kritika.saxena" Date: Thu, 17 Oct 2019 13:13:08 +0530 Subject: [PATCH 40/40] Changes done corresponding to nimble changes. Also release changes in setup.py --- chaostoolkit_nimble/controllers/spark/control.py | 4 ++-- chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/chaostoolkit_nimble/controllers/spark/control.py b/chaostoolkit_nimble/controllers/spark/control.py index 97cdda8..5e3b043 100644 --- a/chaostoolkit_nimble/controllers/spark/control.py +++ b/chaostoolkit_nimble/controllers/spark/control.py @@ -64,14 +64,14 @@ def after_experiment_control(context: Experiment, state: Journal, logger.debug("AFTER EXPERIMENT CONTROL: %s" % state) hadoop_rest_client_utils = HadoopRestClientUtils() try: - hadoop_rest_client_utils.is_yarn_job_finished(APPLICATION_ID) + hadoop_rest_client_utils.wait_for_yarn_job_to_finish(APPLICATION_ID) job_stats = hadoop_rest_client_utils.get_yarn_job_details(APPLICATION_ID) logger.info("Total execution time for yarn job with application id %s: %s ms (i.e %s minutes) " % ( APPLICATION_ID, job_stats["app"]["elapsedTime"], date_utils.get_minutes_from_milliseconds(job_stats["app"]["elapsedTime"]))) except RetryError: try: - hadoop_rest_client_utils.is_yarn_job_finished(APPLICATION_ID) + hadoop_rest_client_utils.wait_for_yarn_job_to_finish(APPLICATION_ID) job_stats = hadoop_rest_client_utils.get_yarn_job_details(APPLICATION_ID) logger.info("Total execution time for yarn job with application id %s: %s ms (i.e %s minutes) " % ( APPLICATION_ID, job_stats["app"]["elapsedTime"], diff --git a/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py b/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py index 14e0115..aded380 100644 --- a/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py +++ b/chaostoolkit_nimble/core/utils/yarn_apps_ha_utils.py @@ -8,7 +8,7 @@ def is_job_running_on_yarn(job_name): hadoop_rest_client_utils = HadoopRestClientUtils() logger.debug("Checking if job '%s' running on yarn" % job_name) try: - return hadoop_rest_client_utils.is_yarn_job_running(job_name=job_name) + return hadoop_rest_client_utils.wait_for_yarn_job_to_start(job_name=job_name) except RetryError: logger.info("Not able to fetch yarn job '%s' status." % job_name) return False diff --git a/setup.py b/setup.py index 71af68d..2f71647 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ name="chaostoolkit-nimble", packages=find_packages(), description="Guavus Chaos Test automation framework", - version="0.0.1", + version="0.0.2", install_requires=requirements, url="https://github.com/kritika-saxena-guavus/chaos_eng_automation", author="Core Automation Squad",