NGDC_method/test_competitors.py at main · Sorooshi/NGDC_method · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os
import time
import argparse
import numpy as np
from pathlib import Path
from types import SimpleNamespace
from gdcm.data.load_data import FeaturesData
from gdcm.data.preprocess import preprocess_features
from gdcm.common.utils import load_a_dict, save_a_dict, print_the_evaluated_results

np.set_printoptions(suppress=True, precision=3, linewidth=140)


def args_parser(arguments):

    _pp = arguments.pp.lower()
    _run = arguments.run
    _data_name = arguments.data_name.lower()
    _algorithm_name = arguments.algorithm_name.lower()
    _n_clusters = arguments.n_clusters
    _n_repeats = arguments.n_repeats
    _n_init = arguments.n_init

    return _pp, _run, _data_name, _algorithm_name, _n_clusters, _n_repeats, _n_init


configs = {
    "results_path": Path("/home/soroosh/Programmes/GDCM/Results"),
    "figures_path": Path("/home/soroosh/Programmes/GDCM/Figures"),
    "params_path": Path("/home/soroosh/Programmes/GDCM/Params"),
    "data_path": Path("/home/soroosh/Programmes/GDCM/Datasets"),
}

configs = SimpleNamespace(**configs)

if not configs.results_path.exists():
    configs.results_path.mkdir()

if not configs.figures_path.exists():
    configs.figures_path.mkdir()

if not configs.params_path.exists():
    configs.params_path.mkdir()

if __name__ == "__main__":

    # all the string inputs will be converted to lower case.
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--data_name", type=str, default="IRIS",
        help="Dataset's name, e.g., IRIS, or Lawyers, or dd_fix_demo."
    )

    parser.add_argument(
        "--algorithm_name", type=str, default="km_clu",
        help="None case sensitive first letter abbreviated name of an estimator proceeds "
             "  with _clu e.g., K-Means clustering := km_clu."
             "Note: First letter of the methods' name should be used for abbreviation."
    )

    parser.add_argument(
        "--run", type=int, default=0,
        help="Run the model or load the saved"
             " weights and reproduce the results."
    )

    parser.add_argument(
        "--pp", type=str, default="mm",
        help="Data preprocessing method:"
             " MinMax/Z-Scoring/etc."
    )

    parser.add_argument(
        "--n_clusters", type=int, default=5,
        help="Number of clusters/classes/discrete target values."
    )

    parser.add_argument(
        "--n_repeats", type=int, default=10,
        help="Number of repeats of a data set or of a specific distribution"
    )

    parser.add_argument(
        "--n_init", type=int, default=10,
        help="Number of repeats with different seed initialization to select the best results on a data set."
    )

    args = parser.parse_args()

    pp, run, data_name, algorithm_name, n_clusters, n_repeats, n_init = args_parser(arguments=args)

    print(
        "configuration: \n",
        "  estimator:", algorithm_name, "\n",
        "  data_name:", data_name, "\n",
        "  pre-processing:", pp, "\n",
        "  run:", run, "\n",
    )

    # Adding some details for the sake of clarity in storing and visualization
    configs.run = run
    specifier = " -alg: " + algorithm_name + \
                " -data: " + data_name + \
                " -n_init: " + str(n_init)

    configs.specifier = specifier
    configs.data_name = data_name
    configs.n_repeats = n_repeats

    # to add the repeat numbers to the data_name variable for synthetic data
    if "n=" in data_name or "k=" in data_name or "v=" in data_name:
        synthetic_data = True
    else:
        synthetic_data = False

    if run == 1:
        results = {}
        for repeat in range(1, configs.n_repeats+1):

            repeat = str(repeat)
            results[repeat] = {}

            if algorithm_name.split("_")[-1].lower() == "clu":
                print(
                    "clustering features_only data: applying competitors on " + data_name + " repeat=" + repeat, "\n"
                )

                from gdcm.algorithms.clustering_methods_competitors import ClusteringEstimators

                if synthetic_data is True:
                    dire = "F/synthetic"
                    dn = data_name + "_" + repeat

                else:
                    dire = "F"
                    dn = data_name

                data_path = os.path.join(configs.data_path, dire)
                fd = FeaturesData(name=dn, path=data_path)

                x, xn, y_true = fd.get_dataset()
                results[repeat]['y_true'] = y_true

                x = preprocess_features(x=x, pp=pp)
                if xn.shape[0] != 0:
                    xn = preprocess_features(x=xn, pp=pp)
                n_clusters = len(np.unique(y_true))

                # instantiate and fit
                start = time.process_time()
                cu = ClusteringEstimators(
                    algorithm_name=algorithm_name,
                    n_clusters=n_clusters,
                    n_init=n_init
                )

                cu.instantiate_estimator_with_parameters()
                y_pred = cu.fit_estimator(x=x, y=y_true)
                end = time.process_time()

                # save results and logs
                results[repeat]['y_pred'] = y_pred
                results[repeat]['time'] = end-start
                results[repeat]['inertia'] = cu.inertia
                results[repeat]['data_scatter'] = cu.data_scatter

            else:
                assert False, "Ill-defined algorithm name!"

        # save results dict and configs
        save_a_dict(
            a_dict=results, name=configs.specifier, save_path=configs.results_path
        )

        save_a_dict(
            a_dict=configs, name=configs.specifier, save_path=configs.params_path
        )

        print_the_evaluated_results(results=results)

    elif run != 1:

        # load results dict and configs
        results = load_a_dict(
            name=configs.specifier, save_path=configs.results_path
        )

        configs = load_a_dict(
            name=configs.specifier, save_path=configs.params_path
        )

        print("configs \n", configs.specifier, "\n")

        print_the_evaluated_results(results=results)