diff --git a/Project.toml b/Project.toml index 0ae8b54..4ae5d79 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "FeatureSelection" uuid = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6" authors = ["Anthony D. Blaom ", "Samuel Okon name in [:x1, :x3], ignore = - true)` has the same effect as `FeatureSelector(features = [:x1, - :x3], ignore = true)`, namely to select all features, with the - exception of `:x1` and `:x3`. - -- `ignore`: whether to ignore or keep specified `features`, as - explained above - - -# Operations - -- `transform(mach, Xnew)`: select features from the table `Xnew` as - specified by the model, taking features seen during training into - account, if relevant - - -# Fitted parameters - -The fields of `fitted_params(mach)` are: - -- `features_to_keep`: the features that will be selected - - -# Example - -``` -using MLJ - -X = (ordinal1 = [1, 2, 3], - ordinal2 = coerce(["x", "y", "x"], OrderedFactor), - ordinal3 = [10.0, 20.0, 30.0], - ordinal4 = [-20.0, -30.0, -40.0], - nominal = coerce(["Your father", "he", "is"], Multiclass)); - -selector = FeatureSelector(features=[:ordinal3, ], ignore=true); - -julia> transform(fit!(machine(selector, X)), X) -(ordinal1 = [1, 2, 3], - ordinal2 = CategoricalValue{Symbol,UInt32}["x", "y", "x"], - ordinal4 = [-20.0, -30.0, -40.0], - nominal = CategoricalValue{String,UInt32}["Your father", "he", "is"],) - -``` -""" -FeatureSelector +# docstring is in "src/type_docstrings.jl" diff --git a/src/models/rfe.jl b/src/models/rfe.jl index 10a49fe..c87f1d4 100644 --- a/src/models/rfe.jl +++ b/src/models/rfe.jl @@ -22,7 +22,7 @@ const ERR_FEATURES_SEEN = ArgumentError( const MODEL_TYPES = [ :ProbabilisticRecursiveFeatureElimination, :DeterministicRecursiveFeatureElimination ] -const SUPER_TYPES = [:Deterministic, :Probabilistic] +const SUPER_TYPES = [:Probabilistic, :Deterministic] const MODELTYPE_GIVEN_SUPERTYPES = zip(MODEL_TYPES, SUPER_TYPES) for (ModelType, ModelSuperType) in MODELTYPE_GIVEN_SUPERTYPES @@ -181,6 +181,7 @@ function RecursiveFeatureElimination( # which is rare. throw(ERR_MODEL_TYPE) end + message = MMI.clean!(selector) isempty(message) || @warn(message) return selector diff --git a/src/type_docstrings.jl b/src/type_docstrings.jl new file mode 100644 index 0000000..7bed37b --- /dev/null +++ b/src/type_docstrings.jl @@ -0,0 +1,81 @@ +# This file cannot be include before types and all metadata is defined + +## Docstring +""" +$(MMI.doc_header(FeatureSelector)) + +Use this model to select features (columns) of a table, usually as +part of a model `Pipeline`. + + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X) + +where + +- `X`: any table of input features, where "table" is in the sense of Tables.jl + +Train the machine using `fit!(mach, rows=...)`. + + +# Hyper-parameters + +- `features`: one of the following, with the behavior indicated: + + - `[]` (empty, the default): filter out all features (columns) which + were not encountered in training + + - non-empty vector of feature names (symbols): keep only the + specified features (`ignore=false`) or keep only unspecified + features (`ignore=true`) + + - function or other callable: keep a feature if the callable returns + `true` on its name. For example, specifying + `FeatureSelector(features = name -> name in [:x1, :x3], ignore = + true)` has the same effect as `FeatureSelector(features = [:x1, + :x3], ignore = true)`, namely to select all features, with the + exception of `:x1` and `:x3`. + +- `ignore`: whether to ignore or keep specified `features`, as + explained above + + +# Operations + +- `transform(mach, Xnew)`: select features from the table `Xnew` as + specified by the model, taking features seen during training into + account, if relevant + + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `features_to_keep`: the features that will be selected + + +# Example + +``` +using MLJ + +X = (ordinal1 = [1, 2, 3], + ordinal2 = coerce(["x", "y", "x"], OrderedFactor), + ordinal3 = [10.0, 20.0, 30.0], + ordinal4 = [-20.0, -30.0, -40.0], + nominal = coerce(["Your father", "he", "is"], Multiclass)); + +selector = FeatureSelector(features=[:ordinal3, ], ignore=true); + +julia> transform(fit!(machine(selector, X)), X) +(ordinal1 = [1, 2, 3], + ordinal2 = CategoricalValue{Symbol,UInt32}["x", "y", "x"], + ordinal4 = [-20.0, -30.0, -40.0], + nominal = CategoricalValue{String,UInt32}["Your father", "he", "is"],) + +``` +""" +FeatureSelector diff --git a/test/models/featureselector.jl b/test/models/featureselector.jl index c38dc82..e89dd28 100644 --- a/test/models/featureselector.jl +++ b/test/models/featureselector.jl @@ -62,6 +62,7 @@ # Test model Metadata @test MLJBase.input_scitype(selector) == MLJBase.Table @test MLJBase.output_scitype(selector) == MLJBase.Table + @test MLJBase.package_name(selector) == "FeatureSelection" end # To be added with FeatureSelectorRule X = (n1=["a", "b", "a"], n2=["g", "g", "g"], n3=[7, 8, 9], diff --git a/test/models/rfe.jl b/test/models/rfe.jl index fb635c9..de95962 100644 --- a/test/models/rfe.jl +++ b/test/models/rfe.jl @@ -51,7 +51,9 @@ const DTM = DummyTestModels selector = RecursiveFeatureElimination(model=rf) selector2 = RecursiveFeatureElimination(model=rf2) @test selector isa FeatureSelection.DeterministicRecursiveFeatureElimination + @test selector isa MLJBase.Deterministic @test selector2 isa FeatureSelection.ProbabilisticRecursiveFeatureElimination + @test selector2 isa MLJBase.Probabilistic @test MLJBase.constructor(selector) == RecursiveFeatureElimination # Fit models @@ -104,8 +106,10 @@ const DTM = DummyTestModels # Traits @test MLJBase.package_name(selector) == "FeatureSelection" @test MLJBase.load_path(selector) == "FeatureSelection.RecursiveFeatureElimination" - @test MLJBase.iteration_parameter(selector) == FeatureSelection.prepend(:model, MLJBase.iteration_parameter(selector.model)) - @test MLJBase.training_losses(selector, rpt) == MLJBase.training_losses(selector.model, rpt.model_report) + @test MLJBase.iteration_parameter(selector) == + FeatureSelection.prepend(:model, MLJBase.iteration_parameter(selector.model)) + @test MLJBase.training_losses(selector, rpt) == + MLJBase.training_losses(selector.model, rpt.model_report) end @testset "Compare results for RFE with scikit-learn" begin