Merge branch 'dev' into entity-embedder-docstring-improvements

ablaom · ablaom · commit 9bdc08293c75 · 2025-06-26T13:34:42.000+12:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJFlux"
 uuid = "094fc8d1-fd35-5302-93ea-dabda2abf845"
 authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>", "Ayush Shridhar <ayush.shridhar1999@gmail.com>"]
-version = "0.6.5"
+version = "0.6.6"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
diff --git a/src/mlj_embedder_interface.jl b/src/mlj_embedder_interface.jl
@@ -82,14 +82,27 @@ are still presented a target variable in training, but they behave as transforme
 pipelines. They are entity embedding transformers, in the sense of the article, "Entity
 Embeddings of Categorical Variables" by Cheng Guo, Felix Berkhahn.
 
-The atomic `model` must be an instance of `MLJFlux.NeuralNetworkClassifier`,
-`MLJFlux.NeuralNetworkBinaryClassifier`, `MLJFlux.NeuralNetworkRegressor`, or
-`MLJFlux.MultitargetNeuralNetworkRegressor`. Hyperparameters of the atomic model, in
-particular `builder` and `embedding_dims`, will effect embedding performance.
+# Training data
 
-The wrapped model is bound to a machine and trained exactly as the wrapped supervised
-`model`, and supports the same form of training data. In particular, a training target
-must be supplied.
+In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
+
+    mach = machine(embed_model, X, y)
+
+Here:
+
+- `embed_model` is an instance of `EntityEmbedder`, which wraps a supervised MLJFlux
+  model, `model`, which must be an instance of one of these:
+  `MLJFlux.NeuralNetworkClassifier`, `NeuralNetworkBinaryClassifier`,
+  `MLJFlux.NeuralNetworkRegressor`,`MLJFlux.MultitargetNeuralNetworkRegressor`.
+
+- `X` is any table of input features supported by the model being wrapped. Features to be
+  transformed must have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)`
+  to check scitypes.
+
+- `y` is the target, which can be any `AbstractVector` supported by the model being
+  wrapped.
+
+Train the machine using `fit!(mach)`.
 
 # Examples
 
@@ -107,6 +120,7 @@ X = (
   b = categorical(rand("abcde", N)),
   c = categorical(rand("ABCDEFGHIJ", N), ordered = true),
 )
+
 y = categorical(rand("YN", N));
 
 # Initiate model
diff --git a/test/encoders.jl b/test/encoders.jl
@@ -7,6 +7,7 @@
         Column3 = categorical(["b", "c", "d"]),
         Column4 = [1.0, 2.0, 3.0, 4.0, 5.0],
     )
+    # Test Encoding Functionality
     map = MLJFlux.ordinal_encoder_fit(X; featinds = [2, 3])
     Xenc = MLJFlux.ordinal_encoder_transform(X, map)
     @test map[2] == Dict('a' => 1, 'b' => 2, 'c' => 3, 'd' => 4, 'e' => 5)
@@ -21,6 +22,16 @@
     @test !haskey(map, 1)   # already encoded
 
     @test Xenc == MLJFlux.ordinal_encoder_fit_transform(X; featinds = [2, 3])[1]
+
+    # Test Consistency with Types
+    scs = schema(Xenc).scitypes
+    ts  = schema(Xenc).types
+    
+    # 1) all scitypes must be exactly Continuous
+    @test all(scs .== Continuous)
+    
+    # 2) all types must be a concrete subtype of AbstractFloat (i.e. <: AbstractFloat, but ≠ AbstractFloat itself)
+    @test all(t -> t <: AbstractFloat && isconcretetype(t), ts)
 end
 
 @testset "Generate New feature names Function Tests" begin