```
begin
import MLDataUtils
import MLJGLMInterface
import MLJDecisionTreeInterface
using AlgebraOfGraphics
using CairoMakie
using CategoricalArrays
using DataFrames
using Distributions
using MLJBase
using MLJ
using StableRNGs: StableRNG
using Random
end
```

```
df = let
n = 80
μ1 = 10
μ2 = 12
σ = 2
d1 = Normal(μ1, σ)
d2 = Normal(μ2, σ)
Random.seed!(123)
classes = categorical(rand(["A", "B"], n))
df = DataFrame(
class = categorical(classes),
U = [class == "A" ? rand(d1) : rand(d2) for class in classes],
V = rand(Normal(100, 10), n)
)
end
```

class | U | V |
---|---|---|

CategoricalValue{String, UInt32} "B" | 11.5802 | 111.588 |

CategoricalValue{String, UInt32} "A" | 10.0062 | 99.9623 |

CategoricalValue{String, UInt32} "A" | 12.719 | 93.8876 |

CategoricalValue{String, UInt32} "A" | 10.6082 | 110.054 |

CategoricalValue{String, UInt32} "B" | 8.8399 | 108.481 |

CategoricalValue{String, UInt32} "B" | 13.2351 | 99.2827 |

CategoricalValue{String, UInt32} "A" | 9.97171 | 90.8848 |

CategoricalValue{String, UInt32} "B" | 8.75458 | 113.906 |

CategoricalValue{String, UInt32} "A" | 8.96639 | 89.2201 |

CategoricalValue{String, UInt32} "B" | 10.3287 | 113.208 |

... | ||

CategoricalValue{String, UInt32} "A" | 12.0734 | 101.862 |

```
let
uv = data(df) * mapping(:U, :V, color=:class)
draw(uv)
end
```

Training and evaluating (testing) on the same data is not particulary useful because we want to know how well our model generalizes. For more information, see topics such as overfitting. Instead, we split the data up in a train and test set.

```
train, test = let
rng = StableRNG(123)
MLJ.partition(eachindex(df.class), 0.7; shuffle=true, rng)
end;
```

```
logistic_model = let
LinearBinary = @load LinearBinaryClassifier pkg=GLM verbosity=0
logistic_model = LinearBinary()
end;
```

```
begin
logistic = machine(logistic_model, (U = df.U, V = df.V), df.class)
fit!(logistic; rows=train)
fitted_params(logistic).coef
end
```

`[0.717045, 0.0409366, -12.2757]`

The second coefficient in the linear model is close to zero. This is exactly what the model should do since `V`

is random noise.

```
forest_model = let
DecisionTree = @load DecisionTreeClassifier pkg=DecisionTree verbosity=0
tree = DecisionTree()
EnsembleModel(atom=tree, n=10)
end;
```

```
forest = let
forest = machine(forest_model, (U = df.U, V = df.V), df.class)
fit!(forest; rows=train);
forest
end;
```

Now that we have fitted the two models, we can compare the accuracies and plot the receiver operating characteristic.

```
let
truths = df.class[test]
logistic_predictions = MLJ.predict(logistic, rows=test)
logistic_fprs, logistic_tprs, _ = roc_curve(logistic_predictions, truths)
forest_predictions = MLJ.predict(forest, rows=test)
forest_fprs, forest_tprs, _ = roc_curve(forest_predictions, truths)
logistic_df = DataFrame(
x = logistic_fprs,
y = logistic_tprs,
method = "logistic"
)
forest_df = DataFrame(
x = forest_fprs,
y = forest_tprs,
method = "forest"
)
roc_df = vcat(logistic_df, forest_df)
xy = data(roc_df)
xy *= smooth() + visual(Scatter)
xy *= mapping(
:x => "False positive rate",
:y => "True positive rate",
color=:method)
draw(xy)
end
```

By doing a train and test split, we basically threw a part of the data away. For small datasets, like the dataset in this example, that is not very efficient. Therefore, we also do a k-fold cross-validation.

```
folds = let
Random.seed!(123)
rng = MersenneTwister(123)
indexes = shuffle(rng, eachindex(df.class))
folds = MLDataUtils.kfolds(indexes, k = 8)
end;
```

`r3(x) = round(x; digits=3);`

```
function fitted_accuracy(model, train, test)
forest = machine(model, (U = df.U, V = df.V), df.class)
fit!(forest; rows=train)
predictions = predict_mode(forest, rows=test)
return accuracy(predictions, df.class[test]) |> r3
end;
```

```
let
accuracies = [fitted_accuracy(logistic_model, train, test) for (train, test) in folds]
accuracies, mean(accuracies) |> r3
end
```

`([0.7, 0.7, 0.4, 0.6, 0.7, 0.8, 0.7, 0.6], 0.65)`

```
let
accuracies = [fitted_accuracy(forest_model, train, test) for (train, test) in folds]
accuracies, mean(accuracies) |> r3
end
```

`([0.5, 0.6, 0.3, 0.5, 0.5, 0.7, 0.5, 0.4], 0.5)`

AlgebraOfGraphics 0.6.0 CairoMakie 0.6.6 CategoricalArrays 0.10.1 DataFrames 1.2.2 Distributions 0.25.22 MLDataUtils 0.5.4 MLJ 0.16.10 MLJBase 0.18.23 MLJDecisionTreeInterface 0.1.3 MLJGLMInterface 0.1.7 StableRNGs 1.0.0