Data generation

begin
	import MLDataUtils
	import MLJGLMInterface
	import MLJDecisionTreeInterface

	using AlgebraOfGraphics
	using CairoMakie
	using CategoricalArrays
	using DataFrames
	using Distributions
	using MLJBase
	using MLJ
	using StableRNGs: StableRNG
	using Random
end
df = let
	n = 80
	μ1 = 10
	μ2 = 12
	σ = 2

	d1 = Normal(μ1, σ)
	d2 = Normal(μ2, σ)

	Random.seed!(123)
	classes = categorical(rand(["A", "B"], n))

	df = DataFrame(
		class = categorical(classes),
		U = [class == "A" ? rand(d1) : rand(d2) for class in classes],
		V = rand(Normal(100, 10), n)
	)
end
class U V
CategoricalValue{String, UInt32} "B" 11.5802 111.588
CategoricalValue{String, UInt32} "A" 10.0062 99.9623
CategoricalValue{String, UInt32} "A" 12.719 93.8876
CategoricalValue{String, UInt32} "A" 10.6082 110.054
CategoricalValue{String, UInt32} "B" 8.8399 108.481
CategoricalValue{String, UInt32} "B" 13.2351 99.2827
CategoricalValue{String, UInt32} "A" 9.97171 90.8848
CategoricalValue{String, UInt32} "B" 8.75458 113.906
CategoricalValue{String, UInt32} "A" 8.96639 89.2201
CategoricalValue{String, UInt32} "B" 10.3287 113.208
...
CategoricalValue{String, UInt32} "A" 12.0734 101.862
let
	uv = data(df) * mapping(:U, :V, color=:class)
	draw(uv)
end

Train and test split

Training and evaluating (testing) on the same data is not particulary useful because we want to know how well our model generalizes. For more information, see topics such as overfitting. Instead, we split the data up in a train and test set.

train, test = let
	rng = StableRNG(123)
	MLJ.partition(eachindex(df.class), 0.7; shuffle=true, rng)
end;

Model fitting

logistic_model = let
	LinearBinary = @load LinearBinaryClassifier pkg=GLM verbosity=0
	logistic_model = LinearBinary()
end;
begin
	logistic = machine(logistic_model, (U = df.U, V = df.V), df.class)
	fit!(logistic; rows=train)
	fitted_params(logistic).coef
end
[0.717045, 0.0409366, -12.2757]

The second coefficient in the linear model is close to zero. This is exactly what the model should do since V is random noise.

forest_model = let
	DecisionTree = @load DecisionTreeClassifier pkg=DecisionTree verbosity=0
	tree = DecisionTree()
	EnsembleModel(atom=tree, n=10)
end;
forest = let
	forest = machine(forest_model, (U = df.U, V = df.V), df.class)
	fit!(forest; rows=train);
	forest
end;

Accuracy

Now that we have fitted the two models, we can compare the accuracies and plot the receiver operating characteristic.

let
	truths = df.class[test]
	
	logistic_predictions = MLJ.predict(logistic, rows=test)
	logistic_fprs, logistic_tprs, _ = roc_curve(logistic_predictions, truths)
	
	forest_predictions = MLJ.predict(forest, rows=test)
	forest_fprs, forest_tprs, _ = roc_curve(forest_predictions, truths)
	
	logistic_df = DataFrame(
		x = logistic_fprs,
		y = logistic_tprs,
		method = "logistic"
	)

	forest_df = DataFrame(
		x = forest_fprs,
		y = forest_tprs,
		method = "forest"
	)

	roc_df = vcat(logistic_df, forest_df)

	xy = data(roc_df)
	xy *= smooth() + visual(Scatter)
	xy *= mapping(
		:x => "False positive rate",
		:y => "True positive rate",
		color=:method)

	draw(xy)
end

K-fold cross-validation

By doing a train and test split, we basically threw a part of the data away. For small datasets, like the dataset in this example, that is not very efficient. Therefore, we also do a k-fold cross-validation.

folds = let
	Random.seed!(123)
	rng = MersenneTwister(123)
	indexes = shuffle(rng, eachindex(df.class))
	folds = MLDataUtils.kfolds(indexes, k = 8)
end;
r3(x) = round(x; digits=3);
function fitted_accuracy(model, train, test)
    forest = machine(model, (U = df.U, V = df.V), df.class)
    fit!(forest; rows=train)
    predictions = predict_mode(forest, rows=test)
    return accuracy(predictions, df.class[test]) |> r3
end;
let 
	accuracies = [fitted_accuracy(logistic_model, train, test) for (train, test) in folds]
	accuracies, mean(accuracies) |> r3
end
([0.7, 0.7, 0.4, 0.6, 0.7, 0.8, 0.7, 0.6], 0.65)
let
	accuracies = [fitted_accuracy(forest_model, train, test) for (train, test) in folds]
	accuracies, mean(accuracies) |> r3
end
([0.5, 0.6, 0.3, 0.5, 0.5, 0.7, 0.5, 0.4], 0.5)

Version

Built with Julia 1.6.3 and

AlgebraOfGraphics 0.6.0
CairoMakie 0.6.6
CategoricalArrays 0.10.1
DataFrames 1.2.2
Distributions 0.25.22
MLDataUtils 0.5.4
MLJ 0.16.10
MLJBase 0.18.23
MLJDecisionTreeInterface 0.1.3
MLJGLMInterface 0.1.7
StableRNGs 1.0.0