library(ggplot2) # Data visualization
library(plotly) # Interactive data visualizations
library(psych) # Will be used for correlation visualizations
library(rattle) # Graphing decision trees
library(caret) # Machine learning
library(tidymodels)
library(tidyverse)
Machine Learning in Mining
ore_stregth | host_rock_strength | thickness | shape | dip | ore_uniformity | depth | ore_grade | mining_method |
---|---|---|---|---|---|---|---|---|
Strong | Moderate-strong | Intermediate | Thick | Tabular | Uniform | Moderate | Moderate | Sublevel stoping |
Moderate | Moderate-strong | Very narrow-narrow | Tabular | Intermediate | uniform | moderate / deep | Moderate | Longwall |
Moderate | Strong | Narrow | Tabular | Steep | variable | moderate/deep | Fairly high | Cut and Fill |
Strong | Moderate-strong | Thick | Irregular | Intermediate | variable | moderate/deep | Fairly high | Cut and Fill |
Weak | Moderate | Very narrow | Irregular | Steep | variable | moderate/deep | Fairly high | Cut and Fill |
Moderate | Weak-moderate | Very narrow | Tabular | Flat | uniform | Shallow / moderate | Moderate | Breast stoping |
table(out_new$mining_method)
Apparent mining Block Caving Breast stoping Cut and Fill
16 49 23 195
Longwall Open Cast Open Pit Room and Pillar
65 32 32 83
Shrinkage Shrinkage Stoping Solution Mining Square Set
34 32 32 32
Stull Stoping Sublevel Caving Sublevel stoping Sublevel Stoping
32 48 16 283
table(out_new$ore_stregth)
Any Fair-strong Moderate Moderate-strong
64 96 219 66
Strong Strong-very strong Very strong Very weak-weak
166 48 17 64
Weak Weak-fair Weak-moderate
83 164 17
Data cleaning
<-out_new |>
out_newmutate(mining_method=fct_collapse(mining_method ,
`Room and Pillar` = c("Room and Pillar","Open Cast"),
`Shrinkage Stoping` = c("Shrinkage Stoping","Shrinkage","Open Pit"),
`Cut and Fill` = c("Cut and Fill"),
Longwall = c("Longwall"),
`Sublevel stoping`=c("Sublevel stoping","Sublevel Stoping"),
|>
)) mutate(thickness = case_when(str_detect(thickness,"hick")~"Thick",
str_detect(thickness,"Narrow")~"Narrow",
.default="Moderate"),
ore_stregth = case_when(str_detect(ore_stregth,"eak")~"Weak",
str_detect(ore_stregth,"oderate")~"Moderate",
str_detect(ore_stregth,"strong")~"Strong",
.default="fair"),
host_rock_strength = case_when(str_detect(host_rock_strength,"eak")~"Weak",
str_detect(host_rock_strength,"oderate")~"Moderate",
str_detect(host_rock_strength,"strong")~"Strong",
.default="fair"),
dip = case_when(str_detect(dip,"teep")~"Steep",
str_detect(dip,"lat")~"Flat",
str_detect(str_to_lower(dip),"deep")~"Deep",
.default="Moderate"),
ore_grade = case_when(str_detect(ore_grade,"high")~"High",
str_detect(ore_grade,"hallow")~"Shallow",
str_detect(str_to_lower(ore_grade),"deep")~"Deep",
.default="Moderate"),
ore_uniformity = case_when(
str_detect(str_to_lower(ore_uniformity),"variable")~"variable",
str_detect(str_to_lower(ore_uniformity),"uniform")~"uniform",
.default="Moderate"),
depth = case_when(str_detect(str_to_lower(depth),"moderate")~"Moderate",
str_detect(str_to_lower(depth),"high")~"high",
.default= depth))
Distribution of mining methods
<- out_new %>%
plt_df group_by(mining_method) %>%
mutate(x.lab=paste0(mining_method,
"\n",
"(n=",
n(),
")")) |>
group_by(x.lab) |>
tally() |>
mutate(pct = n/sum(n)*100,
lbl = sprintf("%.1f%%", pct)) |>
mutate(variable="variable")
%>%
plt_df ggplot() +
aes(x = fct_reorder(x.lab,pct),y=pct,fill=variable) +
geom_col(width=0.5) +
geom_text(aes(label=lbl),position=position_stack(vjust=0.5),size=2.5)+
labs(title = "", y = "percentage", x ="",fill="") +
::scale_fill_economist() +
ggthemes::theme_tufte() +
ggthemestheme(legend.position = "none",
axis.text.x = element_text(face="bold",
color="black",
size=8,angle = 60))
Model building
# Fix the random numbers by setting the seed
# This enables the analysis to be reproducible
set.seed(123)
# Put 3/4 of the data into the training set
<- initial_split(out_new,
data_split prop = 0.70,
strata = mining_method)
# Create dataframes for the two sets:
<- training(data_split)
train_data <- testing(data_split) test_data
<-
housing_rec recipe(mining_method ~ .,
data = train_data) %>%
step_naomit(everything(), skip = TRUE) |>
step_dummy(all_nominal(), -all_outcomes())
<-housing_rec |>
housing_rec_prepprep()
<-bake(housing_rec_prep,NULL)
mine_train<-bake(housing_rec_prep,test_data) mine_test
summary(housing_rec)
# A tibble: 9 × 4
variable type role source
<chr> <list> <chr> <chr>
1 ore_stregth <chr [3]> predictor original
2 host_rock_strength <chr [3]> predictor original
3 thickness <chr [3]> predictor original
4 shape <chr [3]> predictor original
5 dip <chr [3]> predictor original
6 ore_uniformity <chr [3]> predictor original
7 depth <chr [3]> predictor original
8 ore_grade <chr [3]> predictor original
9 mining_method <chr [3]> outcome original
<-
prepped_data |># use the recipe object
housing_rec prep() |># perform the recipe on training data
juice() # extract only the preprocessed dataframe
5 fold cross validation
set.seed(100)
<-
cv_folds vfold_cv(train_data,
v = 5,
strata = mining_method)
setup models
<-multinom_reg() |>
mtl_specset_engine("nnet") |>
set_mode("classification")
# Random forest
<-
rf_spec rand_forest() |>
set_engine("ranger", importance = "impurity") |>
set_mode("classification")
#Boosted tree (XGBoost)
<-
xgb_spec boost_tree() |>
set_engine("xgboost") |>
set_mode("classification")
# K-nearest neighbor
<-
knn_spec nearest_neighbor(neighbors = 5) |># we can adjust the number of neighbors
set_engine("kknn") |>
set_mode("classification")
<-
nnet_spec mlp(penalty=0.01) %>%
set_mode("classification") |>
set_engine("keras", verbose=0)
<-
mtl_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(mtl_spec)
<-
rf_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(rf_spec)
<-
xgb_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(xgb_spec)
<-
knn_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(knn_spec)
<-
nnet_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(nnet_spec)
<- function(x) {
get_model extract_fit_parsnip(x) |>tidy()
}
neural network
<-
mtl_res |>
mtl_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|> collect_metrics(summarize = TRUE) mtl_res
# A tibble: 5 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy multiclass 0.963 5 0.00878 Preprocessor1_Model1
2 f_meas macro 0.956 5 0.00907 Preprocessor1_Model1
3 kap multiclass 0.956 5 0.00982 Preprocessor1_Model1
4 roc_auc hand_till 0.997 5 0.00114 Preprocessor1_Model1
5 sens macro 0.958 5 0.00888 Preprocessor1_Model1
Random forest
<-
rf_res |>
rf_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|> collect_metrics(summarize = TRUE) rf_res
# A tibble: 5 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy multiclass 0.973 5 0.00471 Preprocessor1_Model1
2 f_meas macro 0.973 5 0.00550 Preprocessor1_Model1
3 kap multiclass 0.968 5 0.00542 Preprocessor1_Model1
4 roc_auc hand_till 0.997 5 0.00258 Preprocessor1_Model1
5 sens macro 0.966 5 0.00691 Preprocessor1_Model1
XGBoost
<-
xgb_res |>
xgb_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|>collect_metrics(summarize = TRUE) xgb_res
# A tibble: 5 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy multiclass 0.963 5 0.0108 Preprocessor1_Model1
2 f_meas macro 0.970 5 0.00851 Preprocessor1_Model1
3 kap multiclass 0.956 5 0.0122 Preprocessor1_Model1
4 roc_auc hand_till 1.00 5 0.000186 Preprocessor1_Model1
5 sens macro 0.969 5 0.00900 Preprocessor1_Model1
K-nearest neighbor
<-
knn_res |>
knn_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|>collect_metrics(summarize = TRUE) knn_res
# A tibble: 5 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy multiclass 0.970 5 0.00989 Preprocessor1_Model1
2 f_meas macro 0.969 5 0.00865 Preprocessor1_Model1
3 kap multiclass 0.965 5 0.0113 Preprocessor1_Model1
4 roc_auc hand_till 0.987 5 0.00705 Preprocessor1_Model1
5 sens macro 0.969 5 0.0119 Preprocessor1_Model1
Compare models
Extract metrics from our models to compare them:
<-
rf_metrics |>
rf_res collect_metrics(summarise = TRUE) %>%
mutate(model = "Random Forest")
<-
xgb_metrics |>
xgb_res collect_metrics(summarise = TRUE) %>%
mutate(model = "XGBoost")
<-
knn_metrics |>
mtl_res collect_metrics(summarise = TRUE) %>%
mutate(model = "Knn")
<-
mtl_metrics |>
knn_res collect_metrics(summarise = TRUE) %>%
mutate(model = "neural net")
# create dataframe with all models
<- bind_rows(mtl_metrics,
model_compare
rf_metrics,
xgb_metrics,
knn_metrics
)
# change data structure
<-
model_comp |>
model_compare select(model, .metric, mean, std_err) |>
pivot_wider(names_from = .metric, values_from = c(mean, std_err))
# show mean F1-Score for every model
|>
model_comp arrange(mean_f_meas) |>
mutate(model = fct_reorder(model, mean_f_meas)) |># order results
ggplot(aes(model, mean_f_meas, fill=model)) +
geom_col() +
coord_flip() +
scale_fill_brewer(palette = "Blues") +
geom_text(
size = 3,
aes(label = round(mean_f_meas, 2), y = mean_f_meas + 0.08),
vjust = 1
)
# show mean area under the curve (auc) per model
|>
model_comp arrange(mean_roc_auc) |>
mutate(model = fct_reorder(model, mean_roc_auc)) %>%
ggplot(aes(model, mean_roc_auc, fill=model)) +
geom_col() +
coord_flip() +
scale_fill_brewer(palette = "Blues") +
geom_text(
size = 3,
aes(label = round(mean_roc_auc, 2), y = mean_roc_auc + 0.08),
vjust = 1
)
# show mean accuracy per model
|>
model_comp arrange(mean_accuracy) |>
mutate(model = fct_reorder(model, mean_roc_auc)) %>%
ggplot(aes(model, mean_accuracy, fill=model)) +
geom_col() +
coord_flip() +
scale_fill_brewer(palette = "Blues") +
geom_text(
size = 3,
aes(label = round(mean_accuracy, 2), y = mean_accuracy + 0.08),
vjust = 1
)
Note that the model results are all quite similar. In our example we choose the Accuracy as performance measure to select the best model. Let’s find the maximum mean Accuracy:
|>slice_max(mean_accuracy) model_comp
# A tibble: 1 × 11
model mean_accuracy mean_f_meas mean_kap mean_roc_auc mean_sens
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Random Forest 0.973 0.973 0.968 0.997 0.966
# ℹ 5 more variables: std_err_accuracy <dbl>, std_err_f_meas <dbl>,
# std_err_kap <dbl>, std_err_roc_auc <dbl>, std_err_sens <dbl>
<- last_fit(mtl_wflow,
last_fit_rf split = data_split,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens) )
Show performance metrics
|>
last_fit_rf collect_metrics()
# A tibble: 5 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 f_meas macro 0.984 Preprocessor1_Model1
2 accuracy multiclass 0.964 Preprocessor1_Model1
3 kap multiclass 0.955 Preprocessor1_Model1
4 sens macro 0.981 Preprocessor1_Model1
5 roc_auc hand_till 0.998 Preprocessor1_Model1
%>%
last_fit_rf collect_predictions() |>
conf_mat(mining_method, .pred_class) |>
autoplot(type = "heatmap") +
scale_fill_gradient(
high = "#07193b",
low = "#4a6c75"
+
)labs(title="Confusion matrix (Neural Network)")
Let’s create the ROC curve. Again, since the event we are predicting is the first level in the Results factor (“above”), we provide roc_curve() with the relevant class probability .pred_above:
|>
last_fit_rf collect_predictions() |>
roc_curve(mining_method, `.pred_Apparent mining`:`.pred_Sublevel stoping`) |>
autoplot()
Based on all of the results, the validation set and test set performance statistics are very close, so we would have pretty high confidence that our random forest model with the selected hyperparameters would perform well when predicting new data.
Neural net diagram
library(neuralnet)
= neuralnet(
model ~.,
mining_methoddata=mine_train,
hidden=c(10,12),
linear.output = FALSE
)
plot(model,rep="best")