Comparing and Machine Learning Models

Introduction
R
Machine Learning
Stats
Random Forest
XGBoost
SVM
KNN
Building different Models and Intercomparing
Author

Rutendo Zvada

Published

April 7, 2024

knitr::opts_chunk$set(cache = TRUE, 
                      echo = TRUE, 
                      message = FALSE, 
                      warning = FALSE,
                      fig.path = "static",
                      fig.height=6, 
                      fig.width = 1.777777*6,
                      fig.align='center',
                      tidy = FALSE, 
                      comment = NA, 
                      highlight = TRUE, 
                      prompt = FALSE, 
                      crop = TRUE,
                      comment = "#>",
                      collapse = TRUE)
knitr::opts_knit$set(width = 60)
library(tidyverse)
library(reshape2)
theme_set(theme_light(base_size = 16))
make_latex_decorator <- function(output, otherwise) {
  function() {
      if (knitr:::is_latex_output()) output else otherwise
  }
}
insert_pause <- make_latex_decorator(". . .", "\n")
insert_slide_break <- make_latex_decorator("----", "\n")
insert_inc_bullet <- make_latex_decorator("> *", "*")
insert_html_math <- make_latex_decorator("", "$$")
library(tidymodels)
library(tidyverse)
library(data.table)
library(DT)
library(ggthemes)
geneorama::sourceDir("CODE/functions/")
#> calculate_confusion_values.R :
#> calculate_heat_values.R :
#> calculate_violation_matrix.R :
#> calculate_violation_types.R :
#> categorize.R :
#> filter_business.R :
#> filter_crime.R :
#> filter_foodInspect.R :
#> filter_garbageCarts.R :
#> filter_sanitationComplaints.R :
#> find_bus_id_matches.R :
#> GenerateOtherLicenseInfo.R :
#> gini.R :
#> kde.R :
#> logLik.R :
#> nokey.R :
#> simulated_bin_summary.R :
#> simulated_date_diff_mean.R :
#> weather_3day_calc.R :
food <- readRDS("DATA/23_food_insp_features.Rds")
bus <- readRDS("DATA/24_bus_features.Rds")
sanitarians <- readRDS("DATA/19_inspector_assignments.Rds")
weather <- readRDS("DATA/17_mongo_weather_update.Rds")
heat_burglary <- readRDS("DATA/22_burglary_heat.Rds")
heat_garbage <- readRDS("DATA/22_garbageCarts_heat.Rds")
heat_sanitation <- readRDS("DATA/22_sanitationComplaints_heat.Rds")

##==============================================================================
## MERGE IN FEATURES
##==============================================================================
sanitarians <- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian]
setnames(heat_burglary, "heat_values", "heat_burglary")
setnames(heat_garbage, "heat_values", "heat_garbage")
setnames(heat_sanitation, "heat_values", "heat_sanitation")

dat <- copy(food)
dat <- dat[bus]
dat <- merge(x = dat,  y = sanitarians,  by = "Inspection_ID")
dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date")
dat <- merge(dat, na.omit(heat_burglary),  by = "Inspection_ID")
dat <- merge(dat, na.omit(heat_garbage),  by = "Inspection_ID")
dat <- merge(dat, na.omit(heat_sanitation),  by = "Inspection_ID")

## Set the key for dat
setkey(dat, Inspection_ID)

## Remove unnecessary data
rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation)

## Only the model data should be present
geneorama::lll()
#>          Class    KB       Dims
#> dat data.table 19072 22300 x 75

##==============================================================================
## FILTER ROWS
##==============================================================================
dat <- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"]
dat
#>        Inspection_ID Inspection_Date                     DBA_Name
#>     1:        269961      2013-01-31                   SEVEN STAR
#>     2:        507211      2011-10-18                 PANERA BREAD
#>     3:        507212      2011-10-18     LITTLE QUIAPO RESTAURANT
#>     4:        507216      2011-10-19 SERGIO'S TAQUERIA PIZZA INC.
#>     5:        507219      2011-10-20        TARGET STORE # T-2079
#>    ---                                                           
#> 18777:       1506243      2014-10-29           CENTRAL GYROS CORP
#> 18778:       1506245      2014-10-30                TORTILLA SOUP
#> 18779:       1506247      2014-10-31             GROTA RESTAURANT
#> 18780:       1508207      2014-10-27                  R U HUNGRY?
#> 18781:       1513208      2014-10-31             EL RANCHITO FOOD
#>                        AKA_Name License Facility_Type            Risk
#>     1:               SEVEN STAR   30790 Grocery Store    Risk 3 (Low)
#>     2:             PANERA BREAD 1475890    Restaurant   Risk 1 (High)
#>     3: LITTLE QUIAPO RESTAURANT 1740130    Restaurant   Risk 1 (High)
#>     4:  SERGIO'S TAQUERIA PIZZA 1447363    Restaurant   Risk 1 (High)
#>     5:                   TARGET 1679459    Restaurant Risk 2 (Medium)
#>    ---                                                               
#> 18777:            CENTRAL GYROS   28851    Restaurant   Risk 1 (High)
#> 18778:            TORTILLA SOUP   78989    Restaurant   Risk 1 (High)
#> 18779:         GROTA RESTAURANT    6753    Restaurant   Risk 1 (High)
#> 18780:              R U HUNGRY? 2008741    Restaurant   Risk 1 (High)
#> 18781:         EL RANCHITO FOOD 1383420 Grocery Store Risk 2 (Medium)
#>                         Address    City State   Zip Inspection_Type Results
#>     1:         3352 N BROADWAY  CHICAGO    IL 60657         Canvass    Pass
#>     2:      6059 N LINCOLN AVE  CHICAGO    IL 60659         Canvass    Pass
#>     3:     6259 N MCCORMICK RD  CHICAGO    IL 60659         Canvass    Fail
#>     4:    3253 W BRYN MAWR AVE  CHICAGO    IL 60659         Canvass    Pass
#>     5:     2112 W PETERSON AVE  CHICAGO    IL 60659         Canvass    Fail
#>    ---                                                                     
#> 18777:      3127 N CENTRAL AVE  CHICAGO    IL 60634         Canvass    Pass
#> 18778:    3809-11 N HARLEM AVE  CHICAGO    IL 60634         Canvass    Pass
#> 18779: 3108-3112 N CENTRAL AVE  CHICAGO    IL 60634         Canvass    Pass
#> 18780:          1724 W 51st ST  CHICAGO    IL 60609         Canvass    Pass
#> 18781:    8327-29 S PULASKI RD  CHICAGO    IL 60652         Canvass    Pass
#>        Latitude Longitude                                 Location
#>     1: 41.94336 -87.64500 (41.943359344775146, -87.64499875300952)
#>     2: 41.99192 -87.70963  (41.99191947239194, -87.70963133440333)
#>     3: 41.99563 -87.71271  (41.99563177556418, -87.71270678169132)
#>     4: 41.98293 -87.71098 (41.982933189164974, -87.71098225381141)
#>     5: 41.99073 -87.68298  (41.99072921796059, -87.68297945359863)
#>    ---                                                            
#> 18777: 41.93758 -87.76631  (41.93758152034964, -87.76631489179631)
#> 18778: 41.94922 -87.80700  (41.94921947071411, -87.80699727423111)
#> 18779: 41.93707 -87.76659  (41.93706710815131, -87.76659233846347)
#> 18780: 41.80140 -87.66837  (41.80139599557942, -87.66836713634461)
#> 18781: 41.74137 -87.72151 (41.741374166961236, -87.72150639395679)
#>        Facility_Type_Clean criticalCount seriousCount minorCount pass_flag
#>     1:               Other             0            0          2         1
#>     2:          Restaurant             0            0          3         1
#>     3:          Restaurant             0            2          6         0
#>     4:          Restaurant             0            0          6         1
#>     5:          Restaurant             0            2          6         0
#>    ---                                                                    
#> 18777:          Restaurant             0            0          6         1
#> 18778:          Restaurant             0            0          4         1
#> 18779:          Restaurant             0            0          4         1
#> 18780:          Restaurant             0            0          1         1
#> 18781:               Other             0            0          8         1
#>        fail_flag pastFail pastCritical pastSerious pastMinor timeSinceLast
#>     1:         0        0            0           0         0     2.0000000
#>     2:         0        0            0           0         0     2.0000000
#>     3:         1        0            0           0         0     2.0000000
#>     4:         0        0            0           0         0     2.0000000
#>     5:         1        0            0           0         0     2.0000000
#>    ---                                                                    
#> 18777:         0        0            0           0         7     0.7150685
#> 18778:         0        0            0           0         7     0.5808219
#> 18779:         0        0            0           0         8     0.4657534
#> 18780:         0        0            0           0         2     0.6136986
#> 18781:         0        0            0           0         4     0.9041096
#>        firstRecord               ID LICENSE_ID ACCOUNT_NUMBER
#>     1:           1   30790-20110416    2081412          63759
#>     2:           1 1475890-20110416    2081695         207283
#>     3:           1 1740130-20110216    2070145           3107
#>     4:           1 1447363-20110216    2071895         270993
#>     5:           1 1679459-20100216    2009972          15538
#>    ---                                                       
#> 18777:           0   28851-20131016    2279699           3542
#> 18778:           0   78989-20131016    2280040          23219
#> 18779:           0    6753-20131016    2279707           3673
#> 18780:           0 2008741-20140116    2297606         349243
#> 18781:           0 1383420-20131116    2286790         266695
#>                          LEGAL_NAME       DOING_BUSINESS_AS_NAME
#>     1:           VIRGINIA DELA ROSA                   SEVEN STAR
#>     2:                  PANERA, LLC                 PANERA BREAD
#>     3:               ENELITA GARCIA     LITTLE QUIAPO RESTAURANT
#>     4: SERGIO'S TAZUERIA PIZZA INC. SERGIO'S TAQUERIA PIZZA INC.
#>     5:           TARGET CORPORATION        TARGET STORE # T-2079
#>    ---                                                          
#> 18777:         CENTRAL GYROS, CORP.           CENTRAL GYROS CORP
#> 18778:           HARLEM-GRACE, INC.                TORTILLA SOUP
#> 18779:                  GROTA, INC.             GROTA RESTAURANT
#> 18780:            R U HUNGRY?, LTD.                  R U HUNGRY?
#> 18781:       EL RANCHITO FOOD, INC.             EL RANCHITO FOOD
#>                            ADDRESS    CITY STATE ZIP_CODE WARD PRECINCT
#>     1:         3352 N BROADWAY   1 CHICAGO    IL    60657   44       33
#>     2:       6059 N LINCOLN AVE  C CHICAGO    IL    60659   50       23
#>     3:         6259 N MCCORMICK RD CHICAGO    IL    60659   50       25
#>     4:        3253 W BRYN MAWR AVE CHICAGO    IL    60659   39       48
#>     5:         2112 W PETERSON AVE CHICAGO    IL    60659   40       18
#>    ---                                                                 
#> 18777:      3127 N CENTRAL AVE 1ST CHICAGO    IL    60634   31       37
#> 18778:    3809-11 N HARLEM AVE 1ST CHICAGO    IL    60634   38        3
#> 18779: 3108-3112 N CENTRAL AVE 1ST CHICAGO    IL    60634   31       37
#> 18780:            1724 W 51ST ST 1 CHICAGO    IL    60609   20        4
#> 18781:        8327-29 S PULASKI RD CHICAGO    IL    60652   18       11
#>        WARD_PRECINCT POLICE_DISTRICT LICENSE_CODE       LICENSE_DESCRIPTION
#>     1:         44-33              19         1006 Retail Food Establishment
#>     2:         50-23              24         1006 Retail Food Establishment
#>     3:         50-25              17         1006 Retail Food Establishment
#>     4:         39-48              17         1006 Retail Food Establishment
#>     5:         40-18              24         1006 Retail Food Establishment
#>    ---                                                                     
#> 18777:         31-37              25         1006 Retail Food Establishment
#> 18778:          38-3              16         1006 Retail Food Establishment
#> 18779:         31-37              25         1006 Retail Food Establishment
#> 18780:          20-4               9         1006 Retail Food Establishment
#> 18781:         18-11               8         1006 Retail Food Establishment
#>        BUSINESS_ACTIVITY_ID                BUSINESS_ACTIVITY LICENSE_NUMBER
#>     1:                  775 Retail Sales of Perishable Foods          30790
#>     2:                  775 Retail Sales of Perishable Foods        1475890
#>     3:                  775 Retail Sales of Perishable Foods        1740130
#>     4:                  775 Retail Sales of Perishable Foods        1447363
#>     5:                  775 Retail Sales of Perishable Foods        1679459
#>    ---                                                                     
#> 18777:                  775 Retail Sales of Perishable Foods          28851
#> 18778:                  775 Retail Sales of Perishable Foods          78989
#> 18779:                  775 Retail Sales of Perishable Foods           6753
#> 18780:                  775 Retail Sales of Perishable Foods        2008741
#> 18781:                  775 Retail Sales of Perishable Foods        1383420
#>        APPLICATION_TYPE LICENSE_TERM_START_DATE LICENSE_TERM_EXPIRATION_DATE
#>     1:            RENEW              2011-04-16                   2013-04-15
#>     2:            RENEW              2011-04-16                   2013-04-15
#>     3:            RENEW              2011-02-16                   2013-02-15
#>     4:            RENEW              2011-02-16                   2013-02-15
#>     5:            RENEW              2010-02-16                   2012-02-15
#>    ---                                                                      
#> 18777:            RENEW              2013-10-16                   2015-10-15
#> 18778:            RENEW              2013-10-16                   2015-10-15
#> 18779:            RENEW              2013-10-16                   2015-10-15
#> 18780:            RENEW              2014-01-16                   2016-01-15
#> 18781:            RENEW              2013-11-16                   2015-11-15
#>        LICENSE_STATUS LATITUDE LONGITUDE    minDate    maxDate ageAtInspection
#>     1:            AAI 41.94336 -87.64500 2002-02-16 2015-04-15       10.964384
#>     2:            AAI 41.99192 -87.70963 2004-05-05 2019-04-15        7.457534
#>     3:            AAI 41.99563 -87.71271 2007-03-22 2017-02-15        4.578082
#>     4:            AAI 41.98293 -87.71098 2003-12-31 2019-02-15        7.805479
#>     5:            AAI 41.99073 -87.68298 2006-07-07 2018-02-15        5.290411
#>    ---                                                                        
#> 18777:            AAI 41.93758 -87.76631 2002-05-16 2019-10-15       12.463014
#> 18778:            AAI 41.94922 -87.80700 2002-11-16 2017-10-15       11.961644
#> 18779:            AAI 41.93707 -87.76659 2002-05-16 2019-10-15       12.468493
#> 18780:            AAI 41.80140 -87.66837 2009-12-29 2018-01-15        4.830137
#> 18781:            AAI 41.74137 -87.72151 2003-09-29 2019-11-15       11.095890
#>        consumption_on_premises_incidental_activity tobacco package_goods
#>     1:                                           0       1             0
#>     2:                                           0       0             0
#>     3:                                           0       0             0
#>     4:                                           0       0             0
#>     5:                                           0       0             1
#>    ---                                                                  
#> 18777:                                           1       0             0
#> 18778:                                           1       0             0
#> 18779:                                           1       0             0
#> 18780:                                           0       0             0
#> 18781:                                           0       0             0
#>        outdoor_patio public_place_of_amusement limited_business_license
#>     1:             0                         0                        1
#>     2:             0                         0                        0
#>     3:             0                         0                        0
#>     4:             0                         0                        0
#>     5:             0                         0                        1
#>    ---                                                                 
#> 18777:             0                         0                        0
#> 18778:             0                         0                        0
#> 18779:             0                         0                        0
#> 18780:             0                         0                        0
#> 18781:             0                         0                        0
#>        childrens_services_facility_license tavern regulated_business_license
#>     1:                                   0      0                          0
#>     2:                                   0      0                          0
#>     3:                                   0      0                          0
#>     4:                                   0      0                          0
#>     5:                                   0      0                          0
#>    ---                                                                      
#> 18777:                                   0      0                          0
#> 18778:                                   0      0                          0
#> 18779:                                   0      0                          0
#> 18780:                                   0      0                          0
#> 18781:                                   0      0                          0
#>        filling_station caterers_liquor_license mobile_food_license sanitarian
#>     1:               0                       0                   0      green
#>     2:               0                       0                   0       blue
#>     3:               0                       0                   0       blue
#>     4:               0                       0                   0       blue
#>     5:               0                       0                   0       blue
#>    ---                                                                       
#> 18777:               0                       0                   0      brown
#> 18778:               0                       0                   0      brown
#> 18779:               0                       0                   0      brown
#> 18780:               0                       0                   0      brown
#> 18781:               0                       0                   0     orange
#>        precipIntensity temperatureMax windSpeed  humidity heat_burglary
#>     1:    0.0145866667       53.49667 13.340000 0.9000000     26.992376
#>     2:    0.0019066667       59.04667 13.016667 0.5500000     13.976557
#>     3:    0.0019066667       59.04667 13.016667 0.5500000     12.611239
#>     4:    0.0027366667       56.15333 10.863333 0.6166667     35.906383
#>     5:    0.0099866667       52.73000 16.266667 0.6900000      9.530785
#>    ---                                                                 
#> 18777:    0.0018000000       65.67333  8.430000 0.6466667     19.974219
#> 18778:    0.0017333333       62.49000 11.026667 0.6800000     14.219000
#> 18779:    0.0010666667       54.76333  9.256667 0.7033333     21.580979
#> 18780:    0.0001333333       62.60000  4.690000 0.6700000     49.906934
#> 18781:    0.0010666667       54.76333  9.256667 0.7033333     32.749393
#>        heat_garbage heat_sanitation
#>     1:    12.768572       37.748787
#>     2:    12.895485       15.412267
#>     3:     8.004220        8.320330
#>     4:    26.238645       38.186618
#>     5:     3.401567        2.126788
#>    ---                             
#> 18777:    26.320516       33.413499
#> 18778:    23.563573       12.189961
#> 18779:    27.637291       27.170901
#> 18780:    22.384442       11.930061
#> 18781:    23.952219       23.257770
dat_2014<-dat |> 
  mutate(Facility_Type = fct_lump(Facility_Type,4),
         pass_flag = as.factor(pass_flag)) |> 
  filter(year(Inspection_Date)==2014) |> 
  mutate(pastFail = as.factor(pastFail))

Distribution of Facility type


plt_df <- dat_2014 %>%
  group_by(Facility_Type) %>%
  mutate(x.lab=paste0(Facility_Type,
                      "\n",
                      "(n=",
                      n(),
                      ")")) |> 
  group_by(x.lab) |> 
  tally() |> 
  mutate(pct = n/sum(n)*100,
         lbl = sprintf("%.1f%%", pct)) |> 
  mutate(variable="variable") 
  

plt_df %>%
 ggplot() +
  aes(x = fct_reorder(x.lab,pct),y=pct,fill=variable) +
  geom_col(width=0.5) +
  geom_text(aes(label=lbl),position=position_stack(vjust=0.5))+
  labs(title = "", y = "percentage", x ="",fill="") +
  ggthemes::scale_fill_economist() +
  ggthemes::theme_tufte() +
  theme(legend.position = "none",
        axis.text.x = element_text(face="bold", 
                                   color="black",
                                   size=8))


ggsave("rutendo_dist_facility.png",height=200,width=300, units = "mm",bg="white")

Flag

plt_df <- dat_2014 %>%
  group_by(pass_flag) %>%
  mutate(x.lab=paste0(pass_flag,
                      "\n",
                      "(n=",
                      n(),
                      ")")) |> 
  group_by(x.lab) |> 
  tally() |> 
  mutate(pct = n/sum(n)*100,
         lbl = sprintf("%.1f%%", pct)) |> 
  mutate(variable="variable") 
  

plt_df %>%
 ggplot() +
  aes(x = fct_reorder(x.lab,pct),y=pct,fill=variable) +
  geom_col(width=0.5) +
  geom_text(aes(label=lbl),position=position_stack(vjust=0.5))+
  labs(title = "", y = "percentage", x ="",fill="") +
  ggthemes::scale_fill_economist() +
  ggthemes::theme_tufte() +
  theme(legend.position = "none",
        axis.text.x = element_text(face="bold", 
                                   color="black",
                                   size=8))


ggsave("rutendo_dist_flag.png",height=200,width=300, units = "mm",bg="white")
dat_2014 |> 
  count(pass_flag, Facility_Type) |> 
  ggplot(aes(Facility_Type, pass_flag, fill = n)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(n, big.mark = ","))) +
  scale_fill_binned(low = "purple", high = "#07193b",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "Facility Type", 
       y = "Flag", 
       fill = "Frequency")


ggsave("rutendo_dist_heat.png",height=200,width=300, units = "mm",bg="white")
dat_2014 |> 
  count(pass_flag, Risk) |> 
  ggplot(aes(Risk, pass_flag, fill = n)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(n, big.mark = ","))) +
  scale_fill_binned(low = "purple", high = "#07193b",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "Risk", 
       y = "Flag", 
       fill = "Frequency")


ggsave("rutendo_dist_risk.png",height=200,width=300, units = "mm",bg="white")

Correlations

library(corrplot)
# dlookr: correlation heatmap
data_for_cor <- dat_2014 %>% 
                  select(Facility_Type,
                     Risk,
                     windSpeed,
                     humidity,
                     heat_burglary,
                     heat_garbage,
                     heat_sanitation,
                     sanitarian,
                     precipIntensity,
                     temperatureMax,
                     ageAtInspection,
                     tobacco,
                     caterers_liquor_license,
                     timeSinceLast) |> 
  select_if(is.numeric)

corrs = cor(data_for_cor)
corrplot(corrs, type="upper", method="color", addCoef.col = "#07193b")

library(gtsummary)
p <- dat_2014 |> 
  select(Facility_Type,Risk,tobacco,caterers_liquor_license,Results,pastFail) |> 
  mutate_all(as.factor)
# summary of descriptive statistics per river (mean,standard deviation and Analysis Of Variance)
theme_gtsummary_journal("jama")
table2<-p %>%
  tbl_summary(
    by = Results,
    type=all_categorical()~"categorical",
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} / {N} ({p}%)")) %>% 
  add_p(test = all_continuous() ~ "t.test",
        pvalue_fun = function(x) style_pvalue(x, digits = 2)) %>% 
   modify_header(statistic ~ "**Test Statistic**") %>%
  bold_labels() %>% 
  modify_fmt_fun(statistic ~ style_sigfig);table2 
Characteristic Fail, N = 790 Pass, N = 2,597 Pass w/ Conditions, N = 678 Test Statistic p-value1
Facility_Type, n / N (%)


19 0.013
    Bakery 26 / 790 (3.3%) 42 / 2,597 (1.6%) 12 / 678 (1.8%)

    Grocery Store 110 / 790 (14%) 281 / 2,597 (11%) 80 / 678 (12%)

    Restaurant 628 / 790 (79%) 2,193 / 2,597 (84%) 566 / 678 (83%)

    School 9 / 790 (1.1%) 20 / 2,597 (0.8%) 2 / 678 (0.3%)

    Other 17 / 790 (2.2%) 61 / 2,597 (2.3%) 18 / 678 (2.7%)

Risk, n / N (%)


12 0.017
    Risk 1 (High) 619 / 790 (78%) 1,986 / 2,597 (76%) 525 / 678 (77%)

    Risk 2 (Medium) 151 / 790 (19%) 555 / 2,597 (21%) 150 / 678 (22%)

    Risk 3 (Low) 20 / 790 (2.5%) 56 / 2,597 (2.2%) 3 / 678 (0.4%)

tobacco, n / N (%)


1.6 0.44
    0 718 / 790 (91%) 2,397 / 2,597 (92%) 623 / 678 (92%)

    1 72 / 790 (9.1%) 200 / 2,597 (7.7%) 55 / 678 (8.1%)

caterers_liquor_license, n / N (%)


0.04 0.98
    0 777 / 790 (98%) 2,552 / 2,597 (98%) 666 / 678 (98%)

    1 13 / 790 (1.6%) 45 / 2,597 (1.7%) 12 / 678 (1.8%)

pastFail, n / N (%)


34 <0.001
    0 626 / 790 (79%) 2,269 / 2,597 (87%) 589 / 678 (87%)

    1 164 / 790 (21%) 328 / 2,597 (13%) 89 / 678 (13%)

1 Pearson’s Chi-squared test
Comments
  • Facility type and risk are significantly associated with results of inspection

Initial logistic model

glm(pass_flag ~ Facility_Type +
                     Risk +
                     windSpeed +
                     humidity +
                     heat_burglary +
                     heat_garbage +
                     heat_sanitation +
                     sanitarian +
                     temperatureMax +
                     ageAtInspection +
                     tobacco +
                     caterers_liquor_license +
                     pastFail +
                     timeSinceLast,family=binomial,data=dat_2014) |> 
  tbl_regression(exponentiate = TRUE)
Characteristic OR (95% CI)1 p-value
Facility_Type

    Bakery
    Grocery Store 1.16 (0.68 to 1.96) 0.58
    Restaurant 1.89 (1.17 to 3.03) 0.008
    School 1.34 (0.54 to 3.46) 0.53
    Other 1.64 (0.86 to 3.13) 0.13
Risk

    Risk 1 (High)
    Risk 2 (Medium) 1.49 (1.24 to 1.80) <0.001
    Risk 3 (Low) 2.40 (1.38 to 4.30) 0.002
windSpeed 1.02 (1.00 to 1.05) 0.040
humidity 1.39 (0.58 to 3.34) 0.47
heat_burglary 1.00 (1.0 to 1.00) 0.69
heat_garbage 1.00 (0.99 to 1.00) 0.063
heat_sanitation 1.00 (1.00 to 1.00) 0.69
sanitarian

    blue
    brown 7.72 (5.49 to 11.1) <0.001
    green 2.40 (1.97 to 2.92) <0.001
    orange 1.48 (1.22 to 1.80) <0.001
    purple 0.47 (0.35 to 0.63) <0.001
    yellow 3.35 (2.67 to 4.21) <0.001
temperatureMax 1.00 (1.0 to 1.00) 0.51
ageAtInspection 1.01 (0.99 to 1.02) 0.37
tobacco 1.06 (0.78 to 1.46) 0.70
caterers_liquor_license 1.28 (0.76 to 2.20) 0.35
pastFail

    0
    1 0.62 (0.51 to 0.75) <0.001
timeSinceLast 0.72 (0.63 to 0.83) <0.001
1 OR = Odds Ratio, CI = Confidence Interval

The principle question is whether we can reasonably determine the probability that a restaurant inspection will yield at least one critical violation. That is, the focus will be whether or not any critical violation is found—a binary response. We use a glmnet model to estimate the impact.

Note
  • the results above suggest that facility type has a significant effect on the results of inpection , with all facility types having odds ratios above 1 . However restaurants have increased odds of yielding at least one critical violation in comparison to bakeries and as well have a significant estimate

  • risk 2 and 3 significantly increase the odds of yielding at least one critical violation.

  • heat_garbage is a significant predictor though its effect is not that much on the final result of inspection

Model building


# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible 
set.seed(123)

# Put 3/4 of the data into the training set 
data_split <- initial_split(dat_2014, 
                           prop = 0.70, 
                           strata = pass_flag)

# Create dataframes for the two sets:
train_data <- training(data_split) 
test_data <- testing(data_split)
housing_rec <-
  recipe(pass_flag ~ Facility_Type +
                     Risk +
                     windSpeed +
                     humidity +
                     heat_burglary +
                     heat_garbage +
                     heat_sanitation +
                     sanitarian +
                     temperatureMax +
                     ageAtInspection +
                     tobacco +
                     caterers_liquor_license +
                     pastFail +
                     timeSinceLast,
         data = train_data) %>%
  step_naomit(everything(), skip = TRUE) |>
  step_dummy(all_nominal(), -all_outcomes()) |> 
  themis::step_smote(pass_flag)
housing_rec_prep<-housing_rec |> 
  prep()
summary(housing_rec)
#> # A tibble: 15 × 4
#>    variable                type      role      source  
#>    <chr>                   <list>    <chr>     <chr>   
#>  1 Facility_Type           <chr [3]> predictor original
#>  2 Risk                    <chr [3]> predictor original
#>  3 windSpeed               <chr [2]> predictor original
#>  4 humidity                <chr [2]> predictor original
#>  5 heat_burglary           <chr [2]> predictor original
#>  6 heat_garbage            <chr [2]> predictor original
#>  7 heat_sanitation         <chr [2]> predictor original
#>  8 sanitarian              <chr [3]> predictor original
#>  9 temperatureMax          <chr [2]> predictor original
#> 10 ageAtInspection         <chr [2]> predictor original
#> 11 tobacco                 <chr [2]> predictor original
#> 12 caterers_liquor_license <chr [2]> predictor original
#> 13 pastFail                <chr [3]> predictor original
#> 14 timeSinceLast           <chr [2]> predictor original
#> 15 pass_flag               <chr [3]> outcome   original

prepped_data <- 
  housing_rec |># use the recipe object
  prep() |># perform the recipe on training data
  juice() # extract only the preprocessed dataframe 

5 fold cross validation

set.seed(100)

cv_folds <-
 vfold_cv(train_data, 
          v = 5, 
          strata = pass_flag) 

setup models

# Random forest
rf_spec <- 
  rand_forest() |>
  set_engine("ranger", importance = "impurity") |>
  set_mode("classification")
#Boosted tree (XGBoost)
xgb_spec <- 
  boost_tree() |>
  set_engine("xgboost") |>
  set_mode("classification") 
# K-nearest neighbor
knn_spec <- 
  nearest_neighbor(neighbors = 5) |># we can adjust the number of neighbors 
  set_engine("kknn") |>
  set_mode("classification") 
# Decision Tree

dt_spec <-
  decision_tree() %>%
  set_mode("classification") |>
  set_engine("rpart") 
# Logistic Regression

log_spec<-logistic_reg() |> 
  set_mode("classification") |> 
  set_engine("glm")

log_wflow <-
 workflow() %>%
 add_recipe(housing_rec) |>
 add_model(log_spec) 

rf_wflow <-
 workflow() %>%
 add_recipe(housing_rec) |>
 add_model(rf_spec) 
xgb_wflow <-
 workflow() %>%
 add_recipe(housing_rec) |>
 add_model(xgb_spec)



knn_wflow <-
 workflow() %>%
 add_recipe(housing_rec) |>
 add_model(knn_spec)
dt_wflow <-
 workflow() %>%
 add_recipe(housing_rec) |>
 add_model(dt_spec)
get_model <- function(x) {
  extract_fit_parsnip(x) |>tidy()
}

Logistic Regression

log_res <-
  log_wflow |>
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(
      f_meas, 
      yardstick::accuracy, kap,
      roc_auc, sens),
    control = control_resamples(save_pred = TRUE)
    ) 

log_res |> collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.628     5 0.0104  Preprocessor1_Model1
#> 2 f_meas   binary     0.561     5 0.00879 Preprocessor1_Model1
#> 3 kap      binary     0.250     5 0.0176  Preprocessor1_Model1
#> 4 roc_auc  binary     0.683     5 0.0121  Preprocessor1_Model1
#> 5 sens     binary     0.657     5 0.0114  Preprocessor1_Model1

Decision Tree

dt_res <-
  dt_wflow |>
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(
      f_meas, 
      yardstick::accuracy, kap,
      roc_auc, sens),
    control = control_resamples(save_pred = TRUE)
    ) 

dt_res |> collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.637     5  0.0120 Preprocessor1_Model1
#> 2 f_meas   binary     0.495     5  0.0190 Preprocessor1_Model1
#> 3 kap      binary     0.213     5  0.0235 Preprocessor1_Model1
#> 4 roc_auc  binary     0.650     5  0.0122 Preprocessor1_Model1
#> 5 sens     binary     0.496     5  0.0336 Preprocessor1_Model1
dt_res[[3]]
#> [[1]]
#> # A tibble: 5 × 4
#>   .metric  .estimator .estimate .config             
#>   <chr>    <chr>          <dbl> <chr>               
#> 1 f_meas   binary         0.538 Preprocessor1_Model1
#> 2 accuracy binary         0.677 Preprocessor1_Model1
#> 3 kap      binary         0.290 Preprocessor1_Model1
#> 4 sens     binary         0.519 Preprocessor1_Model1
#> 5 roc_auc  binary         0.682 Preprocessor1_Model1
#> 
#> [[2]]
#> # A tibble: 5 × 4
#>   .metric  .estimator .estimate .config             
#>   <chr>    <chr>          <dbl> <chr>               
#> 1 f_meas   binary         0.507 Preprocessor1_Model1
#> 2 accuracy binary         0.649 Preprocessor1_Model1
#> 3 kap      binary         0.235 Preprocessor1_Model1
#> 4 sens     binary         0.5   Preprocessor1_Model1
#> 5 roc_auc  binary         0.664 Preprocessor1_Model1
#> 
#> [[3]]
#> # A tibble: 5 × 4
#>   .metric  .estimator .estimate .config             
#>   <chr>    <chr>          <dbl> <chr>               
#> 1 f_meas   binary         0.454 Preprocessor1_Model1
#> 2 accuracy binary         0.614 Preprocessor1_Model1
#> 3 kap      binary         0.156 Preprocessor1_Model1
#> 4 sens     binary         0.444 Preprocessor1_Model1
#> 5 roc_auc  binary         0.616 Preprocessor1_Model1
#> 
#> [[4]]
#> # A tibble: 5 × 4
#>   .metric  .estimator .estimate .config             
#>   <chr>    <chr>          <dbl> <chr>               
#> 1 f_meas   binary         0.447 Preprocessor1_Model1
#> 2 accuracy binary         0.634 Preprocessor1_Model1
#> 3 kap      binary         0.176 Preprocessor1_Model1
#> 4 sens     binary         0.410 Preprocessor1_Model1
#> 5 roc_auc  binary         0.628 Preprocessor1_Model1
#> 
#> [[5]]
#> # A tibble: 5 × 4
#>   .metric  .estimator .estimate .config             
#>   <chr>    <chr>          <dbl> <chr>               
#> 1 f_meas   binary         0.530 Preprocessor1_Model1
#> 2 accuracy binary         0.613 Preprocessor1_Model1
#> 3 kap      binary         0.209 Preprocessor1_Model1
#> 4 sens     binary         0.605 Preprocessor1_Model1
#> 5 roc_auc  binary         0.661 Preprocessor1_Model1

Random forest

rf_res <-
  rf_wflow |>
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(
      f_meas, 
      yardstick::accuracy, kap,
      roc_auc, sens),
    control = control_resamples(save_pred = TRUE)
    ) 

rf_res |> collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.654     5 0.00523 Preprocessor1_Model1
#> 2 f_meas   binary     0.454     5 0.00892 Preprocessor1_Model1
#> 3 kap      binary     0.208     5 0.0101  Preprocessor1_Model1
#> 4 roc_auc  binary     0.665     5 0.0125  Preprocessor1_Model1
#> 5 sens     binary     0.399     5 0.0130  Preprocessor1_Model1

XGBoost

xgb_res <- 
  xgb_wflow |>
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(
     f_meas, 
      yardstick::accuracy, kap,
      roc_auc, sens),
    control = control_resamples(save_pred = TRUE)
    ) 

xgb_res |>collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.638     5  0.0120 Preprocessor1_Model1
#> 2 f_meas   binary     0.460     5  0.0135 Preprocessor1_Model1
#> 3 kap      binary     0.190     5  0.0233 Preprocessor1_Model1
#> 4 roc_auc  binary     0.646     5  0.0181 Preprocessor1_Model1
#> 5 sens     binary     0.426     5  0.0129 Preprocessor1_Model1

K-nearest neighbor

knn_res <- 
  knn_wflow |>
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(
     f_meas, 
      yardstick::accuracy, kap,
      roc_auc, sens),
    control = control_resamples(save_pred = TRUE)
    ) 

knn_res |>collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.601     5 0.00771 Preprocessor1_Model1
#> 2 f_meas   binary     0.457     5 0.00777 Preprocessor1_Model1
#> 3 kap      binary     0.142     5 0.0141  Preprocessor1_Model1
#> 4 roc_auc  binary     0.604     5 0.00847 Preprocessor1_Model1
#> 5 sens     binary     0.465     5 0.00900 Preprocessor1_Model1

Compare models

Extract metrics from our models to compare them:

rf_metrics <- 
  rf_res |>
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "Random Forest")

xgb_metrics <- 
  xgb_res |>
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "XGBoost")

knn_metrics <- 
  knn_res |>
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "Knn")


log_metrics <- 
  log_res |>
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "Logistic Regression")


dt_metrics <- 
  dt_res |>
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "Decision Tree")

# create dataframe with all models
model_compare <- bind_rows(log_metrics,
                           rf_metrics,
                           xgb_metrics,
                           knn_metrics,
                           dt_metrics
                           ) 

# change data structure
model_comp <- 
  model_compare |>
  select(model, .metric, mean, std_err) |>
  pivot_wider(names_from = .metric, values_from = c(mean, std_err)) 


# show mean F1-Score for every model
model_comp |>
  arrange(mean_f_meas) |>
  mutate(model = fct_reorder(model, mean_f_meas)) |># order results
  ggplot(aes(fct_reorder(model, mean_f_meas),mean_f_meas, fill=model)) +
  labs(x="Model") +
  geom_col() +
  coord_flip() +
  scale_fill_tableau()+
   geom_text(
     size = 3,
     aes(label = round(mean_f_meas, 2), y = mean_f_meas + 0.08),
     vjust = 1
  )


ggsave("f_measure.png",height=200,width=300, units = "mm",bg="white")

# show mean area under the curve (auc) per model
model_comp |>
  arrange(mean_roc_auc) |>
  mutate(model = fct_reorder(model, mean_roc_auc)) %>%
  ggplot(aes(fct_reorder(model, mean_roc_auc),mean_roc_auc, fill=model)) +
  labs(x="Model") +
  geom_col() +
  coord_flip() +
  scale_fill_tableau()+
     geom_text(
     size = 3,
     aes(label = round(mean_roc_auc, 2), y = mean_roc_auc + 0.08),
     vjust = 1
  )


ggsave("roc_auc.png",height=200,width=300, units = "mm",bg="white")

# show mean accuracy per model
model_comp |>
  arrange(mean_accuracy) |>
  mutate(model = fct_reorder(model, mean_accuracy)) %>%
  ggplot(aes(fct_reorder(model, mean_accuracy),mean_accuracy, fill=model)) +
  labs(x="Model") +
  geom_col() +
  coord_flip() +
  scale_fill_tableau() + 
     geom_text(
     size = 3,
     aes(label = round(mean_accuracy, 2), y = mean_accuracy + 0.08),
     vjust = 1
  )


ggsave("rutendo_accuracy.png",height=200,width=300, units = "mm",bg="white")

Note that the model results are all quite similar. In our example we choose the Accuracy as performance measure to select the best model. Let’s find the maximum mean Accuracy:

model_comp |>slice_max(mean_roc_auc)
#> # A tibble: 1 × 11
#>   model               mean_accuracy mean_f_meas mean_kap mean_roc_auc mean_sens
#>   <chr>                       <dbl>       <dbl>    <dbl>        <dbl>     <dbl>
#> 1 Logistic Regression         0.628       0.561    0.250        0.683     0.657
#> # ℹ 5 more variables: std_err_accuracy <dbl>, std_err_f_meas <dbl>,
#> #   std_err_kap <dbl>, std_err_roc_auc <dbl>, std_err_sens <dbl>
last_fit_rf <- last_fit(log_wflow, 
                        split = data_split,
                        metrics = metric_set(
                          f_meas, 
                          yardstick::accuracy, kap,
                          roc_auc, sens)
                        )

Show performance metrics

last_fit_rf |>
  collect_metrics()
#> # A tibble: 5 × 4
#>   .metric  .estimator .estimate .config             
#>   <chr>    <chr>          <dbl> <chr>               
#> 1 f_meas   binary         0.551 Preprocessor1_Model1
#> 2 accuracy binary         0.623 Preprocessor1_Model1
#> 3 kap      binary         0.237 Preprocessor1_Model1
#> 4 sens     binary         0.639 Preprocessor1_Model1
#> 5 roc_auc  binary         0.676 Preprocessor1_Model1

last_fit_rf %>%
  collect_predictions() |>
  conf_mat(pass_flag, .pred_class) |>
  autoplot(type = "heatmap") +
  scale_fill_gradient(
  high = "#07193b",  
  low = "#4a6c75"  
)+
  labs(title="Confusion matrix (logistic Regression)")


ggsave("conf_matrix.png",height=200,width=300, units = "mm",bg="white")

Let’s create the ROC curve. Again, since the event we are predicting is the first level in the Results factor (“above”), we provide roc_curve() with the relevant class probability .pred_above:

last_fit_rf |>
  collect_predictions() |>
  roc_curve(pass_flag,.pred_0) |>
  autoplot()


ggsave("roc_curve.png",height=200,width=300, units = "mm",bg="white")
last_fit_rf |>
  pluck(".workflow", 1) |>  
  pull_workflow_fit() |>
  vip::vip()

ggsave("rutendo_vip.png",height=200,width=300, units = "mm",bg="white")