knitr::opts_chunk$set(cache = TRUE,
echo = TRUE,
message = FALSE,
warning = FALSE,
fig.path = "static",
fig.height=6,
fig.width = 1.777777*6,
fig.align='center',
tidy = FALSE,
comment = NA,
highlight = TRUE,
prompt = FALSE,
crop = TRUE,
comment = "#>",
collapse = TRUE)
knitr::opts_knit$set(width = 60)
library(tidyverse)
library(reshape2)
theme_set(theme_light(base_size = 16))
make_latex_decorator <- function(output, otherwise) {
function() {
if (knitr:::is_latex_output()) output else otherwise
}
}
insert_pause <- make_latex_decorator(". . .", "\n")
insert_slide_break <- make_latex_decorator("----", "\n")
insert_inc_bullet <- make_latex_decorator("> *", "*")
insert_html_math <- make_latex_decorator("", "$$")Comparing and Machine Learning Models
library(tidymodels)
library(tidyverse)
library(data.table)
library(DT)
library(ggthemes)geneorama::sourceDir("CODE/functions/")
#> calculate_confusion_values.R :
#> calculate_heat_values.R :
#> calculate_violation_matrix.R :
#> calculate_violation_types.R :
#> categorize.R :
#> filter_business.R :
#> filter_crime.R :
#> filter_foodInspect.R :
#> filter_garbageCarts.R :
#> filter_sanitationComplaints.R :
#> find_bus_id_matches.R :
#> GenerateOtherLicenseInfo.R :
#> gini.R :
#> kde.R :
#> logLik.R :
#> nokey.R :
#> simulated_bin_summary.R :
#> simulated_date_diff_mean.R :
#> weather_3day_calc.R :
food <- readRDS("DATA/23_food_insp_features.Rds")
bus <- readRDS("DATA/24_bus_features.Rds")
sanitarians <- readRDS("DATA/19_inspector_assignments.Rds")
weather <- readRDS("DATA/17_mongo_weather_update.Rds")
heat_burglary <- readRDS("DATA/22_burglary_heat.Rds")
heat_garbage <- readRDS("DATA/22_garbageCarts_heat.Rds")
heat_sanitation <- readRDS("DATA/22_sanitationComplaints_heat.Rds")
##==============================================================================
## MERGE IN FEATURES
##==============================================================================
sanitarians <- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian]
setnames(heat_burglary, "heat_values", "heat_burglary")
setnames(heat_garbage, "heat_values", "heat_garbage")
setnames(heat_sanitation, "heat_values", "heat_sanitation")
dat <- copy(food)
dat <- dat[bus]
dat <- merge(x = dat, y = sanitarians, by = "Inspection_ID")
dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date")
dat <- merge(dat, na.omit(heat_burglary), by = "Inspection_ID")
dat <- merge(dat, na.omit(heat_garbage), by = "Inspection_ID")
dat <- merge(dat, na.omit(heat_sanitation), by = "Inspection_ID")
## Set the key for dat
setkey(dat, Inspection_ID)
## Remove unnecessary data
rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation)
## Only the model data should be present
geneorama::lll()
#> Class KB Dims
#> dat data.table 19072 22300 x 75
##==============================================================================
## FILTER ROWS
##==============================================================================
dat <- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"]
dat
#> Inspection_ID Inspection_Date DBA_Name
#> 1: 269961 2013-01-31 SEVEN STAR
#> 2: 507211 2011-10-18 PANERA BREAD
#> 3: 507212 2011-10-18 LITTLE QUIAPO RESTAURANT
#> 4: 507216 2011-10-19 SERGIO'S TAQUERIA PIZZA INC.
#> 5: 507219 2011-10-20 TARGET STORE # T-2079
#> ---
#> 18777: 1506243 2014-10-29 CENTRAL GYROS CORP
#> 18778: 1506245 2014-10-30 TORTILLA SOUP
#> 18779: 1506247 2014-10-31 GROTA RESTAURANT
#> 18780: 1508207 2014-10-27 R U HUNGRY?
#> 18781: 1513208 2014-10-31 EL RANCHITO FOOD
#> AKA_Name License Facility_Type Risk
#> 1: SEVEN STAR 30790 Grocery Store Risk 3 (Low)
#> 2: PANERA BREAD 1475890 Restaurant Risk 1 (High)
#> 3: LITTLE QUIAPO RESTAURANT 1740130 Restaurant Risk 1 (High)
#> 4: SERGIO'S TAQUERIA PIZZA 1447363 Restaurant Risk 1 (High)
#> 5: TARGET 1679459 Restaurant Risk 2 (Medium)
#> ---
#> 18777: CENTRAL GYROS 28851 Restaurant Risk 1 (High)
#> 18778: TORTILLA SOUP 78989 Restaurant Risk 1 (High)
#> 18779: GROTA RESTAURANT 6753 Restaurant Risk 1 (High)
#> 18780: R U HUNGRY? 2008741 Restaurant Risk 1 (High)
#> 18781: EL RANCHITO FOOD 1383420 Grocery Store Risk 2 (Medium)
#> Address City State Zip Inspection_Type Results
#> 1: 3352 N BROADWAY CHICAGO IL 60657 Canvass Pass
#> 2: 6059 N LINCOLN AVE CHICAGO IL 60659 Canvass Pass
#> 3: 6259 N MCCORMICK RD CHICAGO IL 60659 Canvass Fail
#> 4: 3253 W BRYN MAWR AVE CHICAGO IL 60659 Canvass Pass
#> 5: 2112 W PETERSON AVE CHICAGO IL 60659 Canvass Fail
#> ---
#> 18777: 3127 N CENTRAL AVE CHICAGO IL 60634 Canvass Pass
#> 18778: 3809-11 N HARLEM AVE CHICAGO IL 60634 Canvass Pass
#> 18779: 3108-3112 N CENTRAL AVE CHICAGO IL 60634 Canvass Pass
#> 18780: 1724 W 51st ST CHICAGO IL 60609 Canvass Pass
#> 18781: 8327-29 S PULASKI RD CHICAGO IL 60652 Canvass Pass
#> Latitude Longitude Location
#> 1: 41.94336 -87.64500 (41.943359344775146, -87.64499875300952)
#> 2: 41.99192 -87.70963 (41.99191947239194, -87.70963133440333)
#> 3: 41.99563 -87.71271 (41.99563177556418, -87.71270678169132)
#> 4: 41.98293 -87.71098 (41.982933189164974, -87.71098225381141)
#> 5: 41.99073 -87.68298 (41.99072921796059, -87.68297945359863)
#> ---
#> 18777: 41.93758 -87.76631 (41.93758152034964, -87.76631489179631)
#> 18778: 41.94922 -87.80700 (41.94921947071411, -87.80699727423111)
#> 18779: 41.93707 -87.76659 (41.93706710815131, -87.76659233846347)
#> 18780: 41.80140 -87.66837 (41.80139599557942, -87.66836713634461)
#> 18781: 41.74137 -87.72151 (41.741374166961236, -87.72150639395679)
#> Facility_Type_Clean criticalCount seriousCount minorCount pass_flag
#> 1: Other 0 0 2 1
#> 2: Restaurant 0 0 3 1
#> 3: Restaurant 0 2 6 0
#> 4: Restaurant 0 0 6 1
#> 5: Restaurant 0 2 6 0
#> ---
#> 18777: Restaurant 0 0 6 1
#> 18778: Restaurant 0 0 4 1
#> 18779: Restaurant 0 0 4 1
#> 18780: Restaurant 0 0 1 1
#> 18781: Other 0 0 8 1
#> fail_flag pastFail pastCritical pastSerious pastMinor timeSinceLast
#> 1: 0 0 0 0 0 2.0000000
#> 2: 0 0 0 0 0 2.0000000
#> 3: 1 0 0 0 0 2.0000000
#> 4: 0 0 0 0 0 2.0000000
#> 5: 1 0 0 0 0 2.0000000
#> ---
#> 18777: 0 0 0 0 7 0.7150685
#> 18778: 0 0 0 0 7 0.5808219
#> 18779: 0 0 0 0 8 0.4657534
#> 18780: 0 0 0 0 2 0.6136986
#> 18781: 0 0 0 0 4 0.9041096
#> firstRecord ID LICENSE_ID ACCOUNT_NUMBER
#> 1: 1 30790-20110416 2081412 63759
#> 2: 1 1475890-20110416 2081695 207283
#> 3: 1 1740130-20110216 2070145 3107
#> 4: 1 1447363-20110216 2071895 270993
#> 5: 1 1679459-20100216 2009972 15538
#> ---
#> 18777: 0 28851-20131016 2279699 3542
#> 18778: 0 78989-20131016 2280040 23219
#> 18779: 0 6753-20131016 2279707 3673
#> 18780: 0 2008741-20140116 2297606 349243
#> 18781: 0 1383420-20131116 2286790 266695
#> LEGAL_NAME DOING_BUSINESS_AS_NAME
#> 1: VIRGINIA DELA ROSA SEVEN STAR
#> 2: PANERA, LLC PANERA BREAD
#> 3: ENELITA GARCIA LITTLE QUIAPO RESTAURANT
#> 4: SERGIO'S TAZUERIA PIZZA INC. SERGIO'S TAQUERIA PIZZA INC.
#> 5: TARGET CORPORATION TARGET STORE # T-2079
#> ---
#> 18777: CENTRAL GYROS, CORP. CENTRAL GYROS CORP
#> 18778: HARLEM-GRACE, INC. TORTILLA SOUP
#> 18779: GROTA, INC. GROTA RESTAURANT
#> 18780: R U HUNGRY?, LTD. R U HUNGRY?
#> 18781: EL RANCHITO FOOD, INC. EL RANCHITO FOOD
#> ADDRESS CITY STATE ZIP_CODE WARD PRECINCT
#> 1: 3352 N BROADWAY 1 CHICAGO IL 60657 44 33
#> 2: 6059 N LINCOLN AVE C CHICAGO IL 60659 50 23
#> 3: 6259 N MCCORMICK RD CHICAGO IL 60659 50 25
#> 4: 3253 W BRYN MAWR AVE CHICAGO IL 60659 39 48
#> 5: 2112 W PETERSON AVE CHICAGO IL 60659 40 18
#> ---
#> 18777: 3127 N CENTRAL AVE 1ST CHICAGO IL 60634 31 37
#> 18778: 3809-11 N HARLEM AVE 1ST CHICAGO IL 60634 38 3
#> 18779: 3108-3112 N CENTRAL AVE 1ST CHICAGO IL 60634 31 37
#> 18780: 1724 W 51ST ST 1 CHICAGO IL 60609 20 4
#> 18781: 8327-29 S PULASKI RD CHICAGO IL 60652 18 11
#> WARD_PRECINCT POLICE_DISTRICT LICENSE_CODE LICENSE_DESCRIPTION
#> 1: 44-33 19 1006 Retail Food Establishment
#> 2: 50-23 24 1006 Retail Food Establishment
#> 3: 50-25 17 1006 Retail Food Establishment
#> 4: 39-48 17 1006 Retail Food Establishment
#> 5: 40-18 24 1006 Retail Food Establishment
#> ---
#> 18777: 31-37 25 1006 Retail Food Establishment
#> 18778: 38-3 16 1006 Retail Food Establishment
#> 18779: 31-37 25 1006 Retail Food Establishment
#> 18780: 20-4 9 1006 Retail Food Establishment
#> 18781: 18-11 8 1006 Retail Food Establishment
#> BUSINESS_ACTIVITY_ID BUSINESS_ACTIVITY LICENSE_NUMBER
#> 1: 775 Retail Sales of Perishable Foods 30790
#> 2: 775 Retail Sales of Perishable Foods 1475890
#> 3: 775 Retail Sales of Perishable Foods 1740130
#> 4: 775 Retail Sales of Perishable Foods 1447363
#> 5: 775 Retail Sales of Perishable Foods 1679459
#> ---
#> 18777: 775 Retail Sales of Perishable Foods 28851
#> 18778: 775 Retail Sales of Perishable Foods 78989
#> 18779: 775 Retail Sales of Perishable Foods 6753
#> 18780: 775 Retail Sales of Perishable Foods 2008741
#> 18781: 775 Retail Sales of Perishable Foods 1383420
#> APPLICATION_TYPE LICENSE_TERM_START_DATE LICENSE_TERM_EXPIRATION_DATE
#> 1: RENEW 2011-04-16 2013-04-15
#> 2: RENEW 2011-04-16 2013-04-15
#> 3: RENEW 2011-02-16 2013-02-15
#> 4: RENEW 2011-02-16 2013-02-15
#> 5: RENEW 2010-02-16 2012-02-15
#> ---
#> 18777: RENEW 2013-10-16 2015-10-15
#> 18778: RENEW 2013-10-16 2015-10-15
#> 18779: RENEW 2013-10-16 2015-10-15
#> 18780: RENEW 2014-01-16 2016-01-15
#> 18781: RENEW 2013-11-16 2015-11-15
#> LICENSE_STATUS LATITUDE LONGITUDE minDate maxDate ageAtInspection
#> 1: AAI 41.94336 -87.64500 2002-02-16 2015-04-15 10.964384
#> 2: AAI 41.99192 -87.70963 2004-05-05 2019-04-15 7.457534
#> 3: AAI 41.99563 -87.71271 2007-03-22 2017-02-15 4.578082
#> 4: AAI 41.98293 -87.71098 2003-12-31 2019-02-15 7.805479
#> 5: AAI 41.99073 -87.68298 2006-07-07 2018-02-15 5.290411
#> ---
#> 18777: AAI 41.93758 -87.76631 2002-05-16 2019-10-15 12.463014
#> 18778: AAI 41.94922 -87.80700 2002-11-16 2017-10-15 11.961644
#> 18779: AAI 41.93707 -87.76659 2002-05-16 2019-10-15 12.468493
#> 18780: AAI 41.80140 -87.66837 2009-12-29 2018-01-15 4.830137
#> 18781: AAI 41.74137 -87.72151 2003-09-29 2019-11-15 11.095890
#> consumption_on_premises_incidental_activity tobacco package_goods
#> 1: 0 1 0
#> 2: 0 0 0
#> 3: 0 0 0
#> 4: 0 0 0
#> 5: 0 0 1
#> ---
#> 18777: 1 0 0
#> 18778: 1 0 0
#> 18779: 1 0 0
#> 18780: 0 0 0
#> 18781: 0 0 0
#> outdoor_patio public_place_of_amusement limited_business_license
#> 1: 0 0 1
#> 2: 0 0 0
#> 3: 0 0 0
#> 4: 0 0 0
#> 5: 0 0 1
#> ---
#> 18777: 0 0 0
#> 18778: 0 0 0
#> 18779: 0 0 0
#> 18780: 0 0 0
#> 18781: 0 0 0
#> childrens_services_facility_license tavern regulated_business_license
#> 1: 0 0 0
#> 2: 0 0 0
#> 3: 0 0 0
#> 4: 0 0 0
#> 5: 0 0 0
#> ---
#> 18777: 0 0 0
#> 18778: 0 0 0
#> 18779: 0 0 0
#> 18780: 0 0 0
#> 18781: 0 0 0
#> filling_station caterers_liquor_license mobile_food_license sanitarian
#> 1: 0 0 0 green
#> 2: 0 0 0 blue
#> 3: 0 0 0 blue
#> 4: 0 0 0 blue
#> 5: 0 0 0 blue
#> ---
#> 18777: 0 0 0 brown
#> 18778: 0 0 0 brown
#> 18779: 0 0 0 brown
#> 18780: 0 0 0 brown
#> 18781: 0 0 0 orange
#> precipIntensity temperatureMax windSpeed humidity heat_burglary
#> 1: 0.0145866667 53.49667 13.340000 0.9000000 26.992376
#> 2: 0.0019066667 59.04667 13.016667 0.5500000 13.976557
#> 3: 0.0019066667 59.04667 13.016667 0.5500000 12.611239
#> 4: 0.0027366667 56.15333 10.863333 0.6166667 35.906383
#> 5: 0.0099866667 52.73000 16.266667 0.6900000 9.530785
#> ---
#> 18777: 0.0018000000 65.67333 8.430000 0.6466667 19.974219
#> 18778: 0.0017333333 62.49000 11.026667 0.6800000 14.219000
#> 18779: 0.0010666667 54.76333 9.256667 0.7033333 21.580979
#> 18780: 0.0001333333 62.60000 4.690000 0.6700000 49.906934
#> 18781: 0.0010666667 54.76333 9.256667 0.7033333 32.749393
#> heat_garbage heat_sanitation
#> 1: 12.768572 37.748787
#> 2: 12.895485 15.412267
#> 3: 8.004220 8.320330
#> 4: 26.238645 38.186618
#> 5: 3.401567 2.126788
#> ---
#> 18777: 26.320516 33.413499
#> 18778: 23.563573 12.189961
#> 18779: 27.637291 27.170901
#> 18780: 22.384442 11.930061
#> 18781: 23.952219 23.257770dat_2014<-dat |>
mutate(Facility_Type = fct_lump(Facility_Type,4),
pass_flag = as.factor(pass_flag)) |>
filter(year(Inspection_Date)==2014) |>
mutate(pastFail = as.factor(pastFail))Distribution of Facility type
plt_df <- dat_2014 %>%
group_by(Facility_Type) %>%
mutate(x.lab=paste0(Facility_Type,
"\n",
"(n=",
n(),
")")) |>
group_by(x.lab) |>
tally() |>
mutate(pct = n/sum(n)*100,
lbl = sprintf("%.1f%%", pct)) |>
mutate(variable="variable")
plt_df %>%
ggplot() +
aes(x = fct_reorder(x.lab,pct),y=pct,fill=variable) +
geom_col(width=0.5) +
geom_text(aes(label=lbl),position=position_stack(vjust=0.5))+
labs(title = "", y = "percentage", x ="",fill="") +
ggthemes::scale_fill_economist() +
ggthemes::theme_tufte() +
theme(legend.position = "none",
axis.text.x = element_text(face="bold",
color="black",
size=8))
ggsave("rutendo_dist_facility.png",height=200,width=300, units = "mm",bg="white")Flag
plt_df <- dat_2014 %>%
group_by(pass_flag) %>%
mutate(x.lab=paste0(pass_flag,
"\n",
"(n=",
n(),
")")) |>
group_by(x.lab) |>
tally() |>
mutate(pct = n/sum(n)*100,
lbl = sprintf("%.1f%%", pct)) |>
mutate(variable="variable")
plt_df %>%
ggplot() +
aes(x = fct_reorder(x.lab,pct),y=pct,fill=variable) +
geom_col(width=0.5) +
geom_text(aes(label=lbl),position=position_stack(vjust=0.5))+
labs(title = "", y = "percentage", x ="",fill="") +
ggthemes::scale_fill_economist() +
ggthemes::theme_tufte() +
theme(legend.position = "none",
axis.text.x = element_text(face="bold",
color="black",
size=8))
ggsave("rutendo_dist_flag.png",height=200,width=300, units = "mm",bg="white")dat_2014 |>
count(pass_flag, Facility_Type) |>
ggplot(aes(Facility_Type, pass_flag, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = number(n, big.mark = ","))) +
scale_fill_binned(low = "purple", high = "#07193b",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "Facility Type",
y = "Flag",
fill = "Frequency")
ggsave("rutendo_dist_heat.png",height=200,width=300, units = "mm",bg="white")dat_2014 |>
count(pass_flag, Risk) |>
ggplot(aes(Risk, pass_flag, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = number(n, big.mark = ","))) +
scale_fill_binned(low = "purple", high = "#07193b",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "Risk",
y = "Flag",
fill = "Frequency")
ggsave("rutendo_dist_risk.png",height=200,width=300, units = "mm",bg="white")Correlations
library(corrplot)
# dlookr: correlation heatmap
data_for_cor <- dat_2014 %>%
select(Facility_Type,
Risk,
windSpeed,
humidity,
heat_burglary,
heat_garbage,
heat_sanitation,
sanitarian,
precipIntensity,
temperatureMax,
ageAtInspection,
tobacco,
caterers_liquor_license,
timeSinceLast) |>
select_if(is.numeric)
corrs = cor(data_for_cor)
corrplot(corrs, type="upper", method="color", addCoef.col = "#07193b")
library(gtsummary)
p <- dat_2014 |>
select(Facility_Type,Risk,tobacco,caterers_liquor_license,Results,pastFail) |>
mutate_all(as.factor)
# summary of descriptive statistics per river (mean,standard deviation and Analysis Of Variance)
theme_gtsummary_journal("jama")
table2<-p %>%
tbl_summary(
by = Results,
type=all_categorical()~"categorical",
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} / {N} ({p}%)")) %>%
add_p(test = all_continuous() ~ "t.test",
pvalue_fun = function(x) style_pvalue(x, digits = 2)) %>%
modify_header(statistic ~ "**Test Statistic**") %>%
bold_labels() %>%
modify_fmt_fun(statistic ~ style_sigfig);table2 | Characteristic | Fail, N = 790 | Pass, N = 2,597 | Pass w/ Conditions, N = 678 | Test Statistic | p-value1 |
|---|---|---|---|---|---|
| Facility_Type, n / N (%) | 19 | 0.013 | |||
| Bakery | 26 / 790 (3.3%) | 42 / 2,597 (1.6%) | 12 / 678 (1.8%) | ||
| Grocery Store | 110 / 790 (14%) | 281 / 2,597 (11%) | 80 / 678 (12%) | ||
| Restaurant | 628 / 790 (79%) | 2,193 / 2,597 (84%) | 566 / 678 (83%) | ||
| School | 9 / 790 (1.1%) | 20 / 2,597 (0.8%) | 2 / 678 (0.3%) | ||
| Other | 17 / 790 (2.2%) | 61 / 2,597 (2.3%) | 18 / 678 (2.7%) | ||
| Risk, n / N (%) | 12 | 0.017 | |||
| Risk 1 (High) | 619 / 790 (78%) | 1,986 / 2,597 (76%) | 525 / 678 (77%) | ||
| Risk 2 (Medium) | 151 / 790 (19%) | 555 / 2,597 (21%) | 150 / 678 (22%) | ||
| Risk 3 (Low) | 20 / 790 (2.5%) | 56 / 2,597 (2.2%) | 3 / 678 (0.4%) | ||
| tobacco, n / N (%) | 1.6 | 0.44 | |||
| 0 | 718 / 790 (91%) | 2,397 / 2,597 (92%) | 623 / 678 (92%) | ||
| 1 | 72 / 790 (9.1%) | 200 / 2,597 (7.7%) | 55 / 678 (8.1%) | ||
| caterers_liquor_license, n / N (%) | 0.04 | 0.98 | |||
| 0 | 777 / 790 (98%) | 2,552 / 2,597 (98%) | 666 / 678 (98%) | ||
| 1 | 13 / 790 (1.6%) | 45 / 2,597 (1.7%) | 12 / 678 (1.8%) | ||
| pastFail, n / N (%) | 34 | <0.001 | |||
| 0 | 626 / 790 (79%) | 2,269 / 2,597 (87%) | 589 / 678 (87%) | ||
| 1 | 164 / 790 (21%) | 328 / 2,597 (13%) | 89 / 678 (13%) | ||
| 1 Pearson’s Chi-squared test | |||||
- Facility type and risk are significantly associated with results of inspection
Initial logistic model
glm(pass_flag ~ Facility_Type +
Risk +
windSpeed +
humidity +
heat_burglary +
heat_garbage +
heat_sanitation +
sanitarian +
temperatureMax +
ageAtInspection +
tobacco +
caterers_liquor_license +
pastFail +
timeSinceLast,family=binomial,data=dat_2014) |>
tbl_regression(exponentiate = TRUE)| Characteristic | OR (95% CI)1 | p-value |
|---|---|---|
| Facility_Type | ||
| Bakery | — | |
| Grocery Store | 1.16 (0.68 to 1.96) | 0.58 |
| Restaurant | 1.89 (1.17 to 3.03) | 0.008 |
| School | 1.34 (0.54 to 3.46) | 0.53 |
| Other | 1.64 (0.86 to 3.13) | 0.13 |
| Risk | ||
| Risk 1 (High) | — | |
| Risk 2 (Medium) | 1.49 (1.24 to 1.80) | <0.001 |
| Risk 3 (Low) | 2.40 (1.38 to 4.30) | 0.002 |
| windSpeed | 1.02 (1.00 to 1.05) | 0.040 |
| humidity | 1.39 (0.58 to 3.34) | 0.47 |
| heat_burglary | 1.00 (1.0 to 1.00) | 0.69 |
| heat_garbage | 1.00 (0.99 to 1.00) | 0.063 |
| heat_sanitation | 1.00 (1.00 to 1.00) | 0.69 |
| sanitarian | ||
| blue | — | |
| brown | 7.72 (5.49 to 11.1) | <0.001 |
| green | 2.40 (1.97 to 2.92) | <0.001 |
| orange | 1.48 (1.22 to 1.80) | <0.001 |
| purple | 0.47 (0.35 to 0.63) | <0.001 |
| yellow | 3.35 (2.67 to 4.21) | <0.001 |
| temperatureMax | 1.00 (1.0 to 1.00) | 0.51 |
| ageAtInspection | 1.01 (0.99 to 1.02) | 0.37 |
| tobacco | 1.06 (0.78 to 1.46) | 0.70 |
| caterers_liquor_license | 1.28 (0.76 to 2.20) | 0.35 |
| pastFail | ||
| 0 | — | |
| 1 | 0.62 (0.51 to 0.75) | <0.001 |
| timeSinceLast | 0.72 (0.63 to 0.83) | <0.001 |
| 1 OR = Odds Ratio, CI = Confidence Interval | ||
The principle question is whether we can reasonably determine the probability that a restaurant inspection will yield at least one critical violation. That is, the focus will be whether or not any critical violation is found—a binary response. We use a glmnet model to estimate the impact.
the results above suggest that facility type has a significant effect on the results of inpection , with all facility types having odds ratios above 1 . However restaurants have increased odds of yielding at least one critical violation in comparison to bakeries and as well have a significant estimate
risk 2 and 3 significantly increase the odds of yielding at least one critical violation.
heat_garbage is a significant predictor though its effect is not that much on the final result of inspection
Model building
# Fix the random numbers by setting the seed
# This enables the analysis to be reproducible
set.seed(123)
# Put 3/4 of the data into the training set
data_split <- initial_split(dat_2014,
prop = 0.70,
strata = pass_flag)
# Create dataframes for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)housing_rec <-
recipe(pass_flag ~ Facility_Type +
Risk +
windSpeed +
humidity +
heat_burglary +
heat_garbage +
heat_sanitation +
sanitarian +
temperatureMax +
ageAtInspection +
tobacco +
caterers_liquor_license +
pastFail +
timeSinceLast,
data = train_data) %>%
step_naomit(everything(), skip = TRUE) |>
step_dummy(all_nominal(), -all_outcomes()) |>
themis::step_smote(pass_flag)housing_rec_prep<-housing_rec |>
prep()summary(housing_rec)
#> # A tibble: 15 × 4
#> variable type role source
#> <chr> <list> <chr> <chr>
#> 1 Facility_Type <chr [3]> predictor original
#> 2 Risk <chr [3]> predictor original
#> 3 windSpeed <chr [2]> predictor original
#> 4 humidity <chr [2]> predictor original
#> 5 heat_burglary <chr [2]> predictor original
#> 6 heat_garbage <chr [2]> predictor original
#> 7 heat_sanitation <chr [2]> predictor original
#> 8 sanitarian <chr [3]> predictor original
#> 9 temperatureMax <chr [2]> predictor original
#> 10 ageAtInspection <chr [2]> predictor original
#> 11 tobacco <chr [2]> predictor original
#> 12 caterers_liquor_license <chr [2]> predictor original
#> 13 pastFail <chr [3]> predictor original
#> 14 timeSinceLast <chr [2]> predictor original
#> 15 pass_flag <chr [3]> outcome original
prepped_data <-
housing_rec |># use the recipe object
prep() |># perform the recipe on training data
juice() # extract only the preprocessed dataframe 5 fold cross validation
set.seed(100)
cv_folds <-
vfold_cv(train_data,
v = 5,
strata = pass_flag) setup models
# Random forest
rf_spec <-
rand_forest() |>
set_engine("ranger", importance = "impurity") |>
set_mode("classification")#Boosted tree (XGBoost)
xgb_spec <-
boost_tree() |>
set_engine("xgboost") |>
set_mode("classification") # K-nearest neighbor
knn_spec <-
nearest_neighbor(neighbors = 5) |># we can adjust the number of neighbors
set_engine("kknn") |>
set_mode("classification") # Decision Tree
dt_spec <-
decision_tree() %>%
set_mode("classification") |>
set_engine("rpart") # Logistic Regression
log_spec<-logistic_reg() |>
set_mode("classification") |>
set_engine("glm")
log_wflow <-
workflow() %>%
add_recipe(housing_rec) |>
add_model(log_spec)
rf_wflow <-
workflow() %>%
add_recipe(housing_rec) |>
add_model(rf_spec) xgb_wflow <-
workflow() %>%
add_recipe(housing_rec) |>
add_model(xgb_spec)
knn_wflow <-
workflow() %>%
add_recipe(housing_rec) |>
add_model(knn_spec)dt_wflow <-
workflow() %>%
add_recipe(housing_rec) |>
add_model(dt_spec)get_model <- function(x) {
extract_fit_parsnip(x) |>tidy()
}Logistic Regression
log_res <-
log_wflow |>
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas,
yardstick::accuracy, kap,
roc_auc, sens),
control = control_resamples(save_pred = TRUE)
)
log_res |> collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.628 5 0.0104 Preprocessor1_Model1
#> 2 f_meas binary 0.561 5 0.00879 Preprocessor1_Model1
#> 3 kap binary 0.250 5 0.0176 Preprocessor1_Model1
#> 4 roc_auc binary 0.683 5 0.0121 Preprocessor1_Model1
#> 5 sens binary 0.657 5 0.0114 Preprocessor1_Model1Decision Tree
dt_res <-
dt_wflow |>
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas,
yardstick::accuracy, kap,
roc_auc, sens),
control = control_resamples(save_pred = TRUE)
)
dt_res |> collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.637 5 0.0120 Preprocessor1_Model1
#> 2 f_meas binary 0.495 5 0.0190 Preprocessor1_Model1
#> 3 kap binary 0.213 5 0.0235 Preprocessor1_Model1
#> 4 roc_auc binary 0.650 5 0.0122 Preprocessor1_Model1
#> 5 sens binary 0.496 5 0.0336 Preprocessor1_Model1dt_res[[3]]
#> [[1]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.538 Preprocessor1_Model1
#> 2 accuracy binary 0.677 Preprocessor1_Model1
#> 3 kap binary 0.290 Preprocessor1_Model1
#> 4 sens binary 0.519 Preprocessor1_Model1
#> 5 roc_auc binary 0.682 Preprocessor1_Model1
#>
#> [[2]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.507 Preprocessor1_Model1
#> 2 accuracy binary 0.649 Preprocessor1_Model1
#> 3 kap binary 0.235 Preprocessor1_Model1
#> 4 sens binary 0.5 Preprocessor1_Model1
#> 5 roc_auc binary 0.664 Preprocessor1_Model1
#>
#> [[3]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.454 Preprocessor1_Model1
#> 2 accuracy binary 0.614 Preprocessor1_Model1
#> 3 kap binary 0.156 Preprocessor1_Model1
#> 4 sens binary 0.444 Preprocessor1_Model1
#> 5 roc_auc binary 0.616 Preprocessor1_Model1
#>
#> [[4]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.447 Preprocessor1_Model1
#> 2 accuracy binary 0.634 Preprocessor1_Model1
#> 3 kap binary 0.176 Preprocessor1_Model1
#> 4 sens binary 0.410 Preprocessor1_Model1
#> 5 roc_auc binary 0.628 Preprocessor1_Model1
#>
#> [[5]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.530 Preprocessor1_Model1
#> 2 accuracy binary 0.613 Preprocessor1_Model1
#> 3 kap binary 0.209 Preprocessor1_Model1
#> 4 sens binary 0.605 Preprocessor1_Model1
#> 5 roc_auc binary 0.661 Preprocessor1_Model1Random forest
rf_res <-
rf_wflow |>
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas,
yardstick::accuracy, kap,
roc_auc, sens),
control = control_resamples(save_pred = TRUE)
)
rf_res |> collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.654 5 0.00523 Preprocessor1_Model1
#> 2 f_meas binary 0.454 5 0.00892 Preprocessor1_Model1
#> 3 kap binary 0.208 5 0.0101 Preprocessor1_Model1
#> 4 roc_auc binary 0.665 5 0.0125 Preprocessor1_Model1
#> 5 sens binary 0.399 5 0.0130 Preprocessor1_Model1XGBoost
xgb_res <-
xgb_wflow |>
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas,
yardstick::accuracy, kap,
roc_auc, sens),
control = control_resamples(save_pred = TRUE)
)
xgb_res |>collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.638 5 0.0120 Preprocessor1_Model1
#> 2 f_meas binary 0.460 5 0.0135 Preprocessor1_Model1
#> 3 kap binary 0.190 5 0.0233 Preprocessor1_Model1
#> 4 roc_auc binary 0.646 5 0.0181 Preprocessor1_Model1
#> 5 sens binary 0.426 5 0.0129 Preprocessor1_Model1K-nearest neighbor
knn_res <-
knn_wflow |>
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas,
yardstick::accuracy, kap,
roc_auc, sens),
control = control_resamples(save_pred = TRUE)
)
knn_res |>collect_metrics(summarize = TRUE)
#> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.601 5 0.00771 Preprocessor1_Model1
#> 2 f_meas binary 0.457 5 0.00777 Preprocessor1_Model1
#> 3 kap binary 0.142 5 0.0141 Preprocessor1_Model1
#> 4 roc_auc binary 0.604 5 0.00847 Preprocessor1_Model1
#> 5 sens binary 0.465 5 0.00900 Preprocessor1_Model1Compare models
Extract metrics from our models to compare them:
rf_metrics <-
rf_res |>
collect_metrics(summarise = TRUE) %>%
mutate(model = "Random Forest")
xgb_metrics <-
xgb_res |>
collect_metrics(summarise = TRUE) %>%
mutate(model = "XGBoost")
knn_metrics <-
knn_res |>
collect_metrics(summarise = TRUE) %>%
mutate(model = "Knn")
log_metrics <-
log_res |>
collect_metrics(summarise = TRUE) %>%
mutate(model = "Logistic Regression")
dt_metrics <-
dt_res |>
collect_metrics(summarise = TRUE) %>%
mutate(model = "Decision Tree")
# create dataframe with all models
model_compare <- bind_rows(log_metrics,
rf_metrics,
xgb_metrics,
knn_metrics,
dt_metrics
)
# change data structure
model_comp <-
model_compare |>
select(model, .metric, mean, std_err) |>
pivot_wider(names_from = .metric, values_from = c(mean, std_err))
# show mean F1-Score for every model
model_comp |>
arrange(mean_f_meas) |>
mutate(model = fct_reorder(model, mean_f_meas)) |># order results
ggplot(aes(fct_reorder(model, mean_f_meas),mean_f_meas, fill=model)) +
labs(x="Model") +
geom_col() +
coord_flip() +
scale_fill_tableau()+
geom_text(
size = 3,
aes(label = round(mean_f_meas, 2), y = mean_f_meas + 0.08),
vjust = 1
)
ggsave("f_measure.png",height=200,width=300, units = "mm",bg="white")
# show mean area under the curve (auc) per model
model_comp |>
arrange(mean_roc_auc) |>
mutate(model = fct_reorder(model, mean_roc_auc)) %>%
ggplot(aes(fct_reorder(model, mean_roc_auc),mean_roc_auc, fill=model)) +
labs(x="Model") +
geom_col() +
coord_flip() +
scale_fill_tableau()+
geom_text(
size = 3,
aes(label = round(mean_roc_auc, 2), y = mean_roc_auc + 0.08),
vjust = 1
)
ggsave("roc_auc.png",height=200,width=300, units = "mm",bg="white")
# show mean accuracy per model
model_comp |>
arrange(mean_accuracy) |>
mutate(model = fct_reorder(model, mean_accuracy)) %>%
ggplot(aes(fct_reorder(model, mean_accuracy),mean_accuracy, fill=model)) +
labs(x="Model") +
geom_col() +
coord_flip() +
scale_fill_tableau() +
geom_text(
size = 3,
aes(label = round(mean_accuracy, 2), y = mean_accuracy + 0.08),
vjust = 1
)
ggsave("rutendo_accuracy.png",height=200,width=300, units = "mm",bg="white")Note that the model results are all quite similar. In our example we choose the Accuracy as performance measure to select the best model. Let’s find the maximum mean Accuracy:
model_comp |>slice_max(mean_roc_auc)
#> # A tibble: 1 × 11
#> model mean_accuracy mean_f_meas mean_kap mean_roc_auc mean_sens
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Logistic Regression 0.628 0.561 0.250 0.683 0.657
#> # ℹ 5 more variables: std_err_accuracy <dbl>, std_err_f_meas <dbl>,
#> # std_err_kap <dbl>, std_err_roc_auc <dbl>, std_err_sens <dbl>last_fit_rf <- last_fit(log_wflow,
split = data_split,
metrics = metric_set(
f_meas,
yardstick::accuracy, kap,
roc_auc, sens)
)Show performance metrics
last_fit_rf |>
collect_metrics()
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.551 Preprocessor1_Model1
#> 2 accuracy binary 0.623 Preprocessor1_Model1
#> 3 kap binary 0.237 Preprocessor1_Model1
#> 4 sens binary 0.639 Preprocessor1_Model1
#> 5 roc_auc binary 0.676 Preprocessor1_Model1
last_fit_rf %>%
collect_predictions() |>
conf_mat(pass_flag, .pred_class) |>
autoplot(type = "heatmap") +
scale_fill_gradient(
high = "#07193b",
low = "#4a6c75"
)+
labs(title="Confusion matrix (logistic Regression)")
ggsave("conf_matrix.png",height=200,width=300, units = "mm",bg="white")Let’s create the ROC curve. Again, since the event we are predicting is the first level in the Results factor (“above”), we provide roc_curve() with the relevant class probability .pred_above:
last_fit_rf |>
collect_predictions() |>
roc_curve(pass_flag,.pred_0) |>
autoplot()
ggsave("roc_curve.png",height=200,width=300, units = "mm",bg="white")last_fit_rf |>
pluck(".workflow", 1) |>
pull_workflow_fit() |>
vip::vip()
ggsave("rutendo_vip.png",height=200,width=300, units = "mm",bg="white")