::opts_chunk$set(cache = TRUE,
knitrecho = TRUE,
message = FALSE,
warning = FALSE,
fig.path = "static",
fig.height=6,
fig.width = 1.777777*6,
fig.align='center',
tidy = FALSE,
comment = NA,
highlight = TRUE,
prompt = FALSE,
crop = TRUE,
comment = "#>",
collapse = TRUE)
::opts_knit$set(width = 60)
knitrlibrary(tidyverse)
library(reshape2)
theme_set(theme_light(base_size = 16))
<- function(output, otherwise) {
make_latex_decorator function() {
if (knitr:::is_latex_output()) output else otherwise
}
}<- make_latex_decorator(". . .", "\n")
insert_pause <- make_latex_decorator("----", "\n")
insert_slide_break <- make_latex_decorator("> *", "*")
insert_inc_bullet <- make_latex_decorator("", "$$") insert_html_math
Comparing and Machine Learning Models
library(tidymodels)
library(tidyverse)
library(data.table)
library(DT)
library(ggthemes)
::sourceDir("CODE/functions/")
geneorama#> calculate_confusion_values.R :
#> calculate_heat_values.R :
#> calculate_violation_matrix.R :
#> calculate_violation_types.R :
#> categorize.R :
#> filter_business.R :
#> filter_crime.R :
#> filter_foodInspect.R :
#> filter_garbageCarts.R :
#> filter_sanitationComplaints.R :
#> find_bus_id_matches.R :
#> GenerateOtherLicenseInfo.R :
#> gini.R :
#> kde.R :
#> logLik.R :
#> nokey.R :
#> simulated_bin_summary.R :
#> simulated_date_diff_mean.R :
#> weather_3day_calc.R :
<- readRDS("DATA/23_food_insp_features.Rds")
food <- readRDS("DATA/24_bus_features.Rds")
bus <- readRDS("DATA/19_inspector_assignments.Rds")
sanitarians <- readRDS("DATA/17_mongo_weather_update.Rds")
weather <- readRDS("DATA/22_burglary_heat.Rds")
heat_burglary <- readRDS("DATA/22_garbageCarts_heat.Rds")
heat_garbage <- readRDS("DATA/22_sanitationComplaints_heat.Rds")
heat_sanitation
##==============================================================================
## MERGE IN FEATURES
##==============================================================================
<- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian]
sanitarians setnames(heat_burglary, "heat_values", "heat_burglary")
setnames(heat_garbage, "heat_values", "heat_garbage")
setnames(heat_sanitation, "heat_values", "heat_sanitation")
<- copy(food)
dat <- dat[bus]
dat <- merge(x = dat, y = sanitarians, by = "Inspection_ID")
dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date")
dat <- merge(dat, na.omit(heat_burglary), by = "Inspection_ID")
dat <- merge(dat, na.omit(heat_garbage), by = "Inspection_ID")
dat <- merge(dat, na.omit(heat_sanitation), by = "Inspection_ID")
dat
## Set the key for dat
setkey(dat, Inspection_ID)
## Remove unnecessary data
rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation)
## Only the model data should be present
::lll()
geneorama#> Class KB Dims
#> dat data.table 19072 22300 x 75
##==============================================================================
## FILTER ROWS
##==============================================================================
<- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"]
dat
dat#> Inspection_ID Inspection_Date DBA_Name
#> 1: 269961 2013-01-31 SEVEN STAR
#> 2: 507211 2011-10-18 PANERA BREAD
#> 3: 507212 2011-10-18 LITTLE QUIAPO RESTAURANT
#> 4: 507216 2011-10-19 SERGIO'S TAQUERIA PIZZA INC.
#> 5: 507219 2011-10-20 TARGET STORE # T-2079
#> ---
#> 18777: 1506243 2014-10-29 CENTRAL GYROS CORP
#> 18778: 1506245 2014-10-30 TORTILLA SOUP
#> 18779: 1506247 2014-10-31 GROTA RESTAURANT
#> 18780: 1508207 2014-10-27 R U HUNGRY?
#> 18781: 1513208 2014-10-31 EL RANCHITO FOOD
#> AKA_Name License Facility_Type Risk
#> 1: SEVEN STAR 30790 Grocery Store Risk 3 (Low)
#> 2: PANERA BREAD 1475890 Restaurant Risk 1 (High)
#> 3: LITTLE QUIAPO RESTAURANT 1740130 Restaurant Risk 1 (High)
#> 4: SERGIO'S TAQUERIA PIZZA 1447363 Restaurant Risk 1 (High)
#> 5: TARGET 1679459 Restaurant Risk 2 (Medium)
#> ---
#> 18777: CENTRAL GYROS 28851 Restaurant Risk 1 (High)
#> 18778: TORTILLA SOUP 78989 Restaurant Risk 1 (High)
#> 18779: GROTA RESTAURANT 6753 Restaurant Risk 1 (High)
#> 18780: R U HUNGRY? 2008741 Restaurant Risk 1 (High)
#> 18781: EL RANCHITO FOOD 1383420 Grocery Store Risk 2 (Medium)
#> Address City State Zip Inspection_Type Results
#> 1: 3352 N BROADWAY CHICAGO IL 60657 Canvass Pass
#> 2: 6059 N LINCOLN AVE CHICAGO IL 60659 Canvass Pass
#> 3: 6259 N MCCORMICK RD CHICAGO IL 60659 Canvass Fail
#> 4: 3253 W BRYN MAWR AVE CHICAGO IL 60659 Canvass Pass
#> 5: 2112 W PETERSON AVE CHICAGO IL 60659 Canvass Fail
#> ---
#> 18777: 3127 N CENTRAL AVE CHICAGO IL 60634 Canvass Pass
#> 18778: 3809-11 N HARLEM AVE CHICAGO IL 60634 Canvass Pass
#> 18779: 3108-3112 N CENTRAL AVE CHICAGO IL 60634 Canvass Pass
#> 18780: 1724 W 51st ST CHICAGO IL 60609 Canvass Pass
#> 18781: 8327-29 S PULASKI RD CHICAGO IL 60652 Canvass Pass
#> Latitude Longitude Location
#> 1: 41.94336 -87.64500 (41.943359344775146, -87.64499875300952)
#> 2: 41.99192 -87.70963 (41.99191947239194, -87.70963133440333)
#> 3: 41.99563 -87.71271 (41.99563177556418, -87.71270678169132)
#> 4: 41.98293 -87.71098 (41.982933189164974, -87.71098225381141)
#> 5: 41.99073 -87.68298 (41.99072921796059, -87.68297945359863)
#> ---
#> 18777: 41.93758 -87.76631 (41.93758152034964, -87.76631489179631)
#> 18778: 41.94922 -87.80700 (41.94921947071411, -87.80699727423111)
#> 18779: 41.93707 -87.76659 (41.93706710815131, -87.76659233846347)
#> 18780: 41.80140 -87.66837 (41.80139599557942, -87.66836713634461)
#> 18781: 41.74137 -87.72151 (41.741374166961236, -87.72150639395679)
#> Facility_Type_Clean criticalCount seriousCount minorCount pass_flag
#> 1: Other 0 0 2 1
#> 2: Restaurant 0 0 3 1
#> 3: Restaurant 0 2 6 0
#> 4: Restaurant 0 0 6 1
#> 5: Restaurant 0 2 6 0
#> ---
#> 18777: Restaurant 0 0 6 1
#> 18778: Restaurant 0 0 4 1
#> 18779: Restaurant 0 0 4 1
#> 18780: Restaurant 0 0 1 1
#> 18781: Other 0 0 8 1
#> fail_flag pastFail pastCritical pastSerious pastMinor timeSinceLast
#> 1: 0 0 0 0 0 2.0000000
#> 2: 0 0 0 0 0 2.0000000
#> 3: 1 0 0 0 0 2.0000000
#> 4: 0 0 0 0 0 2.0000000
#> 5: 1 0 0 0 0 2.0000000
#> ---
#> 18777: 0 0 0 0 7 0.7150685
#> 18778: 0 0 0 0 7 0.5808219
#> 18779: 0 0 0 0 8 0.4657534
#> 18780: 0 0 0 0 2 0.6136986
#> 18781: 0 0 0 0 4 0.9041096
#> firstRecord ID LICENSE_ID ACCOUNT_NUMBER
#> 1: 1 30790-20110416 2081412 63759
#> 2: 1 1475890-20110416 2081695 207283
#> 3: 1 1740130-20110216 2070145 3107
#> 4: 1 1447363-20110216 2071895 270993
#> 5: 1 1679459-20100216 2009972 15538
#> ---
#> 18777: 0 28851-20131016 2279699 3542
#> 18778: 0 78989-20131016 2280040 23219
#> 18779: 0 6753-20131016 2279707 3673
#> 18780: 0 2008741-20140116 2297606 349243
#> 18781: 0 1383420-20131116 2286790 266695
#> LEGAL_NAME DOING_BUSINESS_AS_NAME
#> 1: VIRGINIA DELA ROSA SEVEN STAR
#> 2: PANERA, LLC PANERA BREAD
#> 3: ENELITA GARCIA LITTLE QUIAPO RESTAURANT
#> 4: SERGIO'S TAZUERIA PIZZA INC. SERGIO'S TAQUERIA PIZZA INC.
#> 5: TARGET CORPORATION TARGET STORE # T-2079
#> ---
#> 18777: CENTRAL GYROS, CORP. CENTRAL GYROS CORP
#> 18778: HARLEM-GRACE, INC. TORTILLA SOUP
#> 18779: GROTA, INC. GROTA RESTAURANT
#> 18780: R U HUNGRY?, LTD. R U HUNGRY?
#> 18781: EL RANCHITO FOOD, INC. EL RANCHITO FOOD
#> ADDRESS CITY STATE ZIP_CODE WARD PRECINCT
#> 1: 3352 N BROADWAY 1 CHICAGO IL 60657 44 33
#> 2: 6059 N LINCOLN AVE C CHICAGO IL 60659 50 23
#> 3: 6259 N MCCORMICK RD CHICAGO IL 60659 50 25
#> 4: 3253 W BRYN MAWR AVE CHICAGO IL 60659 39 48
#> 5: 2112 W PETERSON AVE CHICAGO IL 60659 40 18
#> ---
#> 18777: 3127 N CENTRAL AVE 1ST CHICAGO IL 60634 31 37
#> 18778: 3809-11 N HARLEM AVE 1ST CHICAGO IL 60634 38 3
#> 18779: 3108-3112 N CENTRAL AVE 1ST CHICAGO IL 60634 31 37
#> 18780: 1724 W 51ST ST 1 CHICAGO IL 60609 20 4
#> 18781: 8327-29 S PULASKI RD CHICAGO IL 60652 18 11
#> WARD_PRECINCT POLICE_DISTRICT LICENSE_CODE LICENSE_DESCRIPTION
#> 1: 44-33 19 1006 Retail Food Establishment
#> 2: 50-23 24 1006 Retail Food Establishment
#> 3: 50-25 17 1006 Retail Food Establishment
#> 4: 39-48 17 1006 Retail Food Establishment
#> 5: 40-18 24 1006 Retail Food Establishment
#> ---
#> 18777: 31-37 25 1006 Retail Food Establishment
#> 18778: 38-3 16 1006 Retail Food Establishment
#> 18779: 31-37 25 1006 Retail Food Establishment
#> 18780: 20-4 9 1006 Retail Food Establishment
#> 18781: 18-11 8 1006 Retail Food Establishment
#> BUSINESS_ACTIVITY_ID BUSINESS_ACTIVITY LICENSE_NUMBER
#> 1: 775 Retail Sales of Perishable Foods 30790
#> 2: 775 Retail Sales of Perishable Foods 1475890
#> 3: 775 Retail Sales of Perishable Foods 1740130
#> 4: 775 Retail Sales of Perishable Foods 1447363
#> 5: 775 Retail Sales of Perishable Foods 1679459
#> ---
#> 18777: 775 Retail Sales of Perishable Foods 28851
#> 18778: 775 Retail Sales of Perishable Foods 78989
#> 18779: 775 Retail Sales of Perishable Foods 6753
#> 18780: 775 Retail Sales of Perishable Foods 2008741
#> 18781: 775 Retail Sales of Perishable Foods 1383420
#> APPLICATION_TYPE LICENSE_TERM_START_DATE LICENSE_TERM_EXPIRATION_DATE
#> 1: RENEW 2011-04-16 2013-04-15
#> 2: RENEW 2011-04-16 2013-04-15
#> 3: RENEW 2011-02-16 2013-02-15
#> 4: RENEW 2011-02-16 2013-02-15
#> 5: RENEW 2010-02-16 2012-02-15
#> ---
#> 18777: RENEW 2013-10-16 2015-10-15
#> 18778: RENEW 2013-10-16 2015-10-15
#> 18779: RENEW 2013-10-16 2015-10-15
#> 18780: RENEW 2014-01-16 2016-01-15
#> 18781: RENEW 2013-11-16 2015-11-15
#> LICENSE_STATUS LATITUDE LONGITUDE minDate maxDate ageAtInspection
#> 1: AAI 41.94336 -87.64500 2002-02-16 2015-04-15 10.964384
#> 2: AAI 41.99192 -87.70963 2004-05-05 2019-04-15 7.457534
#> 3: AAI 41.99563 -87.71271 2007-03-22 2017-02-15 4.578082
#> 4: AAI 41.98293 -87.71098 2003-12-31 2019-02-15 7.805479
#> 5: AAI 41.99073 -87.68298 2006-07-07 2018-02-15 5.290411
#> ---
#> 18777: AAI 41.93758 -87.76631 2002-05-16 2019-10-15 12.463014
#> 18778: AAI 41.94922 -87.80700 2002-11-16 2017-10-15 11.961644
#> 18779: AAI 41.93707 -87.76659 2002-05-16 2019-10-15 12.468493
#> 18780: AAI 41.80140 -87.66837 2009-12-29 2018-01-15 4.830137
#> 18781: AAI 41.74137 -87.72151 2003-09-29 2019-11-15 11.095890
#> consumption_on_premises_incidental_activity tobacco package_goods
#> 1: 0 1 0
#> 2: 0 0 0
#> 3: 0 0 0
#> 4: 0 0 0
#> 5: 0 0 1
#> ---
#> 18777: 1 0 0
#> 18778: 1 0 0
#> 18779: 1 0 0
#> 18780: 0 0 0
#> 18781: 0 0 0
#> outdoor_patio public_place_of_amusement limited_business_license
#> 1: 0 0 1
#> 2: 0 0 0
#> 3: 0 0 0
#> 4: 0 0 0
#> 5: 0 0 1
#> ---
#> 18777: 0 0 0
#> 18778: 0 0 0
#> 18779: 0 0 0
#> 18780: 0 0 0
#> 18781: 0 0 0
#> childrens_services_facility_license tavern regulated_business_license
#> 1: 0 0 0
#> 2: 0 0 0
#> 3: 0 0 0
#> 4: 0 0 0
#> 5: 0 0 0
#> ---
#> 18777: 0 0 0
#> 18778: 0 0 0
#> 18779: 0 0 0
#> 18780: 0 0 0
#> 18781: 0 0 0
#> filling_station caterers_liquor_license mobile_food_license sanitarian
#> 1: 0 0 0 green
#> 2: 0 0 0 blue
#> 3: 0 0 0 blue
#> 4: 0 0 0 blue
#> 5: 0 0 0 blue
#> ---
#> 18777: 0 0 0 brown
#> 18778: 0 0 0 brown
#> 18779: 0 0 0 brown
#> 18780: 0 0 0 brown
#> 18781: 0 0 0 orange
#> precipIntensity temperatureMax windSpeed humidity heat_burglary
#> 1: 0.0145866667 53.49667 13.340000 0.9000000 26.992376
#> 2: 0.0019066667 59.04667 13.016667 0.5500000 13.976557
#> 3: 0.0019066667 59.04667 13.016667 0.5500000 12.611239
#> 4: 0.0027366667 56.15333 10.863333 0.6166667 35.906383
#> 5: 0.0099866667 52.73000 16.266667 0.6900000 9.530785
#> ---
#> 18777: 0.0018000000 65.67333 8.430000 0.6466667 19.974219
#> 18778: 0.0017333333 62.49000 11.026667 0.6800000 14.219000
#> 18779: 0.0010666667 54.76333 9.256667 0.7033333 21.580979
#> 18780: 0.0001333333 62.60000 4.690000 0.6700000 49.906934
#> 18781: 0.0010666667 54.76333 9.256667 0.7033333 32.749393
#> heat_garbage heat_sanitation
#> 1: 12.768572 37.748787
#> 2: 12.895485 15.412267
#> 3: 8.004220 8.320330
#> 4: 26.238645 38.186618
#> 5: 3.401567 2.126788
#> ---
#> 18777: 26.320516 33.413499
#> 18778: 23.563573 12.189961
#> 18779: 27.637291 27.170901
#> 18780: 22.384442 11.930061
#> 18781: 23.952219 23.257770
<-dat |>
dat_2014mutate(Facility_Type = fct_lump(Facility_Type,4),
pass_flag = as.factor(pass_flag)) |>
filter(year(Inspection_Date)==2014) |>
mutate(pastFail = as.factor(pastFail))
Distribution of Facility type
<- dat_2014 %>%
plt_df group_by(Facility_Type) %>%
mutate(x.lab=paste0(Facility_Type,
"\n",
"(n=",
n(),
")")) |>
group_by(x.lab) |>
tally() |>
mutate(pct = n/sum(n)*100,
lbl = sprintf("%.1f%%", pct)) |>
mutate(variable="variable")
%>%
plt_df ggplot() +
aes(x = fct_reorder(x.lab,pct),y=pct,fill=variable) +
geom_col(width=0.5) +
geom_text(aes(label=lbl),position=position_stack(vjust=0.5))+
labs(title = "", y = "percentage", x ="",fill="") +
::scale_fill_economist() +
ggthemes::theme_tufte() +
ggthemestheme(legend.position = "none",
axis.text.x = element_text(face="bold",
color="black",
size=8))
ggsave("rutendo_dist_facility.png",height=200,width=300, units = "mm",bg="white")
Flag
<- dat_2014 %>%
plt_df group_by(pass_flag) %>%
mutate(x.lab=paste0(pass_flag,
"\n",
"(n=",
n(),
")")) |>
group_by(x.lab) |>
tally() |>
mutate(pct = n/sum(n)*100,
lbl = sprintf("%.1f%%", pct)) |>
mutate(variable="variable")
%>%
plt_df ggplot() +
aes(x = fct_reorder(x.lab,pct),y=pct,fill=variable) +
geom_col(width=0.5) +
geom_text(aes(label=lbl),position=position_stack(vjust=0.5))+
labs(title = "", y = "percentage", x ="",fill="") +
::scale_fill_economist() +
ggthemes::theme_tufte() +
ggthemestheme(legend.position = "none",
axis.text.x = element_text(face="bold",
color="black",
size=8))
ggsave("rutendo_dist_flag.png",height=200,width=300, units = "mm",bg="white")
|>
dat_2014 count(pass_flag, Facility_Type) |>
ggplot(aes(Facility_Type, pass_flag, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = number(n, big.mark = ","))) +
scale_fill_binned(low = "purple", high = "#07193b",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "Facility Type",
y = "Flag",
fill = "Frequency")
ggsave("rutendo_dist_heat.png",height=200,width=300, units = "mm",bg="white")
|>
dat_2014 count(pass_flag, Risk) |>
ggplot(aes(Risk, pass_flag, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = number(n, big.mark = ","))) +
scale_fill_binned(low = "purple", high = "#07193b",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "Risk",
y = "Flag",
fill = "Frequency")
ggsave("rutendo_dist_risk.png",height=200,width=300, units = "mm",bg="white")
Correlations
library(corrplot)
# dlookr: correlation heatmap
<- dat_2014 %>%
data_for_cor select(Facility_Type,
Risk,
windSpeed,
humidity,
heat_burglary,
heat_garbage,
heat_sanitation,
sanitarian,
precipIntensity,
temperatureMax,
ageAtInspection,
tobacco,
caterers_liquor_license,|>
timeSinceLast) select_if(is.numeric)
= cor(data_for_cor)
corrs corrplot(corrs, type="upper", method="color", addCoef.col = "#07193b")
library(gtsummary)
<- dat_2014 |>
p select(Facility_Type,Risk,tobacco,caterers_liquor_license,Results,pastFail) |>
mutate_all(as.factor)
# summary of descriptive statistics per river (mean,standard deviation and Analysis Of Variance)
theme_gtsummary_journal("jama")
<-p %>%
table2tbl_summary(
by = Results,
type=all_categorical()~"categorical",
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} / {N} ({p}%)")) %>%
add_p(test = all_continuous() ~ "t.test",
pvalue_fun = function(x) style_pvalue(x, digits = 2)) %>%
modify_header(statistic ~ "**Test Statistic**") %>%
bold_labels() %>%
modify_fmt_fun(statistic ~ style_sigfig);table2
Characteristic | Fail, N = 790 | Pass, N = 2,597 | Pass w/ Conditions, N = 678 | Test Statistic | p-value1 |
---|---|---|---|---|---|
Facility_Type, n / N (%) | 19 | 0.013 | |||
Bakery | 26 / 790 (3.3%) | 42 / 2,597 (1.6%) | 12 / 678 (1.8%) | ||
Grocery Store | 110 / 790 (14%) | 281 / 2,597 (11%) | 80 / 678 (12%) | ||
Restaurant | 628 / 790 (79%) | 2,193 / 2,597 (84%) | 566 / 678 (83%) | ||
School | 9 / 790 (1.1%) | 20 / 2,597 (0.8%) | 2 / 678 (0.3%) | ||
Other | 17 / 790 (2.2%) | 61 / 2,597 (2.3%) | 18 / 678 (2.7%) | ||
Risk, n / N (%) | 12 | 0.017 | |||
Risk 1 (High) | 619 / 790 (78%) | 1,986 / 2,597 (76%) | 525 / 678 (77%) | ||
Risk 2 (Medium) | 151 / 790 (19%) | 555 / 2,597 (21%) | 150 / 678 (22%) | ||
Risk 3 (Low) | 20 / 790 (2.5%) | 56 / 2,597 (2.2%) | 3 / 678 (0.4%) | ||
tobacco, n / N (%) | 1.6 | 0.44 | |||
0 | 718 / 790 (91%) | 2,397 / 2,597 (92%) | 623 / 678 (92%) | ||
1 | 72 / 790 (9.1%) | 200 / 2,597 (7.7%) | 55 / 678 (8.1%) | ||
caterers_liquor_license, n / N (%) | 0.04 | 0.98 | |||
0 | 777 / 790 (98%) | 2,552 / 2,597 (98%) | 666 / 678 (98%) | ||
1 | 13 / 790 (1.6%) | 45 / 2,597 (1.7%) | 12 / 678 (1.8%) | ||
pastFail, n / N (%) | 34 | <0.001 | |||
0 | 626 / 790 (79%) | 2,269 / 2,597 (87%) | 589 / 678 (87%) | ||
1 | 164 / 790 (21%) | 328 / 2,597 (13%) | 89 / 678 (13%) | ||
1 Pearson’s Chi-squared test |
- Facility type and risk are significantly associated with results of inspection
Initial logistic model
glm(pass_flag ~ Facility_Type +
+
Risk +
windSpeed +
humidity +
heat_burglary +
heat_garbage +
heat_sanitation +
sanitarian +
temperatureMax +
ageAtInspection +
tobacco +
caterers_liquor_license +
pastFail family=binomial,data=dat_2014) |>
timeSinceLast,tbl_regression(exponentiate = TRUE)
Characteristic | OR (95% CI)1 | p-value |
---|---|---|
Facility_Type | ||
Bakery | — | |
Grocery Store | 1.16 (0.68 to 1.96) | 0.58 |
Restaurant | 1.89 (1.17 to 3.03) | 0.008 |
School | 1.34 (0.54 to 3.46) | 0.53 |
Other | 1.64 (0.86 to 3.13) | 0.13 |
Risk | ||
Risk 1 (High) | — | |
Risk 2 (Medium) | 1.49 (1.24 to 1.80) | <0.001 |
Risk 3 (Low) | 2.40 (1.38 to 4.30) | 0.002 |
windSpeed | 1.02 (1.00 to 1.05) | 0.040 |
humidity | 1.39 (0.58 to 3.34) | 0.47 |
heat_burglary | 1.00 (1.0 to 1.00) | 0.69 |
heat_garbage | 1.00 (0.99 to 1.00) | 0.063 |
heat_sanitation | 1.00 (1.00 to 1.00) | 0.69 |
sanitarian | ||
blue | — | |
brown | 7.72 (5.49 to 11.1) | <0.001 |
green | 2.40 (1.97 to 2.92) | <0.001 |
orange | 1.48 (1.22 to 1.80) | <0.001 |
purple | 0.47 (0.35 to 0.63) | <0.001 |
yellow | 3.35 (2.67 to 4.21) | <0.001 |
temperatureMax | 1.00 (1.0 to 1.00) | 0.51 |
ageAtInspection | 1.01 (0.99 to 1.02) | 0.37 |
tobacco | 1.06 (0.78 to 1.46) | 0.70 |
caterers_liquor_license | 1.28 (0.76 to 2.20) | 0.35 |
pastFail | ||
0 | — | |
1 | 0.62 (0.51 to 0.75) | <0.001 |
timeSinceLast | 0.72 (0.63 to 0.83) | <0.001 |
1 OR = Odds Ratio, CI = Confidence Interval |
The principle question is whether we can reasonably determine the probability that a restaurant inspection will yield at least one critical violation. That is, the focus will be whether or not any critical violation is found—a binary response. We use a glmnet model to estimate the impact.
the results above suggest that facility type has a significant effect on the results of inpection , with all facility types having odds ratios above 1 . However restaurants have increased odds of yielding at least one critical violation in comparison to bakeries and as well have a significant estimate
risk 2 and 3 significantly increase the odds of yielding at least one critical violation.
heat_garbage is a significant predictor though its effect is not that much on the final result of inspection
Model building
# Fix the random numbers by setting the seed
# This enables the analysis to be reproducible
set.seed(123)
# Put 3/4 of the data into the training set
<- initial_split(dat_2014,
data_split prop = 0.70,
strata = pass_flag)
# Create dataframes for the two sets:
<- training(data_split)
train_data <- testing(data_split) test_data
<-
housing_rec recipe(pass_flag ~ Facility_Type +
+
Risk +
windSpeed +
humidity +
heat_burglary +
heat_garbage +
heat_sanitation +
sanitarian +
temperatureMax +
ageAtInspection +
tobacco +
caterers_liquor_license +
pastFail
timeSinceLast,data = train_data) %>%
step_naomit(everything(), skip = TRUE) |>
step_dummy(all_nominal(), -all_outcomes()) |>
::step_smote(pass_flag) themis
<-housing_rec |>
housing_rec_prepprep()
summary(housing_rec)
#> # A tibble: 15 × 4
#> variable type role source
#> <chr> <list> <chr> <chr>
#> 1 Facility_Type <chr [3]> predictor original
#> 2 Risk <chr [3]> predictor original
#> 3 windSpeed <chr [2]> predictor original
#> 4 humidity <chr [2]> predictor original
#> 5 heat_burglary <chr [2]> predictor original
#> 6 heat_garbage <chr [2]> predictor original
#> 7 heat_sanitation <chr [2]> predictor original
#> 8 sanitarian <chr [3]> predictor original
#> 9 temperatureMax <chr [2]> predictor original
#> 10 ageAtInspection <chr [2]> predictor original
#> 11 tobacco <chr [2]> predictor original
#> 12 caterers_liquor_license <chr [2]> predictor original
#> 13 pastFail <chr [3]> predictor original
#> 14 timeSinceLast <chr [2]> predictor original
#> 15 pass_flag <chr [3]> outcome original
<-
prepped_data |># use the recipe object
housing_rec prep() |># perform the recipe on training data
juice() # extract only the preprocessed dataframe
5 fold cross validation
set.seed(100)
<-
cv_folds vfold_cv(train_data,
v = 5,
strata = pass_flag)
setup models
# Random forest
<-
rf_spec rand_forest() |>
set_engine("ranger", importance = "impurity") |>
set_mode("classification")
#Boosted tree (XGBoost)
<-
xgb_spec boost_tree() |>
set_engine("xgboost") |>
set_mode("classification")
# K-nearest neighbor
<-
knn_spec nearest_neighbor(neighbors = 5) |># we can adjust the number of neighbors
set_engine("kknn") |>
set_mode("classification")
# Decision Tree
<-
dt_spec decision_tree() %>%
set_mode("classification") |>
set_engine("rpart")
# Logistic Regression
<-logistic_reg() |>
log_specset_mode("classification") |>
set_engine("glm")
<-
log_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(log_spec)
<-
rf_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(rf_spec)
<-
xgb_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(xgb_spec)
<-
knn_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(knn_spec)
<-
dt_wflow workflow() %>%
add_recipe(housing_rec) |>
add_model(dt_spec)
<- function(x) {
get_model extract_fit_parsnip(x) |>tidy()
}
Logistic Regression
<-
log_res |>
log_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|> collect_metrics(summarize = TRUE)
log_res #> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.628 5 0.0104 Preprocessor1_Model1
#> 2 f_meas binary 0.561 5 0.00879 Preprocessor1_Model1
#> 3 kap binary 0.250 5 0.0176 Preprocessor1_Model1
#> 4 roc_auc binary 0.683 5 0.0121 Preprocessor1_Model1
#> 5 sens binary 0.657 5 0.0114 Preprocessor1_Model1
Decision Tree
<-
dt_res |>
dt_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|> collect_metrics(summarize = TRUE)
dt_res #> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.637 5 0.0120 Preprocessor1_Model1
#> 2 f_meas binary 0.495 5 0.0190 Preprocessor1_Model1
#> 3 kap binary 0.213 5 0.0235 Preprocessor1_Model1
#> 4 roc_auc binary 0.650 5 0.0122 Preprocessor1_Model1
#> 5 sens binary 0.496 5 0.0336 Preprocessor1_Model1
3]]
dt_res[[#> [[1]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.538 Preprocessor1_Model1
#> 2 accuracy binary 0.677 Preprocessor1_Model1
#> 3 kap binary 0.290 Preprocessor1_Model1
#> 4 sens binary 0.519 Preprocessor1_Model1
#> 5 roc_auc binary 0.682 Preprocessor1_Model1
#>
#> [[2]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.507 Preprocessor1_Model1
#> 2 accuracy binary 0.649 Preprocessor1_Model1
#> 3 kap binary 0.235 Preprocessor1_Model1
#> 4 sens binary 0.5 Preprocessor1_Model1
#> 5 roc_auc binary 0.664 Preprocessor1_Model1
#>
#> [[3]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.454 Preprocessor1_Model1
#> 2 accuracy binary 0.614 Preprocessor1_Model1
#> 3 kap binary 0.156 Preprocessor1_Model1
#> 4 sens binary 0.444 Preprocessor1_Model1
#> 5 roc_auc binary 0.616 Preprocessor1_Model1
#>
#> [[4]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.447 Preprocessor1_Model1
#> 2 accuracy binary 0.634 Preprocessor1_Model1
#> 3 kap binary 0.176 Preprocessor1_Model1
#> 4 sens binary 0.410 Preprocessor1_Model1
#> 5 roc_auc binary 0.628 Preprocessor1_Model1
#>
#> [[5]]
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.530 Preprocessor1_Model1
#> 2 accuracy binary 0.613 Preprocessor1_Model1
#> 3 kap binary 0.209 Preprocessor1_Model1
#> 4 sens binary 0.605 Preprocessor1_Model1
#> 5 roc_auc binary 0.661 Preprocessor1_Model1
Random forest
<-
rf_res |>
rf_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|> collect_metrics(summarize = TRUE)
rf_res #> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.654 5 0.00523 Preprocessor1_Model1
#> 2 f_meas binary 0.454 5 0.00892 Preprocessor1_Model1
#> 3 kap binary 0.208 5 0.0101 Preprocessor1_Model1
#> 4 roc_auc binary 0.665 5 0.0125 Preprocessor1_Model1
#> 5 sens binary 0.399 5 0.0130 Preprocessor1_Model1
XGBoost
<-
xgb_res |>
xgb_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|>collect_metrics(summarize = TRUE)
xgb_res #> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.638 5 0.0120 Preprocessor1_Model1
#> 2 f_meas binary 0.460 5 0.0135 Preprocessor1_Model1
#> 3 kap binary 0.190 5 0.0233 Preprocessor1_Model1
#> 4 roc_auc binary 0.646 5 0.0181 Preprocessor1_Model1
#> 5 sens binary 0.426 5 0.0129 Preprocessor1_Model1
K-nearest neighbor
<-
knn_res |>
knn_wflow fit_resamples(
resamples = cv_folds,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens),control = control_resamples(save_pred = TRUE)
)
|>collect_metrics(summarize = TRUE)
knn_res #> # A tibble: 5 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.601 5 0.00771 Preprocessor1_Model1
#> 2 f_meas binary 0.457 5 0.00777 Preprocessor1_Model1
#> 3 kap binary 0.142 5 0.0141 Preprocessor1_Model1
#> 4 roc_auc binary 0.604 5 0.00847 Preprocessor1_Model1
#> 5 sens binary 0.465 5 0.00900 Preprocessor1_Model1
Compare models
Extract metrics from our models to compare them:
<-
rf_metrics |>
rf_res collect_metrics(summarise = TRUE) %>%
mutate(model = "Random Forest")
<-
xgb_metrics |>
xgb_res collect_metrics(summarise = TRUE) %>%
mutate(model = "XGBoost")
<-
knn_metrics |>
knn_res collect_metrics(summarise = TRUE) %>%
mutate(model = "Knn")
<-
log_metrics |>
log_res collect_metrics(summarise = TRUE) %>%
mutate(model = "Logistic Regression")
<-
dt_metrics |>
dt_res collect_metrics(summarise = TRUE) %>%
mutate(model = "Decision Tree")
# create dataframe with all models
<- bind_rows(log_metrics,
model_compare
rf_metrics,
xgb_metrics,
knn_metrics,
dt_metrics
)
# change data structure
<-
model_comp |>
model_compare select(model, .metric, mean, std_err) |>
pivot_wider(names_from = .metric, values_from = c(mean, std_err))
# show mean F1-Score for every model
|>
model_comp arrange(mean_f_meas) |>
mutate(model = fct_reorder(model, mean_f_meas)) |># order results
ggplot(aes(fct_reorder(model, mean_f_meas),mean_f_meas, fill=model)) +
labs(x="Model") +
geom_col() +
coord_flip() +
scale_fill_tableau()+
geom_text(
size = 3,
aes(label = round(mean_f_meas, 2), y = mean_f_meas + 0.08),
vjust = 1
)
ggsave("f_measure.png",height=200,width=300, units = "mm",bg="white")
# show mean area under the curve (auc) per model
|>
model_comp arrange(mean_roc_auc) |>
mutate(model = fct_reorder(model, mean_roc_auc)) %>%
ggplot(aes(fct_reorder(model, mean_roc_auc),mean_roc_auc, fill=model)) +
labs(x="Model") +
geom_col() +
coord_flip() +
scale_fill_tableau()+
geom_text(
size = 3,
aes(label = round(mean_roc_auc, 2), y = mean_roc_auc + 0.08),
vjust = 1
)
ggsave("roc_auc.png",height=200,width=300, units = "mm",bg="white")
# show mean accuracy per model
|>
model_comp arrange(mean_accuracy) |>
mutate(model = fct_reorder(model, mean_accuracy)) %>%
ggplot(aes(fct_reorder(model, mean_accuracy),mean_accuracy, fill=model)) +
labs(x="Model") +
geom_col() +
coord_flip() +
scale_fill_tableau() +
geom_text(
size = 3,
aes(label = round(mean_accuracy, 2), y = mean_accuracy + 0.08),
vjust = 1
)
ggsave("rutendo_accuracy.png",height=200,width=300, units = "mm",bg="white")
Note that the model results are all quite similar. In our example we choose the Accuracy as performance measure to select the best model. Let’s find the maximum mean Accuracy:
|>slice_max(mean_roc_auc)
model_comp #> # A tibble: 1 × 11
#> model mean_accuracy mean_f_meas mean_kap mean_roc_auc mean_sens
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Logistic Regression 0.628 0.561 0.250 0.683 0.657
#> # ℹ 5 more variables: std_err_accuracy <dbl>, std_err_f_meas <dbl>,
#> # std_err_kap <dbl>, std_err_roc_auc <dbl>, std_err_sens <dbl>
<- last_fit(log_wflow,
last_fit_rf split = data_split,
metrics = metric_set(
f_meas, ::accuracy, kap,
yardstick
roc_auc, sens) )
Show performance metrics
|>
last_fit_rf collect_metrics()
#> # A tibble: 5 × 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 f_meas binary 0.551 Preprocessor1_Model1
#> 2 accuracy binary 0.623 Preprocessor1_Model1
#> 3 kap binary 0.237 Preprocessor1_Model1
#> 4 sens binary 0.639 Preprocessor1_Model1
#> 5 roc_auc binary 0.676 Preprocessor1_Model1
%>%
last_fit_rf collect_predictions() |>
conf_mat(pass_flag, .pred_class) |>
autoplot(type = "heatmap") +
scale_fill_gradient(
high = "#07193b",
low = "#4a6c75"
+
)labs(title="Confusion matrix (logistic Regression)")
ggsave("conf_matrix.png",height=200,width=300, units = "mm",bg="white")
Let’s create the ROC curve. Again, since the event we are predicting is the first level in the Results factor (“above”), we provide roc_curve() with the relevant class probability .pred_above:
|>
last_fit_rf collect_predictions() |>
roc_curve(pass_flag,.pred_0) |>
autoplot()
ggsave("roc_curve.png",height=200,width=300, units = "mm",bg="white")
|>
last_fit_rf pluck(".workflow", 1) |>
pull_workflow_fit() |>
::vip() vip
ggsave("rutendo_vip.png",height=200,width=300, units = "mm",bg="white")