library(gt)
library(rlang)
library(data.table)
library(brms)
library(ggplot2)
library(mgcv)
library(StanHeaders)
library(marginaleffects)
library(rstan)
library(cmdstanr)
library(tidybayes)
library(patchwork)
library(showtext)
<- "Roboto"
font_name font_add_google(font_name)
<- function(x, variables){
brms_to_draws <- x$fit@sim$samples
x <- lapply(x, \(x) x[, variables])
x_subsamples <- posterior::as_draws(x_subsamples)
x_draws return(x_draws)
}
<- function(xdt, var = 'user_id'){
get_unique_N return(xdt[, .(N = uniqueN(.SD)), .SDcols = var]$N)
}
<- function(center, inverse_logit = FALSE, format = "numeric") {
create_label_transform function(x) {
<- as.numeric(as.character(x))
x <- x + center
uncentered
if (inverse_logit) {
<- plogis(uncentered)
values else {
} <- uncentered
values
}
if (format == "numeric") {
return(round(values, 3))
else if (format == "percent") {
} return(paste0(round(values * 100, 1), "%"))
else {
} stop("Invalid format. Use 'numeric' or 'percent'.")
}
}
}
<- function(x, var){
get_center <- paste0(var, '_c')
var_c return(
unique(round(month_level_dt[[var]] -
5))
month_level_dt[[var_c]],
)
}
<- function(x, condition,
plot_marginal_effect trans.y = NULL,
translabels.x = NULL,
translabels.fill = rev(translabels.x),
binwidth = NULL,
plot_labels = NULL,
bin_alpha = .8,
patchwork_args = list(),
stack_swaps = FALSE,
swap_condition_axes = FALSE){
# x = cycle_time_full_intx_lin_remonth_m
# condition = list('month_num_c' = c(1, 3, 6, 9, 12) - 7,
# 'yr_avg_comments_per_pr_c' = c(1, 5, 7, 12, 18) - yr_avg_comments_per_pr_center)
# trans.y = scales::modulus_trans(p = .25)
# trans.x = NULL
# patchwork_args = list(nrow = 2)
# binwidth = NULL
# bin_alpha = .8
# plot_labels = list(labs(x = 'Month',
# y = 'Median cycle time (days)',
# fill = 'Comments per PR',
# color = 'Comments per PR'),
# labs(y = 'Median cycle time (days)',
# x = 'Comments per PR',
# fill = 'Month',
# color = 'Month'))
# translabels.x = list(create_label_transform(7),
# create_label_transform(yr_avg_comments_per_pr_center))
# translabels.fill = rev(translabels.x)
# stack_swaps = TRUE
# swap_condition_axes = FALSE
require(patchwork)
require(rlang)
<- \(x) return(x / (60*60*24))
s_to_day # Extract the formula terms
<- attr(model.frame(x), "terms")
terms_object # Get the variables attribute (which includes the response variable)
<- as.character(attr(terms_object, "variables"))[2]
response_variable <- quantile(x$data[[response_variable]], p = .95)
max_resp <- s_to_day(c(0, max_resp))
coords
if(stack_swaps){
<- unique(list(condition, rev(condition)))
condition_list else if (swap_condition_axes) {
} <- unique(list(rev(condition)))
condition_list else {
} <- unique(list(condition))
condition_list
}
<- lapply(1:length(condition_list), \(c_i) {
p <- condition_list[[c_i]]
cond if(!is.null(names(cond))){
<- names(cond)
c_names 1]] <- function(x) seq(from = min(x), to = max(x), length.out = 50)
cond[[else {
} <- cond
c_names
}
if (is.null(binwidth)) {
if(c_names[[1]] == 'month_num_c'){
<- 1
binwidth_x <- .005*s_to_day(diff(range(x$data[[response_variable]])))
binwidth_y else {
} <- .05*diff(range(x$data[[c_names[[1]]]]))
binwidth_x <- .0025*s_to_day(diff(range(x$data[[response_variable]])))
binwidth_y
}<- c(binwidth_x, binwidth_y)
this_binwidth else {
} if(c_i > length(binwidth)){
stop('binwidth should be a list with length equal to condition_list')
}<- binwidth[[c_i]]
this_binwidth if(length(this_binwidth) != 2){
stop('each element of binwidth should have length == 2')
}
}<- marginaleffects::plot_predictions(x, condition = cond, re_formula = NA, trans = s_to_day, draw = FALSE)
pp_data
<- ggplot(data = pp_data, aes(x = get(c_names[[1]]), y = estimate)) +
pp stat_summary_2d(data = x$data,
aes(y = s_to_day(get(response_variable)),
x = get(c_names[[1]]),
z = 1),
binwidth = this_binwidth, alpha = bin_alpha,
fun = function(x) sum(!is.na(x))) +
# geom_bin2d(data = x$data, aes(y = s_to_day(get(response_variable)), x = get(c_names[[1]])),
# binwidth = this_binwidth, alpha = bin_alpha) +
scale_fill_viridis_c(option = 'mako', guide = 'none',
begin = 1, end = .5)
if(length(cond) > 1){
<- pp +
pp ::new_scale_fill() +
ggnewscalegeom_ribbon(aes(ymin = conf.low, ymax = conf.high, group = get(c_names[[2]]),
fill = get(c_names[[2]])), alpha = .5) +
geom_line(aes(group = get(c_names[[2]]), color = get(c_names[[2]])))
if(!is.null(translabels.fill)){
<- pp + scale_color_viridis_d(aesthetics = c('fill', 'color'),
pp option = 'rocket', begin = 0, end = .7,
labels = translabels.fill[[c_i]])
else {
} <- pp + scale_color_viridis_d(labels = function(b) round(as.numeric(b), 2),
pp aesthetics = c('fill', 'color'),
option = 'rocket', begin = 0, end = .7)
}<- pp +
pp theme_minimal()
else if(length(cond) == 1) {
} <- pp +
pp ::new_scale_fill() +
ggnewscalegeom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .5) +
geom_line() +
theme_minimal()
}
if(!is.null(translabels.x)){
<- pp +
pp scale_x_continuous(labels = translabels.x[[c_i]])
}
if(is.null(plot_labels)){
if(length(c_names) == 1){
<- pp +
pp labs(x = c_names[[1]], y = response_variable)
else if(length(c_names == 2)){
} <- pp +
pp labs(x = c_names[[1]], y = response_variable, fill = c_names[[2]], color = c_names[[2]])
}else {
} <- pp +
pp
plot_labels[[c_i]]
}<- pp +
pp coord_cartesian(y = coords,
x = range(x$data[[c_names[[1]]]]))
return(pp)
})
if(!is.null(trans.y)){
message('adding transform for y')
<- lapply(p, \(pp) pp + scale_y_continuous(transform = trans.y))
p
}
<- list(nrow = 1)
default_patchwork <- do.call(patchwork::wrap_plots, c(list(p), modifyList(default_patchwork, patchwork_args)))
p return(p)
}
<- function(x, variables){
traceplot_manyvars require(posterior)
require(data.table)
<- x$fit@sim$samples
x <- lapply(x, \(x) x[, variables])
x_subsamples <- melt(as.data.table(posterior::as_draws_df(x_subsamples)),
x_draws_df_l id.vars = c('.chain', '.iteration', '.draw'))
:= factor(.chain)]
x_draws_df_l[, .chain <- ggplot(x_draws_df_l, aes(x = .iteration, y = value)) +
p geom_line(aes(group = .chain, color = .chain), alpha = 1, linewidth = .125) +
facet_wrap(~ variable, scales = 'free') +
scale_color_viridis_d(option = 'mako', begin = 0, end = .65) +
theme_minimal() +
theme(
strip.text.x = element_text(size = 8), # Adjust text size for readability
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing = unit(0, 'lines'),
legend.position = 'none',
axis.title = element_blank()
)return(p)
}
<- theme_minimal() +
theme_clean theme(
text = element_text(family = font_name),
strip.text.x = element_blank(), # Adjust text size for readability
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing = unit(0, 'lines')
)
<- function() {
objects_in_mem # Load the data.table package
require(data.table)
# Get all objects in the global environment
<- ls(envir = .GlobalEnv)
all_objects
# Create a data table with object names and sizes
<- data.table(
object_sizes Object = all_objects,
Size = sapply(all_objects, function(x) object.size(get(x, envir = .GlobalEnv)))
)
# Sort the data table by size in descending order
setorder(object_sizes, -Size)
# Return the sorted list
return(object_sizes)
}
<- function(filename, do_this){
filename_do if(!file.exists(filename)){
<- do_this
result saveRDS(result, filename)
else {
} <- readRDS(filename)
result
}return(result)
}
Data preparation
In [1]:
In [2]:
<- '~/remote'
data_dir <- data.table::fread(file.path(data_dir, 'productivity-project/ind_full_df.csv'))
full_df <- fread(file.path(data_dir, 'productivity-project/cycle_time_and_tickets_20240829.csv'))
ct_20240829 <- fread(file.path(data_dir, 'productivity-project/comments_pr.csv'))
comments_20240903 <- fread(file.path(data_dir, 'productivity-project/defect_tickets.csv'))
defecttkts_20240903 <- fread(file.path(data_dir, 'productivity-project/degree_cent_month.csv'))
centr_20240903
:= match(tolower(month), tolower(month.abb))]
full_df[, month_num
<- unique(ct_20240829[, -'TEAM_ID'])
ct_20240829 !is.na(CT_TIME_END) & !is.na(TK_TIME_END) & !CT_TIME_END == '' & !TK_TIME_END == '',
ct_20240829[sum(CT_TIME_END == TK_TIME_END), .N)]
.(!is.na(CT_CYCLE_TIME_S) & !is.na(TK_CYCLE_TIME_S) & !CT_CYCLE_TIME_S == '' & !TK_CYCLE_TIME_S == '',
ct_20240829[sum(CT_CYCLE_TIME_S == TK_CYCLE_TIME_S), .N)]
.(is.na(CT_CYCLE_TIME_S) | CT_CYCLE_TIME_S == '') & (!is.na(TK_CYCLE_TIME_S) & !TK_CYCLE_TIME_S == ''),
ct_20240829[(N_user_id = uniqueN(ASSIGNEE_APEX_USER_ID))]
.(.N, := fifelse(is.na(CT_CYCLE_TIME_S) | CT_CYCLE_TIME_S == '', TK_CYCLE_TIME_S, CT_CYCLE_TIME_S)]
ct_20240829[, new_cycle_time na_new_ct = sum(is.na(new_cycle_time)),
ct_20240829[, .(na_old_ct = sum(is.na(CT_CYCLE_TIME_S) | CT_CYCLE_TIME_S == ''))]
<- ct_20240829[, .(N_tickets_old = sum(!is.na(CT_CYCLE_TIME_S) & !CT_CYCLE_TIME_S == ''),
ct_20240829_agg N_tickets_new = uniqueN(TICKET_IDENTITY_ID),
N_unclosed = sum(CT_TIME_END == '' & TK_TIME_END == ''),
P_unclosed = sum(CT_TIME_END == '' & TK_TIME_END == '') / uniqueN(TICKET_IDENTITY_ID),
cycle_time_old = as.numeric(median(CT_CYCLE_TIME_S, na.rm = TRUE)),
cycle_time_new = as.numeric(median(new_cycle_time, na.rm = TRUE))),
= c('ASSIGNEE_APEX_USER_ID', 'ORG_ID', 'TICKET_CREATED_MONTH', 'TICKET_CREATED_YEAR')]
by
<- c(.01, .99)
minmaxp <- qlogis(minmaxp)
range_non01_punclosed := fifelse(P_unclosed < minmaxp[[1]], range_non01_punclosed[1],
ct_20240829_agg[, q_unclosed fifelse(P_unclosed > minmaxp[[2]], range_non01_punclosed[2],
qlogis(P_unclosed)))]
setnames(ct_20240829_agg,
old = c('ORG_ID', 'ASSIGNEE_APEX_USER_ID', 'TICKET_CREATED_YEAR', 'TICKET_CREATED_MONTH'),
new = c('org_id', 'user_id', 'year', 'month_num'))
setnames(comments_20240903,
old = c('ORG_ID', 'APEX_USER_ID', 'PR_YEAR', 'PR_MONTH', 'COMMENTS_PER_PR'),
new = c('org_id', 'user_id', 'year', 'month_num', 'comments_per_pr'))
setnames(defecttkts_20240903,
old = c('ORG_ID', 'USER_ID', 'YEAR', 'MONTH_NUM', 'DEFECT_TICKET_PCT'),
new = c('org_id', 'user_id', 'year', 'month_num', 'defect_tickets_pct_indiv'))
setnames(centr_20240903,
old = c('ORG_ID', 'APEX_USER_ID', 'PR_MONTH', 'DEGREE_CENTRALITY'),
new = c('org_id', 'user_id', 'month_num', 'degree_centrality_monthly'))
<- merge(ct_20240829_agg[!is.na(cycle_time_new)], full_df, by = c('org_id', 'user_id', 'year', 'month_num'), all.y = TRUE)
full_df_cleaned_unclosed <- merge(comments_20240903, full_df_cleaned_unclosed, by = c('org_id', 'user_id', 'year', 'month_num'), all.y = TRUE)
full_df_cleaned_unclosed <- merge(defecttkts_20240903, full_df_cleaned_unclosed, by = c('org_id', 'user_id', 'year', 'month_num'), all.y = TRUE)
full_df_cleaned_unclosed <- merge(centr_20240903, full_df_cleaned_unclosed, by = c('org_id', 'user_id', 'month_num'), all.y = TRUE)
full_df_cleaned_unclosed if(! get_unique_N(full_df_cleaned_unclosed) >= get_unique_N(full_df)){
stop('Error in merging')
}
:= cycle_time_new]
full_df_cleaned_unclosed[, median_cycle_time_s := 100*degree_centrality_monthly]
full_df_cleaned_unclosed[, degree_centrality_monthly_100 := (month_num - 1) %% 3]
full_df_cleaned_unclosed[, within_quarter_month_num
<- c('avg_coding_days_per_week', 'total_merged_prs',
yr_avg_vars 'defect_tickets_pct_indiv', 'degree_centrality_monthly_100',
'comments_per_pr')
<- paste0('wi_', yr_avg_vars)
within_vars_new <- paste0('yr_avg_', yr_avg_vars)
yr_avg_vars_new <- paste0(yr_avg_vars_new, '_c')
yr_avg_vars_new_c <- c('q_unclosed', 'avg_coding_days_per_week',
center_vars_ 'total_merged_prs', 'defect_tickets_pct_indiv',
'comments_per_pr', 'degree_centrality_monthly_100')
<- c('team_size')
team_vars <- paste0(team_vars, '_c')
team_vars_new <- c('org_id', 'user_id')
factor_vars <- paste0(factor_vars, '_fac')
factor_vars_new <- c('month_num', 'within_quarter_month_num')
time_vars <- paste0(time_vars, '_c')
time_vars_new <- c('median_cycle_time_s')
outcome_vars <- unique(
full_df_cleaned_unclosed_team_size_info c('team_size', 'org_id', 'user_id', 'team_id')]
full_df_cleaned_unclosed[, team_size = mean(team_size, na.rm = TRUE)), by = c('org_id', 'user_id')]
)[, .(
<- copy(full_df_cleaned_unclosed)
full_df_cleaned_unclosed_old
<- unique(full_df_cleaned_unclosed[, c(..outcome_vars,
full_df_cleaned_unclosed
..center_vars_,
..factor_vars,'org_name',
..time_vars)]) := lapply(.SD, mean, na.rm = TRUE), .SDcols = yr_avg_vars, by = c('org_id', 'user_id')]
full_df_cleaned_unclosed[, (yr_avg_vars_new) := lapply(.SD, \(x) x - mean(x, na.rm = TRUE)),
full_df_cleaned_unclosed[, (within_vars_new) = yr_avg_vars,
.SDcols = c('org_id', 'user_id')]
by <- c(center_vars_, yr_avg_vars_new)
center_vars <- paste0(center_vars, '_c')
center_vars_new <- merge(full_df_cleaned_unclosed, full_df_cleaned_unclosed_team_size_info)
full_df_cleaned_unclosed := lapply(.SD, round, digits = 4), .SDcols = center_vars]
full_df_cleaned_unclosed[, (center_vars) := lapply(.SD, round, digits = 4), .SDcols = team_vars]
full_df_cleaned_unclosed[, (team_vars) := lapply(.SD, \(x) scale(x, scale = FALSE)[,1]), .SDcols = center_vars]
full_df_cleaned_unclosed[, (center_vars_new) := lapply(.SD, \(x) scale(x, scale = FALSE)[,1]), .SDcols = team_vars]
full_df_cleaned_unclosed[, (team_vars_new) := lapply(.SD, factor), .SDcols = factor_vars]
full_df_cleaned_unclosed[, (factor_vars_new) c('month_num_c', 'within_quarter_month_num_c') :=
full_df_cleaned_unclosed[, - 7, within_quarter_month_num - 1)]
.(month_num
<- unique(c(factor_vars, factor_vars_new,
model_vars
outcome_vars,
center_vars,
center_vars_new,
within_vars_new, yr_avg_vars_new_c,
team_vars, team_vars_new,
time_vars, time_vars_new))<- unique(c(factor_vars, factor_vars_new,
model_vars_yrpred
outcome_vars, 'q_unclosed', 'q_unclosed_c',
yr_avg_vars_new, yr_avg_vars_new_c,
team_vars, team_vars_new,
time_vars, time_vars_new))
#model_vars <- model_vars[which(! model_vars %in% c('q_unclosed', 'q_unclosed_c'))]
<- full_df_cleaned_unclosed[!is.na(org_name)]
full_df_cleaned_unclosed
<- lapply(center_vars_, \(x){
unique_users_dropping_col uniqueN(na.omit(full_df_cleaned_unclosed[, c(..center_vars, 'user_id')][, .SD, .SDcols = !c(x)])$user_id)
})names(unique_users_dropping_col) <- center_vars_
<- lapply(yr_avg_vars_new, \(x){
unique_users_dropping_col_yr uniqueN(na.omit(full_df_cleaned_unclosed[, c(..yr_avg_vars_new, 'user_id')][, .SD, .SDcols = !c(x)])$user_id)
})names(unique_users_dropping_col_yr) <- yr_avg_vars_new
unique_users_dropping_col
unique_users_dropping_col_yr
#This is a considerably smaller data set due to missingness in the predictors
<- full_df_cleaned_unclosed[, .SD, .SDcols = unique(model_vars, model_vars_yrpred)]
month_level_dt <- month_level_dt[median_cycle_time_s > 0]
month_level_dt <- na.omit(month_level_dt[, ..model_vars])
month_level_dt
<- na.omit(full_df_cleaned_unclosed[median_cycle_time_s > 0, ..model_vars_yrpred])
month_level_yrpred_dt
<- month_level_dt[within_quarter_month_num == 1]
month_level_midq_dt
<- copy(month_level_dt)
month_level_ptyp_dt set.seed(451)
:= sample(1:40, 1, replace = TRUE), by = c('org_id')] #make N chunks of data stratified by org, team
month_level_ptyp_dt[, chunk_id <- month_level_ptyp_dt[chunk_id == 1]
month_level_ptyp_dt
<- full_df[month %in% c('Feb', 'May', 'Aug', 'Nov'),
full_df_cleaned_step9 c(..factor_vars, 'team_id', 'cycle_time', 'month',
'yr_avg_coding_days_per_week',
'yr_total_merged_prs',
'yr_defect_tickets_pct',
'degree_centrality',
'yr_comments')]
<- na.omit(full_df_cleaned_step9[cycle_time > 0])
full_df_cleaned_step9
#Has fewer unique_id because of missing month-level covariates
N_user = uniqueN(user_id_fac),
month_level_dt[, .(N_org = uniqueN(org_id_fac),
N_obs = .N)]
N_user = uniqueN(user_id_fac),
month_level_yrpred_dt[, .(N_org = uniqueN(org_id_fac),
N_obs = .N)]
N_user = uniqueN(user_id),
full_df_cleaned_step9[, .(N_team = uniqueN(team_id),
N_teamuser = uniqueN(paste(team_id, user_id)),
N_org = uniqueN(org_id),
N_obs = .N)]
N_user = uniqueN(user_id_fac),
full_df_cleaned_unclosed[, .(N_org = uniqueN(org_id_fac),
N_obs = .N)]
lapply(.SD, \(x) fifelse(.N == 1, 0, sd(x, na.rm = TRUE))),
month_level_dt[, = c(outcome_vars, center_vars_new, time_vars),
.SDcols = c('user_id_fac', 'org_id_fac')] by
A lot happens here. I take the data that Carol was using it and then integrate it with ticket-level data which also includes some updated ticket-end timestamps. I then re-aggregate at the month level within-person, so that we get median cycle times for each person over that month. The predictor variables all come from the previous data set and are already aggregated at the month level or the year level.
I also keep or create some control variables. I discard repeat rows due to users working in multiple teams but I keep their average team size as a control variable. I also take the proportion of unclosed tickets for each month and transform it from \(P(unclosed) \in [0,1]\) to be \(Q(unclosed) \in (-\inf, +\inf)\) using the logistic quantile function.
I planned to run models where the predictors, where possible, vary by month, as well as where they are aggregated across the year to be individually varying but time invariant. There are trade-offs here. The time-varying data gives us a more granular view of correlated change across time, and a much closer sense of what might be causal processes. That is, we can say things like, within the same person when X changes from month to month we also see a change in cycle time. However, there are a lot of missing observations of our predictors so this reduces the size of the data to some extent. When all is cleaned up we have formatC(dim(month_level_dt)[[1]], format="f", big.mark=",", digits=0)
rows comprised of formatC(length(unique(month_level_dt$user_id_fac)), format="f", big.mark=",", digits=0)
unique users across formatC(length(unique(month_level_dt$org_id_fac)), format="f", big.mark=",", digits=0)
, with a median of formatC(month_level_dt[, .N, by = 'user_id_fac'][, .(median(N))][[1]], format="f", big.mark=",", digits=0)
(interquartile range = tmprange <- quantile(month_level_dt[, .N, by = 'user_id_fac']$N, c(.25, .75)); sprintf('[%d, %d]', tmprange[[1]], tmprange[[2]])
) observations per user.
From the data aggregated across the year we simply use all available information for each person to compute their average on each predictor, thereby “recovering” quite a few monthly observations of cycle time but with the cost of only being able to make inferences back to correlated stable individual differences. That is, we can say the sort of person who tends to have N coding days per week also tends to have lower cycle time and a steeper decrease in cycle time across the year (for example). In the data with yearly-average predictors, we have formatC(dim(month_level_yrpred_dt)[[1]], format="f", big.mark=",", digits=0)
rows comprised of formatC(length(unique(month_level_yrpred_dt$user_id_fac)), format="f", big.mark=",", digits=0)
unique users across formatC(length(unique(month_level_yrpred_dt$org_id_fac)), format="f", big.mark=",", digits=0)
, with a median of formatC(month_level_yrpred_dt[, .N, by = 'user_id_fac'][, .(median(N))][[1]], format="f", big.mark=",", digits=0)
(interquartile range = tmprange <- quantile(month_level_yrpred_dt[, .N, by = 'user_id_fac']$N, c(.25, .75)); sprintf('[%d, %d]', tmprange[[1]], tmprange[[2]])
) observations per user.
Data exploration
These plots helped me get a sense of the data distribution of the outcome (cycle time) and other variables that we were concerned might have undue influence on either the outcome itself or as a potential confound between the outcome and predictors (or their effect on the growth of our outcome over time).
Outcome distribution
In [3]:
ggplot(month_level_yrpred_dt, aes(x = median_cycle_time_s)) +
stat_density(geom = 'area', fill = 'blue', alpha = .5, position = 'identity', aes(group = org_id)) +
stat_density(geom = 'line', color = 'black', alpha = .8, position = 'identity', aes(group = org_id)) +
facet_wrap(~ org_id, scales = 'free_y') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks = element_blank(),
panel.grid = element_blank() # Remove grid lines for simplicity
+
) labs(y = 'Density of observations', x = 'Median Cycle Time')
ggplot(month_level_yrpred_dt, aes(x = median_cycle_time_s)) +
stat_density(geom = 'area', fill = 'blue', alpha = .5, position = 'identity', aes(group = org_id)) +
stat_density(geom = 'line', color = 'black', alpha = .8, position = 'identity', aes(group = org_id)) +
facet_wrap(~ org_id, scales = 'free_y') +
scale_x_log10() +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks = element_blank(),
panel.grid = element_blank() # Remove grid lines for simplicity
+
) labs(y = 'Density of observations', x = 'Log Median Cycle Time')
In the above, we see extreme right skew that is often a feature of time-to-event data.
Outcome across months
In [4]:
<- 2*28*24*60*60
hours_width <- hours_width / (7*24*60*60)
N_weeks ggplot(month_level_yrpred_dt, aes(y = median_cycle_time_s, x = month_num)) +
geom_hex(binwidth = c(1, hours_width)) +
scale_x_continuous(breaks = 1:12) +
scale_y_continuous(breaks = seq(0, 12000, by = hours_width), labels = N_weeks*c(0:(length(seq(0, 12000, by = hours_width)) - 1))) +
scale_fill_continuous(
trans = "log",
name = "Count",
breaks = c(1, 10, 100, 1000)
+
) theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks = element_blank(),
panel.grid = element_blank() # Remove grid lines for simplicity
+
) labs(x = 'Month', y = 'Cycle time') +
facet_wrap(~ org_id)
We can see a slight downward trend in the outliers across months for many orgs.
Outlier trends
In [5]:
ggplot(month_level_yrpred_dt, aes(x = month_num, y = median_cycle_time_s)) +
geom_point() +
theme_minimal()
ggplot(month_level_yrpred_dt[, .(max_cycle_time = max(median_cycle_time_s, na.rm = TRUE)), by = c('month_num')],
aes(x = month_num, y = max_cycle_time)) +
geom_smooth(method = 'lm') +
geom_point() +
scale_x_continuous(breaks = 1:12) +
theme_minimal()
In the above we see the monthly trend in outliers more clearly. This is what gave rise to our diving back into ticket close dates to try to recover what we didn’t have from before as well as thinking about including a measure of proportion open tickets to try to control for the possibility that we were systematically mis-measuring cycle time because of systematic differences in missing ticket data over time.
In [6]:
ggplot(ct_20240829_agg[, .(med_p_unclosed = median(P_unclosed), N = .N), by = c('org_id', 'month_num')][N > 10],
aes(x = month_num, y = med_p_unclosed)) +
geom_line() +
geom_point(size = .2, alpha = .8) +
facet_wrap(~ org_id, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
+
) coord_cartesian(y = c(0,1))
#View(ind_perf_agg[, .(med_p_unclosed = median(P_unclosed), N = .N), by = c('org_id', 'month_num')][N > 10])
Above we see that across many orgs we have an uptick in unclosed tickets at the end of the year. These unclosed tickets can’t be included in the cycle time data and so will potentially have the effect of reducing observed cycle time for that month.
Predictor plots
In [7]:
ggplot(month_level_dt[, .(avg_coding_days_per_week = median(avg_coding_days_per_week), N = .N), by = c('org_id', 'month_num')][N > 10],
aes(x = month_num, y = avg_coding_days_per_week)) +
geom_line(stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
geom_point(size = .2, alpha = .8) +
facet_wrap(~ org_id, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [8]:
<- copy(month_level_dt)
month_level_dt_plot := rep(sample(1:144, size = 1), .N), by = 'user_id_fac']
month_level_dt_plot[, group ggplot(month_level_dt_plot, aes(x = month_num, y = avg_coding_days_per_week)) +
geom_line(aes(group = user_id_fac),
alpha = .5, size = .1,
stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
facet_wrap(~ group) +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()
)
In [9]:
ggplot(month_level_dt[, .(total_merged_prs = median(total_merged_prs), N = .N), by = c('org_id', 'month_num')][N > 10],
aes(x = month_num, y = total_merged_prs)) +
geom_line(stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
geom_point(size = .2, alpha = .8) +
facet_wrap(~ org_id, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [10]:
<- copy(month_level_dt)
month_level_dt_plot := rep(sample(1:144, size = 1), .N), by = 'user_id_fac']
month_level_dt_plot[, group ggplot(month_level_dt_plot, aes(x = month_num, y = total_merged_prs)) +
geom_line(aes(group = user_id_fac),
alpha = .5, size = .1,
stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
facet_wrap(~ group, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()
)
In [11]:
ggplot(month_level_dt[, .(defect_tickets_pct_indiv = median(defect_tickets_pct_indiv), N = .N), by = c('org_id', 'month_num')][N > 10],
aes(x = month_num, y = defect_tickets_pct_indiv)) +
geom_line(stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
geom_point(size = .2, alpha = .8) +
facet_wrap(~ org_id, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [12]:
<- copy(month_level_dt)
month_level_dt_plot := rep(sample(1:144, size = 1), .N), by = 'user_id_fac']
month_level_dt_plot[, group ggplot(month_level_dt_plot, aes(x = month_num, y = defect_tickets_pct_indiv)) +
geom_line(aes(group = user_id_fac),
alpha = .5, size = .1,
stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
facet_wrap(~ group, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()
)
In [13]:
ggplot(month_level_dt[, .(degree_centrality_monthly_100 = median(degree_centrality_monthly_100), N = .N), by = c('org_id', 'month_num')][N > 10],
aes(x = month_num, y = degree_centrality_monthly_100)) +
geom_line(stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
geom_point(size = .2, alpha = .8) +
facet_wrap(~ org_id, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [14]:
<- copy(month_level_dt)
month_level_dt_plot := rep(sample(1:144, size = 1), .N), by = 'user_id_fac']
month_level_dt_plot[, group ggplot(month_level_dt_plot, aes(x = month_num, y = degree_centrality_monthly_100)) +
geom_line(aes(group = user_id_fac),
alpha = .5, size = .1,
stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
facet_wrap(~ group, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()
)
In [15]:
ggplot(month_level_dt[, .(comments_per_pr = median(comments_per_pr), N = .N), by = c('org_id', 'month_num')][N > 10],
aes(x = month_num, y = comments_per_pr)) +
geom_line(stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
geom_point(size = .2, alpha = .8) +
facet_wrap(~ org_id, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [16]:
<- copy(month_level_dt)
month_level_dt_plot := rep(sample(1:144, size = 1), .N), by = 'user_id_fac']
month_level_dt_plot[, group ggplot(month_level_dt_plot, aes(x = month_num, y = comments_per_pr)) +
geom_line(aes(group = user_id_fac),
alpha = .5, size = .1,
stat = 'smooth', method = 'lm', se = FALSE, formula = y ~ x) +
facet_wrap(~ group, scales = 'free') +
theme_minimal() +
theme(
strip.text.x = element_blank(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()
)
In [17]:
<- "(centr_20240903|comments_20240903|ct_20240829|ct_20240829_agg|defecttkts_20240903|full_df|full_df_cleaned_step9|full_df_cleaned_unclosed|full_df_cleaned_unclosed_old|full_df_cleaned_unclosed_team_size_info|month_level_dt_plot|month_level_midq_dt|month_level_ptyp_dt)"
rm_ls_pattern <- ls(pattern = rm_ls_pattern)
ls_list if(length(ls_list) > 0) rm(list = ls_list)
gc()
Modeling
These are the goals that informed my modelling decisions (beyond those that Carol already set forth, e.g., accounting for nesting):
- Allow non-linear effects of certain predictors, especially month number, on the outcome.
- requirement:
s()
and related syntax frommgcv
package
- requirement:
- Model the outcome using a distribution appropriate for its observed shape and for its hypothesized data generating process
- requirement: response distributions for time-to-event data (i.e., log normal, gamma, Weibull)
- Include group-varying effects for intercepts and predictor variables when groups are nested and crossed (as in Carol’s original models)
- requirement:
lme4
-style random effects specifications
- requirement:
- Uncertaint quantification across the range of effects of interest including individually-varying effects
- requirement: simulation or draws from the posterior distribution
The above requirements are met by brms
which uses the Bayesian probabilistic programming language Stan. Added benefits of this method include good model diagnostics, robust fitting due to control via weakly-informative priors, and modeling
Likelihood and prior definitions
In the code blocks below I define the likelihood (i.e., model) and priors for a set of models going from most complicated to least complicated. This was part of the process of diagnosing any fitting problems starting with very simple models, and optimizing them to fit in a reasonable amount of time. I iterated through some trial and error finding reasonable prior distributions that constrained possible model-generated values to a reasonable range without being overly restrictive. This was particularly helpful in stabilizing the estimate of the shape parameter and intercept of the Weibull distribution.
In [18]:
<- bf(median_cycle_time_s ~ 1 +
cycle_time_full_intx_lin_remonth_yrpred_f +
within_quarter_month_num_c +
team_size_c s(q_unclosed_c, k = 5) +
s(month_num_c, k = 5) +
s(month_num_c, by = team_size_c, k = 5) +
s(month_num_c, by = q_unclosed_c, k = 5) +
+
yr_avg_avg_coding_days_per_week_c +
yr_avg_total_merged_prs_c +
yr_avg_defect_tickets_pct_indiv_c +
yr_avg_degree_centrality_monthly_100_c +
yr_avg_comments_per_pr_c :yr_avg_avg_coding_days_per_week_c +
month_num_c:yr_avg_total_merged_prs_c +
month_num_c:yr_avg_defect_tickets_pct_indiv_c +
month_num_c:yr_avg_degree_centrality_monthly_100_c +
month_num_c:yr_avg_comments_per_pr_c +
month_num_c| org_id_fac) + (month_num_c | org_id_fac:user_id_fac),
(month_num_c ~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
<- bf(median_cycle_time_s ~ 1 +
cycle_time_full_intx_lin_remonth_f +
within_quarter_month_num_c +
team_size_c s(q_unclosed_c, k = 5) +
s(month_num_c, k = 5) +
s(month_num_c, by = team_size_c, k = 5) +
s(month_num_c, by = q_unclosed_c, k = 5) +
+
wi_avg_coding_days_per_week +
yr_avg_avg_coding_days_per_week_c +
wi_total_merged_prs +
yr_avg_total_merged_prs_c +
wi_defect_tickets_pct_indiv +
yr_avg_defect_tickets_pct_indiv_c +
wi_degree_centrality_monthly_100 +
yr_avg_degree_centrality_monthly_100_c +
wi_comments_per_pr +
yr_avg_comments_per_pr_c :yr_avg_avg_coding_days_per_week_c +
month_num_c:yr_avg_total_merged_prs_c +
month_num_c:yr_avg_defect_tickets_pct_indiv_c +
month_num_c:yr_avg_degree_centrality_monthly_100_c +
month_num_c:yr_avg_comments_per_pr_c +
month_num_c| org_id_fac) + (month_num_c | org_id_fac:user_id_fac),
(month_num_c ~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
<- bf(median_cycle_time_s ~ 1 +
cycle_time_full_intx_lin_f +
within_quarter_month_num_c +
team_size_c s(q_unclosed_c, k = 5) +
s(month_num_c, k = 5) +
s(month_num_c, by = team_size_c, k = 5) +
s(month_num_c, by = q_unclosed_c, k = 5) +
+
wi_avg_coding_days_per_week +
yr_avg_avg_coding_days_per_week_c +
wi_total_merged_prs +
yr_avg_total_merged_prs_c +
wi_defect_tickets_pct_indiv +
yr_avg_defect_tickets_pct_indiv_c +
wi_degree_centrality_monthly_100 +
yr_avg_degree_centrality_monthly_100_c +
wi_comments_per_pr +
yr_avg_comments_per_pr_c :yr_avg_avg_coding_days_per_week_c +
month_num_c:yr_avg_total_merged_prs_c +
month_num_c:yr_avg_defect_tickets_pct_indiv_c +
month_num_c:yr_avg_degree_centrality_monthly_100_c +
month_num_c:yr_avg_comments_per_pr_c +
month_num_c1 | org_id_fac) + (1 | org_id_fac:user_id_fac),
(~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
<- bf(median_cycle_time_s ~ 1 +
cycle_time_full_intx_nlq_f +
within_quarter_month_num_c +
team_size_c s(q_unclosed_c, k = 5) +
s(month_num_c, k = 5) +
s(month_num_c, by = team_size_c, k = 5) +
s(month_num_c, by = q_unclosed_c, k = 5) +
+
wi_avg_coding_days_per_week +
yr_avg_avg_coding_days_per_week_c +
wi_total_merged_prs +
yr_avg_total_merged_prs_c +
wi_defect_tickets_pct_indiv +
yr_avg_defect_tickets_pct_indiv_c +
wi_degree_centrality_monthly_100 +
yr_avg_degree_centrality_monthly_100_c +
wi_comments_per_pr +
yr_avg_comments_per_pr_c s(month_num_c, by = yr_avg_avg_coding_days_per_week_c, k = 3) +
s(month_num_c, by = yr_avg_total_merged_prs_c, k = 3) +
s(month_num_c, by = yr_avg_defect_tickets_pct_indiv_c, k = 3) +
s(month_num_c, by = yr_avg_degree_centrality_monthly_100_c, k = 3) +
s(month_num_c, by = yr_avg_comments_per_pr_c, k = 3) +
| org_id_fac) + (month_num_c | org_id_fac:user_id_fac),
(month_num_c ~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
<- bf(median_cycle_time_s ~ 1 +
cycle_time_full_intx_f +
within_quarter_month_num_c +
team_size_c +
q_unclosed_c s(month_num_c, k = 5) +
s(month_num_c, by = team_size_c, k = 5) +
s(month_num_c, by = q_unclosed_c, k = 5) +
+
wi_avg_coding_days_per_week +
yr_avg_avg_coding_days_per_week_c +
wi_total_merged_prs +
yr_avg_total_merged_prs_c +
wi_defect_tickets_pct_indiv +
yr_avg_defect_tickets_pct_indiv_c +
wi_degree_centrality_monthly_100 +
yr_avg_degree_centrality_monthly_100_c +
wi_comments_per_pr +
yr_avg_comments_per_pr_c s(month_num_c, by = yr_avg_avg_coding_days_per_week_c, k = 3) +
s(month_num_c, by = yr_avg_total_merged_prs_c, k = 3) +
s(month_num_c, by = yr_avg_defect_tickets_pct_indiv_c, k = 3) +
s(month_num_c, by = yr_avg_degree_centrality_monthly_100_c, k = 3) +
s(month_num_c, by = yr_avg_comments_per_pr_c, k = 3) +
| org_id_fac) + (month_num_c | org_id_fac:user_id_fac),
(month_num_c ~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
<- bf(median_cycle_time_s ~ 1 +
cycle_time_full_intx_ptyp_f +
within_quarter_month_num_c +
team_size_c +
q_unclosed_c s(month_num_c, k = 5) +
s(month_num_c, by = team_size_c, k = 5) +
s(month_num_c, by = q_unclosed_c, k = 5) +
+
wi_avg_coding_days_per_week +
yr_avg_avg_coding_days_per_week_c +
wi_total_merged_prs +
yr_avg_total_merged_prs_c +
wi_defect_tickets_pct_indiv +
yr_avg_defect_tickets_pct_indiv_c +
wi_degree_centrality_monthly_100 +
yr_avg_degree_centrality_monthly_100_c +
wi_comments_per_pr +
yr_avg_comments_per_pr_c s(month_num_c, by = yr_avg_avg_coding_days_per_week_c, k = 3) +
s(month_num_c, by = yr_avg_total_merged_prs_c, k = 3) +
s(month_num_c, by = yr_avg_defect_tickets_pct_indiv_c, k = 3) +
s(month_num_c, by = yr_avg_degree_centrality_monthly_100_c, k = 3) +
s(month_num_c, by = yr_avg_comments_per_pr_c, k = 3) +
1 | org_id_fac) + (1 | org_id_fac:user_id_fac),
(~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
<- bf(median_cycle_time_s ~ 1 +
cycle_time_full_ptyp_f +
within_quarter_month_num_c +
team_size_c +
q_unclosed_c s(month_num_c, k = 5) +
s(month_num_c, by = team_size_c, k = 5) +
s(month_num_c, by = q_unclosed_c, k = 5) +
+
wi_avg_coding_days_per_week +
yr_avg_avg_coding_days_per_week_c +
wi_total_merged_prs +
yr_avg_total_merged_prs_c +
wi_defect_tickets_pct_indiv +
yr_avg_defect_tickets_pct_indiv_c +
wi_degree_centrality_monthly_100 +
yr_avg_degree_centrality_monthly_100_c +
wi_comments_per_pr +
yr_avg_comments_per_pr_c 1 | org_id_fac) + (1 | org_id_fac:user_id_fac),
(~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
<- bf(median_cycle_time_s ~ 1 +
growth_ptyp_f +
team_size_c +
q_unclosed_c +
within_quarter_month_num_c s(month_num_c, k = 5) +
1 | org_id_fac) + (1 | org_id_fac:user_id_fac),
(~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
<- bf(median_cycle_time_s ~ 1 +
growth_ptyp_shape_f 1 | org_id_fac) + (1 | org_id_fac:user_id_fac),
(~ 1 + (1 | org_id_fac),
shape family = brms::weibull())
Here I get a list of all the parameters and the default priors as a starting point (not evaluated here because the output is so long).
In [19]:
get_prior(cycle_time_full_intx_lin_remonth_yrpred_f, data = month_level_yrpred_dt)
get_prior(cycle_time_full_intx_lin_remonth_f, data = month_level_dt)
get_prior(cycle_time_full_intx_lin_f, data = month_level_dt)
get_prior(cycle_time_full_intx_nlq_f, data = month_level_dt)
get_prior(cycle_time_full_intx_f, data = month_level_dt)
get_prior(cycle_time_full_intx_ptyp_f, data = month_level_dt)
get_prior(cycle_time_full_ptyp_f, data = month_level_dt)
get_prior(growth_ptyp_f, data = month_level_dt)
get_prior(growth_ptyp_shape_f, data = month_level_dt)
In [20]:
<- c(prior('normal(0,.1)', class = 'b'),
cycle_time_full_intx_lin_remonth_yrpred_prior prior('normal(14, 2.5)', class = 'Intercept'),
prior('normal(.29, .25)', dpar = 'shape', class = 'Intercept'),
prior('normal(.25, .5)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
prior('normal(.05, .1)', class = 'sd', group = 'org_id_fac', coef = 'month_num_c'),
prior('lkj(1.5)', class = 'cor', group = 'org_id_fac'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'month_num_c'),
prior('lkj(1.5)', class = 'cor', group = 'org_id_fac:user_id_fac'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'),
prior('normal(0, 1)', class = 'sds', coef = 's(month_num_c, k = 5)'))
<- c(prior('normal(0,.1)', class = 'b'),
cycle_time_full_intx_lin_remonth_prior prior('normal(14, 2.5)', class = 'Intercept'),
prior('normal(.1, .75)', dpar = 'shape', class = 'Intercept'),
prior('normal(.25, .5)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
prior('normal(.05, .1)', class = 'sd', group = 'org_id_fac', coef = 'month_num_c'),
prior('lkj(1.5)', class = 'cor', group = 'org_id_fac'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'month_num_c'),
prior('lkj(1.5)', class = 'cor', group = 'org_id_fac:user_id_fac'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'),
prior('normal(0, .5)', class = 'sds', coef = 's(q_unclosed_c, k = 5)'),
prior('normal(0, .5)', class = 'sds', coef = 's(month_num_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = q_unclosed_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = team_size_c, k = 5)'))
<- c(prior('normal(0,.1)', class = 'b'),
cycle_time_full_intx_lin_prior prior('normal(14, 2.5)', class = 'Intercept'),
prior('normal(.29, .25)', dpar = 'shape', class = 'Intercept'),
prior('normal(.25, .5)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
# prior('normal(.05, .1)', class = 'sd', group = 'org_id_fac', coef = 'month_num_c'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
# prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'month_num_c'),
# prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:team_id_fac', coef = 'Intercept'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'),
prior('normal(0, 1)', class = 'sds', coef = 's(month_num_c, k = 5)'))
<- c(prior('normal(0,.1)', class = 'b'),
cycle_time_full_intx_nlq_prior prior('normal(14, 2.5)', class = 'Intercept'),
prior('normal(.1, .75)', dpar = 'shape', class = 'Intercept'),
prior('normal(.25, .5)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
prior('normal(.05, .1)', class = 'sd', group = 'org_id_fac', coef = 'month_num_c'),
prior('lkj(1.5)', class = 'cor', group = 'org_id_fac'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'month_num_c'),
prior('lkj(1.5)', class = 'cor', group = 'org_id_fac:user_id_fac'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'),
prior('normal(0, .5)', class = 'sds', coef = 's(q_unclosed_c, k = 5)'),
prior('normal(0, .5)', class = 'sds', coef = 's(month_num_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = q_unclosed_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = team_size_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_avg_coding_days_per_week_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_total_merged_prs_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_defect_tickets_pct_indiv_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_degree_centrality_monthly_100_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_comments_per_pr_c, k = 3)'))
<- c(prior('normal(0,.1)', class = 'b'),
cycle_time_full_intx_prior prior('normal(14, 2.5)', class = 'Intercept'),
prior('normal(.1, .75)', dpar = 'shape', class = 'Intercept'),
prior('normal(.25, .5)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
prior('normal(.05, .1)', class = 'sd', group = 'org_id_fac', coef = 'month_num_c'),
prior('lkj(1.5)', class = 'cor', group = 'org_id_fac'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'month_num_c'),
prior('lkj(1.5)', class = 'cor', group = 'org_id_fac:user_id_fac'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'),
prior('normal(0, .5)', class = 'sds', coef = 's(month_num_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = q_unclosed_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = team_size_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_avg_coding_days_per_week_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_total_merged_prs_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_defect_tickets_pct_indiv_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_degree_centrality_monthly_100_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_comments_per_pr_c, k = 3)'))
<- c(prior('normal(0,.1)', class = 'b'),
cycle_time_full_intx_ptyp_prior prior('normal(14, 2.5)', class = 'Intercept'),
prior('normal(.1, .75)', dpar = 'shape', class = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'),
prior('normal(0, .5)', class = 'sds', coef = 's(month_num_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = q_unclosed_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = team_size_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_avg_coding_days_per_week_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_total_merged_prs_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_defect_tickets_pct_indiv_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_degree_centrality_monthly_100_c, k = 3)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = yr_avg_comments_per_pr_c, k = 3)'))
<- c(prior('normal(0,.1)', class = 'b'),
cycle_time_full_ptyp_prior prior('normal(14, 2.5)', class = 'Intercept'),
prior('normal(.1, .75)', dpar = 'shape', class = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'),
prior('normal(0, .5)', class = 'sds', coef = 's(month_num_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = q_unclosed_c, k = 5)'),
prior('normal(0, .25)', class = 'sds', coef = 's(month_num_c, by = team_size_c, k = 5)'))
<- c(prior('normal(0,.1)', class = 'b'),
growth_ptyp_prior prior('normal(14, 2.5)', class = 'Intercept'),
prior('normal(.1, .75)', dpar = 'shape', class = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'),
prior('normal(0, .5)', class = 'sds', coef = 's(month_num_c, k = 5)'))
<- c(prior('normal(14, 2.5)', class = 'Intercept'),
growth_ptyp_shape_prior prior('normal(.1, .75)', dpar = 'shape', class = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac', coef = 'Intercept'),
prior('normal(.15, .25)', class = 'sd', group = 'org_id_fac:user_id_fac', coef = 'Intercept'),
prior('normal(.15, .15)', class = 'sd', group = 'org_id_fac', dpar = 'shape', coef = 'Intercept'))
<- cycle_time_full_intx_ptyp_prior cycle_time_full_intx_prior
Non-linear Predictors interacting with non-linear unclosed control (Full data)
In [21]:
<- brm(formula = cycle_time_full_intx_nlq_f, data = month_level_dt, prior = cycle_time_full_intx_nlq_prior,
cycle_time_full_intx_nlq_prior_m file = 'cycle_time_full_intx_nlq_prior',
sample_prior = 'only',
chains = 4, iter = 750, warmup = 500,
cores = 4,
init = 0,
backend = 'cmdstanr', silent = 0, refresh = 500,
file_refit = 'never',
stan_model_args = list(cpp_options = list(
PRECOMPILED_HEADERS = FALSE, STAN_CPP_OPTIMS = TRUE,
"CXXFLAGS+= -O1 -march=native -mtune=native",
"CXXFLAGS+= -DEIGEN_USE_BLAS -DEIGEN_USE_LAPACKE",
"LDLIBS += -lblas -llapack -llapacke"),
force_recompile = TRUE))
summary(cycle_time_full_intx_nlq_prior_m)
<- posterior_predict(cycle_time_full_intx_nlq_prior_m)
cycle_time_full_intx_nlq_prior_pred <- as.data.table(t(cycle_time_full_intx_nlq_prior_pred))
cycle_time_full_intx_nlq_prior_dt <- melt(cycle_time_full_intx_nlq_prior_dt,
cycle_time_full_intx_nlq_prior_l measure.vars = names(cycle_time_full_intx_nlq_prior_dt),
variable.name = 'iter', value.name = 'y_hat')
ggplot(cycle_time_full_intx_nlq_prior_l, aes(x = y_hat)) +
geom_histogram(aes(y = after_stat(density)), bins = 200, fill = 'blue', alpha = 1) +
geom_histogram(aes(x = median_cycle_time_s, y = after_stat(density)), data = month_level_dt,
bins = 200, fill = 'gray', alpha = .8) +
scale_x_log10() +
scale_y_continuous(transform = scales::modulus_trans(p = -10)) +
theme_cleanget_prior(cycle_time_full_intx_nlq_prior_m)
# growth_ptyp_shape_prior_pp <- pp_check(growth_ptyp_shape_prior_m)
# growth_ptyp_shape_prior_pp + scale_y_log10()
In [22]:
# Get objects matching the pattern
<- ls(pattern = '_prior_(l|dt|pred)$', envir = .GlobalEnv)
matching_objects
# Create a logical vector indicating which objects inherit from 'data.frame'
<- sapply(matching_objects, function(x) inherits(get(x, envir = .GlobalEnv), 'data.frame') | inherits(get(x, envir = .GlobalEnv), 'array'))
is_data
if (length(is_data) > 0) rm(list = matching_objects[is_data])
rm(list = ls(pattern = '_m$'))
gc(verbose = TRUE, full = TRUE, reset = TRUE)
In [23]:
<- function(N_org, N_user){
make_init return(
function(){
return(
list(Intercept = 14,
b = rep(0, 12),
bs = rep(.1, 16),
zs_1_1 = rep(.1, 3),
sds_1 = .1,
zs_2_1 = rep(.1, 3),
sds_2 = .1,
zs_3_1 = rep(.1, 3),
sds_3 = .1,
zs_4_1 = rep(.1, 3),
sds_4 = .1,
zs_5_1 = rep(.1, 1),
sds_5 = .1,
zs_6_1 = rep(.1, 1),
sds_6 = .1,
zs_7_1 = rep(.1, 1),
sds_7 = .1,
zs_8_1 = rep(.1, 1),
sds_8 = .1,
zs_9_1 = rep(.1, 1),
sds_9 = .1,
Intercept_shape = .29,
#org
sd_1 = rep(.1, 2),
z_1 = matrix(rep(0, N_org*2), nrow = 2),
L_1 = t(chol(matrix(c(1, .2, .2, 1), nrow = 2, byrow = TRUE))),
#user
sd_2 = rep(.1, 2),
z_2 = matrix(rep(0, N_user*2), nrow = 2),
L_2 = t(chol(matrix(c(1, .2, .2, 1), nrow = 2, byrow = TRUE))),
#org
sd_3 = .1,
z_3 = matrix(rep(0, N_org), nrow = 1))
)
}
)
}<- make_init(length(unique(month_level_dt$org_id_fac)),
init length(unique(month_level_dt$user_id_fac)))
<- brm(formula = cycle_time_full_intx_nlq_f, data = month_level_dt, prior = cycle_time_full_intx_nlq_prior,
cycle_time_full_intx_nlq_m file = 'cycle_time_full_intx_nlq',
# control = list(adapt_delta = .99, max_treedepth = 20),
sample_prior = 'no',
chains = 4, iter = 2000, warmup = 1000,
cores = 1, threads = 19,
init = init,
backend = 'cmdstanr', silent = 0, refresh = 1,
file_refit = 'never',
stan_model_args = list(cpp_options = list(
PRECOMPILED_HEADERS = FALSE, STAN_CPP_OPTIMS = TRUE,
"CXXFLAGS+= -O1 -march=native -mtune=native",
"CXXFLAGS+= -DEIGEN_USE_BLAS -DEIGEN_USE_LAPACKE",
"LDLIBS += -lblas -llapack -llapacke"),
force_recompile = TRUE))
#summary(cycle_time_full_intx_nlq_m)
In [24]:
traceplot_manyvars(cycle_time_full_intx_nlq_m, variables = variables(cycle_time_full_intx_nlq_m)[1:46])
In [25]:
<- function(x) {
apa |>
x tab_options(
table.border.top.color = "white",
heading.title.font.size = px(16),
column_labels.border.top.width = 3,
column_labels.border.top.color = "black",
column_labels.border.bottom.width = 3,
column_labels.border.bottom.color = "black",
table_body.border.bottom.color = "black",
table.border.bottom.color = "white",
table.width = pct(100),
table.background.color = "white"
|>
) cols_align(align="auto") %>%
tab_style(
style = list(
cell_borders(
sides = c("top", "bottom"),
color = "white",
weight = px(1)
),cell_fill(color = "white", alpha = NULL)
),locations = cells_body(
columns = everything(),
rows = everything()
)
)
}
<- brms_to_draws(cycle_time_full_intx_nlq_m,
cycle_time_full_intx_nlq_m_draws variables = variables(cycle_time_full_intx_nlq_m)[1:30])
<- parameters::model_parameters(cycle_time_full_intx_nlq_m_draws,
cycle_time_full_intx_nlq_m_pars centrality = 'median', ci = .95, ci_method = 'hdi')
gt(cycle_time_full_intx_nlq_m_pars) |>
tab_header(title = "Model parameters") |>
fmt_number(decimals = 2, columns = c('Median', 'CI_low', 'CI_high')) |>
fmt_number(columns = 'pd', scale_by = 100, decimals = 0, pattern = '{x}%') |>
cols_label(Median = 'Posterior Median',
CI_low = '2.5% HDI',
CI_high = '97.5% HDI',
pd = md('P(Sign)')) |>
apa()
In [26]:
<- pp_check(cycle_time_full_intx_nlq_m)
cycle_time_full_intx_nlq_ppcheck +
cycle_time_full_intx_nlq_ppcheck scale_x_log10()
+
cycle_time_full_intx_nlq_ppcheck coord_cartesian(x = c(0, 7.5e6))
In [27]:
plot_marginal_effect(cycle_time_full_intx_nlq_m, 'month_num_c', binwidth = list(c(1, 10)))
plot_marginal_effect(cycle_time_full_intx_nlq_m, 'within_quarter_month_num_c', binwidth = list(c(.25, 10)))
<- unique(round(month_level_dt$q_unclosed - month_level_dt$q_unclosed_c, 5))[[1]]
q_unclosed_center plot_marginal_effect(cycle_time_full_intx_nlq_m, list('month_num_c' = c(1, seq(3, 12, 3)) - 7,
'q_unclosed_c' = round(
quantile(cycle_time_full_intx_nlq_m$data$q_unclosed_c,
c(.1, .5, .9)),
1
translabels.x = list(create_label_transform(7),
)), create_label_transform(q_unclosed_center,
inverse_logit = TRUE)))
plot_marginal_effect(cycle_time_full_intx_nlq_m, c('month_num_c', 'team_size_c'))
<- get_center(month_level_dt, 'yr_avg_avg_coding_days_per_week')
yr_avg_avg_coding_days_per_week_center plot_marginal_effect(cycle_time_full_intx_nlq_m,
'wi_avg_coding_days_per_week',
binwidth = list(c(.5, 10)),
plot_labels = list(labs(x = 'Coding days per week\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
plot_marginal_effect(cycle_time_full_intx_nlq_m,
list('month_num_c' = c(1, 3, 6, 9, 12) - 7,
'yr_avg_avg_coding_days_per_week_c' = c(1, 3, 5, 7) - yr_avg_avg_coding_days_per_week_center),
binwidth = list(c(1, 10), c(.5, 10)),
translabels.x = list(create_label_transform(7),
create_label_transform(yr_avg_avg_coding_days_per_week_center)),
plot_labels = list(labs(x = 'Month',
y = 'Median cycle time (days)',
fill = 'Avg coding\ndays/week',
color = 'Avg coding\ndays/week'),
labs(y = 'Median cycle time (days)',
x = 'Average coding days per week',
fill = 'Month',
color = 'Month')),
patchwork_args = list(nrow = 2)) +
plot_layout(design = 'AB')
<- get_center(month_level_dt, 'yr_avg_total_merged_prs')
yr_avg_total_merged_prs_center plot_marginal_effect(cycle_time_full_intx_nlq_m,
'wi_total_merged_prs',
binwidth = list(c(.5, 10)),
plot_labels = list(labs(x = 'Total merged PRs\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
plot_marginal_effect(cycle_time_full_intx_nlq_m,
list('month_num_c' = c(1, 3, 6, 9, 12) - 7,
'yr_avg_total_merged_prs_c' = c(1:4, 100) - yr_avg_total_merged_prs_center),
translabels.x = list(create_label_transform(7),
create_label_transform(yr_avg_total_merged_prs_center)),
plot_labels = list(labs(x = 'Month',
y = 'Median cycle time (days)',
fill = 'Total merged PRs',
color = 'Total merged PRs'),
labs(y = 'Median cycle time (days)',
x = 'Total merged PRs',
fill = 'Month',
color = 'Month')),
patchwork_args = list(nrow = 2)) +
plot_layout(design = 'AB')
<- get_center(month_level_dt, 'yr_avg_defect_tickets_pct_indiv')
yr_avg_defect_tickets_pct_indiv_center plot_marginal_effect(cycle_time_full_intx_nlq_m,
'wi_defect_tickets_pct_indiv',
binwidth = list(c(10, 10)),
plot_labels = list(labs(x = 'Defect ticket percent\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
plot_marginal_effect(cycle_time_full_intx_nlq_m,
list('month_num_c' = c(1, 3, 6, 9, 12) - 7,
'yr_avg_defect_tickets_pct_indiv_c' = c(0, 25, 50, 75, 100) - yr_avg_defect_tickets_pct_indiv_center),
translabels.x = list(create_label_transform(7),
create_label_transform(yr_avg_defect_tickets_pct_indiv_center)),
plot_labels = list(labs(x = 'Month',
y = 'Median cycle time (days)',
fill = 'Defect ticket percent',
color = 'Defect ticket percent'),
labs(y = 'Median cycle time (days)',
x = 'Defect ticket percent',
fill = 'Month',
color = 'Month')),
patchwork_args = list(nrow = 2)) +
plot_layout(design = 'AB')
<- get_center(month_level_dt, 'yr_avg_degree_centrality_monthly_100')
yr_avg_degree_centrality_monthly_100_center plot_marginal_effect(cycle_time_full_intx_nlq_m,
'wi_degree_centrality_monthly_100',
binwidth = list(c(10,10)),
plot_labels = list(labs(x = 'Degree centrality\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
plot_marginal_effect(cycle_time_full_intx_nlq_m,
list('month_num_c' = c(1, 3, 6, 9, 12) - 7,
'yr_avg_degree_centrality_monthly_100_c' = c(0, 25, 50, 75, 100) - yr_avg_degree_centrality_monthly_100_center),
translabels.x = list(create_label_transform(7),
create_label_transform(yr_avg_degree_centrality_monthly_100_center)),
plot_labels = list(labs(x = 'Month',
y = 'Median cycle time (days)',
fill = 'Degree centrality',
color = 'Degree centrality'),
labs(y = 'Median cycle time (days)',
x = 'Degree centrality',
fill = 'Month',
color = 'Month')),
patchwork_args = list(nrow = 2)) +
plot_layout(design = 'AB')
<- get_center(month_level_dt, 'yr_avg_comments_per_pr')
yr_avg_comments_per_pr_center plot_marginal_effect(cycle_time_full_intx_nlq_m,
'wi_comments_per_pr',
binwidth = list(c(40, 10)),
plot_labels = list(labs(x = 'Comments per PR\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
plot_marginal_effect(cycle_time_full_intx_nlq_m,
list('month_num_c' = c(1, 3, 6, 9, 12) - 7,
'yr_avg_comments_per_pr_c' = c(1, 5, 7, 12, 18) - yr_avg_comments_per_pr_center),
translabels.x = list(create_label_transform(7),
create_label_transform(yr_avg_comments_per_pr_center)),
plot_labels = list(labs(x = 'Month',
y = 'Median cycle time (days)',
fill = 'Comments per PR',
color = 'Comments per PR'),
labs(y = 'Median cycle time (days)',
x = 'Comments per PR',
fill = 'Month',
color = 'Month')),
patchwork_args = list(nrow = 2)) +
plot_layout(design = 'AB')
In [28]:
<- posterior_predict(cycle_time_full_intx_nlq_m, re_formula = NULL, ndraws = 500)
cycle_time_full_intx_nlq_pp_draws <- cbind(as.data.table(cycle_time_full_intx_nlq_m$data),
cycle_time_full_intx_nlq_pp_draws_w_data as.data.table(t(cycle_time_full_intx_nlq_pp_draws)))
<- melt(cycle_time_full_intx_nlq_pp_draws_w_data,
cycle_time_full_intx_nlq_pp_draws_l measure.vars = c(paste0('V', 1:500)))
:= .N, by = c('org_id_fac', 'variable')]
cycle_time_full_intx_nlq_pp_draws_l[, N
<- cycle_time_full_intx_nlq_pp_draws_l[N > 10]
cycle_time_full_intx_nlq_pp_draws_l_filtered <- scales::transform_modulus(p = .325)
mod_trans := mod_trans$transform(value)]
cycle_time_full_intx_nlq_pp_draws_l_filtered[, transformed_value <- cycle_time_full_intx_nlq_pp_draws_l_filtered[
cycle_time_full_intx_nlq_pp_draws_l_density > 10,
N
{<- density(transformed_value, n = 50)
d list(x = d$x, y = d$y, median_x = suppressWarnings(approx(cumsum(d$y)/sum(d$y), d$x, xout=0.5)$y), N = unique(N))
}, = c('org_id_fac', 'variable')
by
]:= median(median_x), by = c('org_id_fac')] cycle_time_full_intx_nlq_pp_draws_l_density[, median_median_x
In [29]:
<- 4
N_col
<- function(ranks, n_rows, n_cols) {
get_col <- floor(n_rows/n_cols) # base rows per column
base <- n_rows %% n_cols # how many columns need an extra row
extra
<- c(0, cumsum(c(rep(base + 1, extra), rep(base, n_cols - extra))))
cutoffs
# Map to columns based on ranks
findInterval(ranks - 1, cutoffs)
}
<- unique(cycle_time_full_intx_nlq_pp_draws_l_density[, c('org_id_fac', 'N', 'median_median_x')])
unique_orders := frankv(.SD, ties.method = 'dense'), .SDcols = c('N', 'org_id_fac')]
unique_orders[, N_rank := get_col(N_rank, .N, 4) - 1]
unique_orders[, col_num setorder(unique_orders, col_num, median_median_x, org_id_fac)
:= .I]
unique_orders[, I := .N:1, by = col_num]
unique_orders[, row_num
<- cycle_time_full_intx_nlq_pp_draws_l_density[
cycle_time_full_intx_nlq_pp_draws_l_density_orders c('org_id_fac', 'col_num', 'row_num', 'I')],
unique_orders[, = 'org_id_fac']
on := reorder(org_id_fac, I, decreasing = FALSE)]
cycle_time_full_intx_nlq_pp_draws_l_density_orders[, org_id_fac
<- max(cycle_time_full_intx_nlq_pp_draws_l_density_orders$y)/4
y_separation <- 3
y_mag := (y*y_mag + y_separation*row_num)]
cycle_time_full_intx_nlq_pp_draws_l_density_orders[, y_ridges := y_separation*row_num]
cycle_time_full_intx_nlq_pp_draws_l_density_orders[, y_ridges_min
<- mod_trans$transform(c(60*60*12*1, 60*60*24*7, 60*60*24*7*4, 60*60*24*1*7*13, 60*60*24*1*7*52))
breaks_seconds_to_days names(breaks_seconds_to_days) <- c("12h", "1w", "4w", "1q", "1y")
<- cycle_time_full_intx_nlq_pp_draws_l_density_orders[variable %in% paste0('V', 1:50)]
cycle_time_full_intx_nlq_pp_draws_l_density_orders_filtered
ggplot(cycle_time_full_intx_nlq_pp_draws_l_density_orders_filtered, aes(x = x, y = y_ridges, group = interaction(org_id_fac, variable))) +
geom_segment(x = breaks_seconds_to_days['4w'], xend = breaks_seconds_to_days['4w'],
y = 0, yend = max(cycle_time_full_intx_nlq_pp_draws_l_density_orders_filtered$y_ridges),
color = viridis::mako(1, begin = .2),
linewidth = .25) +
geom_ribbon(aes(ymin = y_ridges_min, ymax = y_ridges, fill = N), alpha = 0.25) +
geom_line(linewidth = .05, alpha = .25, color = viridis::mako(1)) +
scale_x_continuous(breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
coord_cartesian(x = c(0, quantile(cycle_time_full_intx_nlq_pp_draws_l_density_orders_filtered$x, .975))) +
facet_grid(~col_num) +
theme_minimal() +
theme(
strip.text.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.x = element_line(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
strip.text = element_blank(),
axis.text.x = element_text(angle = 0, hjust = .5, family = font_name, size = 6)
+
) scale_fill_viridis_c(option = 'mako', transform = scales::transform_modulus(p = .1),
breaks = c(min(cycle_time_full_intx_nlq_pp_draws_l_density_orders_filtered$N),
100, 1000,
max(cycle_time_full_intx_nlq_pp_draws_l_density_orders_filtered$N)),
labels = format(c(min(cycle_time_full_intx_nlq_pp_draws_l_density_orders_filtered$N),
100, 1000,
max(cycle_time_full_intx_nlq_pp_draws_l_density_orders_filtered$N)), big.mark = ","),
begin = .2,
guide = guide_colorbar(
barwidth = .5,
barheight = 5
+
)) labs(x = "Cycle time", y = "Posterior prediction probability density")
ggsave('cycle_time_full_intx_nlq_pp_check.png', width = 6, height = 4, dpi = 300, units = 'in')
In [30]:
<- cycle_time_full_intx_nlq_pp_draws_l_filtered[
cycle_time_full_intx_nlq_pp_draws_l_summary_density > 10,
N
{<- density(transformed_value, n = 500)
d list(x = d$x, y = d$y)
}, = c('variable')
by
]
<- as.data.table(cycle_time_full_intx_nlq_m$data)
cycle_time_full_intx_nlq_pp_draws_l_summary_data := .N, by = org_id_fac]
cycle_time_full_intx_nlq_pp_draws_l_summary_data[, N <- cycle_time_full_intx_nlq_pp_draws_l_summary_data[N > 10]
cycle_time_full_intx_nlq_pp_draws_l_summary_data := mod_trans$transform(median_cycle_time_s)]
cycle_time_full_intx_nlq_pp_draws_l_summary_data[, median_cycle_time_s_modulus
<- seq(min(cycle_time_full_intx_nlq_pp_draws_l_summary_data$median_cycle_time_s_modulus),
breakpoints max(cycle_time_full_intx_nlq_pp_draws_l_summary_data$median_cycle_time_s_modulus), length.out = 50)
:= findInterval(median_cycle_time_s_modulus, breakpoints)]
cycle_time_full_intx_nlq_pp_draws_l_summary_data[, intervals
<- diff(breakpoints)[1] # Width of each bin
bin_width <- nrow(cycle_time_full_intx_nlq_pp_draws_l_summary_data)
total_n
<- cycle_time_full_intx_nlq_pp_draws_l_summary_data[, .(density = .N / (bin_width*total_n)), by = intervals]
cycle_time_full_intx_nlq_pp_draws_l_summary_data_hist setorder(cycle_time_full_intx_nlq_pp_draws_l_summary_data_hist, intervals)
<- data.table(bin_value = breakpoints[-1], intervals = 1:length(breakpoints[-1]))
breakpoints_dt <- cycle_time_full_intx_nlq_pp_draws_l_summary_data_hist[breakpoints_dt, on = 'intervals']
cycle_time_full_intx_nlq_pp_draws_l_summary_data_hist := fifelse(is.na(density), 0, density)]
cycle_time_full_intx_nlq_pp_draws_l_summary_data_hist[, density
ggplot(cycle_time_full_intx_nlq_pp_draws_l_summary_density, aes(x = x, y = y, group = variable)) +
geom_col(data = cycle_time_full_intx_nlq_pp_draws_l_summary_data_hist, aes(x = bin_value, y = density, group = NULL),
fill = viridis::mako(1, begin = .35)) +
geom_ribbon(aes(ymax = y), ymin = 0, fill = viridis::mako(1, begin = .65), alpha = 0.025) +
geom_line(linewidth = .015, alpha = .5, color = viridis::mako(1)) +
scale_x_continuous(breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
coord_cartesian(x = c(0, quantile(cycle_time_full_intx_nlq_pp_draws_l_summary_density$x, .60))) +
theme_minimal() +
theme(
strip.text.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.x = element_line(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
strip.text = element_blank(),
axis.text.x = element_text(angle = 0, hjust = .5, family = font_name, size = 6)
+
) labs(x = "Cycle time", y = "Posterior prediction\nprobability density")
ggsave('cycle_time_full_intx_nlq_pp_check_summary.png', width = 4, height = 3, dpi = 300, units = 'in')
In [31]:
<- 'cycle_time_full_intx_nlq_draws_re_sum.rds'
re_draws_org_fn <- 'cycle_time_full_intx_nlq_draws_uidre_sum.rds'
re_draws_id_fn
if(!file.exists(re_draws_org_fn)){
<- spread_draws(cycle_time_full_intx_nlq_m, r_org_id_fac[org_id_fac,par])
cycle_time_full_intx_nlq_draws_re <- posterior::summarize_draws(cycle_time_full_intx_nlq_draws_re, median, ~posterior::quantile2(.x, probs = c(1-.8, (1+.8))/2), ~posterior::quantile2(.x, probs = c(1-.5, (1+.5))/2))
cycle_time_full_intx_nlq_draws_re_sum saveRDS(cycle_time_full_intx_nlq_draws_re_sum, re_draws_org_fn)
else {
} <- readRDS(re_draws_org_fn)
cycle_time_full_intx_nlq_draws_re_sum
}
if(!file.exists(re_draws_id_fn)){
<- spread_draws(cycle_time_full_intx_nlq_m, `r_.*user_id_fac`[org_id_fac,par], regex = TRUE)
cycle_time_full_intx_nlq_draws_uidre <- posterior::summarize_draws(cycle_time_full_intx_nlq_draws_uidre, median, ~posterior::quantile2(.x, probs = c(1-.8, (1+.8))/2), ~posterior::quantile2(.x, probs = c(1-.5, (1+.5))/2))
cycle_time_full_intx_nlq_draws_uidre_sum <- as.data.table(cycle_time_full_intx_nlq_draws_uidre_sum)
cycle_time_full_intx_nlq_draws_uidre_sum saveRDS(cycle_time_full_intx_nlq_draws_uidre_sum, re_draws_id_fn)
else {
} <- readRDS(re_draws_id_fn)
cycle_time_full_intx_nlq_draws_uidre_sum }
In [32]:
:= rank(median), by = 'par']
cycle_time_full_intx_nlq_draws_uidre_sum[, order := order / (.N + 1), by = 'par']
cycle_time_full_intx_nlq_draws_uidre_sum[, pp := qnorm(pp, mean = 0, sd = sd(median)), by = 'par']
cycle_time_full_intx_nlq_draws_uidre_sum[, qq ggplot(cycle_time_full_intx_nlq_draws_uidre_sum, aes(x = order)) +
geom_interval(aes(ymin = q10, ymax = q90), size = .125/4, alpha = .25, orientation = 'vertical') +
geom_interval(aes(ymin = q25, ymax = q75), size = .25/4, alpha = .25, orientation = 'vertical') +
geom_line(aes(y = qq), color = 'white', linewidth = 1, alpha = .5) +
facet_wrap(par~., scales = 'free_y') +
+
theme_clean theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
strip.text.x = element_text(size = 12)) +
labs(y = 'Individual deviation from population parameter')
In [33]:
<- melt(cycle_time_full_intx_nlq_draws_uidre_sum[, c('median', 'q10', 'q25', 'q75', 'q90', 'par', 'org_id_fac')],
cycle_time_full_intx_nlq_draws_uidre_sum_l measure.vars = c('median', 'q10', 'q25', 'q75', 'q90'),
variable.name = 'quantile', value.name = 'y_hat')
:= paste0(par, '_', quantile)]
cycle_time_full_intx_nlq_draws_uidre_sum_l[, var <- dcast(cycle_time_full_intx_nlq_draws_uidre_sum_l[, -'par'], org_id_fac ~ var, value.var = 'y_hat')
cycle_time_full_intx_nlq_draws_uidre_sum_w ggplot(cycle_time_full_intx_nlq_draws_uidre_sum_w, aes(x = Intercept_median, y = month_num_c_median)) +
geom_segment(aes(x = Intercept_q25, xend = Intercept_q75), size = .125, alpha = .35) +
geom_segment(aes(y = month_num_c_q25, yend = month_num_c_q75), size = .125, alpha = .35) +
geom_hex(bins = 120, alpha = 1) +
scale_fill_viridis_c(aesthetics = c('fill', 'color'), option = 'plasma', transform = scales::modulus_trans(p = -.3),
breaks = c(2, 10, 100)) +
+
theme_clean theme(axis.text.x = element_text())
In [34]:
set.seed(34232)
<- as.data.table(cycle_time_full_intx_nlq_m$data)
cycle_time_full_intx_nlq_m_data #using frank (fast rank)
<- function(x, n_quantiles = 20){
quantile_labeler <- length(x)
N 1 + (frank(x, ties.method = "min")-1) %/% (N/n_quantiles)) / n_quantiles
(
}
:= quantile_labeler(yr_avg_avg_coding_days_per_week_c)]
cycle_time_full_intx_nlq_m_data[, yr_avg_avg_coding_days_per_week_c_quantile := quantile_labeler(wi_avg_coding_days_per_week)]
cycle_time_full_intx_nlq_m_data[, wi_avg_coding_days_per_week_quantile := .N, by = 'user_id_fac']
cycle_time_full_intx_nlq_m_data[, N_obs <- unique(cycle_time_full_intx_nlq_m_data[N_obs >= 4, c('user_id_fac', 'yr_avg_avg_coding_days_per_week_c_quantile')])
cycle_time_full_intx_nlq_m_data_subsample_ids
<- 20
nsamples :=
cycle_time_full_intx_nlq_m_data_subsample_ids[, yr_avg_avg_coding_days_per_week_c_subsample sample(c(rep(1, nsamples), rep(0, .N-nsamples)), .N),
= yr_avg_avg_coding_days_per_week_c_quantile]
by
table(cycle_time_full_intx_nlq_m_data_subsample_ids$yr_avg_avg_coding_days_per_week_c_subsample,
$yr_avg_avg_coding_days_per_week_c_quantile)
cycle_time_full_intx_nlq_m_data_subsample_ids
<- cycle_time_full_intx_nlq_m_data_subsample_ids[
cycle_time_full_intx_nlq_m_data_subsample_ids == 1]
yr_avg_avg_coding_days_per_week_c_subsample
<- cycle_time_full_intx_nlq_m_data[cycle_time_full_intx_nlq_m_data_subsample_ids, on = 'user_id_fac'] cycle_time_full_intx_nlq_m_data_subsample
In [35]:
<- cycle_time_full_intx_nlq_m_data_subsample[, CJ(user_id_fac = unique(user_id_fac),
raw_data_ct_example_for_plot month_num_c = unique(month_num_c))]
<- merge(unique(cycle_time_full_intx_nlq_m_data_subsample[, c('user_id_fac', 'yr_avg_avg_coding_days_per_week_c_quantile')]),
raw_data_ct_example_for_plot
raw_data_ct_example_for_plot, on = c('user_id_fac'))
<- merge(unique(cycle_time_full_intx_nlq_m_data_subsample[, c('user_id_fac',
raw_data_ct_example_for_plot 'month_num_c',
'median_cycle_time_s',
'yr_avg_avg_coding_days_per_week_c')]),
raw_data_ct_example_for_plot, all = TRUE)
<- median(raw_data_ct_example_for_plot$median_cycle_time_s, na.rm = TRUE)
med_med_ct
ggplot(raw_data_ct_example_for_plot[yr_avg_avg_coding_days_per_week_c_quantile %in% seq(.05, 1, .25)],
aes(x = month_num_c, y = median_cycle_time_s)) +
geom_hline(yintercept = med_med_ct) +
geom_line(aes(group = user_id_fac, color = yr_avg_avg_coding_days_per_week_c), linewidth = .125, alpha = 1) +
scale_y_continuous(transform = scales::modulus_trans(p = .25)) +
scale_color_viridis_c(option = 'plasma', end = .8) +
facet_grid(~yr_avg_avg_coding_days_per_week_c_quantile) +
theme_minimal() +
theme(
strip.text.x = element_text(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [36]:
<- as.data.table(predictions(cycle_time_full_intx_nlq_m,
subsample_predictions newdata = unique(cycle_time_full_intx_nlq_m_data_subsample),
re_formula = NULL))
<- as.data.table(cycle_time_full_intx_nlq_m_data_subsample)
cycle_time_full_intx_nlq_m_data_subsample <- cycle_time_full_intx_nlq_m_data_subsample[, CJ(user_id_fac = unique(user_id_fac),
pred_data_ct_example_for_plot month_num_c = unique(month_num_c))]
<- merge(
pred_data_ct_example_for_plot unique(cycle_time_full_intx_nlq_m_data_subsample[, c('user_id_fac', 'yr_avg_avg_coding_days_per_week_c_quantile')]),
pred_data_ct_example_for_plot, on = c('user_id_fac'))
<- merge(unique(subsample_predictions[, c('user_id_fac',
pred_data_ct_example_for_plot 'month_num_c',
'estimate',
'conf.low', 'conf.high',
'yr_avg_avg_coding_days_per_week_c',
'wi_avg_coding_days_per_week')]),
pred_data_ct_example_for_plot, all = TRUE)
<- median(pred_data_ct_example_for_plot$estimate, na.rm = TRUE)
med_med_ct
<- scales::transform_modulus(p = .325)
mod_trans <- c(60*60*12*1, 60*60*24*7, 60*60*24*7*4, 60*60*24*1*7*13, 60*60*24*1*7*26)
breaks_seconds_to_days names(breaks_seconds_to_days) <- c("12h", "1w", "4w", "1q", "6m")
:= reorder(sprintf('%.0f%% - %.0f%%',
pred_data_ct_example_for_plot[, quantile_name *100 - 5,
yr_avg_avg_coding_days_per_week_c_quantile*100),
yr_avg_avg_coding_days_per_week_c_quantile
yr_avg_avg_coding_days_per_week_c_quantile)]
ggplot(pred_data_ct_example_for_plot[yr_avg_avg_coding_days_per_week_c_quantile %in% c(.1, .2, .5, .85, .95)],
aes(x = month_num_c, y = estimate)) +
geom_hline(yintercept = med_med_ct) +
geom_ribbon(aes(group = user_id_fac, ymin = conf.low, ymax = conf.high), alpha = .025) +
geom_line(aes(group = user_id_fac), linewidth = .125, alpha = 1) +
scale_y_continuous(transform = mod_trans, breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
facet_grid(~quantile_name) +
theme_minimal() +
theme(
strip.text.x = element_text(size = 8), # Adjust text size for readability
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
+
) labs(x = 'Calendar time', y = 'Cycle time')
ggsave('cycle_time_full_intx_nlq_post_pred_ids.png', width = 4, height = 3, dpi = 300, units = 'in')
gc()
In [37]:
<- copy(pred_data_ct_example_for_plot)
pred_data_ct_example_for_plot_wi := median(estimate, na.rm = TRUE), by = user_id_fac]
pred_data_ct_example_for_plot_wi[, median_estimate c('wi_estimate', 'wi_conf.high', 'wi_conf.low') :=
pred_data_ct_example_for_plot_wi[, - median_estimate, conf.high - median_estimate, conf.low - median_estimate)]
.(estimate
:= reorder(sprintf('%.0f%% - %.0f%%',
pred_data_ct_example_for_plot_wi[, quantile_name *100 - 5,
yr_avg_avg_coding_days_per_week_c_quantile*100),
yr_avg_avg_coding_days_per_week_c_quantile
yr_avg_avg_coding_days_per_week_c_quantile)]ggplot(pred_data_ct_example_for_plot_wi[yr_avg_avg_coding_days_per_week_c_quantile %in% c(.1, .2, .5, .85, .95)],
aes(x = month_num_c, y = wi_estimate)) +
geom_hline(yintercept = 0) +
geom_ribbon(aes(group = user_id_fac, ymin = wi_conf.low, ymax = wi_conf.high), alpha = .025) +
geom_line(aes(group = user_id_fac, color = wi_avg_coding_days_per_week), linewidth = .125, alpha = 1) +
geom_point(aes(color = wi_avg_coding_days_per_week), size = .75) +
scale_color_viridis_c(option = 'plasma', transform = scales::modulus_trans(p = -.5), breaks = c(-1, 0, 1),
guide = guide_colorbar(direction = "horizontal", title.position = "top",
position = 'bottom', title.hjust = 0.5)) +
scale_y_continuous(transform = mod_trans, breaks = c(-rev(breaks_seconds_to_days), breaks_seconds_to_days),
labels = c(paste0('-', rev(names(breaks_seconds_to_days))), names(breaks_seconds_to_days))) +
facet_grid(~quantile_name) +
theme_minimal() +
theme(
strip.text.x = element_text(), # Adjust text size for readability
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
+
) labs(x = 'Calendar time', y = 'Deviation in cycle time', color = 'Within-person deviation in average coding days per week')
ggsave('cycle_time_full_intx_nlq_post_pred_ids_win.png', width = 6, height = 3, dpi = 300, units = 'in')
In [38]:
set.seed(7331)
<- unique(cycle_time_full_intx_nlq_m_data[N_obs == 12, c('user_id_fac', 'yr_avg_avg_coding_days_per_week_c_quantile')])
cycle_time_full_intx_nlq_m_data_subsample_ids
#hist(cycle_time_full_intx_nlq_m_data_subsample_ids$yr_avg_avg_coding_days_per_week_c_quantile)
<- 1
nsamples :=
cycle_time_full_intx_nlq_m_data_subsample_ids[, yr_avg_avg_coding_days_per_week_c_subsample sample(c(rep(1, nsamples), rep(0, .N-nsamples)), .N),
= yr_avg_avg_coding_days_per_week_c_quantile]
by
<- cycle_time_full_intx_nlq_m_data_subsample_ids[
cycle_time_full_intx_nlq_m_data_subsample_ids %in% c(1, .9, .8, .7, .6, .5, .4, .3, .2, .1)
yr_avg_avg_coding_days_per_week_c_quantile
]
<- cycle_time_full_intx_nlq_m_data_subsample_ids[
cycle_time_full_intx_nlq_m_data_subsample_ids == 1]
yr_avg_avg_coding_days_per_week_c_subsample
<- cycle_time_full_intx_nlq_m_data[cycle_time_full_intx_nlq_m_data_subsample_ids, on = 'user_id_fac']
cycle_time_full_intx_nlq_m_data_subsample
ggplot(cycle_time_full_intx_nlq_m_data_subsample,
aes(x = month_num_c, y = median_cycle_time_s)) +
geom_line(aes(group = user_id_fac, color = yr_avg_avg_coding_days_per_week_c_quantile), linewidth = .5, alpha = 1) +
scale_y_continuous(transform = scales::modulus_trans(p = .25)) +
theme_minimal() +
scale_color_viridis_c(end = .8) +
theme(
strip.text.x = element_text(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [39]:
<- posterior_summary(posterior_epred(cycle_time_full_intx_nlq_m,
cycle_time_full_intx_nlq_m_data_subsample_posterior_summary newdata = cycle_time_full_intx_nlq_m_data_subsample,
re_formula = NULL))
<- posterior_predict(cycle_time_full_intx_nlq_m,
cycle_time_full_intx_nlq_m_data_subsample_posterior_preds newdata = cycle_time_full_intx_nlq_m_data_subsample,
re_formula = NULL, ndraws = 100)
<- cbind(cycle_time_full_intx_nlq_m_data_subsample,
cycle_time_full_intx_nlq_m_data_subsample_posterior_preds_data as.data.table(t(cycle_time_full_intx_nlq_m_data_subsample_posterior_preds)))
<- melt(cycle_time_full_intx_nlq_m_data_subsample_posterior_preds_data,
cycle_time_full_intx_nlq_m_data_subsample_posterior_preds_data_l measure.vars = paste0('V', 1:100))
<- cbind(cycle_time_full_intx_nlq_m_data_subsample,
cycle_time_full_intx_nlq_m_data_subsample_posterior_summary as.data.table(cycle_time_full_intx_nlq_m_data_subsample_posterior_summary))
<- c(60*60*12*1, 60*60*24*7, 60*60*24*7*4, 60*60*24*1*7*13, 60*60*24*1*7*26)
breaks_seconds_to_days names(breaks_seconds_to_days) <- c("12h", "1w", "4w", "1q", "6m")
<- month_level_dt[
cycle_time_full_intx_nlq_m_data_subsample_posterior_summary
, c('yr_avg_avg_coding_days_per_week', 'month_num_c', 'user_id_fac', 'org_id_fac')
][
cycle_time_full_intx_nlq_m_data_subsample_posterior_summary, = c('month_num_c', 'user_id_fac', 'org_id_fac')
on
]
:= reorder(user_id_fac, yr_avg_avg_coding_days_per_week)]
cycle_time_full_intx_nlq_m_data_subsample_posterior_summary[, user_id_fac_sorted := reorder(user_id_fac, yr_avg_avg_coding_days_per_week_c)] cycle_time_full_intx_nlq_m_data_subsample_posterior_preds_data_l[, user_id_fac_sorted
In [40]:
<- ggplot(cycle_time_full_intx_nlq_m_data_subsample_posterior_summary,
real_data_and_prediction_plot aes(x = month_num_c, y = median_cycle_time_s)) +
geom_line(aes(y = median_cycle_time_s), linewidth = .125, alpha = .5) +
geom_point(aes(y = median_cycle_time_s), size = .125, alpha = .5) +
geom_line(data = cycle_time_full_intx_nlq_m_data_subsample_posterior_preds_data_l,
aes(group = interaction(user_id_fac_sorted, variable), y = value),
linewidth = .15, alpha = .0625) +
geom_ribbon(aes(ymin = Q2.5, ymax = Q97.5, group = user_id_fac, fill = yr_avg_avg_coding_days_per_week),
linewidth = .0625, alpha = .5, color = 'white') +
geom_line(aes(y = Estimate, group = user_id_fac), color = 'white', linewidth = .65, alpha = 1) +
geom_line(aes(y = Estimate, group = user_id_fac, color = yr_avg_avg_coding_days_per_week), linewidth = .5, alpha = 1) +
scale_y_continuous(transform = mod_trans,
breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
theme_minimal() +
facet_wrap(~ user_id_fac_sorted, nrow = 2) +
scale_color_viridis_c(end = .7, aesthetics = c('fill', 'color'), breaks = c(2, 3, 4),
name = 'Year-average coding-days-per-week',
guide = guide_colorbar(direction = "horizontal", title.position = "top",
position = 'bottom', title.hjust = 0.5)) +
theme(
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing = unit(0, units = 'mm'),
strip.text = element_blank(),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8),
legend.key.size = unit(0.8, "cm"),
#legend.key.width = unit(0.4, "cm"),
legend.key.height = unit(0.3, "cm")
+
) labs(x = 'Calendar time', y = 'Cycle time') +
coord_cartesian(y = c(0, 60*60*24*7*25))
print(real_data_and_prediction_plot)
ggsave(real_data_and_prediction_plot, file = 'cycle_time_full_intx_nlq_post_pred_samples.png', width = 6, height = 3, dpi = 300, units = 'in')
In [41]:
<- quantile(
yr_avg_avg_coding_days_per_week_c_quantiles unique(
$data[
cycle_time_full_intx_nlq_mc('user_id_fac', 'yr_avg_avg_coding_days_per_week_c')]
, 'yr_avg_avg_coding_days_per_week_c']],
)[[seq(.1, .9, length.out = 3))
<- rbindlist(
cf_data lapply(names(yr_avg_avg_coding_days_per_week_c_quantiles),
\(x){<- copy(cycle_time_full_intx_nlq_m_data_subsample)
a_dt := rep(yr_avg_avg_coding_days_per_week_c_quantiles[x], .N)]
a_dt[, yr_avg_avg_coding_days_per_week_c := rep(x, .N)]
a_dt[, quantiles return(a_dt)
})
)
= 100
ndraws <- posterior_predict(cycle_time_full_intx_nlq_m,
cf_post_pred newdata = cf_data,
re_formula = NULL,
ndraws = ndraws)
<- posterior_summary(posterior_epred(cycle_time_full_intx_nlq_m,
cf_post_epred_sum newdata = cf_data,
re_formula = NULL))
<- cbind(cf_data,
cf_data_post_pred as.data.table(t(cf_post_pred)))
<- cbind(cf_data,
cf_data_post_epred as.data.table(cf_post_epred_sum))
:= reorder(user_id_fac, yr_avg_avg_coding_days_per_week_c_quantile)]
cf_data_post_pred[, user_id_fac_sorted := reorder(user_id_fac, yr_avg_avg_coding_days_per_week_c_quantile)]
cf_data_post_epred[, user_id_fac_sorted
<- melt(cf_data_post_pred, measure.vars = paste0('V', 1:ndraws))
cf_data_post_pred_l
unique(cycle_time_full_intx_nlq_m_data_subsample_posterior_summary[, .(diff = -yr_avg_avg_coding_days_per_week_c + yr_avg_avg_coding_days_per_week)])[[1]] + yr_avg_avg_coding_days_per_week_c_quantiles
diff = Estimate[quantiles == '50%'] - Estimate[quantiles == '90%']),
cf_data_post_epred[, .(= c('user_id_fac', 'month_num_c')][, .(med_diff = median(diff)), by = c('user_id_fac')][
by med_diff = median(med_diff) / (60*60*24))
, .( ]
In [42]:
<- ggplot(cf_data_post_pred_l,
real_data_and_cf_prediction_plot aes(x = month_num_c, y = value)) +
# geom_point(aes(group = interaction(variable, quantiles), color = quantiles), size = .25, alpha = .25) +
geom_bin2d(aes(group = interaction(quantiles, month_num_c), fill = quantiles, alpha = after_stat(count)), binwidth = c(1, 25),
position = position_dodge(width = 1, preserve = 'total')) +
#geom_line(aes(group = interaction(variable, quantiles), color = quantiles), linewidth = .25, alpha = .0625) +
geom_line(data = cf_data_post_epred, aes(y = Estimate, group = quantiles), color = 'white', linewidth = 1) +
geom_line(data = cf_data_post_epred, aes(y = Estimate, group = quantiles, color = quantiles), linewidth = .75) +
scale_y_continuous(transform = mod_trans,
breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
scale_alpha_continuous(range = c(.4, 1), breaks = c(1, 5, 20),
guide = 'none', transform = scales::transform_modulus(p = 1)) +
scale_color_viridis_d(aesthetics = c('fill', 'color'), option = 'cividis', end = .7,
name = 'Counterfactual year-average-coding-days quantile',
guide = guide_legend(direction = "horizontal", title.position = "top",
position = 'bottom', title.hjust = 0.5)) +
# scale_color_manual(aesthetics = c('fill', 'color'),
# breaks = c("10%", "50%", "90%"),
# values = c("#29483A", "#759C44", "#DF3383"),
# name = 'Counterfactual Year Average Coding Days quantiles',
# guide = guide_legend(direction = "horizontal", title.position = "top",
# position = 'bottom', title.hjust = 0.5)) +
theme_minimal() +
facet_wrap( ~ user_id_fac_sorted, nrow = 2) +
theme(
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing = unit(0, units = 'mm'),
strip.text = element_blank(),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8),
legend.key.size = unit(0.8, "cm"),
legend.key.width = unit(0.3, "cm"),
legend.key.height = unit(0.3, "cm")
+
) labs(x = 'Calendar time', y = 'Cycle time') +
coord_cartesian(y = c(0, 60*60*24*7*25))
print(real_data_and_cf_prediction_plot)
ggsave(real_data_and_cf_prediction_plot, file = 'cycle_time_full_intx_nlq_post_pred_counterfactual.png', width = 6, height = 4, dpi = 300, units = 'in')
In [43]:
<- real_data_and_prediction_plot + real_data_and_cf_prediction_plot + plot_layout(axis_titles = 'collect')
combined_prediction_plot print(combined_prediction_plot)
ggsave(combined_prediction_plot, file = 'cycle_time_full_intx_nlq_combined_post_pred_counterfactual.png', width = 7, height = 4, dpi = 300, units = 'in')
In [44]:
options("marginaleffects_posterior_interval" = "hdi")
options("marginaleffects_posterior_center" = "median")
<- marginaleffects::avg_comparisons(
coding_days_comparisons_90_50
cycle_time_full_intx_nlq_m,variables = list(yr_avg_avg_coding_days_per_week_c = yr_avg_avg_coding_days_per_week_c_quantiles[3:2]),
newdata = cycle_time_full_intx_nlq_m$data,
ndraws = 250,
re_formula = NULL)
<- marginaleffects::avg_comparisons(
coding_days_comparisons_50_10
cycle_time_full_intx_nlq_m,variables = list(yr_avg_avg_coding_days_per_week_c = yr_avg_avg_coding_days_per_week_c_quantiles[2:1]),
newdata = cycle_time_full_intx_nlq_m$data,
ndraws = 250,
re_formula = NULL)
<- 1/(60*60*24)
convert_s_to_d <- as.data.table(cycle_time_full_intx_nlq_m$data)[, .(median_users = median(median_cycle_time_s)), by = 'user_id_fac'][, .(median = median(median_users))]*convert_s_to_d
raw_median_cycle_time sprintf("A difference between the 50th and 90th quantile of coding days is associated with a difference of %.2f (95%% CI = [%.2f, %.2f]) in cycle time (refer to overall median cycle time: %.2f",
$estimate*convert_s_to_d,
coding_days_comparisons_90_50$conf.low*convert_s_to_d,
coding_days_comparisons_90_50$conf.high*convert_s_to_d,
coding_days_comparisons_90_50
raw_median_cycle_time)
sprintf("A difference between the 10th and 50th quantile of coding days is associated with a difference of %.2f (95%% CI = [%.2f, %.2f]) in cycle time (refer to overall median cycle time: %.2f",
$estimate*convert_s_to_d,
coding_days_comparisons_50_10$conf.low*convert_s_to_d,
coding_days_comparisons_50_10$conf.high*convert_s_to_d,
coding_days_comparisons_50_10 raw_median_cycle_time)
[1] "A difference between the 50th and 90th quantile of coding days is associated with a difference of -2.10 (95% CI = [-2.74, -1.43]) in cycle time (refer to overall median cycle time: 13.01"
[1] "A difference between the 10th and 50th quantile of coding days is associated with a difference of -1.83 (95% CI = [-2.40, -1.14]) in cycle time (refer to overall median cycle time: 13.01"
In [45]:
<- marginaleffects::avg_comparisons(
coding_days_comparisons_90_50_by_month
cycle_time_full_intx_nlq_m,variables = list(yr_avg_avg_coding_days_per_week_c = yr_avg_avg_coding_days_per_week_c_quantiles[3:2]),
by = c('org_id_fac', 'month_num_c'),
newdata = cycle_time_full_intx_nlq_m$data,
ndraws = 250,
re_formula = NULL)
In [46]:
$org_id_fac_reorder <- reorder(coding_days_comparisons_90_50_by_month$org_id_fac, coding_days_comparisons_90_50_by_month$estimate, FUN = mean)
coding_days_comparisons_90_50_by_monthggplot(coding_days_comparisons_90_50_by_month, aes(x = month_num_c, y = org_id_fac_reorder)) +
geom_raster(aes(fill = estimate*convert_s_to_d)) +
scale_fill_viridis_c(transform = scales::transform_modulus(p = -.1), breaks = c(-.5, -1, -2, -4, -8)) +
+
theme_clean theme(
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing = unit(0, units = 'mm'),
strip.text = element_blank(),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8),
legend.key.size = unit(0.8, "cm"),
legend.key.width = unit(0.3, "cm"),
legend.key.height = unit(0.8, "cm")
+
) labs(x = 'Calendar time', y = 'Organization', fill = 'Change in\ncycle time (days)')
In [47]:
rm(list = ls(pattern = '_m$'))
gc()
Linear predictors interacting with “month” random effect (full data)
In [48]:
<- brm(formula = cycle_time_full_intx_lin_remonth_f, data = month_level_dt, prior = cycle_time_full_intx_lin_remonth_prior,
cycle_time_full_intx_lin_remonth_prior_m file = 'cycle_time_full_intx_lin_remonth_prior',
sample_prior = 'only',
chains = 4, iter = 750, warmup = 500,
cores = 4,
init = 0,
backend = 'cmdstanr', silent = 0, refresh = 500,
file_refit = 'never',
stan_model_args = list(cpp_options = list(
PRECOMPILED_HEADERS = FALSE, STAN_CPP_OPTIMS = TRUE,
"CXXFLAGS+= -O1 -march=native -mtune=native",
"CXXFLAGS+= -DEIGEN_USE_BLAS -DEIGEN_USE_LAPACKE",
"LDLIBS += -lblas -llapack -llapacke"),
force_recompile = TRUE))
summary(cycle_time_full_intx_lin_remonth_prior_m)
<- posterior_predict(cycle_time_full_intx_lin_remonth_prior_m)
cycle_time_full_intx_lin_remonth_prior_pred <- as.data.table(t(cycle_time_full_intx_lin_remonth_prior_pred))
cycle_time_full_intx_lin_remonth_prior_dt <- melt(cycle_time_full_intx_lin_remonth_prior_dt,
cycle_time_full_intx_lin_remonth_prior_l measure.vars = names(cycle_time_full_intx_lin_remonth_prior_dt),
variable.name = 'iter', value.name = 'y_hat')
ggplot(cycle_time_full_intx_lin_remonth_prior_l, aes(x = y_hat)) +
geom_histogram(aes(y = after_stat(density)), bins = 200, fill = 'blue', alpha = 1) +
geom_histogram(aes(x = median_cycle_time_s, y = after_stat(density)), data = month_level_dt,
bins = 200, fill = 'gray', alpha = .8) +
scale_x_log10() +
theme_minimal()
get_prior(cycle_time_full_intx_lin_remonth_prior_m)
# growth_ptyp_shape_prior_pp <- pp_check(growth_ptyp_shape_prior_m)
# growth_ptyp_shape_prior_pp + scale_y_log10()
In [49]:
<- function(N_org, N_user){
make_init return(
function(){
return(
list(Intercept = 14,
b = rep(0, 17), #runif(n = 15, min = -.05, max = .05),
bs = rep(.1, 6), #runif(n = 1, min = -.1, max = .1),
zs_1_1 = rep(.1, 3), #runif(n = 3, min = -.1, max = .1),
sds_1 = .1, #runif(n = 1, min = 0, max = .1),
zs_2_1 = rep(.1, 3), #runif(n = 3, min = -.1, max = .1),
sds_2 = .1, #runif(n = 1, min = 0, max = .1),
zs_3_1 = rep(.1, 3), #runif(n = 3, min = -.1, max = .1),
sds_3 = .1, #runif(n = 1, min = 0, max = .1),
zs_4_1 = rep(.1, 3), #runif(n = 3, min = -.1, max = .1),
sds_4 = .1, #runif(n = 1, min = 0, max = .1),
Intercept_shape = .29,
#org
sd_1 = rep(.1, 2), #runif(n = 1, min = 0, max = .1),
z_1 = matrix(rep(0, N_org*2), nrow = 2),
L_1 = t(chol(matrix(c(1, .2, .2, 1), nrow = 2, byrow = TRUE))),
#user
sd_2 = rep(.1, 2), #runif(n = 1, min = 0, max = .1),
z_2 = matrix(rep(0, N_user*2), nrow = 2),
L_2 = t(chol(matrix(c(1, .2, .2, 1), nrow = 2, byrow = TRUE))),
#org
sd_3 = .1,
z_3 = matrix(rep(0, N_org), nrow = 1))
)
}
)
}<- make_init(length(unique(month_level_dt$org_id_fac)),
init length(unique(month_level_dt$user_id_fac)))
<- brm(formula = cycle_time_full_intx_lin_remonth_f, data = month_level_dt,
cycle_time_full_intx_lin_remonth_m prior = cycle_time_full_intx_lin_remonth_prior,
save_model = 'cycle_time_full_intx_lin_remonth.stan',
#control = list(adapt_delta = .99, max_treedepth = 20),
file = 'cycle_time_full_intx_lin_remonth',
sample_prior = 'no',
chains = 4, iter = 2000, warmup = 1000,
cores = 1, threads = 19,
init = init,
backend = 'cmdstanr', silent = 0, refresh = 1,
file_refit = 'never',
stan_model_args = list(cpp_options = list(
PRECOMPILED_HEADERS = FALSE, STAN_CPP_OPTIMS = TRUE,
"CXXFLAGS+= -O1 -march=native -mtune=native",
"CXXFLAGS+= -DEIGEN_USE_BLAS -DEIGEN_USE_LAPACKE",
"LDLIBS += -lblas -llapack -llapacke"),
force_recompile = TRUE))
#summary(cycle_time_full_intx_lin_remonth_m)
#traceplot_manyvars(cycle_time_full_intx_lin_remonth_m, variables = variables(cycle_time_full_intx_lin_remonth_m)[1:26])
In [50]:
<- brms_to_draws(cycle_time_full_intx_lin_remonth_m,
cycle_time_full_intx_lin_remonth_m_draws variables = variables(cycle_time_full_intx_lin_remonth_m)[1:25])
<- parameters::model_parameters(cycle_time_full_intx_lin_remonth_m_draws,
cycle_time_full_intx_lin_remonth_m_pars centrality = 'median', ci = .95, ci_method = 'hdi')
<- kableExtra::kbl(cycle_time_full_intx_lin_remonth_m_pars,
cycle_time_full_intx_lin_remonth_m_param_table digits = 2, format = 'pipe', booktabs = TRUE, tabular = TRUE,
caption = "Model parameter estimates")
if(knitr::is_html_output()){
::knit_print(kableExtra::kable_styling(cycle_time_full_intx_lin_remonth_m_param_table,
knitrbootstrap_options = c('striped', 'hover')))
else {
} ::knit_print(cycle_time_full_intx_lin_remonth_m_param_table)
knitr }
In [51]:
::mcmc_pairs(cycle_time_full_intx_lin_remonth_m, pars = variables(cycle_time_full_intx_lin_remonth_m)[1:26], off_diag_fun = 'hex') bayesplot
In [52]:
<- plot_marginal_effect(
cycle_time_full_intx_lin_remonth_quarter
cycle_time_full_intx_lin_remonth_m, 'within_quarter_month_num_c',
binwidth = list(c(.1, 10)), translabels.x = list(create_label_transform(2)),
plot_labels = list(labs(x = 'Within-quarter month number',
y = 'Median cycle time (days)'))) +
+
theme_clean theme(axis.text.x = element_text())
ggsave('cycle_time_full_intx_lin_remonth_quarter.png',
cycle_time_full_intx_lin_remonth_quarter,units = 'in', width = 6, height = 4, dpi = 300)
print(cycle_time_full_intx_lin_remonth_quarter)
<- plot_marginal_effect(
cycle_time_full_intx_lin_remonth_month_num_c
cycle_time_full_intx_lin_remonth_m, 'month_num_c',
binwidth = list(c(.25, 10)), translabels.x = list(create_label_transform(7)),
plot_labels = list(labs(x = 'Month',
y = 'Median cycle time (days)'))) +
+
theme_clean theme(axis.text.x = element_text())
ggsave('cycle_time_full_intx_lin_remonth_month_num.png',
cycle_time_full_intx_lin_remonth_month_num_c,units = 'in', width = 6, height = 4, dpi = 300)
print(cycle_time_full_intx_lin_remonth_month_num_c)
<- get_center(month_level_dt, 'yr_avg_avg_coding_days_per_week')
yr_avg_avg_coding_days_per_week_center <- plot_marginal_effect(
cycle_time_full_intx_lin_remonth_coding_days
cycle_time_full_intx_lin_remonth_m, 'wi_avg_coding_days_per_week',
binwidth = list(c(.5, 10)),
plot_labels = list(labs(x = 'Coding days per week\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
+
theme_clean plot_marginal_effect(cycle_time_full_intx_lin_remonth_m,
list('yr_avg_avg_coding_days_per_week_c' = c(1, 3, 5, 7) - yr_avg_avg_coding_days_per_week_center,
'month_num_c' = c(1, 3, 6, 9, 12) - 7),
binwidth = list(c(.5, 10)),
translabels.x = list(create_label_transform(yr_avg_avg_coding_days_per_week_center)),
translabels.fill = list(create_label_transform(7)),
plot_labels = list(labs(y = 'Median cycle time (days)',
x = 'Average coding days per week\n(year average)',
fill = 'Month',
color = 'Month'))) +
+
theme_clean plot_layout(design = 'AB')
ggsave('cycle_time_full_intx_lin_remonth_coding_days.png',
cycle_time_full_intx_lin_remonth_coding_days,units = 'in', width = 6, height = 4, dpi = 300)
print(cycle_time_full_intx_lin_remonth_coding_days)
<- get_center(month_level_dt, 'yr_avg_total_merged_prs')
yr_avg_total_merged_prs_center <- plot_marginal_effect(
cycle_time_full_intx_lin_remonth_merged_prs
cycle_time_full_intx_lin_remonth_m, 'wi_total_merged_prs',
binwidth = list(c(10, 10)),
plot_labels = list(labs(x = 'Total merged PRs\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
+
theme_clean plot_marginal_effect(cycle_time_full_intx_lin_remonth_m,
list('yr_avg_total_merged_prs_c' = c(1:4, 100) - yr_avg_total_merged_prs_center,
'month_num_c' = c(1, 3, 6, 9, 12) - 7),
translabels.x = list(create_label_transform(yr_avg_total_merged_prs_center)),
translabels.fill = list(create_label_transform(7)),
plot_labels = list(labs(y = 'Median cycle time (days)\n(year average)',
x = 'Total merged PRs',
fill = 'Month',
color = 'Month'))) +
+
theme_clean plot_layout(design = 'AB')
ggsave('cycle_time_full_intx_lin_remonth_merged_prs.png',
cycle_time_full_intx_lin_remonth_merged_prs,units = 'in', width = 6, height = 4, dpi = 300)
print(cycle_time_full_intx_lin_remonth_merged_prs)
<- get_center(month_level_dt, 'yr_avg_defect_tickets_pct_indiv')
yr_avg_defect_tickets_pct_indiv_center <- plot_marginal_effect(
cycle_time_full_intx_lin_remonth_defect_tickets
cycle_time_full_intx_lin_remonth_m, 'wi_defect_tickets_pct_indiv',
binwidth = list(c(10, 10)),
plot_labels = list(labs(x = 'Defect ticket percent\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
plot_marginal_effect(cycle_time_full_intx_lin_remonth_m,
list('yr_avg_defect_tickets_pct_indiv_c' = c(0, 25, 50, 75, 100) - yr_avg_defect_tickets_pct_indiv_center,
'month_num_c' = c(1, 3, 6, 9, 12) - 7),
translabels.x = list(create_label_transform(yr_avg_defect_tickets_pct_indiv_center)),
translabels.fill = list(create_label_transform(7)),
plot_labels = list(labs(y = 'Median cycle time (days)',
x = 'Defect ticket percent\n(year average)',
fill = 'Month',
color = 'Month'))) +
plot_layout(design = 'AB')
ggsave('cycle_time_full_intx_lin_remonth_defect_tickets.png',
cycle_time_full_intx_lin_remonth_defect_tickets,units = 'in', width = 6, height = 4, dpi = 300)
print(cycle_time_full_intx_lin_remonth_defect_tickets)
<- get_center(month_level_dt, 'yr_avg_degree_centrality_monthly_100')
yr_avg_degree_centrality_monthly_100_center <- plot_marginal_effect(
cycle_time_full_intx_lin_remonth_degree_cent
cycle_time_full_intx_lin_remonth_m, 'wi_degree_centrality_monthly_100',
binwidth = list(c(10,10)),
plot_labels = list(labs(x = 'Degree centrality\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
plot_marginal_effect(cycle_time_full_intx_lin_remonth_m,
list('yr_avg_degree_centrality_monthly_100_c' = c(0, 25, 50, 75, 100) - yr_avg_degree_centrality_monthly_100_center,
'month_num_c' = c(1, 3, 6, 9, 12) - 7),
translabels.x = list(create_label_transform(yr_avg_degree_centrality_monthly_100_center)),
translabels.fill = list(create_label_transform(7)),
plot_labels = list(labs(y = 'Median cycle time (days)',
x = 'Degree centrality\n(year average)',
fill = 'Month',
color = 'Month'))) +
plot_layout(design = 'AB')
ggsave('cycle_time_full_intx_lin_remonth_degree_cent.png',
cycle_time_full_intx_lin_remonth_degree_cent,units = 'in', width = 6, height = 4, dpi = 300)
print(cycle_time_full_intx_lin_remonth_degree_cent)
<- get_center(month_level_dt, 'yr_avg_comments_per_pr')
yr_avg_comments_per_pr_center <- plot_marginal_effect(cycle_time_full_intx_lin_remonth_m,
cycle_time_full_intx_lin_remonth_pr_comments 'wi_comments_per_pr',
binwidth = list(c(40, 10)),
plot_labels = list(labs(x = 'Comments per PR\n(within-person deviation)',
y = 'Median cycle time (days)'))) +
plot_marginal_effect(cycle_time_full_intx_lin_remonth_m,
list('yr_avg_comments_per_pr_c' = c(1, 5, 7, 12, 18) - yr_avg_comments_per_pr_center,
'month_num_c' = c(1, 3, 6, 9, 12) - 7),
translabels.x = list(create_label_transform(yr_avg_comments_per_pr_center)),
translabels.fill = list(create_label_transform(7)),
plot_labels = list(labs(y = 'Median cycle time (days)',
x = 'Comments per PR\n(year average)',
fill = 'Month',
color = 'Month'))) +
plot_layout(design = 'AB')
ggsave('cycle_time_full_intx_lin_remonth_pr_comments.png',
cycle_time_full_intx_lin_remonth_pr_comments,units = 'in', width = 6, height = 4, dpi = 300)
print(cycle_time_full_intx_lin_remonth_pr_comments)
In [53]:
<- posterior_predict(cycle_time_full_intx_lin_remonth_m, re_formula = NULL, ndraws = 500)
cycle_time_full_intx_lin_remonth_pp_draws <- cbind(as.data.table(cycle_time_full_intx_lin_remonth_m$data),
cycle_time_full_intx_lin_remonth_pp_draws_w_data as.data.table(t(cycle_time_full_intx_lin_remonth_pp_draws)))
<- melt(cycle_time_full_intx_lin_remonth_pp_draws_w_data,
cycle_time_full_intx_lin_remonth_pp_draws_l measure.vars = c(paste0('V', 1:500)))
:= .N, by = c('org_id_fac', 'variable')]
cycle_time_full_intx_lin_remonth_pp_draws_l[, N
<- cycle_time_full_intx_lin_remonth_pp_draws_l[N > 10]
cycle_time_full_intx_lin_remonth_pp_draws_l_filtered <- scales::transform_modulus(p = .325)
mod_trans := mod_trans$transform(value)]
cycle_time_full_intx_lin_remonth_pp_draws_l_filtered[, transformed_value <- cycle_time_full_intx_lin_remonth_pp_draws_l_filtered[
cycle_time_full_intx_lin_remonth_pp_draws_l_density > 10,
N
{<- density(transformed_value, n = 50)
d list(x = d$x, y = d$y, median_x = suppressWarnings(approx(cumsum(d$y)/sum(d$y), d$x, xout=0.5)$y), N = unique(N))
}, = c('org_id_fac', 'variable')
by
]:= median(median_x), by = c('org_id_fac')] cycle_time_full_intx_lin_remonth_pp_draws_l_density[, median_median_x
In [54]:
<- 4
N_col
<- function(ranks, n_rows, n_cols) {
get_col <- floor(n_rows/n_cols) # base rows per column
base <- n_rows %% n_cols # how many columns need an extra row
extra
<- c(0, cumsum(c(rep(base + 1, extra), rep(base, n_cols - extra))))
cutoffs
# Map to columns based on ranks
findInterval(ranks - 1, cutoffs)
}
<- unique(cycle_time_full_intx_lin_remonth_pp_draws_l_density[, c('org_id_fac', 'N', 'median_median_x')])
unique_orders := frankv(.SD, ties.method = 'dense'), .SDcols = c('N', 'org_id_fac')]
unique_orders[, N_rank := get_col(N_rank, .N, 4) - 1]
unique_orders[, col_num setorder(unique_orders, col_num, median_median_x, org_id_fac)
:= .I]
unique_orders[, I := .N:1, by = col_num]
unique_orders[, row_num
<- cycle_time_full_intx_lin_remonth_pp_draws_l_density[
cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders c('org_id_fac', 'col_num', 'row_num', 'I')],
unique_orders[, = 'org_id_fac']
on := reorder(org_id_fac, I, decreasing = FALSE)]
cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders[, org_id_fac
<- max(cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders$y)/4
y_separation <- 3
y_mag := (y*y_mag + y_separation*row_num)]
cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders[, y_ridges := y_separation*row_num]
cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders[, y_ridges_min
<- mod_trans$transform(c(60*60*12*1, 60*60*24*7, 60*60*24*7*4, 60*60*24*1*7*13, 60*60*24*1*7*52))
breaks_seconds_to_days names(breaks_seconds_to_days) <- c("12h", "1w", "4w", "1q", "1y")
<- cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders[variable %in% paste0('V', 1:50)]
cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders_filtered
ggplot(cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders_filtered, aes(x = x, y = y_ridges, group = interaction(org_id_fac, variable))) +
geom_segment(x = breaks_seconds_to_days['4w'], xend = breaks_seconds_to_days['4w'],
y = 0, yend = max(cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders_filtered$y_ridges),
color = viridis::mako(1, begin = .2),
linewidth = .25) +
geom_ribbon(aes(ymin = y_ridges_min, ymax = y_ridges, fill = N), alpha = 0.25) +
geom_line(linewidth = .05, alpha = .25, color = viridis::mako(1)) +
scale_x_continuous(breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
coord_cartesian(x = c(0, quantile(cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders_filtered$x, .975))) +
facet_grid(~col_num) +
theme_minimal() +
theme(
strip.text.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.x = element_line(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
strip.text = element_blank(),
axis.text.x = element_text(angle = 0, hjust = .5, family = font_name, size = 6)
+
) scale_fill_viridis_c(option = 'mako', transform = scales::transform_modulus(p = .1),
breaks = c(min(cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders_filtered$N),
100, 1000,
max(cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders_filtered$N)),
labels = format(c(min(cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders_filtered$N),
100, 1000,
max(cycle_time_full_intx_lin_remonth_pp_draws_l_density_orders_filtered$N)), big.mark = ","),
begin = .2,
guide = guide_colorbar(
barwidth = .5,
barheight = 5
+
)) labs(x = "Cycle time", y = "Posterior prediction probability density")
ggsave('cycle_time_full_intx_lin_remonth_pp_check.png', width = 6, height = 4, dpi = 300, units = 'in')
In [55]:
<- cycle_time_full_intx_lin_remonth_pp_draws_l_filtered[
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_density > 10,
N
{<- density(transformed_value, n = 500)
d list(x = d$x, y = d$y)
}, = c('variable')
by
]
<- as.data.table(cycle_time_full_intx_lin_remonth_m$data)
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data := .N, by = org_id_fac]
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data[, N <- cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data[N > 10]
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data := mod_trans$transform(median_cycle_time_s)]
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data[, median_cycle_time_s_modulus
<- seq(min(cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data$median_cycle_time_s_modulus),
breakpoints max(cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data$median_cycle_time_s_modulus), length.out = 50)
:= findInterval(median_cycle_time_s_modulus, breakpoints)]
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data[, intervals
<- diff(breakpoints)[1] # Width of each bin
bin_width <- nrow(cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data)
total_n
<- cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data[, .(density = .N / (bin_width*total_n)), by = intervals]
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data_hist setorder(cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data_hist, intervals)
<- data.table(bin_value = breakpoints[-1], intervals = 1:length(breakpoints[-1]))
breakpoints_dt <- cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data_hist[breakpoints_dt, on = 'intervals']
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data_hist := fifelse(is.na(density), 0, density)]
cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data_hist[, density
ggplot(cycle_time_full_intx_lin_remonth_pp_draws_l_summary_density, aes(x = x, y = y, group = variable)) +
geom_col(data = cycle_time_full_intx_lin_remonth_pp_draws_l_summary_data_hist, aes(x = bin_value, y = density, group = NULL),
fill = viridis::mako(1, begin = .35)) +
geom_ribbon(aes(ymax = y), ymin = 0, fill = viridis::mako(1, begin = .65), alpha = 0.025) +
geom_line(linewidth = .015, alpha = .5, color = viridis::mako(1)) +
scale_x_continuous(breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
coord_cartesian(x = c(0, quantile(cycle_time_full_intx_lin_remonth_pp_draws_l_summary_density$x, .60))) +
theme_minimal() +
theme(
strip.text.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.x = element_line(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
strip.text = element_blank(),
axis.text.x = element_text(angle = 0, hjust = .5, family = font_name, size = 6)
+
) labs(x = "Cycle time", y = "Posterior prediction\nprobability density")
ggsave('cycle_time_full_intx_lin_remonth_pp_check_summary.png', width = 4, height = 3, dpi = 300, units = 'in')
In [56]:
<- 'cycle_time_full_intx_lin_remonth_draws_re_sum.rds'
re_draws_org_fn <- 'cycle_time_full_intx_lin_remonth_draws_uidre_sum.rds'
re_draws_id_fn
if(!file.exists(re_draws_org_fn)){
<- spread_draws(cycle_time_full_intx_lin_remonth_m, r_org_id_fac[org_id_fac,par])
cycle_time_full_intx_lin_remonth_draws_re <- posterior::summarize_draws(cycle_time_full_intx_lin_remonth_draws_re, median, ~posterior::quantile2(.x, probs = c(1-.8, (1+.8))/2), ~posterior::quantile2(.x, probs = c(1-.5, (1+.5))/2))
cycle_time_full_intx_lin_remonth_draws_re_sum saveRDS(cycle_time_full_intx_lin_remonth_draws_re_sum, re_draws_org_fn)
else {
} <- readRDS(re_draws_org_fn)
cycle_time_full_intx_lin_remonth_draws_re_sum
}
if(!file.exists(re_draws_id_fn)){
<- spread_draws(cycle_time_full_intx_lin_remonth_m, `r_.*user_id_fac`[org_id_fac,par], regex = TRUE)
cycle_time_full_intx_lin_remonth_draws_uidre <- posterior::summarize_draws(cycle_time_full_intx_lin_remonth_draws_uidre, median, ~posterior::quantile2(.x, probs = c(1-.8, (1+.8))/2), ~posterior::quantile2(.x, probs = c(1-.5, (1+.5))/2))
cycle_time_full_intx_lin_remonth_draws_uidre_sum <- as.data.table(cycle_time_full_intx_lin_remonth_draws_uidre_sum)
cycle_time_full_intx_lin_remonth_draws_uidre_sum saveRDS(cycle_time_full_intx_lin_remonth_draws_uidre_sum, re_draws_id_fn)
else {
} <- readRDS(re_draws_id_fn)
cycle_time_full_intx_lin_remonth_draws_uidre_sum }
In [57]:
:= rank(median), by = 'par']
cycle_time_full_intx_lin_remonth_draws_uidre_sum[, order := order / (.N + 1), by = 'par']
cycle_time_full_intx_lin_remonth_draws_uidre_sum[, pp := qnorm(pp, mean = 0, sd = sd(median)), by = 'par']
cycle_time_full_intx_lin_remonth_draws_uidre_sum[, qq ggplot(cycle_time_full_intx_lin_remonth_draws_uidre_sum, aes(x = order)) +
geom_interval(aes(ymin = q10, ymax = q90), size = .125/4, alpha = .25, orientation = 'vertical') +
geom_interval(aes(ymin = q25, ymax = q75), size = .25/4, alpha = .25, orientation = 'vertical') +
geom_line(aes(y = qq), color = 'white', linewidth = 1, alpha = .5) +
facet_wrap(par~., scales = 'free_y') +
+
theme_clean theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
strip.text.x = element_text(size = 12)) +
labs(y = 'Individual deviation from population parameter')
In [58]:
<- melt(cycle_time_full_intx_lin_remonth_draws_uidre_sum[, c('median', 'q10', 'q25', 'q75', 'q90', 'par', 'org_id_fac')],
cycle_time_full_intx_lin_remonth_draws_uidre_sum_l measure.vars = c('median', 'q10', 'q25', 'q75', 'q90'),
variable.name = 'quantile', value.name = 'y_hat')
:= paste0(par, '_', quantile)]
cycle_time_full_intx_lin_remonth_draws_uidre_sum_l[, var <- dcast(cycle_time_full_intx_lin_remonth_draws_uidre_sum_l[, -'par'], org_id_fac ~ var, value.var = 'y_hat')
cycle_time_full_intx_lin_remonth_draws_uidre_sum_w ggplot(cycle_time_full_intx_lin_remonth_draws_uidre_sum_w, aes(x = Intercept_median, y = month_num_c_median)) +
geom_segment(aes(x = Intercept_q25, xend = Intercept_q75), size = .125, alpha = .35) +
geom_segment(aes(y = month_num_c_q25, yend = month_num_c_q75), size = .125, alpha = .35) +
geom_hex(bins = 120, alpha = 1) +
scale_fill_viridis_c(aesthetics = c('fill', 'color'), option = 'plasma', transform = scales::modulus_trans(p = -.3),
breaks = c(2, 10, 100)) +
+
theme_clean theme(axis.text.x = element_text())
In [59]:
set.seed(34232)
<- as.data.table(cycle_time_full_intx_lin_remonth_m$data)
cycle_time_full_intx_lin_remonth_m_data #using frank (fast rank)
<- function(x, n_quantiles = 20){
quantile_labeler <- length(x)
N 1 + (frank(x, ties.method = "min")-1) %/% (N/n_quantiles)) / n_quantiles
(
}
:= quantile_labeler(yr_avg_avg_coding_days_per_week_c)]
cycle_time_full_intx_lin_remonth_m_data[, yr_avg_avg_coding_days_per_week_c_quantile := quantile_labeler(wi_avg_coding_days_per_week)]
cycle_time_full_intx_lin_remonth_m_data[, wi_avg_coding_days_per_week_quantile := .N, by = 'user_id_fac']
cycle_time_full_intx_lin_remonth_m_data[, N_obs <- unique(cycle_time_full_intx_lin_remonth_m_data[N_obs >= 4, c('user_id_fac', 'yr_avg_avg_coding_days_per_week_c_quantile')])
cycle_time_full_intx_lin_remonth_m_data_subsample_ids
<- 20
nsamples :=
cycle_time_full_intx_lin_remonth_m_data_subsample_ids[, yr_avg_avg_coding_days_per_week_c_subsample sample(c(rep(1, nsamples), rep(0, .N-nsamples)), .N),
= yr_avg_avg_coding_days_per_week_c_quantile]
by
table(cycle_time_full_intx_lin_remonth_m_data_subsample_ids$yr_avg_avg_coding_days_per_week_c_subsample,
$yr_avg_avg_coding_days_per_week_c_quantile)
cycle_time_full_intx_lin_remonth_m_data_subsample_ids
<- cycle_time_full_intx_lin_remonth_m_data_subsample_ids[
cycle_time_full_intx_lin_remonth_m_data_subsample_ids == 1]
yr_avg_avg_coding_days_per_week_c_subsample
<- cycle_time_full_intx_lin_remonth_m_data[cycle_time_full_intx_lin_remonth_m_data_subsample_ids, on = 'user_id_fac'] cycle_time_full_intx_lin_remonth_m_data_subsample
In [60]:
<- cycle_time_full_intx_lin_remonth_m_data_subsample[, CJ(user_id_fac = unique(user_id_fac),
raw_data_ct_example_for_plot month_num_c = unique(month_num_c))]
<- merge(unique(cycle_time_full_intx_lin_remonth_m_data_subsample[, c('user_id_fac', 'yr_avg_avg_coding_days_per_week_c_quantile')]),
raw_data_ct_example_for_plot
raw_data_ct_example_for_plot, on = c('user_id_fac'))
<- merge(unique(cycle_time_full_intx_lin_remonth_m_data_subsample[, c('user_id_fac',
raw_data_ct_example_for_plot 'month_num_c',
'median_cycle_time_s',
'yr_avg_avg_coding_days_per_week_c')]),
raw_data_ct_example_for_plot, all = TRUE)
<- median(raw_data_ct_example_for_plot$median_cycle_time_s, na.rm = TRUE)
med_med_ct
ggplot(raw_data_ct_example_for_plot[yr_avg_avg_coding_days_per_week_c_quantile %in% seq(.05, 1, .25)],
aes(x = month_num_c, y = median_cycle_time_s)) +
geom_hline(yintercept = med_med_ct) +
geom_line(aes(group = user_id_fac, color = yr_avg_avg_coding_days_per_week_c), linewidth = .125, alpha = 1) +
scale_y_continuous(transform = scales::modulus_trans(p = .25)) +
scale_color_viridis_c(option = 'plasma', end = .8) +
facet_grid(~yr_avg_avg_coding_days_per_week_c_quantile) +
theme_minimal() +
theme(
strip.text.x = element_text(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [61]:
<- as.data.table(predictions(cycle_time_full_intx_lin_remonth_m,
subsample_predictions newdata = unique(cycle_time_full_intx_lin_remonth_m_data_subsample),
re_formula = NULL))
<- as.data.table(cycle_time_full_intx_lin_remonth_m_data_subsample)
cycle_time_full_intx_lin_remonth_m_data_subsample <- cycle_time_full_intx_lin_remonth_m_data_subsample[, CJ(user_id_fac = unique(user_id_fac),
pred_data_ct_example_for_plot month_num_c = unique(month_num_c))]
<- merge(
pred_data_ct_example_for_plot unique(cycle_time_full_intx_lin_remonth_m_data_subsample[, c('user_id_fac', 'yr_avg_avg_coding_days_per_week_c_quantile')]),
pred_data_ct_example_for_plot, on = c('user_id_fac'))
<- merge(unique(subsample_predictions[, c('user_id_fac',
pred_data_ct_example_for_plot 'month_num_c',
'estimate',
'conf.low', 'conf.high',
'yr_avg_avg_coding_days_per_week_c',
'wi_avg_coding_days_per_week')]),
pred_data_ct_example_for_plot, all = TRUE)
<- median(pred_data_ct_example_for_plot$estimate, na.rm = TRUE)
med_med_ct
<- scales::transform_modulus(p = .325)
mod_trans <- c(60*60*12*1, 60*60*24*7, 60*60*24*7*4, 60*60*24*1*7*13, 60*60*24*1*7*26)
breaks_seconds_to_days names(breaks_seconds_to_days) <- c("12h", "1w", "4w", "1q", "6m")
:= reorder(sprintf('%.0f%% - %.0f%%',
pred_data_ct_example_for_plot[, quantile_name *100 - 5,
yr_avg_avg_coding_days_per_week_c_quantile*100),
yr_avg_avg_coding_days_per_week_c_quantile
yr_avg_avg_coding_days_per_week_c_quantile)]
ggplot(pred_data_ct_example_for_plot[yr_avg_avg_coding_days_per_week_c_quantile %in% c(.1, .2, .5, .85, .95)],
aes(x = month_num_c, y = estimate)) +
geom_hline(yintercept = med_med_ct) +
geom_ribbon(aes(group = user_id_fac, ymin = conf.low, ymax = conf.high), alpha = .025) +
geom_line(aes(group = user_id_fac), linewidth = .125, alpha = 1) +
scale_y_continuous(transform = mod_trans, breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
facet_grid(~quantile_name) +
theme_minimal() +
theme(
strip.text.x = element_text(size = 8), # Adjust text size for readability
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
+
) labs(x = 'Calendar time', y = 'Cycle time')
ggsave('cycle_time_full_intx_lin_remonth_post_pred_ids.png', width = 4, height = 3, dpi = 300, units = 'in')
gc()
In [62]:
<- copy(pred_data_ct_example_for_plot)
pred_data_ct_example_for_plot_wi := median(estimate, na.rm = TRUE), by = user_id_fac]
pred_data_ct_example_for_plot_wi[, median_estimate c('wi_estimate', 'wi_conf.high', 'wi_conf.low') :=
pred_data_ct_example_for_plot_wi[, - median_estimate, conf.high - median_estimate, conf.low - median_estimate)]
.(estimate
:= reorder(sprintf('%.0f%% - %.0f%%',
pred_data_ct_example_for_plot_wi[, quantile_name *100 - 5,
yr_avg_avg_coding_days_per_week_c_quantile*100),
yr_avg_avg_coding_days_per_week_c_quantile
yr_avg_avg_coding_days_per_week_c_quantile)]ggplot(pred_data_ct_example_for_plot_wi[yr_avg_avg_coding_days_per_week_c_quantile %in% c(.1, .2, .5, .85, .95)],
aes(x = month_num_c, y = wi_estimate)) +
geom_hline(yintercept = 0) +
geom_ribbon(aes(group = user_id_fac, ymin = wi_conf.low, ymax = wi_conf.high), alpha = .025) +
geom_line(aes(group = user_id_fac, color = wi_avg_coding_days_per_week), linewidth = .125, alpha = 1) +
geom_point(aes(color = wi_avg_coding_days_per_week), size = .75) +
scale_color_viridis_c(option = 'plasma', transform = scales::modulus_trans(p = -.5), breaks = c(-1, 0, 1),
guide = guide_colorbar(direction = "horizontal", title.position = "top",
position = 'bottom', title.hjust = 0.5)) +
scale_y_continuous(transform = mod_trans, breaks = c(-rev(breaks_seconds_to_days), breaks_seconds_to_days),
labels = c(paste0('-', rev(names(breaks_seconds_to_days))), names(breaks_seconds_to_days))) +
facet_grid(~quantile_name) +
theme_minimal() +
theme(
strip.text.x = element_text(), # Adjust text size for readability
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
+
) labs(x = 'Calendar time', y = 'Deviation in cycle time', color = 'Within-person deviation in average coding days per week')
ggsave('cycle_time_full_intx_lin_remonth_post_pred_ids_win.png', width = 6, height = 3, dpi = 300, units = 'in')
In [63]:
set.seed(7331)
<- unique(cycle_time_full_intx_lin_remonth_m_data[N_obs == 12, c('user_id_fac', 'yr_avg_avg_coding_days_per_week_c_quantile')])
cycle_time_full_intx_lin_remonth_m_data_subsample_ids
#hist(cycle_time_full_intx_lin_remonth_m_data_subsample_ids$yr_avg_avg_coding_days_per_week_c_quantile)
<- 1
nsamples :=
cycle_time_full_intx_lin_remonth_m_data_subsample_ids[, yr_avg_avg_coding_days_per_week_c_subsample sample(c(rep(1, nsamples), rep(0, .N-nsamples)), .N),
= yr_avg_avg_coding_days_per_week_c_quantile]
by
<- cycle_time_full_intx_lin_remonth_m_data_subsample_ids[
cycle_time_full_intx_lin_remonth_m_data_subsample_ids %in% c(1, .9, .8, .7, .6, .5, .4, .3, .2, .1)
yr_avg_avg_coding_days_per_week_c_quantile
]
<- cycle_time_full_intx_lin_remonth_m_data_subsample_ids[
cycle_time_full_intx_lin_remonth_m_data_subsample_ids == 1]
yr_avg_avg_coding_days_per_week_c_subsample
<- cycle_time_full_intx_lin_remonth_m_data[cycle_time_full_intx_lin_remonth_m_data_subsample_ids, on = 'user_id_fac']
cycle_time_full_intx_lin_remonth_m_data_subsample
ggplot(cycle_time_full_intx_lin_remonth_m_data_subsample,
aes(x = month_num_c, y = median_cycle_time_s)) +
geom_line(aes(group = user_id_fac, color = yr_avg_avg_coding_days_per_week_c_quantile), linewidth = .5, alpha = 1) +
scale_y_continuous(transform = scales::modulus_trans(p = .25)) +
theme_minimal() +
scale_color_viridis_c(end = .8) +
theme(
strip.text.x = element_text(), # Adjust text size for readability
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
)
In [64]:
<- posterior_summary(posterior_epred(cycle_time_full_intx_lin_remonth_m,
cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_summary newdata = cycle_time_full_intx_lin_remonth_m_data_subsample,
re_formula = NULL))
<- posterior_predict(cycle_time_full_intx_lin_remonth_m,
cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_preds newdata = cycle_time_full_intx_lin_remonth_m_data_subsample,
re_formula = NULL, ndraws = 100)
<- cbind(cycle_time_full_intx_lin_remonth_m_data_subsample,
cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_preds_data as.data.table(t(cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_preds)))
<- melt(cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_preds_data,
cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_preds_data_l measure.vars = paste0('V', 1:100))
<- cbind(cycle_time_full_intx_lin_remonth_m_data_subsample,
cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_summary as.data.table(cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_summary))
<- c(60*60*12*1, 60*60*24*7, 60*60*24*7*4, 60*60*24*1*7*13, 60*60*24*1*7*26)
breaks_seconds_to_days names(breaks_seconds_to_days) <- c("12h", "1w", "4w", "1q", "6m")
<- month_level_dt[
cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_summary
, c('yr_avg_avg_coding_days_per_week', 'month_num_c', 'user_id_fac', 'org_id_fac')
][
cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_summary, = c('month_num_c', 'user_id_fac', 'org_id_fac')
on
]
:= reorder(user_id_fac, yr_avg_avg_coding_days_per_week)]
cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_summary[, user_id_fac_sorted := reorder(user_id_fac, yr_avg_avg_coding_days_per_week_c)] cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_preds_data_l[, user_id_fac_sorted
In [65]:
<- ggplot(cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_summary,
real_data_and_prediction_plot aes(x = month_num_c, y = median_cycle_time_s)) +
geom_line(aes(y = median_cycle_time_s), linewidth = .125, alpha = .5) +
geom_point(aes(y = median_cycle_time_s), size = .125, alpha = .5) +
geom_line(data = cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_preds_data_l,
aes(group = interaction(user_id_fac_sorted, variable), y = value),
linewidth = .15, alpha = .0625) +
geom_ribbon(aes(ymin = Q2.5, ymax = Q97.5, group = user_id_fac, fill = yr_avg_avg_coding_days_per_week),
linewidth = .0625, alpha = .5, color = 'white') +
geom_line(aes(y = Estimate, group = user_id_fac), color = 'white', linewidth = .65, alpha = 1) +
geom_line(aes(y = Estimate, group = user_id_fac, color = yr_avg_avg_coding_days_per_week), linewidth = .5, alpha = 1) +
scale_y_continuous(transform = mod_trans,
breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
theme_minimal() +
facet_wrap(~ user_id_fac_sorted, nrow = 2) +
scale_color_viridis_c(end = .7, aesthetics = c('fill', 'color'), breaks = c(2, 3, 4),
name = 'Year-average coding-days-per-week',
guide = guide_colorbar(direction = "horizontal", title.position = "top",
position = 'bottom', title.hjust = 0.5)) +
theme(
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing = unit(0, units = 'mm'),
strip.text = element_blank(),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8),
legend.key.size = unit(0.8, "cm"),
#legend.key.width = unit(0.4, "cm"),
legend.key.height = unit(0.3, "cm")
+
) labs(x = 'Calendar time', y = 'Cycle time') +
coord_cartesian(y = c(0, 60*60*24*7*25))
print(real_data_and_prediction_plot)
ggsave(real_data_and_prediction_plot, file = 'cycle_time_full_intx_lin_remonth_post_pred_samples.png', width = 6, height = 3, dpi = 300, units = 'in')
In [66]:
<- quantile(
yr_avg_avg_coding_days_per_week_c_quantiles unique(
$data[
cycle_time_full_intx_lin_remonth_mc('user_id_fac', 'yr_avg_avg_coding_days_per_week_c')]
, 'yr_avg_avg_coding_days_per_week_c']],
)[[seq(.1, .9, length.out = 3))
<- rbindlist(
cf_data lapply(names(yr_avg_avg_coding_days_per_week_c_quantiles),
\(x){<- copy(cycle_time_full_intx_lin_remonth_m_data_subsample)
a_dt := rep(yr_avg_avg_coding_days_per_week_c_quantiles[x], .N)]
a_dt[, yr_avg_avg_coding_days_per_week_c := rep(x, .N)]
a_dt[, quantiles return(a_dt)
})
)
= 100
ndraws <- posterior_predict(cycle_time_full_intx_lin_remonth_m,
cf_post_pred newdata = cf_data,
re_formula = NULL,
ndraws = ndraws)
<- posterior_summary(posterior_epred(cycle_time_full_intx_lin_remonth_m,
cf_post_epred_sum newdata = cf_data,
re_formula = NULL))
<- cbind(cf_data,
cf_data_post_pred as.data.table(t(cf_post_pred)))
<- cbind(cf_data,
cf_data_post_epred as.data.table(cf_post_epred_sum))
:= reorder(user_id_fac, yr_avg_avg_coding_days_per_week_c_quantile)]
cf_data_post_pred[, user_id_fac_sorted := reorder(user_id_fac, yr_avg_avg_coding_days_per_week_c_quantile)]
cf_data_post_epred[, user_id_fac_sorted
<- melt(cf_data_post_pred, measure.vars = paste0('V', 1:ndraws))
cf_data_post_pred_l
unique(cycle_time_full_intx_lin_remonth_m_data_subsample_posterior_summary[, .(diff = -yr_avg_avg_coding_days_per_week_c + yr_avg_avg_coding_days_per_week)])[[1]] + yr_avg_avg_coding_days_per_week_c_quantiles
diff = Estimate[quantiles == '50%'] - Estimate[quantiles == '90%']),
cf_data_post_epred[, .(= c('user_id_fac', 'month_num_c')][, .(med_diff = median(diff)), by = c('user_id_fac')][
by med_diff = median(med_diff) / (60*60*24))
, .( ]
In [67]:
<- ggplot(cf_data_post_pred_l,
real_data_and_cf_prediction_plot aes(x = month_num_c, y = value)) +
# geom_point(aes(group = interaction(variable, quantiles), color = quantiles), size = .25, alpha = .25) +
geom_bin2d(aes(group = interaction(quantiles, month_num_c), fill = quantiles, alpha = after_stat(count)), binwidth = c(1, 25),
position = position_dodge(width = 1, preserve = 'total')) +
#geom_line(aes(group = interaction(variable, quantiles), color = quantiles), linewidth = .25, alpha = .0625) +
geom_line(data = cf_data_post_epred, aes(y = Estimate, group = quantiles), color = 'white', linewidth = 1) +
geom_line(data = cf_data_post_epred, aes(y = Estimate, group = quantiles, color = quantiles), linewidth = .75) +
scale_y_continuous(transform = mod_trans,
breaks = breaks_seconds_to_days, labels = names(breaks_seconds_to_days)) +
scale_alpha_continuous(range = c(.4, 1), breaks = c(1, 5, 20),
guide = 'none', transform = scales::transform_modulus(p = 1)) +
scale_color_viridis_d(aesthetics = c('fill', 'color'), option = 'cividis', end = .7,
name = 'Counterfactual year-average-coding-days quantile',
guide = guide_legend(direction = "horizontal", title.position = "top",
position = 'bottom', title.hjust = 0.5)) +
# scale_color_manual(aesthetics = c('fill', 'color'),
# breaks = c("10%", "50%", "90%"),
# values = c("#29483A", "#759C44", "#DF3383"),
# name = 'Counterfactual Year Average Coding Days quantiles',
# guide = guide_legend(direction = "horizontal", title.position = "top",
# position = 'bottom', title.hjust = 0.5)) +
theme_minimal() +
facet_wrap( ~ user_id_fac_sorted, nrow = 2) +
theme(
axis.text.x = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing = unit(0, units = 'mm'),
strip.text = element_blank(),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8),
legend.key.size = unit(0.8, "cm"),
legend.key.width = unit(0.3, "cm"),
legend.key.height = unit(0.3, "cm")
+
) labs(x = 'Calendar time', y = 'Cycle time') +
coord_cartesian(y = c(0, 60*60*24*7*25))
print(real_data_and_cf_prediction_plot)
ggsave(real_data_and_cf_prediction_plot, file = 'cycle_time_full_intx_lin_remonth_post_pred_counterfactual.png', width = 6, height = 4, dpi = 300, units = 'in')
In [68]:
<- real_data_and_prediction_plot + real_data_and_cf_prediction_plot + plot_layout(axis_titles = 'collect')
combined_prediction_plot print(combined_prediction_plot)
ggsave(combined_prediction_plot, file = 'cycle_time_full_intx_lin_remonth_combined_post_pred_counterfactual.png', width = 7, height = 4, dpi = 300, units = 'in')
In [69]:
options("marginaleffects_posterior_interval" = "hdi")
options("marginaleffects_posterior_center" = "median")
<- marginaleffects::avg_comparisons(
coding_days_comparisons_90_50
cycle_time_full_intx_lin_remonth_m,variables = list(yr_avg_avg_coding_days_per_week_c = yr_avg_avg_coding_days_per_week_c_quantiles[3:2]),
newdata = cycle_time_full_intx_lin_remonth_m$data,
ndraws = 250,
re_formula = NULL)
<- marginaleffects::avg_comparisons(
coding_days_comparisons_50_10
cycle_time_full_intx_lin_remonth_m,variables = list(yr_avg_avg_coding_days_per_week_c = yr_avg_avg_coding_days_per_week_c_quantiles[2:1]),
newdata = cycle_time_full_intx_lin_remonth_m$data,
ndraws = 250,
re_formula = NULL)
<- 1/(60*60*24)
convert_s_to_d <- as.data.table(cycle_time_full_intx_lin_remonth_m$data)[, .(median_users = median(median_cycle_time_s)), by = 'user_id_fac'][, .(median = median(median_users))]*convert_s_to_d
raw_median_cycle_time sprintf("A difference between the 50th and 90th quantile of coding days is associated with a difference of %.2f (95%% CI = [%.2f, %.2f]) in cycle time (refer to overall median cycle time: %.2f)",
$estimate*convert_s_to_d,
coding_days_comparisons_90_50$conf.low*convert_s_to_d,
coding_days_comparisons_90_50$conf.high*convert_s_to_d,
coding_days_comparisons_90_50
raw_median_cycle_time)
sprintf("A difference between the 10th and 50th quantile of coding days is associated with a difference of %.2f (95%% CI = [%.2f, %.2f]) in cycle time (refer to overall median cycle time: %.2f)",
$estimate*convert_s_to_d,
coding_days_comparisons_50_10$conf.low*convert_s_to_d,
coding_days_comparisons_50_10$conf.high*convert_s_to_d,
coding_days_comparisons_50_10 raw_median_cycle_time)
[1] "A difference between the 50th and 90th quantile of coding days is associated with a difference of -2.05 (95% CI = [-2.59, -1.43]) in cycle time (refer to overall median cycle time: 13.01)"
[1] "A difference between the 10th and 50th quantile of coding days is associated with a difference of -1.78 (95% CI = [-2.35, -1.20]) in cycle time (refer to overall median cycle time: 13.01)"
In [70]:
<- marginaleffects::avg_comparisons(
coding_days_comparisons_90_50_by_month
cycle_time_full_intx_lin_remonth_m,variables = list(yr_avg_avg_coding_days_per_week_c = yr_avg_avg_coding_days_per_week_c_quantiles[3:2]),
by = c('org_id_fac', 'month_num_c'),
newdata = cycle_time_full_intx_lin_remonth_m$data,
ndraws = 250,
re_formula = NULL)
In [71]:
$org_id_fac_reorder <- reorder(coding_days_comparisons_90_50_by_month$org_id_fac, coding_days_comparisons_90_50_by_month$estimate, FUN = mean)
coding_days_comparisons_90_50_by_monthggplot(coding_days_comparisons_90_50_by_month, aes(x = month_num_c, y = org_id_fac_reorder)) +
geom_raster(aes(fill = estimate*convert_s_to_d)) +
scale_fill_viridis_c(transform = scales::transform_modulus(p = -.1), breaks = c(-.5, -1, -2, -4, -8)) +
+
theme_clean theme(
axis.text = element_blank(), # Remove axis text for clarity
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(), # Remove grid lines for simplicity
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing = unit(0, units = 'mm'),
strip.text = element_blank(),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8),
legend.key.size = unit(0.8, "cm"),
legend.key.width = unit(0.3, "cm"),
legend.key.height = unit(0.8, "cm")
+
) labs(x = 'Calendar time', y = 'Organization', fill = 'Change in\ncycle time (days)')