# Clear work space and load necessary packages
rm(list = ls())
library(tidyverse) # General data handling and manipulation
library(mirt) # Estimate item response model
# Load item parameters from file
<- rio::import("choi_grm.xlsx")
item_parameter <- item_parameter$a
a_i <- item_parameter[, c("b1", "b2", "b3")] %>% as.matrix()
b_i <- item_parameter$Item item_names
Objective
The example demonstrates how to identify the sample size required to estimate the conditional reliability of a test using the graded response model (GRM; Samejima, 1969) with a given precision. It is assumed that respondents are randomly administered two out of three depression instruments, that is, the 21-item Beck’s Depression Inventory-II (BDI-II), the 20-item Center for Epidemiological Studies Depression Scale (CES-D), and 9-item the Patient Health Questionnaire (PHQ). These instruments are intended to screen patients for clinically relevant levels of depression. Therefore, the focus is on the measurement precision, that is conditional reliability, at two standard deviations above the mean.
I. Determining the data generation for the complete dataset
- Number and distribution of factors (unidimensional vs. multidimensional)
- Number of items and item parameters (discriminations, difficulties)
- Item type (dichotomous, polytomous)
A total of 50 items are included in the three measurement instruments. All items are answered on a four-point, ordered response scale, for example, CESD-1 “I was bothered by things that usually don’t bother me” from 0 (Rarely or none of the time) to 3 (Most or all of the time). The values for item discrimination (\(a\)) and item difficulty (\(b\)) are taken from Table 3 in Choi et al. (2014).
The function generate_grm_data
uses mirt::simdata
to simulate polytomous data given the item discriminations (\(a\)), the item difficulties (\(b\)), and the sample size (\(n\)). For polytomous items (i.e., multiple ordered categories) in the GRM, the probability of obtaining a category score is given by:
\[ P(X_{pi} \geq k | a_i, b_{ik}, \theta_p) = \frac{1}{1 + \exp(-a_i(\theta_p - b_{ik}))} \]
where \(b_{ik}\) is the threshold parameter for modeling the probability of scoring at or above category \(k\) on item \(i\). As a reminder, the mirt
package uses a matrix of intercepts (\(d\)) as input, so that the item difficulties have to be transformed (\(d = -a*b\)).
# Simulate graded item responses for n respondents to all items
# - 'a' denotes the item discriminations
# - 'b' denotes and the category thresholds
# - 'n' denotes the number of observations
<- function(a, b, n) {
generate_grm_data <-
resp simdata(a = a,
d = -a*b,
N = n,
itemtype = "graded") %>%
as.data.frame()
colnames(resp) <- item_parameter$Item
return(resp)
}
II. Defining the test design and the process of missing values
- Pattern of missingness (e.g., MCAR, linking design)
- Amount of missing data
In the present case, we assume a simple test design in which each test taker completes exactly two measurement instruments. Therefore, the function data_lomo_design
leaves one measure out (i.e., select the other two).
# Select two out of three measures
# - 'resp' denotes the complete data set
<- function(resp) {
data_lomo_design <- floor(nrow(resp) / 3)
n_per_test
<- resp %>%
resp mutate(across(!starts_with("CESD"), ~replace(., 1:n_per_test, NA))) %>%
mutate(across(!starts_with("PHQ"), ~replace(., (n_per_test+1):(2 * n_per_test), NA))) %>%
mutate(across(!starts_with("BDI"), ~replace(., (2 * n_per_test+1):n(), NA)))
return(resp)
}
III. Selecting the IRT model and the parameter of interest
- Underlying IRT model (e.g., 1PL, 2PL)
- IRT modeling software and estimation method
- Parameters to extract
# Estimate item response model
# - 'resp' denotes the data set
<- function(resp) {
estimate_irt
# Estimate a graded response model (GRM) using try-catch to handle errors
<- tryCatch(
mod mirt(data = resp, 1,
itemtype = "graded", # unidimensional GRM model
verbose = FALSE),
error = function(e) NULL
)
# Calculate the conditional reliabilities if the model is successfully estimated
<- if (!is.null(mod)) {
est <- testinfo(mod, Theta = seq(-4, 4, 0.1)) # get the test information
tinfo = tinfo / (tinfo + 1) # conditional reliability
rel else {
} = NA
rel
}<- data.frame(theta = seq(-4, 4, 0.1), rel = rel)
est return(est)
}
IV. Setting up the Monte Carlo Simulation
- Number of iterations
- Sample sizes to evaluate
The Monte Carlo simulation runs n_iterations
times, including the previous steps of (i) determining the data generation for the complete dataset, (ii) defining the test design and the process of missing values, (iii) selecting the IRT model and the parameter of interest. The simulation is run for different total sample sizes between 300 and 1050 (in increments of 150). Due to the chosen test design the sample size for each individual measure is 2/3 of the total sample size.
Using an estimated variance for the reliability (\(\sigma^2 = 0.01\)), a specified level of accuracy (\(\delta = .01\)), and a significance level (\(\alpha = .05\)), the number of iterations needed is approximately 385.
# Number of iterations
<- 385
n_iterations
# Set the seed for random number generation to ensure reproducibility
set.seed(2024)
# Create data frame for results (res)
<- data.frame()
res
# Loop over different sample sizes (from 300 to 1050, in steps of 150)
for (n_persons in seq(300, 1050, 150)) {
# Nested loop, running 'n_iterations' times
for (iter in 1:n_iterations) {
<- generate_grm_data(a_i, b_i, n_persons) %>%
dat data_lomo_design()
<- dat %>% dplyr::select(starts_with("PHQ")) %>%
rel1 %>% estimate_irt %>% filter(theta==2.0) %>% pull(rel)
na.omit <- dat %>% dplyr::select(starts_with("BDI")) %>%
rel2 %>% estimate_irt %>% filter(theta==2.0) %>% pull(rel)
na.omit <- dat %>% dplyr::select(starts_with("CESD")) %>%
rel3 %>% estimate_irt %>% filter(theta==2.0) %>% pull(rel)
na.omit
# Create result set with iter, N, and the estimated reliabilities
<- bind_rows(res,
res data.frame(iteration = iter,
n_persons = n_persons,
rel_phq = rel1,
rel_bdi = rel2,
rel_cesd = rel3))
} }
Results
The figure shows the accuracy of the conditional reliability estimate at the boundary between moderate and severe symptom severity for all three depression measures. We use the root mean square error of the reliability, \(\mathrm{RMSE}(\rho) = \sqrt{M((\rho_{\text{est}} - \rho_{\text{true}})^2)}\), at a given theta value (\(\theta = 2.0\)).
# The function estimates the conditional reliability at a given point in the
# trait distribution (theta). Specifically, a response matrix 'resp' is
# generated with columns representing items, then a parameter object 'pars'
# for a graded response model is initiated using the mirt The discrimination
# parameters 'a' and the threshold parameters 'b' are assigned to 'pars'.
# The parameters are fixed - not estimated - by setting 'pars$est' to FALSE.
<- function(a, b, theta) {
rel_true # Fixed item parameters
<- matrix(rep(seq_len(ncol(b) + 1), length(a)), ncol = length(a))
resp colnames(resp) <- paste0("i", seq_len(ncol(resp)))
<- mirt(data = resp, 1, itemtype = "graded", pars = "values")
pars $value[pars$name == "a1"] <- a
pars$value[grepl("^d", pars$name)] <- c(t(-a*b))
pars$est <- FALSE
pars
# Model with fixed item parameters
<- mirt(data = resp, itemtype = "graded",
mod pars = pars, verbose = FALSE)
# Conditional reliability
<- testinfo(mod, Theta = theta)
tinf <- tinf / (1 + tinf)
rel return(rel)
}
# Extract item parameters for the depression measures
<- item_parameter %>% filter(str_starts(Item, "PHQ"))
ipar_phq <- item_parameter %>% filter(str_starts(Item, "BDI"))
ipar_bdi <- item_parameter %>% filter(str_starts(Item, "CESD"))
ipar_cesd
# Calculate true reliabilities of the depression measures
<- rel_true(ipar_phq$a, ipar_phq[, c("b1", "b2", "b3")], 2.0)
rel_true_phq <- rel_true(ipar_bdi$a, ipar_bdi[, c("b1", "b2", "b3")], 2.0)
rel_true_bdi <- rel_true(ipar_cesd$a, ipar_cesd[, c("b1", "b2", "b3")], 2.0)
rel_true_cesd
# Preparation and aggregation of results
<- res %>%
res_plot group_by(n_persons) %>%
summarise(
RMSE_rel_phq = sqrt(mean((rel_phq - rel_true_phq)^2)), # RMSE rel(PHQ)
RMSE_rel_bdi = sqrt(mean((rel_bdi - rel_true_bdi)^2)), # RMSE rel(BDI)
RMSE_rel_cesd = sqrt(mean((rel_cesd - rel_true_cesd)^2)), # RMSE rel(CESD)
.groups = 'drop')
# Plot the RMSE for the reliability estimates across measures
# n_persons/3*2 = sample size per measure
ggplot(data=res_plot, aes(x=n_persons/3*2)) +
geom_line(aes(y = RMSE_rel_phq, color = "PHQ"), linewidth = 0.8, alpha = 0.7) +
geom_point(aes(y = RMSE_rel_phq, color = "PHQ"), size = 1.5, alpha = 0.7) +
geom_line(aes(y = RMSE_rel_bdi, color = "BDI"), linewidth = 0.8, alpha = 0.7) +
geom_point(aes(y = RMSE_rel_bdi, color = "BDI"), size = 1.5, alpha = 0.7) +
geom_line(aes(y = RMSE_rel_cesd, color = "CESD"), linewidth = 0.8, alpha = 0.7) +
geom_point(aes(y = RMSE_rel_cesd, color = "CESD"), size = 1.5, alpha = 0.7) +
scale_color_manual(
values = c("PHQ" = "gold",
"BDI" = "darkorange",
"CESD" = "darkorange3"),
labels = c("PHQ" = "PHQ",
"BDI" = "BDI",
"CESD" = "CESD")
+
) geom_abline(intercept = .01, slope = 0, col="red", lty = "twodash") +
labs(
x = "Sample size (per measure)",
y = "RMSE(reliability)",
color = "Measure"
+
) ylim(0, 0.03) +
xlim(150, 750) +
theme_bw() +
theme(
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12),
legend.position = "inside",
legend.position.inside = c(.90, .85)
)