library(SamplingDesignTools)
library(survival)
library(Epi) # To draw (non-counter-matched) nested case-control sample
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
data("cohort_2")
<- 2
n_per_case <- ccwc(exit = t, fail = y, controls = n_per_case,
ncc_2 match = list(age_cat, gender), include = list(x, age, z),
data = cohort_2, silent = TRUE)
names(ncc_2)[-(1:4)] <- c("age_cat", "gender", "x", "age", "z")
head(ncc_2, 12)
## Set Map Time Fail age_cat gender x age z
## 1 1 36 0.2016047 1 (35,45] 1 1 -16 0
## 2 1 14705 0.2016047 0 (35,45] 1 1 -14 1
## 3 1 8990 0.2016047 0 (35,45] 1 1 -12 0
## 4 2 888 4.9218685 1 (35,45] 1 1 -20 1
## 5 2 49875 4.9218685 0 (35,45] 1 1 -15 0
## 6 2 17845 4.9218685 0 (35,45] 1 1 -14 0
## 7 3 1037 21.4651890 1 (35,45] 1 1 -15 1
## 8 3 17756 21.4651890 0 (35,45] 1 1 -11 0
## 9 3 93612 21.4651890 0 (35,45] 1 0 -11 0
## 10 4 1200 15.7674384 1 (35,45] 1 1 -15 1
## 11 4 91281 15.7674384 0 (35,45] 1 0 -11 1
## 12 4 51485 15.7674384 0 (35,45] 1 1 -15 0
3 Compute weights for NCC without full cohort
KM-type weights can be computed for the NCC sample as long the time of event/censoring for each subject is available, and the number of subjects at risk can be obtained (or approximated) elsewhere.
Examples in this section use cohort_2
as the underlying cohort, but consider a more realistic scenario where the cohort is no longer available, and subjects at risk are approximated using the number of subjects at risk in a year.
3.1 Load packages and data
3.2 Coarsen time
To match NCC sample to the coarsened time frame, create a variable t_yr
by rounding the exact event/censoring time, t
to the next integer, and use this coarsened time to compute the number at risk:
$t <- cohort_2$t[ncc_2$Map]
ncc_2$t_yr <- ceiling(ncc_2$t)
ncc_2$t_yr <- ceiling(cohort_2$t)
cohort_2<- compute_risk_table(cohort = cohort_2, t_name = "t_yr",
risk_table_coarse y_name = "y",
match_var_names = c("age_cat", "gender"))
## Start time is 0 for all subjects. Event/censoring time is given by variable t_yr.
## Joining, by = c("age_cat", "gender")
## Joining, by = "strata"
head(risk_table_coarse)
## t_event n_event n_at_risk age_cat gender
## 1 1 3 1165 (-Inf,35] 0
## 2 3 2 1045 (-Inf,35] 0
## 3 4 2 983 (-Inf,35] 0
## 4 5 1 928 (-Inf,35] 0
## 5 8 1 806 (-Inf,35] 0
## 6 9 3 772 (-Inf,35] 0
3.3 1:2 NCC based on coarsened time
In reality, number at risk at each event time may be approximated by, e.g., size of the relevant sub-population at mid-year. In such case, user may use the following function to generate a template for risk_table_coarse
to fill in:
<- prepare_risk_table(ncc = ncc_2, t_match_name = "t_yr",
risk_table_template y_name = "Fail",
match_var_names = c("gender", "age_cat"),
csv_file = NULL)
head(risk_table_template)
## t_event gender age_cat n_at_risk
## 1 1 0 (-Inf,35] NA
## 2 1 0 (35,45] NA
## 3 1 0 (45,55] NA
## 4 1 0 (55,65] NA
## 5 1 0 (65,75] NA
## 6 1 0 (75, Inf] NA
This template will be written to a csv
if specified by csv_file
, making it easier to supply information regarding the cohort that is required for computing the KM-type weights.
Assuming that risk_table_coarse
is the approximated risk table obtained from external sources, the KM-type weights for ncc_2
generated in the previous section can be computed using the same compute_km_weights()
function and subsequently analyzed using a weighted Cox approach:
<- compute_km_weights(ncc = ncc_2[, -1],
ncc_nodup2 risk_table_manual = risk_table_coarse,
t_name = "t", y_name = "Fail",
t_match_name = "t_yr",
id_name = "Map",
match_var_names = c("age_cat", "gender"),
n_per_case = 5)
## Make sure input ncc does not include ID of matched sets. Failing to do so results in Errors.
## Joining, by = c("age_cat", "gender")
## Joining, by = c(".t_event", "age_cat", "gender", "n_event")
## Joining, by = c("age_cat", "gender")
## There are 8003 unique subjects (identified by Map) in the input ncc with 8319
## rows, therefore the return data only has 8003 rows.
<- coxph(Surv(t, Fail) ~ x * z + age + gender,
m_cox_ncc_2_v2 data = ncc_nodup2, weights = km_weight, robust = TRUE)
3.4 Compare results
Compare with results when the full cohort is available:
<- rbind(summary(m_cox_cohort_2)$coef,
results_3 summary(m_cox_ncc_2)$coef[, -3],
summary(m_cox_ncc_2_v2)$coef[, -3])
<- data.frame(Variable = rownames(results_3), results_3,
results_3 check.names = FALSE)
rownames(results_3) <- NULL
kable(data.frame(
Data = c("Full cohort", rep("", 4),
"NCC (weighted Cox)", rep("", 4),
"NCC (weighted Cox, approximated weights)", rep("", 4)),
Variable = results_3$Variable,
`True HR` = rep(c(1.5, 4, 1.01, 1.01, 2), 3),
`Estimated HR` = results_3[, "exp(coef)"],
`SE of log(HR)` = results_3[, "se(coef)"],
`p-value` = results_3[, "Pr(>|z|)"], check.names = FALSE
digits = c(0, 0, 2, 2, 3, 3)) ),
Data | Variable | True HR | Estimated HR | SE of log(HR) | p-value |
---|---|---|---|---|---|
Full cohort | x | 1.50 | 1.47 | 0.110 | 0.001 |
z | 4.00 | 4.46 | 0.126 | 0.000 | |
age | 1.01 | 1.01 | 0.002 | 0.000 | |
gender | 1.01 | 0.92 | 0.038 | 0.024 | |
x:z | 2.00 | 1.90 | 0.135 | 0.000 | |
NCC (weighted Cox) | x | 1.50 | 1.36 | 0.117 | 0.009 |
z | 4.00 | 4.68 | 0.143 | 0.000 | |
gender | 1.01 | 0.91 | 0.054 | 0.081 | |
age | 1.01 | 1.01 | 0.003 | 0.003 | |
x:z | 2.00 | 1.80 | 0.154 | 0.000 | |
NCC (weighted Cox, approximated weights) | x | 1.50 | 1.31 | 0.110 | 0.014 |
z | 4.00 | 3.85 | 0.127 | 0.000 | |
age | 1.01 | 1.00 | 0.002 | 0.698 | |
gender | 1.01 | 1.00 | 0.038 | 0.979 | |
x:z | 2.00 | 1.41 | 0.136 | 0.012 |