2  Compute KM-type weights for NCC sample

This section describes how to use compute_km_weights() to draw NCC samples and compute Kaplan-Meier type (KM-type) weights, and subsequently how to analyze the sample using a weighted Cox approach.

Examples in this section use cohort_2 as the underlying cohort.

References:

2.1 Load packages and data

library(SamplingDesignTools)
library(survival)
library(Epi) # To draw (non-counter-matched) nested case-control sample
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
data("cohort_2")

2.2 1:2 NCC matched on age and gender

Consider an idealistic scenario, where the full cohort is available to calculate the KM-type weights. A NCC sample was draw (using Epi::ccwc()) with 2 controls per case, matched on age group and gender.

n_per_case <- 2
ncc_2 <- ccwc(exit = t, fail = y, controls = n_per_case, 
              match = list(age_cat, gender), include = list(x, age, z), 
              data = cohort_2, silent = TRUE)
names(ncc_2)[-(1:4)] <- c("age_cat", "gender", "x", "age", "z")
head(ncc_2, 12)
##    Set   Map       Time Fail age_cat gender x age z
## 1    1    36  0.2016047    1 (35,45]      1 1 -16 0
## 2    1 14705  0.2016047    0 (35,45]      1 1 -14 1
## 3    1  8990  0.2016047    0 (35,45]      1 1 -12 0
## 4    2   888  4.9218685    1 (35,45]      1 1 -20 1
## 5    2 49875  4.9218685    0 (35,45]      1 1 -15 0
## 6    2 17845  4.9218685    0 (35,45]      1 1 -14 0
## 7    3  1037 21.4651890    1 (35,45]      1 1 -15 1
## 8    3 17756 21.4651890    0 (35,45]      1 1 -11 0
## 9    3 93612 21.4651890    0 (35,45]      1 0 -11 0
## 10   4  1200 15.7674384    1 (35,45]      1 1 -15 1
## 11   4 91281 15.7674384    0 (35,45]      1 0 -11 1
## 12   4 51485 15.7674384    0 (35,45]      1 1 -15 0
# Create the sampling and status indicator
sample_stat <- numeric(nrow(cohort_2))
sample_stat[unique(ncc_2$Map[ncc_2$Fail == 0])] <- 1
sample_stat[ncc_2$Map[ncc_2$Fail == 1]] <- 2
table(sample_stat)
## sample_stat
##     0     1     2 
## 91997  5230  2773
ncc_2_nodup <- compute_km_weights(cohort = cohort_2, t_name = "t", y_name = "y",
                                  sample_stat = sample_stat, 
                                  match_var_names = c("age_cat", "gender"), 
                                  n_per_case = n_per_case)
## Start time is 0 for all subjects. Event/censoring time is given by variable t.
## Joining, by = c("age_cat", "gender")
## Joining, by = "strata"
head(ncc_2_nodup)
##    id y          t x age age_cat gender z    km_prob km_weight
## 18 18 0 23.0020275 1 -11 (35,45]      0 1 0.07969598  12.54769
## 36 36 1  0.2016047 1 -16 (35,45]      1 0 1.00000000   1.00000
## 40 40 0 25.0000000 1  17 (65,75]      1 0 0.09368591  10.67396
## 44 44 0 20.1856606 0  -1 (45,55]      1 0 0.07193216  13.90199
## 61 61 1 14.8098399 1   7 (55,65]      0 0 1.00000000   1.00000
## 68 68 0  9.3384792 1  10 (65,75]      0 0 0.04448197  22.48102
summary(ncc_2_nodup$km_weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00   10.64   12.21   13.32  696.20
m_cox_ncc_2 <- coxph(Surv(t, y) ~ x * z + gender + age, data = ncc_2_nodup,
                     weights = km_weight, robust = TRUE)
m_clogit_ncc_2 <- clogit(Fail ~ x * z + strata(Set), data = ncc_2)

By breaking the matching in the NCC and performing a weighted analysis, users are also able to estimate the absolute risk that is not available from NCC in conventional conditional approach. This is not illustrated here but have been described in following papers:

2.3 Compare results

Compared to conventional conditional logistic regression analysis of matched sets, weighted Cox analysis of sampled subjects produces smaller SE, and provides estimates for matching factors.

results_2 <- rbind(summary(m_cox_cohort_2)$coef, 
                   summary(m_clogit_ncc_2)$coef, 
                   summary(m_cox_ncc_2)$coef[, -3])
results_2 <- data.frame(Variable = rownames(results_2), results_2, 
                        check.names = FALSE)
rownames(results_2) <- NULL
kable(data.frame(
  Data = c("Full cohort", rep("", 4), 
           "NCC (clogit)", rep("", 2), 
           "NCC (weighted Cox)", rep("", 4)), 
  Variable = results_2$Variable, 
  `True HR` = c(c(1.5, 4, 1.01, 1.01, 2), 
                c(1.5, 4, 2), c(1.5, 4, 1.01, 1.01, 2)),
  `Estimated HR` = results_2[, "exp(coef)"], 
  `SE of log(HR)` = results_2[, "se(coef)"], 
  `p-value` = results_2[, "Pr(>|z|)"], check.names = FALSE
), digits = c(0, 0, 2, 2, 3, 3))
Data Variable True HR Estimated HR SE of log(HR) p-value
Full cohort x 1.50 1.47 0.110 0.001
z 4.00 4.46 0.126 0.000
age 1.01 1.01 0.002 0.000
gender 1.01 0.92 0.038 0.024
x:z 2.00 1.90 0.135 0.000
NCC (clogit) x 1.50 1.35 0.122 0.014
z 4.00 5.05 0.153 0.000
x:z 2.00 1.67 0.163 0.002
NCC (weighted Cox) x 1.50 1.36 0.117 0.009
z 4.00 4.68 0.143 0.000
gender 1.01 0.91 0.054 0.081
age 1.01 1.01 0.003 0.003
x:z 2.00 1.80 0.154 0.000