2 Compute KM-type weights for NCC sample

This section describes how to use compute_km_weights() to draw NCC samples and compute Kaplan-Meier type (KM-type) weights, and subsequently how to analyze the sample using a weighted Cox approach.

Examples in this section use cohort_2 as the underlying cohort.

References:

Samuelsen SO. A psudolikelihood approach to analysis of nested case-control studies. Biometrika. 1997 Jun 1;84(2):379-94.
Borgan Ø, Samuelsen SO. A review of cohort sampling designs for Cox’s regression model: potentials in epidemiology. Norsk Epidemiologi. 2003;13(2).

2.1 Load packages and data

library(SamplingDesignTools)
library(survival)
library(Epi) # To draw (non-counter-matched) nested case-control sample
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
data("cohort_2")

2.2 1:2 NCC matched on age and gender

Consider an idealistic scenario, where the full cohort is available to calculate the KM-type weights. A NCC sample was draw (using Epi::ccwc()) with 2 controls per case, matched on age group and gender.

n_per_case <- 2
ncc_2 <- ccwc(exit = t, fail = y, controls = n_per_case, 
              match = list(age_cat, gender), include = list(x, age, z), 
              data = cohort_2, silent = TRUE)
names(ncc_2)[-(1:4)] <- c("age_cat", "gender", "x", "age", "z")
head(ncc_2, 12)
##    Set   Map       Time Fail age_cat gender x age z
## 1    1    36  0.2016047    1 (35,45]      1 1 -16 0
## 2    1 14705  0.2016047    0 (35,45]      1 1 -14 1
## 3    1  8990  0.2016047    0 (35,45]      1 1 -12 0
## 4    2   888  4.9218685    1 (35,45]      1 1 -20 1
## 5    2 49875  4.9218685    0 (35,45]      1 1 -15 0
## 6    2 17845  4.9218685    0 (35,45]      1 1 -14 0
## 7    3  1037 21.4651890    1 (35,45]      1 1 -15 1
## 8    3 17756 21.4651890    0 (35,45]      1 1 -11 0
## 9    3 93612 21.4651890    0 (35,45]      1 0 -11 0
## 10   4  1200 15.7674384    1 (35,45]      1 1 -15 1
## 11   4 91281 15.7674384    0 (35,45]      1 0 -11 1
## 12   4 51485 15.7674384    0 (35,45]      1 1 -15 0
# Create the sampling and status indicator
sample_stat <- numeric(nrow(cohort_2))
sample_stat[unique(ncc_2$Map[ncc_2$Fail == 0])] <- 1
sample_stat[ncc_2$Map[ncc_2$Fail == 1]] <- 2
table(sample_stat)
## sample_stat
##     0     1     2 
## 91997  5230  2773
ncc_2_nodup <- compute_km_weights(cohort = cohort_2, t_name = "t", y_name = "y",
                                  sample_stat = sample_stat, 
                                  match_var_names = c("age_cat", "gender"), 
                                  n_per_case = n_per_case)
## Start time is 0 for all subjects. Event/censoring time is given by variable t.
## Joining, by = c("age_cat", "gender")
## Joining, by = "strata"
head(ncc_2_nodup)
##    id y          t x age age_cat gender z    km_prob km_weight
## 18 18 0 23.0020275 1 -11 (35,45]      0 1 0.07969598  12.54769
## 36 36 1  0.2016047 1 -16 (35,45]      1 0 1.00000000   1.00000
## 40 40 0 25.0000000 1  17 (65,75]      1 0 0.09368591  10.67396
## 44 44 0 20.1856606 0  -1 (45,55]      1 0 0.07193216  13.90199
## 61 61 1 14.8098399 1   7 (55,65]      0 0 1.00000000   1.00000
## 68 68 0  9.3384792 1  10 (65,75]      0 0 0.04448197  22.48102
summary(ncc_2_nodup$km_weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00   10.64   12.21   13.32  696.20
m_cox_ncc_2 <- coxph(Surv(t, y) ~ x * z + gender + age, data = ncc_2_nodup,
                     weights = km_weight, robust = TRUE)
m_clogit_ncc_2 <- clogit(Fail ~ x * z + strata(Set), data = ncc_2)

By breaking the matching in the NCC and performing a weighted analysis, users are also able to estimate the absolute risk that is not available from NCC in conventional conditional approach. This is not illustrated here but have been described in following papers:

Salim A, Delcoigne B, Villaflores K, Koh WP, Yuan JM, van Dam RM, Reilly M. Comparisons of risk prediction methods using nested case‐control data. Statistics in medicine. 2017 Feb 10;36(3):455-65.
Delcoigne B, Colzani E, Prochazka M, Gagliardi G, Hall P, Abrahamowicz M, Czene K, Reilly M. Breaking the matching in nested case–control data offered several advantages for risk estimation. Journal of clinical epidemiology. 2017 Feb 1;82:79-86.

2.3 Compare results

Compared to conventional conditional logistic regression analysis of matched sets, weighted Cox analysis of sampled subjects produces smaller SE, and provides estimates for matching factors.

results_2 <- rbind(summary(m_cox_cohort_2)$coef, 
                   summary(m_clogit_ncc_2)$coef, 
                   summary(m_cox_ncc_2)$coef[, -3])
results_2 <- data.frame(Variable = rownames(results_2), results_2, 
                        check.names = FALSE)
rownames(results_2) <- NULL
kable(data.frame(
  Data = c("Full cohort", rep("", 4), 
           "NCC (clogit)", rep("", 2), 
           "NCC (weighted Cox)", rep("", 4)), 
  Variable = results_2$Variable, 
  `True HR` = c(c(1.5, 4, 1.01, 1.01, 2), 
                c(1.5, 4, 2), c(1.5, 4, 1.01, 1.01, 2)),
  `Estimated HR` = results_2[, "exp(coef)"], 
  `SE of log(HR)` = results_2[, "se(coef)"], 
  `p-value` = results_2[, "Pr(>|z|)"], check.names = FALSE
), digits = c(0, 0, 2, 2, 3, 3))

Data	Variable	True HR	Estimated HR	SE of log(HR)	p-value
Full cohort	x	1.50	1.47	0.110	0.001
	z	4.00	4.46	0.126	0.000
	age	1.01	1.01	0.002	0.000
	gender	1.01	0.92	0.038	0.024
	x:z	2.00	1.90	0.135	0.000
NCC (clogit)	x	1.50	1.35	0.122	0.014
	z	4.00	5.05	0.153	0.000
	x:z	2.00	1.67	0.163	0.002
NCC (weighted Cox)	x	1.50	1.36	0.117	0.009
	z	4.00	4.68	0.143	0.000
	gender	1.01	0.91	0.054	0.081
	age	1.01	1.01	0.003	0.003
	x:z	2.00	1.80	0.154	0.000