Data analysis summary day 1

Author

KB Christensen

Data example: five symptoms in RA patients

Without Any Difficulty? (0) With Some Difficulty? (1) With Much Difficulty? (2) Unable To Do (3)

Latent (unobservable) variable \(\Theta\), items \((X_i)_{i\in I}\), categorical exogenous variable \(Y\) (gender, treatment group, age group)

dat <- read.csv("https://biostat.ku.dk/PROMS/exampledata.csv")

tabulate items

table(dat$it1, useNA = "ifany")

   0    1    2    3 <NA> 
  71  163   79   80    9 
table(dat$it2, useNA = "ifany")

   0    1    2    3 <NA> 
 123  117   87   64   11 
table(dat$it3, useNA = "ifany")

   0    1    2    3 <NA> 
  45  115   97  136    9 
table(dat$it4, useNA = "ifany")

   0    1    2    3 <NA> 
  41  100  104  150    7 
table(dat$it5, useNA = "ifany")

   0    1    2    3 <NA> 
 114  131   88   64    5 

visualize items

count <- c(table(dat$it1), table(dat$it2), table(dat$it3), table(dat$it4), table(dat$it5))
item <- c(rep("it1", 4), rep("it2", 4), rep("it3", 4), rep("it4", 4), rep("it5", 4))
resp <- c(rep(c(0:3), 5))
data <- data.frame(count, item, resp)
## install.packages("ggplot2", repos="http://cran.rstudio.com/")
library(ggplot2)
ggplot(data, aes(fill=resp, y=count, x=item)) + 
  geom_bar(position="fill", stat="identity")

Comparing men and women using the total score

dat$score <- dat$it1 + dat$it2 + dat$it3 + dat$it4 + dat$it5
M <- dat$score[(dat$gender==1)]
W <- dat$score[(dat$gender==2)]
wilcox.test(M,W)

    Wilcoxon rank sum test with continuity correction

data:  M and W
W = 12891, p-value = 0.03381
alternative hypothesis: true location shift is not equal to 0

Graphical illustration of monotonicity

dat$R1 <- dat$it2 + dat$it3 + dat$it4 + dat$it5
qplot(dat$R1, dat$it1, geom = 'smooth', span = 0.95) + 
  labs(x = 'rest score', y = 'Item 1') +
  ylim(0, 4)

dat$R2 <- dat$it1 + dat$it3 + dat$it4 + dat$it5
qplot(dat$R2, dat$it2, geom = 'smooth', span = 0.95) + 
  labs(x = 'rest score', y = 'Item 2') + 
  ylim(0, 3)

dat$R3 <- dat$it1 + dat$it2 + dat$it4 + dat$it5
qplot(dat$R3, dat$it3, geom = 'smooth', span = 0.95) + 
  labs(x = 'rest score', y = 'Item 3') + 
  ylim(0, 3)

dat$R4 <- dat$it1 + dat$it2 + dat$it3 + dat$it5
qplot(dat$R4, dat$it4, geom = 'smooth', span = 0.95) + 
  labs(x = 'rest score', y = 'Item 4') +
  ylim(0, 3)

dat$R5 <- dat$it1 + dat$it2 + dat$it3 + dat$it4
qplot(dat$R5, dat$it5, geom = 'smooth', span = 0.95)  + 
  labs(x = 'rest score', y = 'Item 5') + 
  ylim(0, 3)

Another graphical illustration of monotonicity

dat$scoregroup <- cut(dat$score, 4)
means <- aggregate(it1 ~ scoregroup, dat, mean)
ggplot(data = means, aes(x = scoregroup, y = it1)) +
  geom_point(size = 3) +
  theme(legend.position = "none") +
  ylim(0, 3)

means <- aggregate(it2 ~ scoregroup, dat, mean)
ggplot(data = means, aes(x = scoregroup, y = it2)) +
  geom_point(size = 3) +
  theme(legend.position = "none") +
  ylim(0, 3)

means <- aggregate(it3 ~ scoregroup, dat, mean)
ggplot(data = means, aes(x = scoregroup, y = it3)) +
  geom_point(size = 3) +
  theme(legend.position = "none") +
  ylim(0, 3)

means <- aggregate(it4 ~ scoregroup, dat, mean)
ggplot(data = means, aes(x = scoregroup, y = it4)) +
  geom_point(size = 3) +
  theme(legend.position = "none") +
  ylim(0, 3)

means <- aggregate(it5 ~ scoregroup, dat, mean)
ggplot(data = means, aes(x = scoregroup, y = it5)) +
  geom_point(size = 3) +
  theme(legend.position = "none") +
  ylim(0, 3)

compute inter-item correlations

it <- c("it1", "it2", "it3", "it4", "it5")
correl <- cor(dat[, it], use = "pairwise.complete.obs")
round(correl, 2)
     it1  it2  it3  it4  it5
it1 1.00 0.73 0.77 0.74 0.74
it2 0.73 1.00 0.72 0.69 0.73
it3 0.77 0.72 1.00 0.90 0.72
it4 0.74 0.69 0.90 1.00 0.70
it5 0.74 0.73 0.72 0.70 1.00

Compute \(\alpha\)

## install.packages("multilevel", repos="http://cran.rstudio.com/")
library(multilevel)
cr <- cronbach(dat[, it])
cr$Alpha
[1] 0.9355742

CTT data analysis

Assume that

\[ \begin{bmatrix} x_1\\x_2\\x_3\\x_4\\x_5 \end{bmatrix} \sim N(\mu, \Sigma), \;\;\;\; \mu= \begin{bmatrix} \mu_1\\\mu_2\\\mu_3\\\mu_4\\\mu_5 \end{bmatrix}, \;\;\;\; \Sigma= \begin{bmatrix} \sigma^2_1&\sigma_{12}&\sigma_{13}&\sigma_{14}&\sigma_{15}\\ &\sigma^2_2&\sigma_{23}&\sigma_{24}&\sigma_{25}\\ &&\sigma^2_3&\sigma_{34}&\sigma_{35}\\ &&&\sigma^2_4&\sigma_{45}\\ &&&&\sigma^2_5 \end{bmatrix} \]

X <- dat[, c("it1", "it2", "it3", "it4", "it5")]
mu <- colMeans(X , na.rm = TRUE)
Sigma <- cov(X, use = "pairwise.complete.obs")

Scree plot

X.comp <- X[complete.cases(X), ]
pca <- prcomp(X.comp, scale = TRUE)
eigenvalues <- pca$sdev^2
plot(eigenvalues, 
     type = "b",
     xlab = "Principal Component",
     ylab = "Eigenvalue")

One “factor” explains most of the variation

X.comp <- X[complete.cases(X), ]
pca <- prcomp(X.comp, scale = TRUE)
eigenvalues <- pca$sdev^2
plot(eigenvalues/sum(eigenvalues), 
     type = "b",
     xlab = "Principal Component",
     ylab = "Percentage of Variance Explained")

EFA - two factors

\[ \begin{eqnarray} x_{1i} & = & \lambda_{11} f_{1i} + \lambda_{12} f_{2i} + u_{1i} \\ x_{2i} & = & \lambda_{21} f_{1i} + \lambda_{22} f_{2i} + u_{2i} \\ x_{3i} & = & \lambda_{31} f_{1i} + \lambda_{32} f_{2i} + u_{3i} \\ x_{4i} & = & \lambda_{41} f_{1i} + \lambda_{42} f_{2i} + u_{4i} \\ x_{5i} & = & \lambda_{51} f_{1i} + \lambda_{52} f_{2i} + u_{5i} \\ \end{eqnarray} \]

factanal(~ ., data = X, factors = 2, rotation = "none")

Call:
factanal(x = ~., factors = 2, data = X, rotation = "none")

Uniquenesses:
  it1   it2   it3   it4   it5 
0.250 0.285 0.017 0.166 0.244 

Loadings:
    Factor1 Factor2
it1  0.800   0.331 
it2  0.750   0.389 
it3  0.990         
it4  0.913         
it5  0.753   0.435 

               Factor1 Factor2
SS loadings      3.583   0.455
Proportion Var   0.717   0.091
Cumulative Var   0.717   0.808

Test of the hypothesis that 2 factors are sufficient.
The chi square statistic is 0.51 on 1 degree of freedom.
The p-value is 0.475 

EFA - one factor

\[ \begin{eqnarray} x_{1i} & = & \lambda_{11} f_{1i} + u_{1i} \\ x_{2i} & = & \lambda_{21} f_{1i} + u_{2i} \\ x_{3i} & = & \lambda_{31} f_{1i} + u_{3i} \\ x_{4i} & = & \lambda_{41} f_{1i} + u_{4i} \\ x_{5i} & = & \lambda_{51} f_{1i} + u_{5i} \\ \end{eqnarray} \]

factanal(~ ., data = X, factors = 1, rotation = "none")

Call:
factanal(x = ~., factors = 1, data = X, rotation = "none")

Uniquenesses:
  it1   it2   it3   it4   it5 
0.310 0.388 0.099 0.141 0.379 

Loadings:
    Factor1
it1 0.831  
it2 0.782  
it3 0.949  
it4 0.927  
it5 0.788  

               Factor1
SS loadings      3.682
Proportion Var   0.736

Test of the hypothesis that 1 factor is sufficient.
The chi square statistic is 113.21 on 5 degrees of freedom.
The p-value is 8.58e-23 

CFA better here because there is a strong theory about the structure:

\[ x_{k,i} = \lambda_{k} f_{1} + u_{k,i}, \;\;\;\;\; (k = 1,2,3,4,5) \]

# install.packages("lavaan")
library(lavaan)
model1 <- 'F1 =~ it1 + it2 + it3 + it4 + it5'
fit <- lavaan::cfa(model = model1, data = X)

This model imposes a structure on the correlation matrix. CFA tests this structure against the observed correlation matrix

library(lavaan)
summary(fit)
lavaan 0.6.17 ended normally after 21 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        10

                                                  Used       Total
  Number of observations                           379         402

Model Test User Model:
                                                      
  Test statistic                               114.471
  Degrees of freedom                                 5
  P-value (Chi-square)                           0.000

Parameter Estimates:

  Standard errors                             Standard
  Information                                 Expected
  Information saturated (h1) model          Structured

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)
  F1 =~                                               
    it1               1.000                           
    it2               1.002    0.055   18.165    0.000
    it3               1.170    0.047   24.888    0.000
    it4               1.136    0.047   23.942    0.000
    it5               0.988    0.054   18.367    0.000

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)
   .it1               0.315    0.026   12.219    0.000
   .it2               0.447    0.035   12.683    0.000
   .it3               0.106    0.015    7.218    0.000
   .it4               0.149    0.016    9.159    0.000
   .it5               0.418    0.033   12.640    0.000
    F1                0.701    0.071    9.849    0.000