roseetal2019nothingstakeknowledge
/data/papers/roseetal2019nothingstakeknowledge/analysis/effect_sizes.qmd
---
title: "Effect size computations: roseetal2019nothingstakeknowledge"
format:
  html:
    toc: true
execute:
  echo: true
  warning: true
  message: false
---

Computes standardized mean differences (`d`) and sampling variances (`v`) for the
extraction YAML `papers/roseetal2019nothingstakeknowledge/roseetal2019nothingstakeknowledge.yaml`.

The paper reports site-level `N` and Cramer's `V` for 2×2 (stakes × binary outcome) χ² tests.
Because the 2×2 cell counts are not reported, we use the *phi coefficient* conversion formulas
from the Campbell Collaboration Effect Size Calculator equations (Correlation / phi coefficient, §1.29).
Campbell notes that if 2×2 cell counts (or per-condition proportions) are available, logit/probit-based approximations are preferable; this is the best available option given the reporting in this paper.

Sign convention in extraction: `d = mean(low) - mean(high)`.

## Inputs

Tables (extracted from the paper):
- Knowledge Attribution: `../out/tables/tabula_stream_p10_t2.csv`
- Strict Knowledge Attribution: `../out/tables/tabula_stream_p14_t5.csv`
- Strict Knowledge Attribution (conditional on Knowledge Attribution): `../out/tables/tabula_stream_p18_t7.csv`

Effect direction (sign of `d`) was taken from the paper’s difference plots (difference = Low − High):
- Knowledge Attribution: Fig. 2
- Strict Knowledge Attribution: Fig. 5
- Strict Knowledge Attribution (conditional): Fig. 8

## Formula (Campbell, §1.29)

Let `r` be the phi correlation coefficient. For a 2×2 table, Cramer's `V = |phi|`, so we set `r = |V|` and apply the sign from the figures.

- `d = 2r / sqrt(1 - r^2)`
- `v_d = d^2 / chi^2`
- `chi^2 = r^2 * (n1 + n2) = r^2 * N`

Substituting yields an equivalent expression (used below):

- `v_d = 4 / (N * (1 - r^2))`

## Compute d and v

```{r}
paper_key <- "roseetal2019nothingstakeknowledge"
sign_convention <- "d = mean(low) - mean(high)"

sites <- c(
  "Mexico", "USA", "Brazil", "Bulgaria", "France", "Germany", "Italy", "Portugal",
  "Spain", "Switzerland", "UK", "Iran", "China", "Hong Kong", "Guangzhou China",
  "Mainland China", "Mongolia", "Japan", "India"
)

read_table <- function(path) {
  df <- read.csv(path, na.strings = c("", " "), check.names = FALSE)
  # Normalize the curly apostrophe in “Cramer’s V” to simplify downstream use.
  names(df) <- gsub("\u2019", "'", names(df))

  df <- df[!is.na(df$N) & df$Sample %in% sites, ]
  df$Sample <- factor(df$Sample, levels = sites, ordered = TRUE)
  df <- df[order(df$Sample), ]

  data.frame(
    site = as.character(df$Sample),
    n_total = as.integer(df$N),
    v_cramers = as.numeric(df[["Cramer's V"]]),
    stringsAsFactors = FALSE
  )
}

phi_to_d_abs <- function(r_abs) {
  2 * r_abs / sqrt(1 - r_abs^2)
}

var_d_from_phi <- function(n_total, r_abs) {
  # Campbell, §1.29: v_d = d^2/chi^2 and chi^2 = r^2 * N
  # Equivalent (after substitution): v_d = 4/(N*(1-r^2))
  4 / (n_total * (1 - r_abs^2))
}

sign_e1 <- c(+1, +1, -1, +1, +1, +1, -1, -1, +1, -1, +1, -1, -1, +1, -1, -1, +1, +1, +1)
sign_e2 <- c(+1, +1, -1, +1, +1, +1, +1, +1, +1, +1, +1, -1, +1, +1, -1, -1, -1, +1, +1)
sign_e3 <- c(+1, +1, -1, +1, +1, +1, +1, +1, +1, +1, +1, -1, +1, +1, +1, -1, -1, +1, -1)

names(sign_e1) <- sites
names(sign_e2) <- sites
names(sign_e3) <- sites

compute_effects <- function(effect_suffix, subgroup, fig_sign_source, table_path, sign_map) {
  df <- read_table(table_path)
  r_abs <- abs(df$v_cramers)

  d_abs <- phi_to_d_abs(r_abs)
  d <- sign_map[df$site] * d_abs
  v <- var_d_from_phi(df$n_total, r_abs)

  data.frame(
    paper_key = paper_key,
    study_id = match(df$site, sites),
    effect_id = paste0("s", match(df$site, sites), "_", effect_suffix),
    site = df$site,
    subgroup = subgroup,
    sign_convention = sign_convention,
    sign_source = paste0("Fig. ", fig_sign_source, " (difference Low - High)"),
    n_total = df$n_total,
    cramers_v = df$v_cramers,
    r_abs = r_abs,
    sign = sign_map[df$site],
    d = d,
    v = v,
    stringsAsFactors = FALSE
  )
}

e1 <- compute_effects(
  effect_suffix = "e1",
  subgroup = "Knowledge Attribution",
  fig_sign_source = 2,
  table_path = "../out/tables/tabula_stream_p10_t2.csv",
  sign_map = sign_e1
)

e2 <- compute_effects(
  effect_suffix = "e2",
  subgroup = "Strict Knowledge Attribution",
  fig_sign_source = 5,
  table_path = "../out/tables/tabula_stream_p14_t5.csv",
  sign_map = sign_e2
)

e3 <- compute_effects(
  effect_suffix = "e3",
  subgroup = "Strict Knowledge Attribution (conditional)",
  fig_sign_source = 8,
  table_path = "../out/tables/tabula_stream_p18_t7.csv",
  sign_map = sign_e3
)

audit <- rbind(e1, e2, e3)
audit
```

## Quick sanity checks

```{r}
stopifnot(nrow(e1) == 19, nrow(e2) == 19, nrow(e3) == 19)
stopifnot(all(!is.na(audit$d)), all(audit$v >= 0))

# Ensure ordering matches the extraction YAML (study_id 1..19, then e1/e2/e3 per study).
audit$effect_order <- as.integer(sub(".*_e", "", audit$effect_id))
audit <- audit[order(audit$study_id, audit$effect_order), ]

audit[c("effect_id", "site", "subgroup", "n_total", "cramers_v", "sign", "d", "v")]
```