porteretalndpuzzleaboutknowledge
/data/papers/porteretalndpuzzleaboutknowledge/analysis/effect_sizes.qmd
---
title: "Effect size computations: porteretalndpuzzleaboutknowledge"
format:
  html:
    toc: true
    embed-resources: true
execute:
  echo: true
  warning: true
  message: false
---

Computes split stakes effects by evidence condition (`weak` vs `strong`) from raw OSF data
for the extraction YAML `papers/porteretalndpuzzleaboutknowledge/porteretalndpuzzleaboutknowledge.yaml`.

- Evidence-fixed (binary knowledge attribution): exact 2x2 counts with `esc::esc_2x2`.
- Evidence-seeking (numeric checks required): group means/SDs with `esc::esc_mean_sd`.

Sign convention throughout:

- `d = mean(low stakes) - mean(high stakes)`

```{r}
suppressPackageStartupMessages({
  library(dplyr)
  library(esc)
})
```

## Data source and filtering

```{r}
paper_key <- "porteretalndpuzzleaboutknowledge"
raw_candidates <- c(
  "../data/GPP_study3_nlp.csv",
  "../data/GPP study 3 nlp.csv",
  "/tmp/GPP_study3_nlp.csv"
)

raw_path <- raw_candidates[file.exists(raw_candidates)][1]
if (is.na(raw_path) || !nzchar(raw_path)) {
  raw_path <- tempfile(fileext = ".csv")
  download.file("https://osf.io/download/59qd6/", destfile = raw_path, mode = "wb", quiet = TRUE)
}

raw <- read.csv(raw_path, check.names = FALSE)

sample_order <- c(
  "China",
  "Russia",
  "Slovakia",
  "Ecuador - Spanish",
  "India - Hindi",
  "India - Meitei",
  "Japan",
  "Morocco",
  "Peru - Shipibo",
  "Peru - Spanish",
  "South Africa - Afrikaans",
  "South Africa - Sepedi",
  "South Africa - isiZulu",
  "South Korea",
  "United States"
)

population_map <- c(
  "China" = "china",
  "Russia" = "russia",
  "Slovakia" = "europe_slovak",
  "Ecuador - Spanish" = "ecuador_spanish",
  "India - Hindi" = "india_hindi",
  "India - Meitei" = "india_meitei",
  "Japan" = "japan",
  "Morocco" = "morocco",
  "Peru - Shipibo" = "peru_shipibo",
  "Peru - Spanish" = "peru_spanish",
  "South Africa - Afrikaans" = "safrica_afrikaans",
  "South Africa - Sepedi" = "safrica_sepedi",
  "South Africa - isiZulu" = "safrica_isizulu",
  "South Korea" = "korea",
  "United States" = "usa_mturk"
)

# Paper filtering logic (as implemented in OSF analysis):
# q1_importance < 3, merge Russia sub-sites, age >= 18, and pass comprehension check via stakes==importance.
base <- raw %>%
  filter(q1_importance < 3) %>%
  mutate(
    q1_importance = ifelse(q1_importance == 1, 1, ifelse(q1_importance == 2, 0, q1_importance)),
    q2_knowledge = ifelse(q2_knowledge == 1, 1, ifelse(q2_knowledge == 2, 0, q2_knowledge)),
    importance = ifelse(q1_importance == 1, "H", ifelse(q1_importance == 0, "L", NA_character_)),
    population = case_when(
      population == "europe_russian_syktyvkar" ~ "russia",
      population == "europe_russian_moscow" ~ "russia",
      population == "europe_russian_stpbg" ~ "russia",
      TRUE ~ population
    ),
    age_numeric = suppressWarnings(as.numeric(as.character(age))),
    evidence_strength = case_when(
      num_checks == "O" ~ "weak",   # checked once
      num_checks == "F" ~ "strong", # checked several times
      TRUE ~ NA_character_
    )
  ) %>%
  filter(age_numeric >= 18, stakes == importance, !is.na(evidence_strength))

base %>%
  count(population, evidence_strength, stakes) %>%
  arrange(population, evidence_strength, stakes)
```

## Evidence-fixed split effects (2x2 counts -> d)

```{r}
compute_fixed_split <- function(pop, ev) {
  s <- base %>% filter(population == pop, evidence_strength == ev, !is.na(q2_knowledge))

  low_yes <- sum(s$stakes == "L" & s$q2_knowledge == 1)
  low_no <- sum(s$stakes == "L" & s$q2_knowledge == 0)
  high_yes <- sum(s$stakes == "H" & s$q2_knowledge == 1)
  high_no <- sum(s$stakes == "H" & s$q2_knowledge == 0)

  n_low <- low_yes + low_no
  n_high <- high_yes + high_no

  out <- list(
    n_low = n_low,
    n_high = n_high,
    mean_low = if (n_low > 0) low_yes / n_low else NA_real_,
    mean_high = if (n_high > 0) high_yes / n_high else NA_real_,
    sd_low = if (n_low > 1) sd(c(rep(1, low_yes), rep(0, low_no))) else NA_real_,
    sd_high = if (n_high > 1) sd(c(rep(1, high_yes), rep(0, high_no))) else NA_real_,
    low_yes = low_yes,
    low_no = low_no,
    high_yes = high_yes,
    high_no = high_no,
    cc_applied = FALSE,
    d = NA_real_,
    v = NA_real_,
    can_compute = FALSE,
    note = NA_character_
  )

  if (n_low == 0 || n_high == 0) {
    out$note <- "Missing one stakes group in this evidence stratum."
    return(out)
  }

  # Use continuity correction only when needed for zero cells.
  cc <- ifelse(any(c(low_yes, low_no, high_yes, high_no) == 0), 0.5, 0)
  out$cc_applied <- cc > 0

  fit <- tryCatch(
    esc::esc_2x2(
      grp1yes = low_yes + cc,
      grp1no = low_no + cc,
      grp2yes = high_yes + cc,
      grp2no = high_no + cc,
      es.type = "d"
    ),
    error = function(e) e
  )

  if (inherits(fit, "error")) {
    out$note <- paste("esc_2x2 failed:", fit$message)
    return(out)
  }

  out$d <- as.numeric(fit$es)
  out$v <- as.numeric(fit$var)
  out$can_compute <- is.finite(out$d) && is.finite(out$v)
  out$note <- if (out$cc_applied) {
    "Computed with esc::esc_2x2 with 0.5 continuity correction."
  } else {
    "Computed with esc::esc_2x2 from exact 2x2 counts."
  }
  out
}

fixed_rows <- list()
for (sid in seq_along(sample_order)) {
  sample <- sample_order[sid]
  pop <- population_map[[sample]]

  for (ev in c("weak", "strong")) {
    tmp <- compute_fixed_split(pop, ev)
    fixed_rows[[length(fixed_rows) + 1]] <- data.frame(
      paper_key = paper_key,
      study_id = sid,
      sample = sample,
      effect_id = if (ev == "weak") sprintf("s%d_e1", sid) else sprintf("s%d_e2", sid),
      domain = "evidence_fixed",
      evidence_strength = ev,
      n_low = tmp$n_low,
      n_high = tmp$n_high,
      mean_low = tmp$mean_low,
      mean_high = tmp$mean_high,
      sd_low = tmp$sd_low,
      sd_high = tmp$sd_high,
      low_yes = tmp$low_yes,
      low_no = tmp$low_no,
      high_yes = tmp$high_yes,
      high_no = tmp$high_no,
      cc_applied = tmp$cc_applied,
      d = tmp$d,
      v = tmp$v,
      can_compute = tmp$can_compute,
      note = tmp$note,
      stringsAsFactors = FALSE
    )
  }
}

fixed_results <- bind_rows(fixed_rows)
fixed_results
```

## Evidence-seeking split effects (means/SDs -> d)

```{r}
compute_seeking_split <- function(pop, ev) {
  s <- base %>% filter(population == pop, evidence_strength == ev, !is.na(nlp), is.finite(nlp))
  low <- s %>% filter(stakes == "L") %>% pull(nlp)
  high <- s %>% filter(stakes == "H") %>% pull(nlp)

  n_low <- length(low)
  n_high <- length(high)

  mean_low <- if (n_low > 0) mean(low) else NA_real_
  mean_high <- if (n_high > 0) mean(high) else NA_real_
  sd_low <- if (n_low > 1) sd(low) else NA_real_
  sd_high <- if (n_high > 1) sd(high) else NA_real_

  out <- list(
    n_low = n_low,
    n_high = n_high,
    mean_low = mean_low,
    mean_high = mean_high,
    sd_low = sd_low,
    sd_high = sd_high,
    d = NA_real_,
    v = NA_real_,
    can_compute = FALSE,
    note = NA_character_
  )

  if (n_low < 2 || n_high < 2 || !is.finite(sd_low) || !is.finite(sd_high)) {
    out$note <- "Insufficient per-group data for esc_mean_sd (need n>=2 and finite SD in both stakes groups)."
    return(out)
  }

  fit <- tryCatch(
    esc::esc_mean_sd(
      grp1m = mean_low,
      grp1sd = sd_low,
      grp1n = n_low,
      grp2m = mean_high,
      grp2sd = sd_high,
      grp2n = n_high,
      es.type = "d"
    ),
    error = function(e) e
  )

  if (inherits(fit, "error")) {
    out$note <- paste("esc_mean_sd failed:", fit$message)
    return(out)
  }

  out$d <- as.numeric(fit$es)
  out$v <- as.numeric(fit$var)
  out$can_compute <- is.finite(out$d) && is.finite(out$v)
  out$note <- "Computed with esc::esc_mean_sd from raw group means/SDs."
  out
}

seeking_rows <- list()
for (sid in seq_along(sample_order)) {
  sample <- sample_order[sid]
  pop <- population_map[[sample]]

  for (ev in c("weak", "strong")) {
    tmp <- compute_seeking_split(pop, ev)
    seeking_rows[[length(seeking_rows) + 1]] <- data.frame(
      paper_key = paper_key,
      study_id = sid,
      sample = sample,
      effect_id = if (ev == "weak") sprintf("s%d_e3", sid) else sprintf("s%d_e4", sid),
      domain = "evidence_seeking",
      evidence_strength = ev,
      n_low = tmp$n_low,
      n_high = tmp$n_high,
      mean_low = tmp$mean_low,
      mean_high = tmp$mean_high,
      sd_low = tmp$sd_low,
      sd_high = tmp$sd_high,
      d = tmp$d,
      v = tmp$v,
      can_compute = tmp$can_compute,
      note = tmp$note,
      stringsAsFactors = FALSE
    )
  }
}

seeking_results <- bind_rows(seeking_rows)
seeking_results
```

## Combined split effects

```{r}
all_split <- bind_rows(fixed_results, seeking_results) %>%
  arrange(study_id, effect_id)

all_split %>%
  group_by(domain) %>%
  summarise(k = n(), computable = sum(can_compute), .groups = "drop")

all_split
```

## Save machine-readable split results

```{r}
out_csv <- "../scratch/split_effects_from_raw.csv"
write.csv(all_split, out_csv, row.names = FALSE)
out_csv
```

## YAML copy/paste lines (`effect_size` only)

```{r}
ok <- all_split %>% filter(can_compute)
for (i in seq_len(nrow(ok))) {
  cat(sprintf(
    "%s (%s; %s): d=%.12f v=%.12f\n",
    ok$effect_id[i],
    ok$sample[i],
    ok$evidence_strength[i],
    ok$d[i],
    ok$v[i]
  ))
}
```