porteretalndpuzzleaboutknowledge
/data/papers/porteretalndpuzzleaboutknowledge/analysis/effect_sizes.qmd---
title: "Effect size computations: porteretalndpuzzleaboutknowledge"
format:
html:
toc: true
embed-resources: true
execute:
echo: true
warning: true
message: false
---
Computes split stakes effects by evidence condition (`weak` vs `strong`) from raw OSF data
for the extraction YAML `papers/porteretalndpuzzleaboutknowledge/porteretalndpuzzleaboutknowledge.yaml`.
- Evidence-fixed (binary knowledge attribution): exact 2x2 counts with `esc::esc_2x2`.
- Evidence-seeking (numeric checks required): group means/SDs with `esc::esc_mean_sd`.
Sign convention throughout:
- `d = mean(low stakes) - mean(high stakes)`
```{r}
suppressPackageStartupMessages({
library(dplyr)
library(esc)
})
```
## Data source and filtering
```{r}
paper_key <- "porteretalndpuzzleaboutknowledge"
raw_candidates <- c(
"../data/GPP_study3_nlp.csv",
"../data/GPP study 3 nlp.csv",
"/tmp/GPP_study3_nlp.csv"
)
raw_path <- raw_candidates[file.exists(raw_candidates)][1]
if (is.na(raw_path) || !nzchar(raw_path)) {
raw_path <- tempfile(fileext = ".csv")
download.file("https://osf.io/download/59qd6/", destfile = raw_path, mode = "wb", quiet = TRUE)
}
raw <- read.csv(raw_path, check.names = FALSE)
sample_order <- c(
"China",
"Russia",
"Slovakia",
"Ecuador - Spanish",
"India - Hindi",
"India - Meitei",
"Japan",
"Morocco",
"Peru - Shipibo",
"Peru - Spanish",
"South Africa - Afrikaans",
"South Africa - Sepedi",
"South Africa - isiZulu",
"South Korea",
"United States"
)
population_map <- c(
"China" = "china",
"Russia" = "russia",
"Slovakia" = "europe_slovak",
"Ecuador - Spanish" = "ecuador_spanish",
"India - Hindi" = "india_hindi",
"India - Meitei" = "india_meitei",
"Japan" = "japan",
"Morocco" = "morocco",
"Peru - Shipibo" = "peru_shipibo",
"Peru - Spanish" = "peru_spanish",
"South Africa - Afrikaans" = "safrica_afrikaans",
"South Africa - Sepedi" = "safrica_sepedi",
"South Africa - isiZulu" = "safrica_isizulu",
"South Korea" = "korea",
"United States" = "usa_mturk"
)
# Paper filtering logic (as implemented in OSF analysis):
# q1_importance < 3, merge Russia sub-sites, age >= 18, and pass comprehension check via stakes==importance.
base <- raw %>%
filter(q1_importance < 3) %>%
mutate(
q1_importance = ifelse(q1_importance == 1, 1, ifelse(q1_importance == 2, 0, q1_importance)),
q2_knowledge = ifelse(q2_knowledge == 1, 1, ifelse(q2_knowledge == 2, 0, q2_knowledge)),
importance = ifelse(q1_importance == 1, "H", ifelse(q1_importance == 0, "L", NA_character_)),
population = case_when(
population == "europe_russian_syktyvkar" ~ "russia",
population == "europe_russian_moscow" ~ "russia",
population == "europe_russian_stpbg" ~ "russia",
TRUE ~ population
),
age_numeric = suppressWarnings(as.numeric(as.character(age))),
evidence_strength = case_when(
num_checks == "O" ~ "weak", # checked once
num_checks == "F" ~ "strong", # checked several times
TRUE ~ NA_character_
)
) %>%
filter(age_numeric >= 18, stakes == importance, !is.na(evidence_strength))
base %>%
count(population, evidence_strength, stakes) %>%
arrange(population, evidence_strength, stakes)
```
## Evidence-fixed split effects (2x2 counts -> d)
```{r}
compute_fixed_split <- function(pop, ev) {
s <- base %>% filter(population == pop, evidence_strength == ev, !is.na(q2_knowledge))
low_yes <- sum(s$stakes == "L" & s$q2_knowledge == 1)
low_no <- sum(s$stakes == "L" & s$q2_knowledge == 0)
high_yes <- sum(s$stakes == "H" & s$q2_knowledge == 1)
high_no <- sum(s$stakes == "H" & s$q2_knowledge == 0)
n_low <- low_yes + low_no
n_high <- high_yes + high_no
out <- list(
n_low = n_low,
n_high = n_high,
mean_low = if (n_low > 0) low_yes / n_low else NA_real_,
mean_high = if (n_high > 0) high_yes / n_high else NA_real_,
sd_low = if (n_low > 1) sd(c(rep(1, low_yes), rep(0, low_no))) else NA_real_,
sd_high = if (n_high > 1) sd(c(rep(1, high_yes), rep(0, high_no))) else NA_real_,
low_yes = low_yes,
low_no = low_no,
high_yes = high_yes,
high_no = high_no,
cc_applied = FALSE,
d = NA_real_,
v = NA_real_,
can_compute = FALSE,
note = NA_character_
)
if (n_low == 0 || n_high == 0) {
out$note <- "Missing one stakes group in this evidence stratum."
return(out)
}
# Use continuity correction only when needed for zero cells.
cc <- ifelse(any(c(low_yes, low_no, high_yes, high_no) == 0), 0.5, 0)
out$cc_applied <- cc > 0
fit <- tryCatch(
esc::esc_2x2(
grp1yes = low_yes + cc,
grp1no = low_no + cc,
grp2yes = high_yes + cc,
grp2no = high_no + cc,
es.type = "d"
),
error = function(e) e
)
if (inherits(fit, "error")) {
out$note <- paste("esc_2x2 failed:", fit$message)
return(out)
}
out$d <- as.numeric(fit$es)
out$v <- as.numeric(fit$var)
out$can_compute <- is.finite(out$d) && is.finite(out$v)
out$note <- if (out$cc_applied) {
"Computed with esc::esc_2x2 with 0.5 continuity correction."
} else {
"Computed with esc::esc_2x2 from exact 2x2 counts."
}
out
}
fixed_rows <- list()
for (sid in seq_along(sample_order)) {
sample <- sample_order[sid]
pop <- population_map[[sample]]
for (ev in c("weak", "strong")) {
tmp <- compute_fixed_split(pop, ev)
fixed_rows[[length(fixed_rows) + 1]] <- data.frame(
paper_key = paper_key,
study_id = sid,
sample = sample,
effect_id = if (ev == "weak") sprintf("s%d_e1", sid) else sprintf("s%d_e2", sid),
domain = "evidence_fixed",
evidence_strength = ev,
n_low = tmp$n_low,
n_high = tmp$n_high,
mean_low = tmp$mean_low,
mean_high = tmp$mean_high,
sd_low = tmp$sd_low,
sd_high = tmp$sd_high,
low_yes = tmp$low_yes,
low_no = tmp$low_no,
high_yes = tmp$high_yes,
high_no = tmp$high_no,
cc_applied = tmp$cc_applied,
d = tmp$d,
v = tmp$v,
can_compute = tmp$can_compute,
note = tmp$note,
stringsAsFactors = FALSE
)
}
}
fixed_results <- bind_rows(fixed_rows)
fixed_results
```
## Evidence-seeking split effects (means/SDs -> d)
```{r}
compute_seeking_split <- function(pop, ev) {
s <- base %>% filter(population == pop, evidence_strength == ev, !is.na(nlp), is.finite(nlp))
low <- s %>% filter(stakes == "L") %>% pull(nlp)
high <- s %>% filter(stakes == "H") %>% pull(nlp)
n_low <- length(low)
n_high <- length(high)
mean_low <- if (n_low > 0) mean(low) else NA_real_
mean_high <- if (n_high > 0) mean(high) else NA_real_
sd_low <- if (n_low > 1) sd(low) else NA_real_
sd_high <- if (n_high > 1) sd(high) else NA_real_
out <- list(
n_low = n_low,
n_high = n_high,
mean_low = mean_low,
mean_high = mean_high,
sd_low = sd_low,
sd_high = sd_high,
d = NA_real_,
v = NA_real_,
can_compute = FALSE,
note = NA_character_
)
if (n_low < 2 || n_high < 2 || !is.finite(sd_low) || !is.finite(sd_high)) {
out$note <- "Insufficient per-group data for esc_mean_sd (need n>=2 and finite SD in both stakes groups)."
return(out)
}
fit <- tryCatch(
esc::esc_mean_sd(
grp1m = mean_low,
grp1sd = sd_low,
grp1n = n_low,
grp2m = mean_high,
grp2sd = sd_high,
grp2n = n_high,
es.type = "d"
),
error = function(e) e
)
if (inherits(fit, "error")) {
out$note <- paste("esc_mean_sd failed:", fit$message)
return(out)
}
out$d <- as.numeric(fit$es)
out$v <- as.numeric(fit$var)
out$can_compute <- is.finite(out$d) && is.finite(out$v)
out$note <- "Computed with esc::esc_mean_sd from raw group means/SDs."
out
}
seeking_rows <- list()
for (sid in seq_along(sample_order)) {
sample <- sample_order[sid]
pop <- population_map[[sample]]
for (ev in c("weak", "strong")) {
tmp <- compute_seeking_split(pop, ev)
seeking_rows[[length(seeking_rows) + 1]] <- data.frame(
paper_key = paper_key,
study_id = sid,
sample = sample,
effect_id = if (ev == "weak") sprintf("s%d_e3", sid) else sprintf("s%d_e4", sid),
domain = "evidence_seeking",
evidence_strength = ev,
n_low = tmp$n_low,
n_high = tmp$n_high,
mean_low = tmp$mean_low,
mean_high = tmp$mean_high,
sd_low = tmp$sd_low,
sd_high = tmp$sd_high,
d = tmp$d,
v = tmp$v,
can_compute = tmp$can_compute,
note = tmp$note,
stringsAsFactors = FALSE
)
}
}
seeking_results <- bind_rows(seeking_rows)
seeking_results
```
## Combined split effects
```{r}
all_split <- bind_rows(fixed_results, seeking_results) %>%
arrange(study_id, effect_id)
all_split %>%
group_by(domain) %>%
summarise(k = n(), computable = sum(can_compute), .groups = "drop")
all_split
```
## Save machine-readable split results
```{r}
out_csv <- "../scratch/split_effects_from_raw.csv"
write.csv(all_split, out_csv, row.names = FALSE)
out_csv
```
## YAML copy/paste lines (`effect_size` only)
```{r}
ok <- all_split %>% filter(can_compute)
for (i in seq_len(nrow(ok))) {
cat(sprintf(
"%s (%s; %s): d=%.12f v=%.12f\n",
ok$effect_id[i],
ok$sample[i],
ok$evidence_strength[i],
ok$d[i],
ok$v[i]
))
}
```