koncewicz2019ocenasilyswiadectw
/data/papers/koncewicz2019ocenasilyswiadectw/analysis/effect_sizes.qmd
---
title: "Effect size computation: koncewicz2019ocenasilyswiadectw"
format:
  html:
    toc: true
execute:
  echo: true
  warning: true
  message: false
---

```{r}
library(dplyr)
library(tidyr)
library(data.table)
library(esc)
library(metafor)
```

## Shared Helpers

```{r}
stop_if_missing <- function(x, name) {
  if (is.na(x)) stop(sprintf("Missing required input: %s", name), call. = FALSE)
}

compute_effect_size <- function(
    paper_key,
    study_id,
    effect_id,
    method_used,
    sign_convention = "d = mean(low) - mean(high)",
    n_high = NA_integer_,
    n_low = NA_integer_,
    n_total = NA_integer_,
    mean_high = NA_real_,
    mean_low = NA_real_,
    sd_high = NA_real_,
    sd_low = NA_real_,
    r_within = NA_real_,
    notes_on_assumptions = "",
    imputed_flag = FALSE,
    needs_sensitivity = TRUE
) {
  d <- NA_real_
  v <- NA_real_
  g <- NA_real_
  v_g <- NA_real_
  computed_from_suggested <- NA_character_
  design_used <- if (startsWith(method_used, "between_")) {
    "Between-Subjects"
  } else if (startsWith(method_used, "within_")) {
    "Within-Subjects"
  } else {
    NA_character_
  }

  if (method_used == "between_groups") {
    computed_from_suggested <- "groups"
    stop_if_missing(n_high, "n_high")
    stop_if_missing(n_low, "n_low")
    stop_if_missing(mean_high, "mean_high")
    stop_if_missing(mean_low, "mean_low")
    stop_if_missing(sd_high, "sd_high")
    stop_if_missing(sd_low, "sd_low")

    es_d <- esc::esc_mean_sd(
      grp1m = mean_low, grp1sd = sd_low, grp1n = n_low,
      grp2m = mean_high, grp2sd = sd_high, grp2n = n_high,
      es.type = "d"
    )
    es_g <- esc::esc_mean_sd(
      grp1m = mean_low, grp1sd = sd_low, grp1n = n_low,
      grp2m = mean_high, grp2sd = sd_high, grp2n = n_high,
      es.type = "g"
    )

    d <- as.numeric(es_d$es)
    v <- as.numeric(es_d$var)
    g <- as.numeric(es_g$es)
    v_g <- as.numeric(es_g$var)
  } else if (method_used == "within_smcrp_r") {
    computed_from_suggested <- "groups"
    stop_if_missing(n_total, "n_total")
    stop_if_missing(mean_high, "mean_high")
    stop_if_missing(mean_low, "mean_low")
    stop_if_missing(sd_high, "sd_high")
    stop_if_missing(sd_low, "sd_low")
    stop_if_missing(r_within, "r_within")
    if (abs(r_within) > 1) stop("r_within must be between -1 and 1", call. = FALSE)

    es_d <- metafor::escalc(
      measure = "SMCRP",
      m1i = mean_low, m2i = mean_high,
      sd1i = sd_low, sd2i = sd_high,
      ri = r_within, ni = n_total,
      correct = FALSE
    )
    es_g <- metafor::escalc(
      measure = "SMCRP",
      m1i = mean_low, m2i = mean_high,
      sd1i = sd_low, sd2i = sd_high,
      ri = r_within, ni = n_total,
      correct = TRUE
    )

    d <- as.numeric(es_d$yi)
    v <- as.numeric(es_d$vi)
    g <- as.numeric(es_g$yi)
    v_g <- as.numeric(es_g$vi)
  } else {
    stop(sprintf("Unknown method_used: %s", method_used), call. = FALSE)
  }

  inputs_used <- paste(
    c(
      sprintf("method=%s", method_used),
      sprintf("sign_convention=%s", sign_convention),
      if (!is.na(n_low)) sprintf("n_low=%s", n_low) else NULL,
      if (!is.na(n_high)) sprintf("n_high=%s", n_high) else NULL,
      if (!is.na(n_total)) sprintf("n_total=%s", n_total) else NULL,
      if (!is.na(mean_low)) sprintf("mean_low=%s", mean_low) else NULL,
      if (!is.na(mean_high)) sprintf("mean_high=%s", mean_high) else NULL,
      if (!is.na(sd_low)) sprintf("sd_low=%s", sd_low) else NULL,
      if (!is.na(sd_high)) sprintf("sd_high=%s", sd_high) else NULL,
      if (!is.na(r_within)) sprintf("r_within=%s", r_within) else NULL
    ),
    collapse = ", "
  )

  audit <- data.frame(
    paper_key = paper_key,
    study_id = study_id,
    effect_id = effect_id,
    design = design_used,
    method_used = method_used,
    computed_from_suggested = computed_from_suggested,
    inputs_used = inputs_used,
    d = d,
    v = v,
    g = g,
    v_g = v_g,
    notes_on_assumptions = notes_on_assumptions,
    imputed_flag = imputed_flag,
    needs_sensitivity = needs_sensitivity
  )

  yaml_snippet <- sprintf(
    "effect_size:\n  metric: SMD\n  d: %.12f\n  v: %.12f\n  computed_from: %s\n  needs_review: false\n  notes: \"%s\"\n",
    d, v, computed_from_suggested, gsub(pattern = "\"", replacement = "'", x = inputs_used)
  )

  list(audit = audit, yaml_snippet = yaml_snippet)
}
```

## Source Data

The thesis is used for the study description and materials. Numerical extraction follows
`data/raport_natka.rmd`, with the shared demographic/end files copied from the same combined
survey export.

```{r}
recode_likert <- function(x) {
  as.numeric(factor(x, levels = c("A1", "A2", "A3", "A4", "A5", "A6", "A7"), ordered = TRUE))
}

read_survey <- function(file) read.csv(file.path("../data", file), na.strings = "")

demografia <- read.csv("../data/demo/625672.csv", na.strings = "")
end <- distinct(read.csv("../data/end/255249.csv", na.strings = ""), uid, step)
demografia <- demografia[!(duplicated(demografia$uid) | duplicated(demografia$uid, fromLast = TRUE)), ]
demografia <- demografia[demografia$uid %in% end$uid, ]
```

## Between-Subjects Inputs

```{r}
single_files <- list(
  przechodzien = list(wazne = "475219.csv", niewazne = "734756.csv", col_w = "PrzechodzenWazne.SA001.", col_n = "PrzechodzenNiewazne.SA001."),
  pijany = list(wazne = "447751.csv", niewazne = "948948.csv", col_w = "PijanyWazne.SA001.", col_n = "PijanyNiewazne.SA001."),
  policjant = list(wazne = "964598.csv", niewazne = "168342.csv", col_w = "PolicjantWazne.SA001.", col_n = "PolicjantNiewazne.SA001."),
  dziecko = list(wazne = "852883.csv", niewazne = "915168.csv", col_w = "DzieckoWazne.SA001.", col_n = "DzieckoNiewazne.SA001."),
  znaki = list(wazne = "583976.csv", niewazne = "968689.csv", col_w = "ZnakiWazne.SA001.", col_n = "ZnakiNiewazne.SA001.")
)

read_single <- function(history, spec) {
  high <- read_survey(spec$wazne)
  names(high)[names(high) == spec$col_w] <- "Response"
  high$Historyjka <- history
  high$Stawka <- "ważne"

  low <- read_survey(spec$niewazne)
  names(low)[names(low) == spec$col_n] <- "Response"
  low$Historyjka <- history
  low$Stawka <- "nieważne"

  rbindlist(list(high, low), fill = TRUE)
}

single <- rbindlist(Map(read_single, names(single_files), single_files), fill = TRUE)
single$Response <- recode_likert(single$Response)
single <- single[!(duplicated(uid) | duplicated(uid, fromLast = TRUE)) & !is.na(uid) & !is.na(shorturl)]
single <- left_join(as.data.frame(single), demografia, by = "uid")

between_inputs <- single %>%
  group_by(Historyjka, Stawka) %>%
  summarise(M = mean(Response), SD = sd(Response), N = n(), .groups = "drop") %>%
  pivot_wider(names_from = Stawka, values_from = c(M, SD, N))

between_inputs
```

## Between-Subjects Effects

```{r}
paper_key <- "koncewicz2019ocenasilyswiadectw"
study_id <- 1

compute_between_from_history <- function(history, effect_id) {
  inp <- between_inputs[between_inputs$Historyjka == history, ]
  compute_effect_size(
    paper_key = paper_key,
    study_id = study_id,
    effect_id = effect_id,
    method_used = "between_groups",
    n_high = inp$`N_ważne`,
    n_low = inp$`N_nieważne`,
    mean_high = inp$`M_ważne`,
    mean_low = inp$`M_nieważne`,
    sd_high = inp$`SD_ważne`,
    sd_low = inp$`SD_nieważne`,
    notes_on_assumptions = "Single-scenario group summaries computed from raw survey files following raport_natka.rmd; effect size computed with esc::esc_mean_sd."
  )
}

res_s1_e1 <- compute_between_from_history("przechodzien", "s1_e1")
res_s1_e2 <- compute_between_from_history("znaki", "s1_e2")
res_s1_e3 <- compute_between_from_history("policjant", "s1_e3")
res_s1_e4 <- compute_between_from_history("dziecko", "s1_e4")
res_s1_e5 <- compute_between_from_history("pijany", "s1_e5")
```

### Effect s1_e1: Przechodzień, Single Scenario

```{r}
res_s1_e1$audit
cat(res_s1_e1$yaml_snippet)
```

### Effect s1_e2: Znaki, Single Scenario

```{r}
res_s1_e2$audit
cat(res_s1_e2$yaml_snippet)
```

### Effect s1_e3: Policjant, Single Scenario

```{r}
res_s1_e3$audit
cat(res_s1_e3$yaml_snippet)
```

### Effect s1_e4: Dziecko, Single Scenario

```{r}
res_s1_e4$audit
cat(res_s1_e4$yaml_snippet)
```

### Effect s1_e5: Pijany Kierowca, Single Scenario

```{r}
res_s1_e5$audit
cat(res_s1_e5$yaml_snippet)
```

## Within-Subjects Inputs

```{r}
read_both <- function(history, file, low_col, high_col) {
  dat <- read_survey(file)
  names(dat)[names(dat) == low_col] <- "Niewazne"
  names(dat)[names(dat) == high_col] <- "Wazne"
  dat$Historyjka <- history
  dat
}

both <- rbindlist(list(
  read_both("przechodzien", "853878.csv", "PrzechodzenOba.SA001.", "PrzechodzenOba.SA002."),
  read_both("pijany", "823783.csv", "PrzechodzenOba.SA001.", "PrzechodzenOba.SA002."),
  read_both("policjant", "986256.csv", "PolicjantOba.SA001.", "PolicjantOba.SA002."),
  read_both("dziecko", "411636.csv", "DzieckoOba.SA001.", "DzieckoOba.SA002."),
  read_both("znaki", "572351.csv", "ZnakiOba.SA001.", "ZnakiOba.SA002.")
), fill = TRUE)

both$Wazne <- recode_likert(both$Wazne)
both$Niewazne <- recode_likert(both$Niewazne)
both <- both[!(duplicated(uid) | duplicated(uid, fromLast = TRUE)) & !is.na(uid) & !is.na(shorturl)]
both <- left_join(as.data.frame(both), demografia, by = "uid")

make_within_inputs <- function(dat) {
  dat <- dat[complete.cases(dat[, c("Niewazne", "Wazne")]), ]
  test <- t.test(dat$Wazne, dat$Niewazne, paired = TRUE)

  data.frame(
    Historyjka = unique(dat$Historyjka),
    N = nrow(dat),
    M_low = mean(dat$Niewazne),
    M_high = mean(dat$Wazne),
    SD_low = sd(dat$Niewazne),
    SD_high = sd(dat$Wazne),
    r = cor(dat$Niewazne, dat$Wazne),
    t_high_minus_low = as.numeric(test$statistic),
    df = as.numeric(test$parameter),
    p = test$p.value
  )
}

within_inputs <- bind_rows(lapply(split(both, both$Historyjka), make_within_inputs))
within_inputs
```

## Within-Subjects Effects

```{r}
compute_within_from_history <- function(history, effect_id) {
  inp <- within_inputs[within_inputs$Historyjka == history, ]
  compute_effect_size(
    paper_key = paper_key,
    study_id = study_id,
    effect_id = effect_id,
    method_used = "within_smcrp_r",
    n_total = inp$N,
    mean_high = inp$M_high,
    mean_low = inp$M_low,
    sd_high = inp$SD_high,
    sd_low = inp$SD_low,
    r_within = inp$r,
    notes_on_assumptions = "Two-scenario within-person summaries computed from raw survey files following raport_natka.rmd; effect size computed with metafor::escalc(measure='SMCRP', correct=FALSE)."
  )
}

res_s1_e6 <- compute_within_from_history("przechodzien", "s1_e6")
res_s1_e7 <- compute_within_from_history("znaki", "s1_e7")
res_s1_e8 <- compute_within_from_history("policjant", "s1_e8")
res_s1_e9 <- compute_within_from_history("dziecko", "s1_e9")
res_s1_e10 <- compute_within_from_history("pijany", "s1_e10")
```

### Effect s1_e6: Przechodzień, Paired Scenarios

```{r}
res_s1_e6$audit
cat(res_s1_e6$yaml_snippet)
```

### Effect s1_e7: Znaki, Paired Scenarios

```{r}
res_s1_e7$audit
cat(res_s1_e7$yaml_snippet)
```

### Effect s1_e8: Policjant, Paired Scenarios

```{r}
res_s1_e8$audit
cat(res_s1_e8$yaml_snippet)
```

### Effect s1_e9: Dziecko, Paired Scenarios

```{r}
res_s1_e9$audit
cat(res_s1_e9$yaml_snippet)
```

### Effect s1_e10: Pijany Kierowca, Paired Scenarios

```{r}
res_s1_e10$audit
cat(res_s1_e10$yaml_snippet)
```

## Audit Table

```{r}
audits <- rbind(
  res_s1_e1$audit,
  res_s1_e2$audit,
  res_s1_e3$audit,
  res_s1_e4$audit,
  res_s1_e5$audit,
  res_s1_e6$audit,
  res_s1_e7$audit,
  res_s1_e8$audit,
  res_s1_e9$audit,
  res_s1_e10$audit
)

audits
```