rr_validation_mapping_data = arrow::read_feather(
file = "https://github.com/synth-science/surveybot3000/raw/refs/heads/main/validation_study/mapping.feather"
)
items <- rio::import("https://docs.google.com/spreadsheets/d/16QcRLP5BUn1Cmtr0e_XRdjr1Wg-EHSMSGmgZO1M3tNM/edit?gid=0#gid=0", which = 2)
# compare <- rr_validation_mapping_data %>% full_join(items %>% filter(in_survey) %>% select(id, scale, subscale, item), by = c("variable" = "id")) %>% mutate(subscale = if_else(subscale == "#N/A", NA_character_, subscale))
# compare %>% filter(item_text != item) %>% View()
# compare %>% filter((scale0 != scale | scale1 != subscale)) %>% View()
# compare %>% filter((scale0 != scale | scale1 != subscale) & !(scale1 == scale & scale0 == subscale)) %>% View()
# rr_validation_mapping_data %>% anti_join(items, by = c("scale0" = "scale", "scale1" = "subscale")) %>% View
scales <- rr_validation_mapping_data %>%
select(-scale0, -scale1) %>%
left_join(items %>%
filter(in_survey) %>%
mutate(keyed = if_else(reversed, -1, 1)) %>%
select(id, keyed, scale, subscale), by = c("variable" = "id")) %>%
rename(scale_0 = scale, scale_1 = subscale) %>%
mutate(scale_1 = if_else(scale_1 == "#N/A", "", scale_1))
rr_validation_mapping_data <- scales %>%
rename(scale0 = scale_0, scale1 = scale_1)
scales <- bind_rows(
scales %>%
mutate(
scale_1 = "",
scale = str_replace_all(str_trim(paste0(instrument, "_", scale_0, scale_1)), "[^a-zA-Z_0-9]", "_")
) %>%
group_by(scale) %>%
mutate(number_of_items = n()),
scales %>%
mutate(
scale_1 = coalesce(scale_1, ""),
scale = str_replace_all(str_trim(paste0(instrument, "_", scale_0, scale_1)), "[^a-zA-Z_0-9]", "_")) %>%
group_by(scale) %>%
mutate(number_of_items = n())
) %>%
group_by(scale) %>%
filter(row_number() == 1) %>%
ungroup() %>%
select(-variable, -item_text) %>%
filter(number_of_items > 1)
arrow::write_feather(scales, sink = file.path(data_path, glue("{model_name}.raw.validation-study-2024-11-01.scales.feather"))
)
arrow::write_feather(rr_validation_mapping_data, sink = file.path(data_path, glue("{model_name}.raw.validation-study-2024-11-01.mapping2.feather"))
)
# pre-trained model
pt_rr_validation_machine_data = rio::import("https://github.com/synth-science/surveybot3000/raw/refs/heads/main/validation_study/embeddings_all-mpnet-base-v2.feather")
pt_rr_validation_machine_data <- pt_rr_validation_machine_data %>%
rowwise() %>%
mutate(embed_id = list(1:length(embeddings))) %>%
unnest(cols = c(embeddings, embed_id)) %>%
select(-item) %>%
pivot_wider(names_from = id, values_from = embeddings) %>%
select(-embed_id)
# fine-tuned model
rr_validation_machine_data = rio::import("https://github.com/synth-science/surveybot3000/raw/refs/heads/main/validation_study/embeddings_surveybot3000.feather")
rr_validation_machine_data <- rr_validation_machine_data %>%
rowwise() %>%
mutate(embed_id = list(1:length(embeddings))) %>%
unnest(cols = c(embeddings, embed_id)) %>%
select(-item) %>%
pivot_wider(names_from = id, values_from = embeddings) %>%
select(-embed_id)
main_qs <- c("AAID", "PANAS", "PAQ", "PSS", "NEPS", "ULS", "FCV", "DAQ", "CESD", "HEXACO", "OCIR", "PTQ", "RAAS", "KSA", "SAS", "MFQ", "CQ", "OLBI", "UWES", "WGS")
rr_validation_human_data = rio::import("data/processed/sosci_labelled_with_exclusion_criteria.rds") %>% filter(included) %>%
select(starts_with(main_qs)) %>%
select(-ends_with("_R"))
## Warning: Missing `trust` will be set to FALSE by default for RDS in 2.0.0.
rio::export(rr_validation_human_data, file = file.path(data_path, glue("{model_name}.raw.validation-study-2024-11-01.human.feather"))
)
setdiff(colnames(rr_validation_human_data), rr_validation_mapping_data$variable)
## character(0)
## character(0)
## [1] 387
## [1] 246
## [1] 768
## [1] 246
## [1] 20
## [1] 40
## [1] 40
n_distinct(str_c(rr_validation_mapping_data$instrument, rr_validation_mapping_data$scale0, rr_validation_mapping_data$scale1)) # subscales
## [1] 70
## # A tibble: 1 × 1
## n
## <int>
## 1 80
## # A tibble: 1 × 1
## n
## <int>
## 1 57
rr_validation_item_pairs = join_pairwise_correlation(rr_validation_human_data, rr_validation_machine_data)
arrow::write_feather(rr_validation_item_pairs, sink = file.path(data_path, glue("data/intermediate/{model_name}.raw.validation-study-2024-11-01.item_correlations.feather")))
pt_rr_validation_item_pairs = join_pairwise_correlation(rr_validation_human_data, pt_rr_validation_machine_data)
arrow::write_feather(pt_rr_validation_item_pairs, sink = file.path(data_path, glue("data/intermediate/{pretrained_model_name}.raw.validation-study-2024-11-01.item_correlations.feather")))
manifest_scores = predict_manifest_scores(rr_validation_human_data, rr_validation_machine_data, rr_validation_mapping_data, scales)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
arrow::write_feather(manifest_scores, sink = file.path(data_path, glue("data/intermediate/{model_name}.raw.validation-study-2024-11-01.scale_correlations.feather")))
pt_manifest_scores = predict_manifest_scores(rr_validation_human_data, pt_rr_validation_machine_data, rr_validation_mapping_data, scales)
arrow::write_feather(pt_manifest_scores, sink = file.path(data_path, glue("data/intermediate/{pretrained_model_name}.raw.validation-study-2024-11-01.scale_correlations.feather")))
mapping_data <- rr_validation_mapping_data %>%
rename(scale_0 = scale0,
scale_1 = scale1)
items_by_scale <- bind_rows(
scales %>% select(-keyed) %>% filter(scale_1 == "") %>% left_join(mapping_data %>% select(-scale_1), by = c("instrument", "scale_0")),
scales %>% select(-keyed) %>% filter(scale_1 != "") %>% left_join(mapping_data, by = c("instrument", "scale_0", "scale_1"))
)
random_scales <- list()
for(i in 1:200) {
n_items <- rpois(1, 6)
n_items <- if_else(n_items < 3, 3, n_items)
random_scales[[i]] <- rr_validation_mapping_data %>%
sample_n(n_items) %>%
mutate(scale = paste0("random", i)) %>%
group_by(scale) %>%
summarise(
items = list(variable),
number_of_items = n_distinct(variable),
lvn = paste(first(scale), " =~ ", paste(variable, collapse = " + "))) %>%
drop_na() %>%
mutate(keyed = 1)
}
random_scales <- bind_rows(random_scales) %>%
distinct(items, .keep_all = TRUE) %>%
rowwise() %>%
mutate(
reverse_items = list(randomly_choose_items_for_reversion(items))
) %>%
ungroup()
nrow(random_scales)
## [1] 200
write_rds(random_scales, file = file.path(data_path, glue("data/intermediate/random_scales_rr.rds")))
rr_validation_llm <- rr_validation_item_pairs %>%
left_join(mapping_data %>% select(variable_1 = variable, InstrumentA = instrument, ScaleA = scale_0, SubscaleA = scale_1)) %>%
left_join(mapping_data %>% select(variable_2 = variable, InstrumentB = instrument, ScaleB = scale_0, SubscaleB = scale_1))
## Joining with `by = join_by(variable_1)`
## Joining with `by = join_by(variable_2)`
pt_rr_validation_llm <- pt_rr_validation_item_pairs %>%
left_join(mapping_data %>% select(variable_1 = variable, InstrumentA = instrument, ScaleA = scale_0, SubscaleA = scale_1)) %>%
left_join(mapping_data %>% select(variable_2 = variable, InstrumentB = instrument, ScaleB = scale_0, SubscaleB = scale_1))
## Joining with `by = join_by(variable_1)`
## Joining with `by = join_by(variable_2)`
cors_llm <- rr_validation_item_pairs %>%
select(x = variable_1, y = variable_2, r = synthetic_r) %>%
as.data.frame() |>
igraph::graph_from_data_frame(directed = FALSE) |>
igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(cors_llm) <- 1
pt_cors_llm <- pt_rr_validation_item_pairs %>%
select(x = variable_1, y = variable_2, r = synthetic_r) %>%
as.data.frame() |>
igraph::graph_from_data_frame(directed = FALSE) |>
igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(pt_cors_llm) <- 1
cors_real <- rr_validation_llm %>%
select(x = variable_1, y = variable_2, r = empirical_r) %>%
as.data.frame() |>
igraph::graph_from_data_frame(directed = FALSE) |>
igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(cors_real) <- 1
real_scales <- items_by_scale %>%
group_by(scale) %>%
summarise(
items = list(variable),
reverse_items = list(variable[keyed == -1]),
number_of_items = n_distinct(variable),
keyed = first(keyed),
lvn = paste(first(scale), " =~ ", paste(variable, collapse = " + "))) %>%
drop_na() %>%
ungroup()
scales <- bind_rows(real = real_scales, random = random_scales, .id = "type")
rr_validation_human_data <- rr_validation_human_data %>% haven::zap_labels()
scales <- scales %>%
rowwise() %>%
mutate(pt_r_llm = list(pt_cors_llm[items, items]),
r_llm = list(cors_llm[items, items]),
r_real = list(cors_real[items, items]),
reverse_items_by_1st = list(find_reverse_items_by_first_item(r_real, keyed)),
N_real = rr_validation_item_pairs %>%
filter(variable_1 %in% items, variable_2 %in% items) %>%
summarise(min_n=min(pairwise_n)) %>% pull(min_n)) %>%
mutate(
rel_real = list(psych::alpha(rr_validation_human_data[, items], keys = reverse_items, n.iter = 1000)),
rel_llm = list(psych::alpha(r_llm, keys = reverse_items, n.obs = N_real)$feldt),
rel_pt_llm = list(psych::alpha(pt_r_llm, keys = reverse_items, n.obs = N_real)$feldt)) %>%
mutate(empirical_alpha = rel_real$feldt$alpha$raw_alpha,
synthetic_alpha = rel_llm$alpha$raw_alpha,
pt_synthetic_alpha = rel_pt_llm$alpha$raw_alpha) %>%
mutate(
empirical_alpha_se = sd(rel_real$boot[,"raw_alpha"]),
)
## Warning: There were 330 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `rel_real = list(...)`.
## ℹ In row 81.
## Caused by warning in `sqrt()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 329 remaining warnings.