This vignette is here just to show you the code used to agglomerate and save the splt_dev_and_demog.rda file that comes with this package.

library(dplyr)
library(tidyr)

credentials <- scorequaltrics::creds_from_file(qualtrics_api_token_file)

survey_names_all <- scorequaltrics::get_surveys(credentials)
survey_names_splt <- dplyr::filter(survey_names_all, grepl('.*(TDS1|TDS2|TDS3|YADS).*', SurveyName))

TDS

#Check that dropped values weren't ambiguous
tds2_wave2_long_nodupes %>% 
    filter(dropped) %>%
    group_by(SID, item) %>%
    filter(!all(length(unlist(old.value)) < 1)) %>%
    mutate(old.value = paste(old.value, collaps = ' ')) %>%
    knitr::kable(caption = "Questionnaire dupes")
#Check that dropped values weren't ambiguous
tds1_wave2_long_nodupes %>% 
    filter(dropped) %>%
    group_by(SID, item) %>%
    filter(!all(length(unlist(old.value)) < 1)) %>%
    mutate(old.value = paste(old.value, collaps = ' ')) %>%
    knitr::kable(caption = "Questionnaire dupes")

YADS-V

Version 1:

  • SID
  • PAL-2_Age
  • PAL-2_Race
  • PDS_Gender
  • Relationship Q176

yadsv_v1_surveys <- survey_names_splt %>%
    filter(grepl('YADS-V$', SurveyName))
yadsv_v1_qid_sid <- scorequaltrics::get_survey_data(yadsv_v1_surveys, 
                                                   credentials, 
                                                   pid_col = 'SID') %>%
    dplyr::filter(grepl('qid', item)) %>% 
    dplyr::left_join(readr::read_csv(id_recode_filename), by = c('value' = 'response_id')) %>%
    tidyr::spread(item, value) %>%
    dplyr::mutate(SID = ifelse(!is.na(id), id, SID)) %>%
    dplyr::select(-survey_name)
yadsv_v1_long <- scorequaltrics::get_survey_data(yadsv_v1_surveys, 
                                                   credentials, 
                                                   pid_col = 'qid') %>%
    dplyr::filter(
        grepl('(PDS|PAL_2_Age|PAL_2_Race|Q176|StartTime|Q173_1_TEXT|StartDate)', 
              item)) %>% 
    dplyr::left_join(yadsv_v1_qid_sid, by = 'qid') %>%
    dplyr::select(-qid, -id)

yadsv_v1_pds_rubrics <- data.frame(file = dir(file.path(tds2_wave2_rubric_dir), 
                                              pattern = 'PDS_scoring_rubric_TDS2_S3.csv',
                                              full.names = TRUE))
yadsv_v1_scoring_data_long <- scorequaltrics::get_rubrics(yadsv_v1_pds_rubrics, 
                                                          type = 'scoring')

#Clean and de-dupe parent and child data
yadsv_v1_long_nodupes <- yadsv_v1_long %>%
    filter(grepl('[1234]\\d\\d', SID)) %>%
    scorequaltrics::get_items_in_rubric(yadsv_v1_scoring_data_long) %>%
    scorequaltrics::clean_dupes(pid_col = 'SID')

#demographics only
yadsv_v1_pal_2_demo <- yadsv_v1_long %>%
    dplyr::filter(grepl('[1234]\\d\\d', SID),
           grepl('(PAL_2_Age|PAL_2_Race|Q173_1_TEXT|StartDate)', item)) %>%
    tidyr::spread(item, value) %>%
    dplyr::rename(age = PAL_2_Age, first_name = Q173_1_TEXT, occur_date = StartDate) %>%
    dplyr::mutate(occur_date = as.character(lubridate::date(occur_date)))

sona_kapp_demo_cols <- readr::cols(
  first_name = readr::col_character(),
  occur_date = readr::col_character(),
  study = readr::col_character(),
  age = readr::col_integer(),
  gender = readr::col_character(),
  ethnicity = readr::col_character())

sona_kapp_demo <- dplyr::filter(
    readr::read_csv(file.path(data_dir, 'sona_data.csv'),
                    col_types = sona_kapp_demo_cols),
    study == 'kapp')

yadsv_v1_pal_2_demo <- dplyr::select(
    dplyr::left_join(
        yadsv_v1_pal_2_demo,
        sona_kapp_demo,
        by = c('first_name', 'occur_date')),
    -first_name, -occur_date, -study)

#inf relations only
yadsv_v1_inf_rel <- yadsv_v1_long %>%
    dplyr::filter(grepl('[1234]\\d\\d', SID),
           grepl('Q176', item))
#Check that dropped values weren't ambiguous
yadsv_v1_long_nodupes %>% 
    filter(dropped) %>%
    group_by(SID, item) %>%
    filter(!all(length(unlist(old.value)) < 1)) %>%
    mutate(old.value = paste(old.value, collaps = ' ')) %>%
    knitr::kable(caption = "Questionnaire dupes")

Version 2.0

  • SID
  • Q327 how old
  • Q329 Which racial ethnic group is most like you
  • PDS_Gender
  • Q251 biological gender
  • inf_rel
yadsv_v2_surveys <- survey_names_splt %>%
    filter(grepl('YADS-V v2.0$', SurveyName))
yadsv_v2_qid_sid <- scorequaltrics::get_survey_data(yadsv_v2_surveys, 
                                                   credentials, 
                                                   pid_col = 'SID') %>%
    dplyr::filter(grepl('qid', item)) %>% 
    dplyr::left_join(readr::read_csv(id_recode_filename), by = c('value' = 'response_id')) %>%
    dplyr::filter(SID != 999) %>%
    dplyr::mutate(SID = ifelse(!is.na(id), id, SID)) %>%
    tidyr::spread(item, value) %>%
    dplyr::select(-survey_name)
yadsv_v2_long <- scorequaltrics::get_survey_data(yadsv_v2_surveys, 
                                                   credentials, 
                                                   pid_col = 'qid') %>%
    dplyr::filter(grepl('(PDS|Q327|Q329|inf_rel|px_info_1_TEXT|StartDate)', item)) %>% 
    dplyr::left_join(yadsv_v2_qid_sid, by = 'qid') %>%
    dplyr::select(-qid, -id)

yadsv_v2_pds_rubrics <- data.frame(file = dir(file.path(tds2_wave2_rubric_dir), 
                                              pattern = 'PDS_scoring_rubric_TDS2_S3.csv',
                                              full.names = TRUE))
yadsv_v2_scoring_data_long <- scorequaltrics::get_rubrics(yadsv_v2_pds_rubrics, 
                                                          type = 'scoring')
SID_filter <- '([1234]\\d\\d|99386)'

#Clean and de-dupe parent and child data
yadsv_v2_long_nodupes <- yadsv_v2_long %>%
    filter(grepl(SID_filter, SID)) %>%
    scorequaltrics::get_items_in_rubric(yadsv_v2_scoring_data_long) %>%
    scorequaltrics::clean_dupes(pid_col = 'SID')

#demographics only
yadsv_v2_pal_2_demo <- yadsv_v2_long %>%
    dplyr::filter(grepl(SID_filter, SID),
           grepl('(Q327|Q329|px_info_1_TEXT|StartDate)', item)) %>%
    tidyr::spread(item, value) %>%
    dplyr::rename('age' = Q327,
                  'PAL_2_Race' = Q329,
                  'PAL_2_Race_TEXT' = Q329_TEXT,
                   first_name = px_info_1_TEXT, occur_date = StartDate) %>%
    dplyr::mutate(occur_date = as.character(lubridate::date(occur_date)))

yadsv_v2_pal_2_demo <- dplyr::select(
    dplyr::left_join(
        yadsv_v2_pal_2_demo,
        sona_kapp_demo,
        by = c('first_name', 'occur_date')),
    -first_name, -occur_date, -study)

#inf relations only
yadsv_v2_inf_rel <- yadsv_v2_long %>%
    dplyr::filter(grepl(SID_filter, SID),
           grepl('inf_rel', item))
#Check that dropped values weren't ambiguous
yadsv_v2_long_nodupes %>% 
    filter(dropped) %>%
    group_by(SID, item) %>%
    filter(!all(length(unlist(old.value)) < 1)) %>%
    mutate(old.value = paste(old.value, collaps = ' ')) %>%
    knitr::kable(caption = "Questionnaire dupes")

Version 2.0 online

  • participantid
  • Q327 how old are you
  • Q329 race
  • PDS_Gender
  • inf_rel
yadsv_v2o_surveys <- survey_names_splt %>%
    filter(grepl('YADS-V v2.0 - Online$', SurveyName))
yadsv_v2o_qid_sid <- scorequaltrics::get_survey_data(yadsv_v2o_surveys, 
                                                   credentials, 
                                                   pid_col = 'participantid') %>%
    dplyr::rename(SID = participantid) %>%
    dplyr::filter(grepl('qid', item)) %>% 
    dplyr::left_join(readr::read_csv(id_recode_filename), by = c('value' = 'response_id')) %>%
    dplyr::filter(SID != 99999) %>%
    dplyr::mutate(SID = ifelse(!is.na(id), id, SID)) %>%
    tidyr::spread(item, value) %>%
    dplyr::select(-survey_name)
yadsv_v2o_long <- scorequaltrics::get_survey_data(yadsv_v2o_surveys, 
                                                   credentials, 
                                                   pid_col = 'qid') %>%
    dplyr::filter(grepl('(PDS|Q327|Q329|inf_rel)', item)) %>% 
    dplyr::left_join(yadsv_v2o_qid_sid, by = 'qid') %>%
    dplyr::select(-qid, -id)

yadsv_v2o_pds_rubrics <- data.frame(file = dir(file.path(tds2_wave2_rubric_dir), 
                                              pattern = 'PDS_scoring_rubric_TDS2_S3.csv',
                                              full.names = TRUE))
yadsv_v2o_scoring_data_long <- scorequaltrics::get_rubrics(yadsv_v2o_pds_rubrics, 
                                                          type = 'scoring')
SID_filter <- '([34]\\d{4}$)'

yadsv_v2o_long_nodupes <- yadsv_v2o_long %>%
    filter(grepl(SID_filter, SID)) %>%
    scorequaltrics::get_items_in_rubric(yadsv_v2o_scoring_data_long) %>%
    scorequaltrics::clean_dupes(pid_col = 'SID')

#demographics only
yadsv_v2o_pal_2_demo <- yadsv_v2o_long %>%
    dplyr::filter(grepl(SID_filter, SID),
           grepl('(Q327|Q329)', item)) %>%
    tidyr::spread(item, value) %>%
    dplyr::rename('PAL_2_Age' = Q327,
                  'PAL_2_Race' = Q329,
                  'PAL_2_Race_TEXT' = Q329_TEXT)
#we can fill in missing ages with ages given on the consent form
yadsv_v2o_surveys_cons <- survey_names_splt %>%
    filter(grepl('YADS-V Consent - Online', SurveyName))
yadsv_v2o_long_cons_age <- scorequaltrics::get_survey_data(yadsv_v2o_surveys_cons, 
                                                   credentials, 
                                                   pid_col = 'participantid') %>%
    dplyr::filter(grepl('check_age', item), !is.na(as.numeric(value))) %>%
    dplyr::distinct(participantid, item, value) %>%
    dplyr::group_by(participantid) %>%
    dplyr::mutate(n = n()) %>%
    dplyr::filter(n == 1) %>%
    dplyr::select(-n) %>%
    tidyr::spread(item, value)
yadsv_v2o_pal_2_demo_cons_age <- dplyr::left_join(yadsv_v2o_pal_2_demo,
                                         yadsv_v2o_long_cons_age, 
                                         by = c('SID' = 'participantid')) %>%
    dplyr::mutate(age = ifelse(!is.na(as.numeric(check_age)),
                               check_age,
                               PAL_2_Age))
#finally, we can use general survey data
sona_bartok_demo <- dplyr::filter(
    readr::read_csv(file.path(data_dir, 'sona_data.csv'),
                    col_types = sona_kapp_demo_cols),
    study == 'bartok')
sona_bartok_roster <- readr::read_csv(
    file.path(data_dir, 'bartok_fname_ids.csv'),
    col_types = readr::cols(first_name = readr::col_character(),
                            survey_id = readr::col_character(),
                            occur_date = readr::col_character()))
sona_bartok_demo <- left_join(sona_bartok_demo, sona_bartok_roster)

yadsv_v2o_pal_2_demo_cons_age <- dplyr::select(
    dplyr::left_join(
        yadsv_v2o_pal_2_demo_cons_age,
        sona_bartok_demo,
        by = c('SID' = 'survey_id')),
    -first_name, -occur_date, -study)

#inf relations only
yadsv_v2o_inf_rel <- yadsv_v2o_long %>%
    dplyr::filter(grepl(SID_filter, SID),
           grepl('inf_rel', item))
#Check that dropped values weren't ambiguous
yadsv_v2o_long_nodupes %>% 
    filter(dropped) %>%
    group_by(SID, item) %>%
    filter(!all(length(unlist(old.value)) < 1)) %>%
    mutate(old.value = paste(old.value, collaps = ' ')) %>%
    knitr::kable(caption = "Questionnaire dupes")

Combine all data

demo_col_types <- readr::cols(
    sid = readr::col_character(),
    s2_date = readr::col_date(format = ""),
    s2_age = readr::col_double(),
    s3_date = readr::col_date(format = ""),
    s3_age = readr::col_double(),
    gender = readr::col_integer(),
    ethnicity = readr::col_character(),
    hispanic_yn = readr::col_integer(),
    fsiq2 = readr::col_integer()
)
anon_id_cols <- readr::cols(
  id = readr::col_character(),
  sample = readr::col_character(),
  anon_id = readr::col_integer(),
  anon_sample = readr::col_character(),
  exclude = readr::col_integer()
)

pal_2_race_eth_mapping <- c(
    '1' = 'European American/White',
    '2' = 'Native American/American Indian/Alaskan Native',
    '3' = 'African American/Black',
    '4' = 'Hispanic/Latino',
    '5' = 'Asian American',
    '6' = 'Pacific Islander/Hawaiian',
    '7' = 'Other'
)

tds_wave2_scored_puberty <- dplyr::bind_rows(tds1_wave2_scored_puberty, tds2_wave2_scored_puberty) %>%
    dplyr::select(-n_items, -n_missing, -method) %>%
    tidyr::unite(scale, scale_name, scored_scale) %>%
    tidyr::spread(scale, score) %>%
    dplyr::rowwise() %>%
    dplyr::mutate_all(funs(ifelse(. == "NaN", NA, .))) %>%
    dplyr::mutate(
        PDS_mean_score = ifelse(!is.na(PDS_female_mean_score) && is.na(PDS_male_mean_score),
                                PDS_female_mean_score,
                                ifelse(is.na(PDS_female_mean_score) && !is.na(PDS_male_mean_score),
                                       PDS_male_mean_score,
                                       NA))) %>%
    dplyr::select(-PDS_female_mean_score, -PDS_male_mean_score)
tds_wave2_dev_demo <- dplyr::full_join(tds_wave2_scored_puberty,
                                       readr::read_csv(redcap_demographics_file,
                                                       col_types = demo_col_types),
                                       by = c('SID' = 'sid')) 
tds_wave2_dev_demo_deid <- dplyr::full_join(
    tds_wave2_dev_demo,
    dplyr::filter(
            readr::read_csv(anon_id_filename, col_types = anon_id_cols),
            !sample %in% c('yads', 'yads_online')),
    by = c('SID' = 'id'))
#Check to make sure everyone has an anon_id, and if they do, fully anonymize the data
if(!all(
    dim(filter(tds_wave2_dev_demo_deid, is.na(anon_id), !is.na(s3_date)))[1] == 0,
    dim(filter(tds_wave2_dev_demo_deid, 
               !is.na(anon_id), is.na(s3_date), is.na(exclude), 
               anon_sample %in% c('TDS1', 'TDS2')))[1] == 0)) {
    stop("Some participants not accounted for, or not anonymized")
} else {
    tds_wave2_dev_demo_deid <- dplyr::left_join(
        tds_wave2_dev_demo,
        dplyr::filter(
            readr::read_csv(anon_id_filename, col_types = anon_id_cols),
            !sample %in% c('yads', 'yads_online')),
        by = c('SID' = 'id')) %>%
        dplyr::filter(is.na(exclude), !is.na(s3_date)) %>%
        dplyr::mutate(age = round(s3_age + runif(n(), -.1, .1), 2)) %>%
        dplyr::select(-SID, -exclude, -sample, -s2_date, -s3_date, -s2_age, -s3_age) %>%
        dplyr::rename(SID = anon_id, sample = anon_sample)
}
yads_scored_puberty <- dplyr::bind_rows(
    yadsv_v1_scored_puberty, yadsv_v2_scored_puberty, yadsv_v2o_scored_puberty) %>%
    dplyr::select(-n_items, -n_missing, -method) %>%
    tidyr::unite(scale, scale_name, scored_scale) %>%
    tidyr::spread(scale, score) %>%
    dplyr::rowwise() %>%
    dplyr::mutate_all(funs(ifelse(. == "NaN", NA, .))) %>%
    dplyr::mutate(
        PDS_mean_score = ifelse(!is.na(PDS_female_mean_score) && is.na(PDS_male_mean_score),
                                PDS_female_mean_score,
                                ifelse(is.na(PDS_female_mean_score) && !is.na(PDS_male_mean_score),
                                       PDS_male_mean_score,
                                       NA))) %>%
    dplyr::select(-PDS_female_mean_score, -PDS_male_mean_score)
yads_dev_demo <- dplyr::full_join(yads_scored_puberty,
                                  dplyr::select(
                                      dplyr::bind_rows(yadsv_v1_pal_2_demo,
                                                       yadsv_v2_pal_2_demo,
                                                       yadsv_v2o_pal_2_demo_cons_age),
                                      -PAL_2_Age, -check_age, -survey_name),
                                  by = 'SID')
yads_dev_demo <- dplyr::mutate(
    yads_dev_demo,
    age = ifelse(is.na(age) || age == '',
                 as.character(sona_age),
                 age),
    PDSS_gender = ifelse(is.na(PDSS_gender) || PDSS_gender == '',
                         c('Male' = '0', 'Female' = '1')[sona_gender],
                         PDSS_gender),
    ethnicity = ifelse(PAL_2_Race == 7,
                       PAL_2_Race_TEXT,
                       pal_2_race_eth_mapping[PAL_2_Race]),
    ethnicity = ifelse(is.na(ethnicity) || ethnicity == '',
                         sona_ethnicity,
                         ethnicity)) %>%
    dplyr::select(-sona_age, -sona_gender, -sona_ethnicity)
yads_dev_demo_deid <- dplyr::full_join(
    yads_dev_demo,
    dplyr::filter(
        readr::read_csv(anon_id_filename, col_types = anon_id_cols),
        sample %in% c('yads', 'yads_online')),
    by = c('SID' = 'id'))

if(!all(
    dim(filter(yads_dev_demo_deid, is.na(anon_id), !is.na(SID), is.na(exclude)))[1] == 0,
    dim(filter(yads_dev_demo_deid, 
               !is.na(anon_id), is.na(SID), is.na(exclude), anon_sample %in% c('yads', 'yads_online')))[1] == 0)) {
    stop("Some participants not accounted for, or not anonymized")
} else {
    yads_dev_demo_deid <- dplyr::left_join(
        yads_dev_demo,
        dplyr::filter(
            readr::read_csv(anon_id_filename, col_types = anon_id_cols),
            sample %in% c('yads', 'yads_online')),
        by = c('SID' = 'id')) %>%
        dplyr::filter(is.na(exclude)) %>%
        dplyr::select(-SID, -exclude, -sample, -PAL_2_Race, -PAL_2_Race_TEXT) %>%
        dplyr::rename(SID = anon_id, sample = anon_sample) %>%
        dplyr::mutate(gender = as.numeric(PDSS_gender), age = as.numeric(age))
}

splt_dev_and_demog <- dplyr::bind_rows(tds_wave2_dev_demo_deid, yads_dev_demo_deid) %>%
    dplyr::left_join(readr::read_csv(ethnicity_recode)) %>%
    dplyr::select(-ethnicity) %>%
    dplyr::rename(ethnicity = ethnicity_recode)