Skip to content

Commit

Permalink
Modify preconsume script to work on one cohort at a time (knowledgesy…
Browse files Browse the repository at this point in the history
  • Loading branch information
callachennault authored and sheridancbio committed Feb 9, 2024
1 parent f39b12f commit 3a9ff4b
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 31 deletions.
11 changes: 8 additions & 3 deletions import-scripts/fetch-dmp-data-for-import.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,6 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/fetch-dmp-data-for-import.lock"
fi
fi

# pre-consume any samples missing values for normal_ad/tumor_dp/ad fields, or having UNKNOWN Archer gene-panel
bash $PORTAL_HOME/scripts/preconsume_problematic_samples.sh

# fetch clinical data from data repository
echo "fetching updates from dmp repository..."
$JAVA_BINARY $JAVA_IMPORTER_ARGS --fetch-data --data-source dmp --run-date latest
Expand Down Expand Up @@ -188,6 +185,8 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/fetch-dmp-data-for-import.lock"
# fetch new/updated IMPACT samples using CVR Web service (must come after git fetching)
drop_dead_instant_step=$(date --date="+3hours" -Iseconds) # nearly 3 hours from now
drop_dead_instant_string=$(find_earlier_instant "$drop_dead_instant_step" "$DROP_DEAD_INSTANT_END_TO_END")
# pre-consume any samples missing values for normal_ad/tumor_dp/ad fields, or having UNKNOWN gene-panel
bash $PORTAL_HOME/scripts/preconsume_problematic_samples.sh mskimpact
printTimeStampedDataProcessingStepMessage "CVR fetch for mskimpact"
$JAVA_BINARY $JAVA_CVR_FETCHER_ARGS -d $MSK_IMPACT_DATA_HOME -p $MSK_IMPACT_PRIVATE_DATA_HOME -n data_clinical_mskimpact_data_clinical_cvr.txt -i mskimpact -r 150 $CVR_TEST_MODE_ARGS -z $drop_dead_instant_string
if [ $? -gt 0 ] ; then
Expand Down Expand Up @@ -265,6 +264,8 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/fetch-dmp-data-for-import.lock"
# fetch new/updated heme samples using CVR Web service (must come after git fetching). Threshold is set to 50 since heme contains only 190 samples (07/12/2017)
drop_dead_instant_step=$(date --date="+3hours" -Iseconds) # nearly 3 hours from now
drop_dead_instant_string=$(find_earlier_instant "$drop_dead_instant_step" "$DROP_DEAD_INSTANT_END_TO_END")
# pre-consume any samples missing values for normal_ad/tumor_dp/ad fields, or having UNKNOWN gene-panel
bash $PORTAL_HOME/scripts/preconsume_problematic_samples.sh mskimpact_heme
printTimeStampedDataProcessingStepMessage "CVR fetch for hemepact"
$JAVA_BINARY $JAVA_CVR_FETCHER_ARGS -d $MSK_HEMEPACT_DATA_HOME -p $MSK_HEMEPACT_PRIVATE_DATA_HOME -n data_clinical_hemepact_data_clinical.txt -i mskimpact_heme -r 50 $CVR_TEST_MODE_ARGS -z $drop_dead_instant_string
if [ $? -gt 0 ] ; then
Expand Down Expand Up @@ -329,6 +330,8 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/fetch-dmp-data-for-import.lock"
# fetch new/updated archer samples using CVR Web service (must come after git fetching).
drop_dead_instant_step=$(date --date="+3hours" -Iseconds) # nearly 3 hours from now
drop_dead_instant_string=$(find_earlier_instant "$drop_dead_instant_step" "$DROP_DEAD_INSTANT_END_TO_END")
# pre-consume any samples missing values for normal_ad/tumor_dp/ad fields, or having UNKNOWN gene-panel
bash $PORTAL_HOME/scripts/preconsume_problematic_samples.sh mskarcher
printTimeStampedDataProcessingStepMessage "CVR fetch for archer"
# archer has -b option to block warnings for samples with zero variants (all samples will have zero variants)
$JAVA_BINARY $JAVA_CVR_FETCHER_ARGS -d $MSK_ARCHER_UNFILTERED_DATA_HOME -p $MSK_ARCHER_UNFILTERED_PRIVATE_DATA_HOME -n data_clinical_mskarcher_data_clinical.txt -i mskarcher -s -b -r 50 $CVR_TEST_MODE_ARGS -z $drop_dead_instant_string
Expand Down Expand Up @@ -364,6 +367,8 @@ MY_FLOCK_FILEPATH="/data/portal-cron/cron-lock/fetch-dmp-data-for-import.lock"
# fetch new/updated access samples using CVR Web service (must come after git fetching).
drop_dead_instant_step=$(date --date="+3hours" -Iseconds) # nearly 3 hours from now
drop_dead_instant_string=$(find_earlier_instant "$drop_dead_instant_step" "$DROP_DEAD_INSTANT_END_TO_END")
# pre-consume any samples missing values for normal_ad/tumor_dp/ad fields, or having UNKNOWN gene-panel
bash $PORTAL_HOME/scripts/preconsume_problematic_samples.sh mskaccess
printTimeStampedDataProcessingStepMessage "CVR fetch for access"
# access has -b option to block warnings for samples with zero variants (all samples will have zero variants)
$JAVA_BINARY $JAVA_CVR_FETCHER_ARGS -d $MSK_ACCESS_DATA_HOME -p $MSK_ACCESS_PRIVATE_DATA_HOME -n data_clinical_mskaccess_data_clinical.txt -i mskaccess -s -b -r 50 $CVR_TEST_MODE_ARGS -z $drop_dead_instant_string
Expand Down
71 changes: 43 additions & 28 deletions import-scripts/preconsume_problematic_samples.sh
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
#!/usr/bin/env bash

COHORT=$1
TMP_DIR="/data/portal-cron/tmp/preconsume_problematic_samples"
CVR_FETCH_PROPERTIES_FILEPATH="/data/portal-cron/git-repos/pipelines-configuration/properties/fetch-cvr/application.properties"
CVR_USERNAME=$(grep 'dmp.user_name' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//)
CVR_PASSWORD=$(grep 'dmp.password' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//)
CVR_TUMOR_SERVER=$(grep 'dmp.server_name' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//)
CVR_CREATE_SESSION_URL="${CVR_TUMOR_SERVER}create_session/${CVR_USERNAME}/${CVR_PASSWORD}/0"
CVR_FETCH_URL_PREFIX=""
CVR_IMPACT_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_variants"
CVR_HEME_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_heme_variants"
CVR_ARCHER_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_archer_retrieve_variants"
CVR_ACCESS_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_access_variants"
CVR_CONSUME_SAMPLE_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_consume_sample"
IMPACT_FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_impact.json"
HEME_FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_heme.json"
ARCHER_FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_archer.json"
ACCESS_FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_access.json"
IMPACT_CONSUME_IDS_FILEPATH="$TMP_DIR/impact_consume.ids"
HEME_CONSUME_IDS_FILEPATH="$TMP_DIR/heme_consume.ids"
ARCHER_CONSUME_IDS_FILEPATH="$TMP_DIR/archer_consume.ids"
ACCESS_CONSUME_IDS_FILEPATH="$TMP_DIR/access_consume.ids"
PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_event_consume.ids"
PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_metadata_consume.ids"
CONSUME_ATTEMPT_OUTPUT_FILEPATH="$TMP_DIR/consume_attempt_output.json"
FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_${COHORT}.json"
CONSUME_IDS_FILEPATH="$TMP_DIR/${COHORT}_consume.ids"
PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_event_consume_${COHORT}.ids"
PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_metadata_consume_${COHORT}.ids"
CONSUME_ATTEMPT_OUTPUT_FILEPATH="$TMP_DIR/consume_attempt_output_${COHORT}.json"
DETECT_SAMPLES_WITH_NULL_DP_AD_FIELDS_SCRIPT_FILEPATH=/data/portal-cron/scripts/detect_samples_with_null_dp_ad_fields.py
DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH=/data/portal-cron/scripts/detect_samples_with_problematic_metadata.py
CVR_MONITOR_SLACK_URI_FILE="/data/portal-cron/pipelines-credentials/cvr-monitor-webhook-uri"
Expand All @@ -36,27 +32,41 @@ function make_tmp_dir_if_necessary() {
fi
}

function check_args() {
if [[ -z $COHORT ]] || [[ "$COHORT" != "mskimpact" && "$COHORT" != "mskimpact_heme" && "$COHORT" != "mskaccess" && "$COHORT" != "mskarcher" ]]; then
usage
exit 1
fi
}

function usage {
echo "preconsume_problematic_samples.sh \$COHORT_ID"
echo -e "\t\$COHORT_ID one of: ['mskimpact', 'mskimpact_heme', 'mskaccess', 'mskarcher']"
}

function set_cvr_fetch_url_prefix() {
if [ "$COHORT" == "mskimpact" ] ; then
CVR_FETCH_URL_PREFIX=$CVR_IMPACT_FETCH_URL_PREFIX
elif [ "$COHORT" == "mskimpact_heme" ] ; then
CVR_FETCH_URL_PREFIX=$CVR_HEME_FETCH_URL_PREFIX
elif [ "$COHORT" == "mskarcher" ] ; then
CVR_FETCH_URL_PREFIX=$CVR_ARCHER_FETCH_URL_PREFIX
elif [ "$COHORT" == "mskaccess" ] ; then
CVR_FETCH_URL_PREFIX=$CVR_ACCESS_FETCH_URL_PREFIX
fi
}

function fetch_currently_queued_samples() {
dmp_token=$(curl $CVR_CREATE_SESSION_URL | grep session_id | sed -E 's/",[[:space:]]*$//' | sed -E 's/.*"//')
curl "${CVR_IMPACT_FETCH_URL_PREFIX}/${dmp_token}/0" > ${IMPACT_FETCH_OUTPUT_FILEPATH}
curl "${CVR_HEME_FETCH_URL_PREFIX}/${dmp_token}/0" > ${HEME_FETCH_OUTPUT_FILEPATH}
curl "${CVR_ARCHER_FETCH_URL_PREFIX}/${dmp_token}/0" > ${ARCHER_FETCH_OUTPUT_FILEPATH}
curl "${CVR_ACCESS_FETCH_URL_PREFIX}/${dmp_token}/0" > ${ACCESS_FETCH_OUTPUT_FILEPATH}
curl "${CVR_FETCH_URL_PREFIX}/${dmp_token}/0" > ${FETCH_OUTPUT_FILEPATH}
}

function detect_samples_with_problematic_events() {
$DETECT_SAMPLES_WITH_NULL_DP_AD_FIELDS_SCRIPT_FILEPATH ${IMPACT_FETCH_OUTPUT_FILEPATH} ${IMPACT_CONSUME_IDS_FILEPATH}
$DETECT_SAMPLES_WITH_NULL_DP_AD_FIELDS_SCRIPT_FILEPATH ${HEME_FETCH_OUTPUT_FILEPATH} ${HEME_CONSUME_IDS_FILEPATH}
$DETECT_SAMPLES_WITH_NULL_DP_AD_FIELDS_SCRIPT_FILEPATH ${ACCESS_FETCH_OUTPUT_FILEPATH} ${ACCESS_CONSUME_IDS_FILEPATH}
$DETECT_SAMPLES_WITH_NULL_DP_AD_FIELDS_SCRIPT_FILEPATH ${FETCH_OUTPUT_FILEPATH} ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH}
}

function detect_samples_with_problematic_metadata() {
$DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH ${ARCHER_FETCH_OUTPUT_FILEPATH} ${ARCHER_CONSUME_IDS_FILEPATH}
}

function combine_problematic_samples_for_consumption() {
cat ${IMPACT_CONSUME_IDS_FILEPATH} ${HEME_CONSUME_IDS_FILEPATH} ${ACCESS_CONSUME_IDS_FILEPATH} > ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH}
cat ${ARCHER_CONSUME_IDS_FILEPATH} > ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
$DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH ${FETCH_OUTPUT_FILEPATH} ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
}

function exit_if_no_problems_detected() {
Expand Down Expand Up @@ -124,14 +134,18 @@ function consume_hardcoded_samples() {
rm -f ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
touch ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH}
touch ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
echo "P-0025907-N01-IM6" >> "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}"

if [ "$COHORT" == "mskimpact" ] ; then
echo "P-0025907-N01-IM6" >> "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}"
fi
if [ -f "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}" ] ; then
attempt_to_consume_problematic_samples
fi
}

function log_actions() {
date
echo -e "${COHORT^^} Problematic Samples"
echo -e "consumed the following samples with problematic events:\n${succeeded_to_consume_problematic_events_sample_list[*]}"
echo -e "attempted but failed to consume the following samples with problematic events:\n${failed_to_consume_problematic_events_sample_list[*]}"
echo -e "consumed the following samples with problematic metadata:\n${succeeded_to_consume_problematic_metadata_sample_list[*]}"
Expand All @@ -140,28 +154,29 @@ function log_actions() {
}

function post_slack_message() {
MESSAGE="<@U02D0Q0RWUE> <@U03FERRJ6SE> Warning : the following samples have been preemptively consumed before fetch because they contained events which required a value for fields {normal_dp, normal_ad, tumor_dp, tumor_ad} but contained no value in one or more of these fields.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_events_sample_list[*]}"
MESSAGE="<@U02D0Q0RWUE> <@U03FERRJ6SE> ${COHORT^^} Problematic Samples \nWarning : \nThe following samples have been preemptively consumed before fetch because they contained events which required a value for fields {normal_dp, normal_ad, tumor_dp, tumor_ad} but contained no value in one or more of these fields.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_events_sample_list[*]}"
if [ ${#failed_to_consume_problematic_events_sample_list[@]} -gt 0 ]; then
MESSAGE="${MESSAGE} Attempted Unsuccessfully To Consume :\n${failed_to_consume_problematic_events_sample_list[*]}"
fi
MESSAGE="${MESSAGE}Warning : the following samples have been reemptively consumed before fetch because they contained problematic metadata where the gene-panel property was unset or had value UNKNOWN.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_metadata_sample_list[*]}"
MESSAGE="${MESSAGE}Warning : the following samples have been preemptively consumed before fetch because they contained problematic metadata where the gene-panel property was unset or had value UNKNOWN.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_metadata_sample_list[*]}"
if [ ${#failed_to_consume_problematic_metadata_sample_list[@]} -gt 0 ]; then
MESSAGE="${MESSAGE} Attempted Unsuccessfully To Consume :\n${failed_to_consume_problematic_metadata_sample_list[*]}"
fi
curl -X POST -H 'Content-type: application/json' --data '{"blocks": [ { "type": "section", "text": { "type": "mrkdwn", "text": "'"$MESSAGE"'" } } ] }' "$CVR_MONITOR_URI"
}

date
check_args
make_tmp_dir_if_necessary
failed_to_consume_problematic_events_sample_list=() # temporary code
succeeded_to_consume_problematic_events_sample_list=() # temporary code
failed_to_consume_problematic_metadata_sample_list=() # temporary code
succeeded_to_consume_problematic_metadata_sample_list=() # temporary code
set_cvr_fetch_url_prefix
consume_hardcoded_samples # temporary code
fetch_currently_queued_samples
detect_samples_with_problematic_events
detect_samples_with_problematic_metadata
combine_problematic_samples_for_consumption
exit_if_no_problems_detected
failed_to_consume_problematic_events_sample_list=()
succeeded_to_consume_problematic_events_sample_list=()
Expand Down

0 comments on commit 3a9ff4b

Please sign in to comment.