Skip to content

Commit

Permalink
Preconsume archer-solid-cv4 and add fetch loop (#1129)
Browse files Browse the repository at this point in the history
* Handle archer-solid-cv4 samples
* Add loop
* move each cohort to its own dir and fix filename
  • Loading branch information
callachennault authored and sheridancbio committed Feb 9, 2024
1 parent d554663 commit f360c62
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 30 deletions.
2 changes: 1 addition & 1 deletion import-scripts/detect_samples_with_problematic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def sample_missing_required_metadata(metadata, problematic_metadata):
if metadata[required_field] is None:
problematic_metadata["gene-panel"] = None
return True
if required_field == "gene-panel" and metadata[required_field].casefold() == "unknown":
if required_field == "gene-panel" and metadata[required_field].casefold() in {"unknown", "archer-solid-cv4"}:
problematic_metadata["gene-panel"] = metadata[required_field]
return True
return False
Expand Down
86 changes: 57 additions & 29 deletions import-scripts/preconsume_problematic_samples.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/usr/bin/env bash

COHORT=$1
TMP_DIR="/data/portal-cron/tmp/preconsume_problematic_samples"
FETCH_NUM=1
TMP_DIR="/data/portal-cron/tmp/preconsume_problematic_samples/${COHORT}"
CVR_FETCH_PROPERTIES_FILEPATH="/data/portal-cron/git-repos/pipelines-configuration/properties/fetch-cvr/application.properties"
CVR_USERNAME=$(grep 'dmp.user_name' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//)
CVR_PASSWORD=$(grep 'dmp.password' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//)
Expand All @@ -13,11 +14,11 @@ CVR_HEME_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_heme_variants"
CVR_ARCHER_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_archer_retrieve_variants"
CVR_ACCESS_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_access_variants"
CVR_CONSUME_SAMPLE_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_consume_sample"
FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_${COHORT}.json"
CONSUME_IDS_FILEPATH="$TMP_DIR/${COHORT}_consume.ids"
PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_event_consume_${COHORT}.ids"
PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_metadata_consume_${COHORT}.ids"
CONSUME_ATTEMPT_OUTPUT_FILEPATH="$TMP_DIR/consume_attempt_output_${COHORT}.json"
FETCH_OUTPUT_FILEPATH=""
CONSUME_IDS_FILEPATH="$TMP_DIR/consume.ids"
PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_event_consume.ids"
PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_metadata_consume.ids"
CONSUME_ATTEMPT_OUTPUT_FILEPATH="$TMP_DIR/consume_attempt_output.json"
DETECT_SAMPLES_WITH_NULL_DP_AD_FIELDS_SCRIPT_FILEPATH=/data/portal-cron/scripts/detect_samples_with_null_dp_ad_fields.py
DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH=/data/portal-cron/scripts/detect_samples_with_problematic_metadata.py
CVR_MONITOR_SLACK_URI_FILE="/data/portal-cron/pipelines-credentials/cvr-monitor-webhook-uri"
Expand All @@ -29,6 +30,9 @@ function make_tmp_dir_if_necessary() {
echo "Error : could not create tmp directory '$TMP_DIR'" >&2
exit 1
fi
else
# Remove files from last fetch
rm $TMP_DIR/*
fi
}

Expand Down Expand Up @@ -57,6 +61,7 @@ function set_cvr_fetch_url_prefix() {
}

function fetch_currently_queued_samples() {
FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_${FETCH_NUM}.json"
dmp_token=$(curl $CVR_CREATE_SESSION_URL | grep session_id | sed -E 's/",[[:space:]]*$//' | sed -E 's/.*"//')
curl "${CVR_FETCH_URL_PREFIX}/${dmp_token}/0" > ${FETCH_OUTPUT_FILEPATH}
}
Expand All @@ -69,10 +74,13 @@ function detect_samples_with_problematic_metadata() {
$DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH ${FETCH_OUTPUT_FILEPATH} ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
}

function exit_if_no_problems_detected() {
function problems_were_detected() {
if [ ! -s ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} ] && [ ! -s ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH} ] ; then
echo "no problematic samples detected .. exiting"
exit 0
echo "no problematic samples detected"
return 1
else
echo "problematic samples were detected"
return 0
fi
}

Expand Down Expand Up @@ -108,41 +116,57 @@ function attempt_to_consume_problematic_sample() {
dmp_token="$1"
sample_id="$2"
type_of_problem="$3" # pass 'e' for event problems and 'm' for metadata problems
register_attempt="$4"
HTTP_STATUS=$(curl -sSL -w '%{http_code}' -o "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" "${CVR_CONSUME_SAMPLE_URL_PREFIX}/${dmp_token}/${sample_id}")
if [[ $HTTP_STATUS =~ ^2 ]] ; then
if ! grep '"error": "' "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" ; then
if grep --silent 'affectedRows": 1' "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" ; then
register_successful_consumption "${sample_id}" "$type_of_problem"
continue
if [ "$register_attempt" == true ] ; then
register_successful_consumption "${sample_id}" "$type_of_problem"
continue
fi
fi
fi
fi
register_failed_consumption "${sample_id}" "$type_of_problem"
if [ "$register_attempt" == true ] ; then
register_failed_consumption "${sample_id}" "$type_of_problem"
fi
}

function attempt_to_consume_problematic_samples() {
register_attempt=${1:-true}
dmp_token=$(curl $CVR_CREATE_SESSION_URL | grep session_id | sed -E 's/",[[:space:]]*$//' | sed -E 's/.*"//')
while read sample_id ; do
attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "e"
attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "e" "$register_attempt"
done < ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH}
while read sample_id ; do
attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "m"
attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "m" "$register_attempt"
done < ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
}

function consume_hardcoded_samples() {
rm -f ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}
touch ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH}
touch ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}

if [ "$COHORT" == "mskimpact" ] ; then
echo "P-0025907-N01-IM6" >> "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}"
fi
if [ -f "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}" ] ; then
attempt_to_consume_problematic_samples
# Won't register attempt (so it doesn't show up in logs every night)
attempt_to_consume_problematic_samples false
fi
}

function need_to_log_actions {
if [ ${#succeeded_to_consume_problematic_events_sample_list[@]} -gt 0 ] || \
[ ${#failed_to_consume_problematic_events_sample_list[@]} -gt 0 ] || \
[ ${#succeeded_to_consume_problematic_metadata_sample_list[@]} -gt 0 ] || \
[ ${#failed_to_consume_problematic_metadata_sample_list[@]} -gt 0 ] ; then
return 0
fi
return 1
}

function log_actions() {
date
echo -e "${COHORT^^} Problematic Samples"
Expand All @@ -158,7 +182,7 @@ function post_slack_message() {
if [ ${#failed_to_consume_problematic_events_sample_list[@]} -gt 0 ]; then
MESSAGE="${MESSAGE} Attempted Unsuccessfully To Consume :\n${failed_to_consume_problematic_events_sample_list[*]}"
fi
MESSAGE="${MESSAGE}Warning : the following samples have been preemptively consumed before fetch because they contained problematic metadata where the gene-panel property was unset or had value UNKNOWN.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_metadata_sample_list[*]}"
MESSAGE="${MESSAGE}Warning : the following samples have been preemptively consumed before fetch because they contained problematic metadata where the gene-panel property was unset, invalid, or had value UNKNOWN.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_metadata_sample_list[*]}"
if [ ${#failed_to_consume_problematic_metadata_sample_list[@]} -gt 0 ]; then
MESSAGE="${MESSAGE} Attempted Unsuccessfully To Consume :\n${failed_to_consume_problematic_metadata_sample_list[*]}"
fi
Expand All @@ -168,20 +192,24 @@ function post_slack_message() {
date
check_args
make_tmp_dir_if_necessary
failed_to_consume_problematic_events_sample_list=() # temporary code
succeeded_to_consume_problematic_events_sample_list=() # temporary code
failed_to_consume_problematic_metadata_sample_list=() # temporary code
succeeded_to_consume_problematic_metadata_sample_list=() # temporary code
set_cvr_fetch_url_prefix
consume_hardcoded_samples # temporary code
fetch_currently_queued_samples
detect_samples_with_problematic_events
detect_samples_with_problematic_metadata
exit_if_no_problems_detected
failed_to_consume_problematic_events_sample_list=()
succeeded_to_consume_problematic_events_sample_list=()
failed_to_consume_problematic_metadata_sample_list=()
succeeded_to_consume_problematic_metadata_sample_list=()
attempt_to_consume_problematic_samples
log_actions
post_slack_message
while :
do
consume_hardcoded_samples # temporary code
fetch_currently_queued_samples
detect_samples_with_problematic_events
detect_samples_with_problematic_metadata
if ! problems_were_detected ; then
break
fi
attempt_to_consume_problematic_samples
((FETCH_NUM++))
done
if need_to_log_actions ; then
log_actions
post_slack_message
fi

0 comments on commit f360c62

Please sign in to comment.