diff --git a/import-scripts/detect_samples_with_problematic_metadata.py b/import-scripts/detect_samples_with_problematic_metadata.py index af6c3050f..a5b57e636 100755 --- a/import-scripts/detect_samples_with_problematic_metadata.py +++ b/import-scripts/detect_samples_with_problematic_metadata.py @@ -39,7 +39,7 @@ def sample_missing_required_metadata(metadata, problematic_metadata): if metadata[required_field] is None: problematic_metadata["gene-panel"] = None return True - if required_field == "gene-panel" and metadata[required_field].casefold() == "unknown": + if required_field == "gene-panel" and metadata[required_field].casefold() in {"unknown", "archer-solid-cv4"}: problematic_metadata["gene-panel"] = metadata[required_field] return True return False diff --git a/import-scripts/preconsume_problematic_samples.sh b/import-scripts/preconsume_problematic_samples.sh index 6b748f7c4..fffb63fb7 100755 --- a/import-scripts/preconsume_problematic_samples.sh +++ b/import-scripts/preconsume_problematic_samples.sh @@ -1,7 +1,8 @@ #!/usr/bin/env bash COHORT=$1 -TMP_DIR="/data/portal-cron/tmp/preconsume_problematic_samples" +FETCH_NUM=1 +TMP_DIR="/data/portal-cron/tmp/preconsume_problematic_samples/${COHORT}" CVR_FETCH_PROPERTIES_FILEPATH="/data/portal-cron/git-repos/pipelines-configuration/properties/fetch-cvr/application.properties" CVR_USERNAME=$(grep 'dmp.user_name' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//) CVR_PASSWORD=$(grep 'dmp.password' ${CVR_FETCH_PROPERTIES_FILEPATH} | head -n 1 | sed -E s/[^=][^=]*=//) @@ -13,11 +14,11 @@ CVR_HEME_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_heme_variants" CVR_ARCHER_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_archer_retrieve_variants" CVR_ACCESS_FETCH_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_retrieve_access_variants" CVR_CONSUME_SAMPLE_URL_PREFIX="${CVR_TUMOR_SERVER}cbio_consume_sample" -FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_${COHORT}.json" -CONSUME_IDS_FILEPATH="$TMP_DIR/${COHORT}_consume.ids" -PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_event_consume_${COHORT}.ids" -PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_metadata_consume_${COHORT}.ids" -CONSUME_ATTEMPT_OUTPUT_FILEPATH="$TMP_DIR/consume_attempt_output_${COHORT}.json" +FETCH_OUTPUT_FILEPATH="" +CONSUME_IDS_FILEPATH="$TMP_DIR/consume.ids" +PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_event_consume.ids" +PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH="$TMP_DIR/problematic_metadata_consume.ids" +CONSUME_ATTEMPT_OUTPUT_FILEPATH="$TMP_DIR/consume_attempt_output.json" DETECT_SAMPLES_WITH_NULL_DP_AD_FIELDS_SCRIPT_FILEPATH=/data/portal-cron/scripts/detect_samples_with_null_dp_ad_fields.py DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH=/data/portal-cron/scripts/detect_samples_with_problematic_metadata.py CVR_MONITOR_SLACK_URI_FILE="/data/portal-cron/pipelines-credentials/cvr-monitor-webhook-uri" @@ -29,6 +30,9 @@ function make_tmp_dir_if_necessary() { echo "Error : could not create tmp directory '$TMP_DIR'" >&2 exit 1 fi + else + # Remove files from last fetch + rm $TMP_DIR/* fi } @@ -57,6 +61,7 @@ function set_cvr_fetch_url_prefix() { } function fetch_currently_queued_samples() { + FETCH_OUTPUT_FILEPATH="$TMP_DIR/cvr_data_${FETCH_NUM}.json" dmp_token=$(curl $CVR_CREATE_SESSION_URL | grep session_id | sed -E 's/",[[:space:]]*$//' | sed -E 's/.*"//') curl "${CVR_FETCH_URL_PREFIX}/${dmp_token}/0" > ${FETCH_OUTPUT_FILEPATH} } @@ -69,10 +74,13 @@ function detect_samples_with_problematic_metadata() { $DETECT_SAMPLES_WITH_PROBLEMATIC_METADATA_SCRIPT_FILEPATH ${FETCH_OUTPUT_FILEPATH} ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH} } -function exit_if_no_problems_detected() { +function problems_were_detected() { if [ ! -s ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} ] && [ ! -s ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH} ] ; then - echo "no problematic samples detected .. exiting" - exit 0 + echo "no problematic samples detected" + return 1 + else + echo "problematic samples were detected" + return 0 fi } @@ -108,25 +116,31 @@ function attempt_to_consume_problematic_sample() { dmp_token="$1" sample_id="$2" type_of_problem="$3" # pass 'e' for event problems and 'm' for metadata problems + register_attempt="$4" HTTP_STATUS=$(curl -sSL -w '%{http_code}' -o "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" "${CVR_CONSUME_SAMPLE_URL_PREFIX}/${dmp_token}/${sample_id}") if [[ $HTTP_STATUS =~ ^2 ]] ; then if ! grep '"error": "' "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" ; then if grep --silent 'affectedRows": 1' "$CONSUME_ATTEMPT_OUTPUT_FILEPATH" ; then - register_successful_consumption "${sample_id}" "$type_of_problem" - continue + if [ "$register_attempt" == true ] ; then + register_successful_consumption "${sample_id}" "$type_of_problem" + continue + fi fi fi fi - register_failed_consumption "${sample_id}" "$type_of_problem" + if [ "$register_attempt" == true ] ; then + register_failed_consumption "${sample_id}" "$type_of_problem" + fi } function attempt_to_consume_problematic_samples() { + register_attempt=${1:-true} dmp_token=$(curl $CVR_CREATE_SESSION_URL | grep session_id | sed -E 's/",[[:space:]]*$//' | sed -E 's/.*"//') while read sample_id ; do - attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "e" + attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "e" "$register_attempt" done < ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} while read sample_id ; do - attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "m" + attempt_to_consume_problematic_sample "$dmp_token" "$sample_id" "m" "$register_attempt" done < ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH} } @@ -134,15 +148,25 @@ function consume_hardcoded_samples() { rm -f ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH} touch ${PROBLEMATIC_EVENT_CONSUME_IDS_FILEPATH} touch ${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH} - if [ "$COHORT" == "mskimpact" ] ; then echo "P-0025907-N01-IM6" >> "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}" fi if [ -f "${PROBLEMATIC_METADATA_CONSUME_IDS_FILEPATH}" ] ; then - attempt_to_consume_problematic_samples + # Won't register attempt (so it doesn't show up in logs every night) + attempt_to_consume_problematic_samples false fi } +function need_to_log_actions { + if [ ${#succeeded_to_consume_problematic_events_sample_list[@]} -gt 0 ] || \ + [ ${#failed_to_consume_problematic_events_sample_list[@]} -gt 0 ] || \ + [ ${#succeeded_to_consume_problematic_metadata_sample_list[@]} -gt 0 ] || \ + [ ${#failed_to_consume_problematic_metadata_sample_list[@]} -gt 0 ] ; then + return 0 + fi + return 1 +} + function log_actions() { date echo -e "${COHORT^^} Problematic Samples" @@ -158,7 +182,7 @@ function post_slack_message() { if [ ${#failed_to_consume_problematic_events_sample_list[@]} -gt 0 ]; then MESSAGE="${MESSAGE} Attempted Unsuccessfully To Consume :\n${failed_to_consume_problematic_events_sample_list[*]}" fi - MESSAGE="${MESSAGE}Warning : the following samples have been preemptively consumed before fetch because they contained problematic metadata where the gene-panel property was unset or had value UNKNOWN.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_metadata_sample_list[*]}" + MESSAGE="${MESSAGE}Warning : the following samples have been preemptively consumed before fetch because they contained problematic metadata where the gene-panel property was unset, invalid, or had value UNKNOWN.\nSuccessfully Consumed :\n${succeeded_to_consume_problematic_metadata_sample_list[*]}" if [ ${#failed_to_consume_problematic_metadata_sample_list[@]} -gt 0 ]; then MESSAGE="${MESSAGE} Attempted Unsuccessfully To Consume :\n${failed_to_consume_problematic_metadata_sample_list[*]}" fi @@ -168,20 +192,24 @@ function post_slack_message() { date check_args make_tmp_dir_if_necessary -failed_to_consume_problematic_events_sample_list=() # temporary code -succeeded_to_consume_problematic_events_sample_list=() # temporary code -failed_to_consume_problematic_metadata_sample_list=() # temporary code -succeeded_to_consume_problematic_metadata_sample_list=() # temporary code set_cvr_fetch_url_prefix -consume_hardcoded_samples # temporary code -fetch_currently_queued_samples -detect_samples_with_problematic_events -detect_samples_with_problematic_metadata -exit_if_no_problems_detected failed_to_consume_problematic_events_sample_list=() succeeded_to_consume_problematic_events_sample_list=() failed_to_consume_problematic_metadata_sample_list=() succeeded_to_consume_problematic_metadata_sample_list=() -attempt_to_consume_problematic_samples -log_actions -post_slack_message +while : +do + consume_hardcoded_samples # temporary code + fetch_currently_queued_samples + detect_samples_with_problematic_events + detect_samples_with_problematic_metadata + if ! problems_were_detected ; then + break + fi + attempt_to_consume_problematic_samples + ((FETCH_NUM++)) +done +if need_to_log_actions ; then + log_actions + post_slack_message +fi \ No newline at end of file