-
Notifications
You must be signed in to change notification settings - Fork 1
/
rrun-z-document-text-export-batch.sh
executable file
·78 lines (67 loc) · 3.17 KB
/
rrun-z-document-text-export-batch.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/bash
# Running this script results in the export of the actionable text documents to a google bucket.
# This step is used to export the text so that it can be processed outside of Dataflow, by AI Platform for example.
source ./rrun.env.sh
########### CHOOSE STAGE OR PROD
## STAGE
# PROJECT_ID=lithe-vault-265816
# WORK_BUCKET=gs://translator-tm-provider-datastore-staging-stage
# ## PROD
# PROJECT_ID=translator-text-workflow-dev
# WORK_BUCKET=gs://translator-text-workflow-dev_work
########### CHOOSE MEDLINE OR PMCOA
#### MEDLINE
TEXT_PIPELINE_KEY=MEDLINE_XML_TO_TEXT
TEXT_PIPELINE_VERSION='0.1.0'
SUBSET_PREFIX=PUBMED_SUB_
MAX_SUBSET_INDEX=37
#### PMCOA
# PIPELINE_KEY=BIOC_TO_TEXT
# TEXT_PIPELINE_VERSION=
# SUBSET_PREFIX=PMC_SUBSET_
# MAX_SUBSET_INDEX=36
SCRIPT=./scripts/pipelines/export/run_document_text_export.sh
# STAGE_LOCATION=$WORK_BUCKET/staging
# TEMP_LOCATION=$WORK_BUCKET/temp
OVERWRITE_FLAG=YES
JAR_VERSION='0.2.1'
echo "Starting document text export"
COLLECTION=PUBMED_SUB_30
#COLLECTION=TEST
OUTPUT_BUCKET="$WORK_BUCKET/output/text-export/$COLLECTION/text-export"
$SCRIPT $PROJECT_ID $COLLECTION $TEXT_PIPELINE_KEY $TEXT_PIPELINE_VERSION $OVERWRITE_FLAG $OUTPUT_BUCKET ${STAGE_LOCATION} ${TEMP_LOCATION} ${JAR_VERSION} &> "./logs/text-export-${COLLECTION}.log"
# for INDEX in $(seq 0 1 $MAX_SUBSET_INDEX)
# do
# ind=$(($INDEX + 0))
# if (( ind <= $MAX_SUBSET_INDEX)); then
# echo "Starting abbreviation detection pipeline... ${ind} $(date)"
# COLLECTION=${SUBSET_PREFIX}${ind}
# OUTPUT_BUCKET="$WORK_BUCKET/output/text-export/$COLLECTION/text-export"
# $SCRIPT $PROJECT_ID $COLLECTION $PIPELINE_KEY $OVERWRITE_FLAG $OUTPUT_BUCKET ${STAGE_LOCATION} ${TEMP_LOCATION} ${JAR_VERSION} &> ./logs/text-export-${ind}.log &
# sleep 120
# fi
# ind=$(($INDEX + 1))
# if (( ind <= $MAX_SUBSET_INDEX)); then
# echo "Starting abbreviation detection pipeline... ${ind} $(date)"
# COLLECTION=${SUBSET_PREFIX}${ind}
# OUTPUT_BUCKET="$WORK_BUCKET/output/text-export/$COLLECTION/text-export"
# $SCRIPT $PROJECT_ID $COLLECTION $PIPELINE_KEY $OVERWRITE_FLAG $OUTPUT_BUCKET ${STAGE_LOCATION} ${TEMP_LOCATION} ${JAR_VERSION} &> ./logs/text-export-${ind}.log &
# sleep 120
# fi
# ind=$(($INDEX + 2))
# if (( ind <= $MAX_SUBSET_INDEX)); then
# echo "Starting abbreviation detection pipeline... ${ind} $(date)"
# COLLECTION=${SUBSET_PREFIX}${ind}
# OUTPUT_BUCKET="$WORK_BUCKET/output/text-export/$COLLECTION/text-export"
# $SCRIPT $PROJECT_ID $COLLECTION $PIPELINE_KEY $OVERWRITE_FLAG $OUTPUT_BUCKET ${STAGE_LOCATION} ${TEMP_LOCATION} ${JAR_VERSION} &> ./logs/text-export-${ind}.log &
# sleep 120
# fi
# ind=$(($INDEX + 3))
# if (( ind <= $MAX_SUBSET_INDEX)); then
# echo "Starting abbreviation detection pipeline... ${ind} $(date)"
# COLLECTION=${SUBSET_PREFIX}${ind}
# OUTPUT_BUCKET="$WORK_BUCKET/output/text-export/$COLLECTION/text-export"
# $SCRIPT $PROJECT_ID $COLLECTION $PIPELINE_KEY $OVERWRITE_FLAG $OUTPUT_BUCKET ${STAGE_LOCATION} ${TEMP_LOCATION} ${JAR_VERSION} &> ./logs/text-export-${ind}.log &
# fi
# wait
# done