Skip to content

Commit

Permalink
BigQuery dataset: Added vwallsignals view, aggregating all signals in…
Browse files Browse the repository at this point in the history
…to one view

BigQuery dataset: Added tables for emoticons, modifiers, negators, patterns, prefixes, substitutions
csdict: fixed data bug in custom-idioms.csv, added ADJP_ADVP_chunk signal to modifiers file, renamed substitutions file
  • Loading branch information
datancoffee committed Nov 4, 2017
1 parent 4b4c399 commit c8c882d
Show file tree
Hide file tree
Showing 23 changed files with 341 additions and 25 deletions.
2 changes: 1 addition & 1 deletion bigquery/abbreviationsSchema.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
},
{
"mode": "NULLABLE",
"name": "keyhash",
"name": "shortkey",
"type": "STRING",
"description" : "First 8 characters of TO_BASE64(SHA256()) of all key columns, concatenated with '/'"
}
Expand Down
23 changes: 23 additions & 0 deletions bigquery/bfoverridesSchema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[
{
"mode": "REQUIRED",
"name": "lemma_key",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "pos_key",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "baseform",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "shortkey",
"type": "STRING",
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString"
}
]
4 changes: 4 additions & 0 deletions bigquery/build_dataset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ echo "Finished loading data"
echo "Beginning to update hashes"
./build_update_hashes.sh
echo "Finished updating hashes"

echo "Beginning to build views"
./build_views.sh mk
echo "Finished building views"
28 changes: 28 additions & 0 deletions bigquery/build_reload_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,31 @@ bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.profanityidioms
gsutil cp ../src/main/resources/csdict/qualities-en.csv gs://$GCS_BUCKET/temp/
bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.qualities gs://$GCS_BUCKET/temp/qualities-en.csv qualitiesSchema.json


gsutil cp ../src/main/resources/csdict/bfoverrides-en.csv gs://$GCS_BUCKET/temp/
bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.bfoverrides gs://$GCS_BUCKET/temp/bfoverrides-en.csv bfoverridesSchema.json

gsutil cp ../src/main/resources/csdict/emoticons.csv gs://$GCS_BUCKET/temp/
bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.emoticons gs://$GCS_BUCKET/temp/emoticons.csv emoticonsSchema.json

gsutil cp ../src/main/resources/csdict/modifiers-en.csv gs://$GCS_BUCKET/temp/
bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.modifiers gs://$GCS_BUCKET/temp/modifiers-en.csv modifiersSchema.json

gsutil cp ../src/main/resources/csdict/negators-en.csv gs://$GCS_BUCKET/temp/
bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.negators gs://$GCS_BUCKET/temp/negators-en.csv negatorsSchema.json

gsutil cp ../src/main/resources/csdict/patterns.csv gs://$GCS_BUCKET/temp/
bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.patterns gs://$GCS_BUCKET/temp/patterns.csv patternsSchema.json

gsutil cp ../src/main/resources/csdict/prefixes-en.csv gs://$GCS_BUCKET/temp/
bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.prefixes gs://$GCS_BUCKET/temp/prefixes-en.csv prefixesSchema.json

gsutil cp ../src/main/resources/csdict/substitutions-en.csv gs://$GCS_BUCKET/temp/
bq load --skip_leading_rows=1 --allow_jagged_rows=1 $DATASET_ID.substitutions gs://$GCS_BUCKET/temp/substitutions-en.csv substitutionsSchema.json







8 changes: 8 additions & 0 deletions bigquery/build_tables.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,13 @@ bq $1 --schema=interjectionsSchema.json -t $DATASET_ID.interjections
bq $1 --schema=profanityIdiomsSchema.json -t $DATASET_ID.profanityidioms
bq $1 --schema=qualitiesSchema.json -t $DATASET_ID.qualities

bq $1 --schema=bfoverridesSchema.json -t $DATASET_ID.bfoverrides
bq $1 --schema=emoticonsSchema.json -t $DATASET_ID.emoticons
bq $1 --schema=modifiersSchema.json -t $DATASET_ID.modifiers
bq $1 --schema=negatorsSchema.json -t $DATASET_ID.negators
bq $1 --schema=patternsSchema.json -t $DATASET_ID.patterns
bq $1 --schema=prefixesSchema.json -t $DATASET_ID.prefixes
bq $1 --schema=substitutionsSchema.json -t $DATASET_ID.substitutions



54 changes: 46 additions & 8 deletions bigquery/build_update_hashes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,41 +18,79 @@ DATASET_ID=${DATASET_ID:-"sirocco_dict_en"}

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.abbreviations`
SET keyhash = SUBSTR(TO_BASE64(SHA256(abbrev_key)),1,8)
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(abbrev_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.degreeadverbs`
SET keyhash = SUBSTR(TO_BASE64(SHA256(CONCAT(lemma_key,"/",pos_key))),1,8)
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(CONCAT(lemma_key,"/",pos_key))),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.emotions`
SET keyhash = SUBSTR(TO_BASE64(SHA256(CONCAT(lemma_key,"/",pos_key))),1,8)
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(CONCAT(lemma_key,"/",pos_key))),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.idioms`
SET keyhash = SUBSTR(TO_BASE64(SHA256(poslist_key)),1,8)
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(poslist_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.customidioms`
SET keyhash = SUBSTR(TO_BASE64(SHA256(poslist_key)),1,8)
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(poslist_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.interjections`
SET keyhash = SUBSTR(TO_BASE64(SHA256(interjection_key)),1,8)
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(interjection_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.profanityidioms`
SET keyhash = SUBSTR(TO_BASE64(SHA256(poslist_key)),1,8)
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(poslist_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.qualities`
SET keyhash = SUBSTR(TO_BASE64(SHA256(CONCAT(lemma_key,"/",pos_key))),1,8)
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(CONCAT(lemma_key,"/",pos_key))),1,8),"/","_"),"+","-")
WHERE true';



bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.bfoverrides`
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(CONCAT(lemma_key,"/",pos_key))),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.emoticons`
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(emoticon_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.modifiers`
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(parsetype_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.negators`
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(CONCAT(lemma_key,"/",pos_key))),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.patterns`
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(pattern_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.prefixes`
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(prefix_key)),1,8),"/","_"),"+","-")
WHERE true';

bq query --use_legacy_sql=false \
'UPDATE `'$PROJECT_ID'.'$DATASET_ID'.substitutions`
SET shortkey = REPLACE(REPLACE(SUBSTR(TO_BASE64(SHA256(abbrev_key)),1,8),"/","_"),"+","-")
WHERE true';


66 changes: 66 additions & 0 deletions bigquery/build_views.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@

if [ $# -eq 0 ]
then
echo "No arguments supplied"
exit 1
fi

if [ "$1" != "mk" ] && [ "$1" != "update" ]
then
echo "Invalid parameter value. Use one of {mk | update}"
exit 1
fi


PROJECT_ID=$(gcloud config list --format 'value(core.project)' 2>/dev/null);
DATASET_ID=${DATASET_ID:-"sirocco_dict_en"}


bq $1 --use_legacy_sql=false --view=\
'SELECT shortkey, abbrev_key AS fullkey, "abbreviations" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.abbreviations`
UNION ALL
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "degreeadverbs" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.degreeadverbs`
UNION ALL
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "emotions" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.emotions`
UNION ALL
SELECT shortkey, poslist_key AS fullkey, "idioms" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.idioms`
UNION ALL
SELECT shortkey, poslist_key AS fullkey, "customidioms" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.customidioms`
UNION ALL
SELECT shortkey, interjection_key AS fullkey, "interjections" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.interjections`
UNION ALL
SELECT shortkey, poslist_key AS fullkey, "profanityidioms" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.profanityidioms`
UNION ALL
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "qualities" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.qualities`
UNION ALL
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "bfoverrides" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.bfoverrides`
UNION ALL
SELECT shortkey, emoticon_key AS fullkey, "emoticons" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.emoticons`
UNION ALL
SELECT shortkey, parsetype_key AS fullkey, "modifiers" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.modifiers`
UNION ALL
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "negators" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.negators`
UNION ALL
SELECT shortkey, pattern_key AS fullkey, "patterns" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.patterns`
UNION ALL
SELECT shortkey, prefix_key AS fullkey, "prefixes" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.prefixes`
UNION ALL
SELECT shortkey, abbrev_key AS fullkey, "substitutions" AS signaltype
FROM `'$PROJECT_ID'.'$DATASET_ID'.substitutions`' \
$DATASET_ID.vwallsignals;


2 changes: 1 addition & 1 deletion bigquery/degreeAdverbsSchema.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
},
{
"mode": "NULLABLE",
"name": "keyhash",
"name": "shortkey",
"type": "STRING",
"description" : "First 8 characters of TO_BASE64(SHA256()) of all key columns, concatenated with '/'"
}
Expand Down
38 changes: 38 additions & 0 deletions bigquery/emoticonsSchema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
[
{
"mode": "REQUIRED",
"name": "emoticon_key",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "positive",
"type": "FLOAT"
},
{
"mode": "REQUIRED",
"name": "negative",
"type": "FLOAT"
},
{
"mode": "REQUIRED",
"name": "sis",
"type": "FLOAT"
},
{
"mode": "REQUIRED",
"name": "surprise",
"type": "FLOAT"
},
{
"mode": "REQUIRED",
"name": "requires_closing_tokenizer_flag",
"type": "FLOAT"
},
{
"mode": "NULLABLE",
"name": "shortkey",
"type": "STRING",
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString"
}
]
2 changes: 1 addition & 1 deletion bigquery/emotionsSchema.json
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
},
{
"mode": "NULLABLE",
"name": "keyhash",
"name": "shortkey",
"type": "STRING",
"description" : "First 8 characters of TO_BASE64(SHA256()) of all key columns, concatenated with '/'"
}
Expand Down
2 changes: 1 addition & 1 deletion bigquery/idiomsSchema.json
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
},
{
"mode": "NULLABLE",
"name": "keyhash",
"name": "shortkey",
"type": "STRING",
"description" : "First 8 characters of TO_BASE64(SHA256()) of all key columns, concatenated with '/'"
}
Expand Down
2 changes: 1 addition & 1 deletion bigquery/interjectionsSchema.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
},
{
"mode": "NULLABLE",
"name": "keyhash",
"name": "shortkey",
"type": "STRING",
"description" : "First 8 characters of TO_BASE64(SHA256()) of all key columns, concatenated with '/'"
}
Expand Down
18 changes: 18 additions & 0 deletions bigquery/modifiersSchema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[
{
"mode": "REQUIRED",
"name": "parsetype_key",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "score",
"type": "FLOAT"
},
{
"mode": "NULLABLE",
"name": "shortkey",
"type": "STRING",
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString"
}
]
23 changes: 23 additions & 0 deletions bigquery/negatorsSchema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[
{
"mode": "REQUIRED",
"name": "lemma_key",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "pos_key",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "negation",
"type": "FLOAT"
},
{
"mode": "NULLABLE",
"name": "shortkey",
"type": "STRING",
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString"
}
]
28 changes: 28 additions & 0 deletions bigquery/patternsSchema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[
{
"mode": "REQUIRED",
"name": "pattern_key",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "regexoption_flag",
"type": "FLOAT"
},
{
"mode": "REQUIRED",
"name": "ishashtag",
"type": "FLOAT"
},
{
"mode": "REQUIRED",
"name": "islink",
"type": "FLOAT"
},
{
"mode": "NULLABLE",
"name": "shortkey",
"type": "STRING",
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString"
}
]
Loading

0 comments on commit c8c882d

Please sign in to comment.