-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BigQuery dataset: Added vwallsignals view, aggregating all signals in…
…to one view BigQuery dataset: Added tables for emoticons, modifiers, negators, patterns, prefixes, substitutions csdict: fixed data bug in custom-idioms.csv, added ADJP_ADVP_chunk signal to modifiers file, renamed substitutions file
- Loading branch information
1 parent
4b4c399
commit c8c882d
Showing
23 changed files
with
341 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
[ | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "lemma_key", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "pos_key", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "baseform", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"mode": "NULLABLE", | ||
"name": "shortkey", | ||
"type": "STRING", | ||
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
|
||
if [ $# -eq 0 ] | ||
then | ||
echo "No arguments supplied" | ||
exit 1 | ||
fi | ||
|
||
if [ "$1" != "mk" ] && [ "$1" != "update" ] | ||
then | ||
echo "Invalid parameter value. Use one of {mk | update}" | ||
exit 1 | ||
fi | ||
|
||
|
||
PROJECT_ID=$(gcloud config list --format 'value(core.project)' 2>/dev/null); | ||
DATASET_ID=${DATASET_ID:-"sirocco_dict_en"} | ||
|
||
|
||
bq $1 --use_legacy_sql=false --view=\ | ||
'SELECT shortkey, abbrev_key AS fullkey, "abbreviations" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.abbreviations` | ||
UNION ALL | ||
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "degreeadverbs" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.degreeadverbs` | ||
UNION ALL | ||
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "emotions" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.emotions` | ||
UNION ALL | ||
SELECT shortkey, poslist_key AS fullkey, "idioms" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.idioms` | ||
UNION ALL | ||
SELECT shortkey, poslist_key AS fullkey, "customidioms" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.customidioms` | ||
UNION ALL | ||
SELECT shortkey, interjection_key AS fullkey, "interjections" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.interjections` | ||
UNION ALL | ||
SELECT shortkey, poslist_key AS fullkey, "profanityidioms" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.profanityidioms` | ||
UNION ALL | ||
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "qualities" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.qualities` | ||
UNION ALL | ||
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "bfoverrides" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.bfoverrides` | ||
UNION ALL | ||
SELECT shortkey, emoticon_key AS fullkey, "emoticons" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.emoticons` | ||
UNION ALL | ||
SELECT shortkey, parsetype_key AS fullkey, "modifiers" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.modifiers` | ||
UNION ALL | ||
SELECT shortkey, CONCAT(lemma_key,"/",pos_key) AS fullkey, "negators" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.negators` | ||
UNION ALL | ||
SELECT shortkey, pattern_key AS fullkey, "patterns" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.patterns` | ||
UNION ALL | ||
SELECT shortkey, prefix_key AS fullkey, "prefixes" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.prefixes` | ||
UNION ALL | ||
SELECT shortkey, abbrev_key AS fullkey, "substitutions" AS signaltype | ||
FROM `'$PROJECT_ID'.'$DATASET_ID'.substitutions`' \ | ||
$DATASET_ID.vwallsignals; | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
[ | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "emoticon_key", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "positive", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "negative", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "sis", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "surprise", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "requires_closing_tokenizer_flag", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "NULLABLE", | ||
"name": "shortkey", | ||
"type": "STRING", | ||
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
[ | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "parsetype_key", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "score", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "NULLABLE", | ||
"name": "shortkey", | ||
"type": "STRING", | ||
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
[ | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "lemma_key", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "pos_key", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "negation", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "NULLABLE", | ||
"name": "shortkey", | ||
"type": "STRING", | ||
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
[ | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "pattern_key", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "regexoption_flag", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "ishashtag", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "REQUIRED", | ||
"name": "islink", | ||
"type": "FLOAT" | ||
}, | ||
{ | ||
"mode": "NULLABLE", | ||
"name": "shortkey", | ||
"type": "STRING", | ||
"description" : "Url-safe (/ and + replaced with _ and -) variation of base64 algo. Takes first 8 chars of TO_BASE64(SHA256()) of all key columns, concatenated with '/'. Equivalent to org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString" | ||
} | ||
] |
Oops, something went wrong.