From 025798c62d768404fb5453ef6e48bf4d8cb17b87 Mon Sep 17 00:00:00 2001 From: Joshua Send Date: Tue, 16 Jan 2024 11:31:46 +0000 Subject: [PATCH] Implement non-ascii variables in Java and Rust (#310) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What is the goal of this PR? We update to TypeQL with Unicode support in both value and concept variables. This makes the following valid TypeQL: ``` match $人 isa person, has name "Liu"; get $人; ``` ``` match $אדם isa person, has name "Solomon"; get $אדם; ``` We now require all Labels and Variables are valid unicode identifiers not starting with `_`. This change is fully backwards compatible. We also validate that Type Labels and Variables created using the TypeQL language builders in both Rust and Java are conforming to our Unicode specification. ## What are the changes implemented in this PR? - Refactor variable names to allow the same range of characters as Labels - which is unicode identifier excluding a leading `_` - Add language builder validation for the strings provided to Labels or Variable names - Update typedb-behaviour --- dependencies/vaticle/repositories.bzl | 2 +- grammar/TypeQL.g4 | 10 ++-- java/TypeQL.java | 6 +-- java/common/Reference.java | 36 +++++++++++-- java/common/exception/ErrorMessage.java | 6 +-- java/parser/test/ParserTest.java | 4 +- java/pattern/constraint/ThingConstraint.java | 6 ++- java/query/TypeQLFetch.java | 2 +- java/query/test/TypeQLQueryTest.java | 2 +- java/test/behaviour/typeql/TypeQLSteps.java | 1 + rust/common/error/mod.rs | 6 +-- rust/common/identifier.rs | 55 ++++++++++++++++++++ rust/common/mod.rs | 1 + rust/parser/mod.rs | 2 +- rust/parser/test/mod.rs | 28 ++++++++++ rust/parser/typeql.pest | 9 ++-- rust/pattern/label.rs | 20 ++++++- rust/query/typeql_fetch.rs | 6 +-- rust/tests/behaviour/steps.rs | 1 + rust/variable/type_reference.rs | 2 +- rust/variable/variable.rs | 10 +--- 21 files changed, 174 insertions(+), 41 deletions(-) create mode 100644 rust/common/identifier.rs diff --git a/dependencies/vaticle/repositories.bzl b/dependencies/vaticle/repositories.bzl index de063310d..28f35c5b1 100644 --- a/dependencies/vaticle/repositories.bzl +++ b/dependencies/vaticle/repositories.bzl @@ -35,5 +35,5 @@ def vaticle_typedb_behaviour(): git_repository( name = "vaticle_typedb_behaviour", remote = "https://github.com/vaticle/typedb-behaviour", - commit = "c96725f416eabd6ba9e8bf5a6218ad867e17d302", # sync-marker: do not remove this comment, this is used for sync-dependencies by @vaticle_typedb_behaviour + commit = "e770b696b04138488894b636c61f08eaf030b45d", # sync-marker: do not remove this comment, this is used for sync-dependencies by @vaticle_typedb_behaviour ) diff --git a/grammar/TypeQL.g4 b/grammar/TypeQL.g4 index 94bfd917c..35b731ef7 100644 --- a/grammar/TypeQL.g4 +++ b/grammar/TypeQL.g4 @@ -332,11 +332,11 @@ DATETIME_ : DATE_FRAGMENT_ 'T' TIME_ ; VAR_CONCEPT_ : VAR_CONCEPT_ANONYMOUS_ | VAR_CONCEPT_NAMED_ ; VAR_CONCEPT_ANONYMOUS_ : '$_' ; -VAR_CONCEPT_NAMED_ : '$' [a-zA-Z0-9][a-zA-Z0-9_-]* ; -VAR_VALUE_ : '?' [a-zA-Z0-9][a-zA-Z0-9_-]* ; -IID_ : '0x' [0-9a-f]+ ; -LABEL_ : TYPE_CHAR_H_ TYPE_CHAR_T_* ; -LABEL_SCOPED_ : LABEL_ ':' LABEL_ ; +VAR_CONCEPT_NAMED_ : '$' TYPE_CHAR_H_ TYPE_CHAR_T_* ; +VAR_VALUE_ : '?' TYPE_CHAR_H_ TYPE_CHAR_T_* ; +IID_ : '0x' [0-9a-f]+ ; +LABEL_ : TYPE_CHAR_H_ TYPE_CHAR_T_* ; +LABEL_SCOPED_ : LABEL_ ':' LABEL_ ; // FRAGMENTS OF KEYWORDS ======================================================= diff --git a/java/TypeQL.java b/java/TypeQL.java index bb16a87df..4085a6f0f 100644 --- a/java/TypeQL.java +++ b/java/TypeQL.java @@ -58,7 +58,7 @@ import static com.vaticle.typeql.lang.common.TypeQLToken.Predicate.Equality.NEQ; import static com.vaticle.typeql.lang.common.TypeQLToken.Predicate.SubString.CONTAINS; import static com.vaticle.typeql.lang.common.TypeQLToken.Predicate.SubString.LIKE; -import static com.vaticle.typeql.lang.common.exception.ErrorMessage.ILLEGAL_CHAR_IN_LABEL; +import static com.vaticle.typeql.lang.common.exception.ErrorMessage.INVALID_TYPE_LABEL; public class TypeQL { @@ -97,10 +97,10 @@ public static String parseLabel(String label) { try { parsedLabel = parser.parseLabelEOF(label); } catch (TypeQLException e) { - throw TypeQLException.of(ILLEGAL_CHAR_IN_LABEL.message(label)); + throw TypeQLException.of(INVALID_TYPE_LABEL.message(label)); } if (!parsedLabel.equals(label)) - throw TypeQLException.of(ILLEGAL_CHAR_IN_LABEL.message(label)); // e.g: 'abc#123' + throw TypeQLException.of(INVALID_TYPE_LABEL.message(label)); // e.g: 'abc#123' return parsedLabel; } diff --git a/java/common/Reference.java b/java/common/Reference.java index dbb29efef..e02a4226c 100644 --- a/java/common/Reference.java +++ b/java/common/Reference.java @@ -31,10 +31,35 @@ import static com.vaticle.typedb.common.util.Objects.className; import static com.vaticle.typeql.lang.common.TypeQLToken.Char.COLON; import static com.vaticle.typeql.lang.common.exception.ErrorMessage.INVALID_CASTING; +import static com.vaticle.typeql.lang.common.exception.ErrorMessage.INVALID_TYPE_LABEL; import static com.vaticle.typeql.lang.common.exception.ErrorMessage.INVALID_VARIABLE_NAME; public abstract class Reference { + private static final String IDENTIFIER_START = "A-Za-z" + + "\\u00C0-\\u00D6" + + "\\u00D8-\\u00F6" + + "\\u00F8-\\u02FF" + + "\\u0370-\\u037D" + + "\\u037F-\\u1FFF" + + "\\u200C-\\u200D" + + "\\u2070-\\u218F" + + "\\u2C00-\\u2FEF" + + "\\u3001-\\uD7FF" + + "\\uF900-\\uFDCF" + + "\\uFDF0-\\uFFFD"; + private static final String IDENTIFIER_TAIL = IDENTIFIER_START + + "0-9" + + "_" + + "\\-" + + "\\u00B7" + + "\\u0300-\\u036F" + + "\\u203F-\\u2040"; + + public static final Pattern IDENTIFIER_REGEX = Pattern.compile( + "^[" + IDENTIFIER_START + "][" + IDENTIFIER_TAIL + "]*$" + ); + final Type type; final boolean isVisible; @@ -125,14 +150,14 @@ public String toString() { public static abstract class Name extends Reference { - public static final Pattern REGEX = Pattern.compile("[a-zA-Z0-9][a-zA-Z0-9_-]*"); final String name; private final int hash; Name(Type type, String name, boolean isVisible) { super(type, isVisible); - if (!REGEX.matcher(name).matches()) - throw TypeQLException.of(INVALID_VARIABLE_NAME.message(name, REGEX.pattern())); + if (!IDENTIFIER_REGEX.matcher(name).matches()) { + throw TypeQLException.of(INVALID_VARIABLE_NAME.message(name)); + } this.name = name; this.hash = Objects.hash(this.type, this.isVisible, this.name); } @@ -216,6 +241,11 @@ public static class Label extends Reference { Label(String label, @Nullable String scope) { super(Type.LABEL, false); + if (!IDENTIFIER_REGEX.matcher(label).matches()) { + throw TypeQLException.of(INVALID_TYPE_LABEL.message(label)); + } else if (scope != null && !IDENTIFIER_REGEX.matcher(scope).matches()) { + throw TypeQLException.of(INVALID_TYPE_LABEL.message(scope)); + } this.label = label; this.scope = scope; this.hash = Objects.hash(this.type, this.isVisible, this.label, this.scope); diff --git a/java/common/exception/ErrorMessage.java b/java/common/exception/ErrorMessage.java index f42d984be..af3f2b3ad 100644 --- a/java/common/exception/ErrorMessage.java +++ b/java/common/exception/ErrorMessage.java @@ -62,7 +62,7 @@ public class ErrorMessage extends com.vaticle.typedb.common.exception.ErrorMessa public static final ErrorMessage FILTER_VARIABLE_ANONYMOUS = new ErrorMessage(19, "Anonymous variable encountered in the query filter."); public static final ErrorMessage INVALID_VARIABLE_NAME = - new ErrorMessage(20, "The variable name '%s' is invalid; variables must match the following regular expression: '%s'."); + new ErrorMessage(20, "Invalid variable name '%s'. Variables must be valid unicode identifiers."); public static final ErrorMessage ILLEGAL_CONSTRAINT_REPETITION = new ErrorMessage(21, "The variable '%s' contains illegally repeating constraints: '%s' and '%s'."); public static final ErrorMessage MISSING_CONSTRAINT_RELATION_PLAYER = @@ -103,8 +103,8 @@ public class ErrorMessage extends com.vaticle.typedb.common.exception.ErrorMessa new ErrorMessage(40, "Aggregate COUNT does not accept a Variable."); public static final ErrorMessage ILLEGAL_GRAMMAR = new ErrorMessage(41, "Illegal grammar: '%s'"); - public static final ErrorMessage ILLEGAL_CHAR_IN_LABEL = - new ErrorMessage(42, "'%s' is not a valid Type label. Type labels must start with a letter, and may contain only letters, numbers, '-' and '_'."); + public static final ErrorMessage INVALID_TYPE_LABEL = + new ErrorMessage(42, "The type label '%s' is invalid. Type labels must be valid unicode identifiers."); public static final ErrorMessage INVALID_ANNOTATION = new ErrorMessage(43, "Invalid annotation '%s' on '%s' constraint"); diff --git a/java/parser/test/ParserTest.java b/java/parser/test/ParserTest.java index 74c730947..e91d7bef3 100644 --- a/java/parser/test/ParserTest.java +++ b/java/parser/test/ParserTest.java @@ -32,8 +32,8 @@ import com.vaticle.typeql.lang.query.TypeQLDefine; import com.vaticle.typeql.lang.query.TypeQLDelete; import com.vaticle.typeql.lang.query.TypeQLFetch; -import com.vaticle.typeql.lang.query.TypeQLInsert; import com.vaticle.typeql.lang.query.TypeQLGet; +import com.vaticle.typeql.lang.query.TypeQLInsert; import com.vaticle.typeql.lang.query.TypeQLQuery; import com.vaticle.typeql.lang.query.TypeQLUndefine; import com.vaticle.typeql.lang.query.TypeQLUpdate; @@ -52,9 +52,9 @@ import static com.vaticle.typeql.lang.TypeQL.and; import static com.vaticle.typeql.lang.TypeQL.cVar; import static com.vaticle.typeql.lang.TypeQL.define; -import static com.vaticle.typeql.lang.TypeQL.label; import static com.vaticle.typeql.lang.TypeQL.gte; import static com.vaticle.typeql.lang.TypeQL.insert; +import static com.vaticle.typeql.lang.TypeQL.label; import static com.vaticle.typeql.lang.TypeQL.lt; import static com.vaticle.typeql.lang.TypeQL.lte; import static com.vaticle.typeql.lang.TypeQL.match; diff --git a/java/pattern/constraint/ThingConstraint.java b/java/pattern/constraint/ThingConstraint.java index e4cc1ee81..728810402 100644 --- a/java/pattern/constraint/ThingConstraint.java +++ b/java/pattern/constraint/ThingConstraint.java @@ -468,10 +468,14 @@ public String toString() { return HAS.toString() + SPACE + (type != null ? type + SPACE : "") + attribute.first(); + } else if (attribute.second().predicate().isPresent()) { + return HAS.toString() + SPACE + + (type != null ? type + SPACE : "") + + attribute.second().predicate().get(); } else { return HAS.toString() + SPACE + (type != null ? type + SPACE : "") + - (attribute.second().headVariable().isNamed() ? attribute.second().headVariable() : attribute.second().predicate().get()); + attribute.second().headVariable(); } } diff --git a/java/query/TypeQLFetch.java b/java/query/TypeQLFetch.java index 6a094c56a..614e53f53 100644 --- a/java/query/TypeQLFetch.java +++ b/java/query/TypeQLFetch.java @@ -330,7 +330,7 @@ private Label(Either quotedOrUnquoted) { } public static Label of(String label) { - if (Reference.Name.REGEX.matcher(label).matches()) { + if (Reference.IDENTIFIER_REGEX.matcher(label).matches()) { return unquoted(label); } else { return quoted(label); diff --git a/java/query/test/TypeQLQueryTest.java b/java/query/test/TypeQLQueryTest.java index b22a1958d..c598d5ee9 100644 --- a/java/query/test/TypeQLQueryTest.java +++ b/java/query/test/TypeQLQueryTest.java @@ -199,7 +199,7 @@ public void testFetchBuilder() { cVar("x").map("name").map("age").map("email"), label("children").map( match( - rel(cVar("c")).rel(cVar("x")).isa("parenthood)") + rel(cVar("c")).rel(cVar("x")).isa("parenthood") ).fetch( cVar("c").map("name") ) diff --git a/java/test/behaviour/typeql/TypeQLSteps.java b/java/test/behaviour/typeql/TypeQLSteps.java index d75e2047b..45a1f5423 100644 --- a/java/test/behaviour/typeql/TypeQLSteps.java +++ b/java/test/behaviour/typeql/TypeQLSteps.java @@ -118,6 +118,7 @@ public void do_nothing_with_throws(String query) { } @Given("typeql get; throws exception containing {string}") + @Given("typeql delete; throws exception containing {string}") public void do_nothing_with_throws_exception_containing(String exception, String query) { } diff --git a/rust/common/error/mod.rs b/rust/common/error/mod.rs index c5546d7e9..7aba7f25e 100644 --- a/rust/common/error/mod.rs +++ b/rust/common/error/mod.rs @@ -137,7 +137,7 @@ error_messages! { TypeQLError VariableNotNamed = 21: "Anonymous variable encountered in a match query filter.", InvalidVariableName { name: String } = - 22: "The variable name '{name}' is invalid; variables must match the following regular expression: '^[a-zA-Z0-9][a-zA-Z0-9_-]+$'.", + 22: "The variable name '{name}' is invalid. Variables must be valid utf-8 identifiers without a leading underscore.", MissingConstraintRelationPlayer = 23: "A relation variable has not been provided with role players.", InvalidConstraintPredicate { predicate: token::Predicate, value: Value } = @@ -168,6 +168,6 @@ error_messages! { TypeQLError 36: "Aggregate COUNT does not accept a Variable.", IllegalGrammar { input: String } = 37: "Illegal grammar: '{input}'", - IllegalCharInLabel { input: String } = - 38: "'{input}' is not a valid Type label. Type labels must start with a letter, and may contain only letters, numbers, '-' and '_'.", + InvalidTypeLabel { label: String } = + 38: "The type label '{label}' is invalid. Type labels must be valid utf-8 identifiers without a leading underscore.", } diff --git a/rust/common/identifier.rs b/rust/common/identifier.rs new file mode 100644 index 000000000..bfef540b4 --- /dev/null +++ b/rust/common/identifier.rs @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2022 Vaticle + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use std::sync::OnceLock; + +use regex::{Regex, RegexBuilder}; + +pub fn is_valid_identifier(identifier: &str) -> bool { + static REGEX: OnceLock = OnceLock::new(); + let regex = REGEX.get_or_init(|| { + let identifier_start = "A-Za-z\ + \\u00C0-\\u00D6\ + \\u00D8-\\u00F6\ + \\u00F8-\\u02FF\ + \\u0370-\\u037D\ + \\u037F-\\u1FFF\ + \\u200C-\\u200D\ + \\u2070-\\u218F\ + \\u2C00-\\u2FEF\ + \\u3001-\\uD7FF\ + \\uF900-\\uFDCF\ + \\uFDF0-\\uFFFD"; + let identifier_tail = format!( + "{}\ + 0-9\ + _\ + \\-\ + \\u00B7\ + \\u0300-\\u036F\ + \\u203F-\\u2040", + identifier_start + ); + let identifier_pattern = format!("^[{}][{}]*$", identifier_start, identifier_tail); + RegexBuilder::new(&identifier_pattern).build().unwrap() + }); + regex.is_match(identifier) +} diff --git a/rust/common/mod.rs b/rust/common/mod.rs index 78c365fa8..d83422749 100644 --- a/rust/common/mod.rs +++ b/rust/common/mod.rs @@ -22,6 +22,7 @@ pub mod date_time; pub mod error; +pub(crate) mod identifier; pub mod string; pub mod token; pub mod validatable; diff --git a/rust/parser/mod.rs b/rust/parser/mod.rs index dd108109e..9fed63cd2 100644 --- a/rust/parser/mod.rs +++ b/rust/parser/mod.rs @@ -173,7 +173,7 @@ pub(crate) fn visit_eof_label(label: &str) -> Result