diff --git a/lang/check-grammar.sh b/lang/check-grammar.sh index aec91e35..6d38b477 100644 --- a/lang/check-grammar.sh +++ b/lang/check-grammar.sh @@ -24,7 +24,7 @@ sed -e 's//\n/g' $b/grammar.xml \ join -a 2 -v 2 $b/defs.txt $b/used.txt >$b/undef.txt test -s $b/undef.txt && echo Undefined references found: `cat $b/undef.txt` 1>&2 join -a 1 -v 1 $b/defs.txt $b/used.txt \ - | grep -v '^TokenWhiteSpace\|module-part$' >$b/unused.txt + | grep -v '^TokenWhiteSpace\|module-part\|RegExp$' >$b/unused.txt test -s $b/unused.txt && echo Unused definitions found: `cat $b/unused.txt` 1>&2 exit 0 diff --git a/lang/lib/regexp.bal b/lang/lib/regexp.bal new file mode 100644 index 00000000..d8f7e00b --- /dev/null +++ b/lang/lib/regexp.bal @@ -0,0 +1,54 @@ +// Copyright (c) 2022 WSO2 Inc. (http://www.wso2.org) All Rights Reserved. +// +// WSO2 Inc. licenses this file to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +# The type RegExp refers to the tagged data basic type with tag `re`. +@builtinSubtype +public type RegExp any; + +public type Span readonly & object { + public int startIndex; + public int endIndex; + // This avoids constructtng a potentially long string unless and until it is needed + isolated function substring() returns string; +}; + +public type Groups readonly & [Span, Span?...]; + +# Returns the span of the first match that starts at or after startIndex. +public isolated function find(RegExp re, string str, int startIndex = 0) returns Span? = external; + +public isolated function findGroups(RegExp re, string str, int startIndex = 0) returns Groups? = external; + +# Return all non-overlapping matches +public isolated function findAll(RegExp re, string str, int startIndex = 0) returns Span[] = external; +public isolated function findAllGroups(RegExp re, string str, int startIndex = 0) returns Groups[] = external; + +public isolated function matchAt(RegExp re, string str, int startIndex = 0) returns Span? = external; +public isolated function matchGroupsAt(RegExp re, string str, int startIndex = 0) returns Groups? = external; + +# Says whether there is a match of the RegExp that starts at the beginning of the string and ends at the end of the string. +public isolated function isFullMatch(RegExp re, string str) returns boolean = external; +public isolated function fullMatchGroups(RegExp re, string str) returns Groups? = external; + +public type ReplacerFunction function(Groups groups) returns string; +public type Replacement ReplacerFunction|string; + +# Replaces the first occurrence of a regular expression. +public isolated function replace(RegExp re, string str, @isolatedParam Replacement replacement, int startIndex = 0) returns string = external; +# Replaces all occurrences of a regular expression. +public isolated function replaceAll(RegExp re, string str, @isolatedParam Replacement replacement, int startIndex = 0) returns string = external; + +public isolated function fromString(string str) returns RegExp|error = external; diff --git a/lang/spec.html b/lang/spec.html index 5560e9a8..09e48c58 100644 --- a/lang/spec.html +++ b/lang/spec.html @@ -58,7 +58,8 @@

Table of contents

7. Actions and statements

8. Module-level declarations

9. Metadata

-

10. Lang library

+

10. Data tags

+

11. Lang library

A. References

B. Changes since previous versions

C. Planned future functionality

@@ -349,17 +350,18 @@

Type system fundamentals

There is a fundamental distinction between values that have a storage identity and values that do not. A value that has storage identity has an identity that comes from the location where the value is stored. All structural -and behavioural values have a storage identity, whereas all simple values do -not. Storage identity for sequence values is more complicated and will be +and behavioural values have a storage identity, whereas all simple values and tagged data values +do not. Storage identity for sequence values is more complicated and will be explained in the section on sequence values.

@@ -397,13 +399,15 @@

Type system fundamentals

mathematical value being represented.

-A value is plain data if it is a simple value, a sequence value or a +A value is plain data if it is a simple value, a sequence value, +a tagged data value or a structured value that does not contain a behavioral value at any depth. More precisely, a value is defined to be plain data if it is

@@ -744,6 +748,7 @@

Type descriptors

simple-type-descriptor | sequence-type-descriptor | structured-type-descriptor + | tagged-data-type-descriptor | behavioral-type-descriptor | other-type-descriptor @@ -1864,6 +1869,55 @@

Tables

+ +
+

Tagged data values

+ +

+Tagged data types are used for well-known data types that are not +application-specific, but are widely supported across multiple protocols and +programming languages. Each of these data types has its own conventional string +syntax. The use of tagged data types allows programs to work with these data +types using their normal string syntax, while distinguishing the values as +belonging to a specific tagged data type. +

+

+Data tags are defined either by this specification or the Ballerina platform. For +each such data tag, there is a basic type. The definition of a data tag +specifies the set of values belonging to corresponding basic type. A tagged data +value is a value belonging to one of these basic types. Every tagged data value +is plain data, immmutable and has no storage identity. It is also atomic, in the +sense that it contains no references to other values. +

+

+The definition of a data tag defines: +

+
    +
  • the tag; this is an unqualified identifier that is used with the +tagged-data-template-expr to construct
  • +
  • the set of values that its corresponding basic basic type consists of
  • +
  • an abstract function mapping from strings to values
  • +
  • an abstract function mapping from values to strings
  • +
  • a library module; method call expressions on values of the basic type will +call functions in this module
  • +
+

+Most of the functionality of a tagged data type is provided by the library +module. +

+ +
tagged-data-type-descriptor := qualified-identifier
+
+

+The qualified-identifier refers to a type that is defined in a module that is +part of the language library or standard library. +

+

+In this version of the Ballerina language, only the data tag re +is defined, with associated lang library module lang.regexp. +

+

Behavioral values

@@ -2896,9 +2950,17 @@

Anydata type

  () | boolean | int | float | decimal
-    | string | xml
+    | string | xml | regexp:RegExp
     | anydata[] | map<anydata> | table<map<anydata>>
 
+ +

+In the above, regexp refers to the lang library module +lang.regexp. Since there is only one tagged data type defined in +this version of Ballerina, all tagged data values belong to +regexp:RegExp. +

+

JSON types

@@ -3518,6 +3580,16 @@

ToString

<d>t</d> + + tagged data type + + re`xyz` + + `xyz` + + xyz + + array @@ -3940,9 +4012,7 @@

6. Expressions

expr-only := 
    literal
-   | string-template-expr
-   | xml-template-expr
-   | raw-template-expr
+   | template-expr
    | structural-constructor-expr
    | object-constructor-expr
    | new-expr
@@ -4337,8 +4407,7 @@ 

Isolated expressions

  • list-constructor-expr
  • table-constructor-expr
  • mapping-constructor-expr
  • -
  • xml-template-expr
  • -
  • raw-template-expr
  • +
  • template-expr
  • type-cast-expr
  • checking-expr
  • trap-expr
  • @@ -4377,9 +4446,7 @@

    Constant expressions

    const-expr := 
        literal
    -   | string-template-expr
    -   | xml-template-expr
    -   | raw-template-expr
    +   | template-expr
        | structural-constructor-expr
        | constant-reference-expr
        | type-cast-expr
    @@ -4517,6 +4584,14 @@ 

    Template expressions

    Dollar := $
    +
    template-expr := 
    +   string-template-expr
    +   | xml-template-expr
    +   | tagged-data-template-expr
    +   | raw-template-expr
    +
    +

    String template expression

    @@ -4594,6 +4669,19 @@

    XML template expression

    +
    +

    Tagged data template expression

    + +
    tagged-data-template-expr := identifier BacktickString
    +
    +

    +The identifier must defined as a data tag by this specification or the Ballerina +platform. +

    +
    + +

    Raw template expression

    @@ -10736,8 +10824,123 @@

    Ballerina Flavored Markdown

    + +
    +

    10. Data tags

    -

    10. Lang library

    +

    Regular expressions

    + +

    +The re tag is used for regular expressions. The lang library module +for the corresponding basic type is lang.regexp. +

    + +

    +The syntax of regular expressions is defined by the production RegExp. +

    + +
    RegExp := ReDisjunction
    +
    +ReDisjunction := ReSequence (| ReSequence)*
    +ReSequence := ReTerm*
    +ReTerm :=
    +   ReAtom [ReQuantifier]
    +   | ReAssertion
    +ReAssertion := ^ | $
    +
    +ReQuantifier := ReBaseQuantifier [?]
    +
    +ReBaseQuantifier :=
    +   *
    +  | +
    +  | ?
    +  | { Digit+ [, Digit*] }
    +
    +ReAtom :=
    +   ReLiteralChar
    +   | ReEscape
    +   | .
    +   | [ [^] [ReCharSet] ] 
    +   | ( [? ReFlagsOnOff :] ReDisjunction )
    +
    +ReLiteralChar := ^ ReSyntaxChar
    +
    +ReSyntaxChar :=
    +  ^ | $ | \ | . | * | + | ?
    +  | ( | ) | [ | ] | { | } | |
    +
    +ReEscape :=
    +   NumericEscape
    +   | ControlEscape
    +   | ReQuoteEscape
    +   | ReUnicodePropertyEscape
    +   | ReSimpleCharClassEscape
    +
    +ReQuoteEscape := \ ReSyntaxChar
    +ControlEscape := \r | \n | \t
    +
    +ReSimpleCharClassEscape := \ ReSimpleCharClassCode
    +
    +ReSimpleCharClassCode := d | D | s | S | w | W
    +
    +ReUnicodePropertyEscape := \ (p | P) {ReUnicodeProperty }
    +
    +ReUnicodeProperty :=  ReUnicodeScript | ReUnicodeGeneralCategory
    +
    +ReUnicodeScript := sc= ReUnicodePropertyValue
    +
    +ReUnicodePropertyValue := ReUnicodePropertyValueChar+
    +
    +ReUnicodePropertyValueChar := AsciiLetter | Digit | _
    +
    +ReUnicodeGeneralCategory := [gc=] ReUnicodeGeneralCategoryName
    +
    +ReCharSetAtom :=
    +  ReCharSetAtomNoDash
    +  | -
    +
    +ReCharSetAtomNoDash :=
    +  ReCharSetLiteralChar
    +  | ReEscape
    +  | \-
    +
    +ReCharSetLiteralChar := ^ (\ | ] | -)
    +
    +ReCharSet :=
    +  ReCharSetAtom
    +  | ReCharSetRange [ReCharSet]
    +  | ReCharSetAtom ReCharSetNoDash
    +
    +ReCharSetRange := ReCharSetAtom - ReCharSetAtom
    +
    +ReCharSetNoDash :=
    +  ReCharSetAtom
    +  | ReCharSetRangeNoDash [ReCharSet]
    +  | ReCharSetAtomNoDash ReCharSetNoDash
    +  
    +ReCharSetRangeNoDash := ReCharSetAtomNoDash - ReCharSetAtom
    +
    +ReFlagsOnOff := ReFlags [- ReFlags]
    +ReFlags := ReFlag*
    +ReFlag :=
    +  ReMultilineFlag
    +  | ReDotAllFlag
    +  | ReIgnoreCaseFlag
    +  | ReCommentFlag
    +
    +ReMultilineFlag := m
    +ReDotAllFlag := s
    +ReIgnoreCaseFlag := i
    +ReCommentFlag := x
    +
    + +
    + +
    + +
    +

    11. Lang library

    Modules in the ballerina organization with a module name starting @@ -10970,6 +11173,16 @@

    lang.object module

    +
    +

    lang.regexp module

    + +

    The lang.regexp module corresponds to tagged data type with tag +re.

    + +

    regexp.bal

    + +
    +

    lang.stream module