diff --git a/lang/check-grammar.sh b/lang/check-grammar.sh
index aec91e35..6d38b477 100644
--- a/lang/check-grammar.sh
+++ b/lang/check-grammar.sh
@@ -24,7 +24,7 @@ sed -e 's//\n/g' $b/grammar.xml \
join -a 2 -v 2 $b/defs.txt $b/used.txt >$b/undef.txt
test -s $b/undef.txt && echo Undefined references found: `cat $b/undef.txt` 1>&2
join -a 1 -v 1 $b/defs.txt $b/used.txt \
- | grep -v '^TokenWhiteSpace\|module-part$' >$b/unused.txt
+ | grep -v '^TokenWhiteSpace\|module-part\|RegExp$' >$b/unused.txt
test -s $b/unused.txt && echo Unused definitions found: `cat $b/unused.txt` 1>&2
exit 0
diff --git a/lang/lib/regexp.bal b/lang/lib/regexp.bal
new file mode 100644
index 00000000..d8f7e00b
--- /dev/null
+++ b/lang/lib/regexp.bal
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 WSO2 Inc. (http://www.wso2.org) All Rights Reserved.
+//
+// WSO2 Inc. licenses this file to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file except
+// in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+# The type RegExp refers to the tagged data basic type with tag `re`.
+@builtinSubtype
+public type RegExp any;
+
+public type Span readonly & object {
+ public int startIndex;
+ public int endIndex;
+ // This avoids constructtng a potentially long string unless and until it is needed
+ isolated function substring() returns string;
+};
+
+public type Groups readonly & [Span, Span?...];
+
+# Returns the span of the first match that starts at or after startIndex.
+public isolated function find(RegExp re, string str, int startIndex = 0) returns Span? = external;
+
+public isolated function findGroups(RegExp re, string str, int startIndex = 0) returns Groups? = external;
+
+# Return all non-overlapping matches
+public isolated function findAll(RegExp re, string str, int startIndex = 0) returns Span[] = external;
+public isolated function findAllGroups(RegExp re, string str, int startIndex = 0) returns Groups[] = external;
+
+public isolated function matchAt(RegExp re, string str, int startIndex = 0) returns Span? = external;
+public isolated function matchGroupsAt(RegExp re, string str, int startIndex = 0) returns Groups? = external;
+
+# Says whether there is a match of the RegExp that starts at the beginning of the string and ends at the end of the string.
+public isolated function isFullMatch(RegExp re, string str) returns boolean = external;
+public isolated function fullMatchGroups(RegExp re, string str) returns Groups? = external;
+
+public type ReplacerFunction function(Groups groups) returns string;
+public type Replacement ReplacerFunction|string;
+
+# Replaces the first occurrence of a regular expression.
+public isolated function replace(RegExp re, string str, @isolatedParam Replacement replacement, int startIndex = 0) returns string = external;
+# Replaces all occurrences of a regular expression.
+public isolated function replaceAll(RegExp re, string str, @isolatedParam Replacement replacement, int startIndex = 0) returns string = external;
+
+public isolated function fromString(string str) returns RegExp|error = external;
diff --git a/lang/spec.html b/lang/spec.html
index 5560e9a8..09e48c58 100644
--- a/lang/spec.html
+++ b/lang/spec.html
@@ -58,7 +58,8 @@ B. Changes since previous versions C. Planned future functionality
There is a fundamental distinction between values that have a storage
identity and values that do not. A value that has storage identity has an
identity that comes from the location where the value is stored. All structural
-and behavioural values have a storage identity, whereas all simple values do
-not. Storage identity for sequence values is more complicated and will be
+and behavioural values have a storage identity, whereas all simple values and tagged data values
+do not. Storage identity for sequence values is more complicated and will be
explained in the section on sequence values.
@@ -397,13 +399,15 @@ Table of contents
-
+
+
Type system fundamentals
Type system fundamentals
mathematical value being represented.
-A value is plain data if it is a simple value, a sequence value or a +A value is plain data if it is a simple value, a sequence value, +a tagged data value or a structured value that does not contain a behavioral value at any depth. More precisely, a value is defined to be plain data if it is
@@ -744,6 +748,7 @@
+Tagged data types are used for well-known data types that are not +application-specific, but are widely supported across multiple protocols and +programming languages. Each of these data types has its own conventional string +syntax. The use of tagged data types allows programs to work with these data +types using their normal string syntax, while distinguishing the values as +belonging to a specific tagged data type. +
++Data tags are defined either by this specification or the Ballerina platform. For +each such data tag, there is a basic type. The definition of a data tag +specifies the set of values belonging to corresponding basic type. A tagged data +value is a value belonging to one of these basic types. Every tagged data value +is plain data, immmutable and has no storage identity. It is also atomic, in the +sense that it contains no references to other values. +
++The definition of a data tag defines: +
++Most of the functionality of a tagged data type is provided by the library +module. +
+ +tagged-data-type-descriptor := qualified-identifier ++
+The qualified-identifier refers to a type that is defined in a module that is +part of the language library or standard library. +
+
+In this version of the Ballerina language, only the data tag re
+is defined, with associated lang library module lang.regexp
.
+
() | boolean | int | float | decimal - | string | xml + | string | xml | regexp:RegExp | anydata[] | map<anydata> | table<map<anydata>>+ +
+In the above, regexp
refers to the lang library module
+lang.regexp
. Since there is only one tagged data type defined in
+this version of Ballerina, all tagged data values belong to
+regexp:RegExp
.
+
<d>t</d>
re`xyz`
+ `xyz`
+ xyz
+ expr-only := literal - | string-template-expr - | xml-template-expr - | raw-template-expr + | template-expr | structural-constructor-expr | object-constructor-expr | new-expr @@ -4337,8 +4407,7 @@Isolated expressions
list-constructor-expr
table-constructor-expr
mapping-constructor-expr
xml-template-expr
raw-template-expr
template-expr
type-cast-expr
checking-expr
trap-expr
const-expr := literal - | string-template-expr - | xml-template-expr - | raw-template-expr + | template-expr | structural-constructor-expr | constant-reference-expr | type-cast-expr @@ -4517,6 +4584,14 @@+Template expressions
Dollar :=$
template-expr := + string-template-expr + | xml-template-expr + | tagged-data-template-expr + | raw-template-expr ++
tagged-data-template-expr := identifier BacktickString ++
+The identifier must defined as a data tag by this specification or the Ballerina +platform. +
+
+The re
tag is used for regular expressions. The lang library module
+for the corresponding basic type is lang.regexp
.
+
+The syntax of regular expressions is defined by the production RegExp
.
+
RegExp := ReDisjunction + +ReDisjunction := ReSequence (+ +|
ReSequence)* +ReSequence := ReTerm* +ReTerm := + ReAtom [ReQuantifier] + | ReAssertion +ReAssertion :=^
|$
+ +ReQuantifier := ReBaseQuantifier [?
] + +ReBaseQuantifier := +*
+ |+
+ |?
+ |{
Digit+ [,
Digit*]}
+ +ReAtom := + ReLiteralChar + | ReEscape + |.
+ |[
[^
] [ReCharSet]]
+ |(
[?
ReFlagsOnOff:
] ReDisjunction)
+ +ReLiteralChar := ^ ReSyntaxChar + +ReSyntaxChar := +^
|$
|\
|.
|*
|+
|?
+ |(
|)
|[
|]
|{
|}
||
+ +ReEscape := + NumericEscape + | ControlEscape + | ReQuoteEscape + | ReUnicodePropertyEscape + | ReSimpleCharClassEscape + +ReQuoteEscape :=\
ReSyntaxChar +ControlEscape :=\r
|\n
|\t
+ +ReSimpleCharClassEscape :=\
ReSimpleCharClassCode + +ReSimpleCharClassCode :=d
|D
|s
|S
|w
|W
+ +ReUnicodePropertyEscape :=\
(p
|P
){
ReUnicodeProperty}
+ +ReUnicodeProperty := ReUnicodeScript | ReUnicodeGeneralCategory + +ReUnicodeScript :=sc=
ReUnicodePropertyValue + +ReUnicodePropertyValue := ReUnicodePropertyValueChar+ + +ReUnicodePropertyValueChar := AsciiLetter | Digit |_
+ +ReUnicodeGeneralCategory := [gc=
] ReUnicodeGeneralCategoryName + +ReCharSetAtom := + ReCharSetAtomNoDash + |-
+ +ReCharSetAtomNoDash := + ReCharSetLiteralChar + | ReEscape + |\-
+ +ReCharSetLiteralChar := ^ (\
|]
|-
) + +ReCharSet := + ReCharSetAtom + | ReCharSetRange [ReCharSet] + | ReCharSetAtom ReCharSetNoDash + +ReCharSetRange := ReCharSetAtom-
ReCharSetAtom + +ReCharSetNoDash := + ReCharSetAtom + | ReCharSetRangeNoDash [ReCharSet] + | ReCharSetAtomNoDash ReCharSetNoDash + +ReCharSetRangeNoDash := ReCharSetAtomNoDash-
ReCharSetAtom + +ReFlagsOnOff := ReFlags [-
ReFlags] +ReFlags := ReFlag* +ReFlag := + ReMultilineFlag + | ReDotAllFlag + | ReIgnoreCaseFlag + | ReCommentFlag + +ReMultilineFlag :=m
+ReDotAllFlag :=s
+ReIgnoreCaseFlag :=i
+ReCommentFlag :=x
+
Modules in the ballerina
organization with a module name starting
@@ -10970,6 +11173,16 @@
lang.object
modulelang.regexp
moduleThe lang.regexp
module corresponds to tagged data type with tag
+re
.
lang.stream
module