diff --git a/doc/spec.md b/doc/spec.md index 2129d375..6929440e 100644 --- a/doc/spec.md +++ b/doc/spec.md @@ -321,6 +321,133 @@ hex_digit = '0' … '9' | 'A' … 'F' | 'a' … 'f' . binary_digit = '0' | '1' . ``` +### String literals + +A Starlark string literal denotes a string value. +In its simplest form, it consists of the desired text +surrounded by matching single- or double-quotation marks: + +```python +"abc" +'abc' +``` + +Literal occurrences of the chosen quotation mark character must be +escaped by a preceding backslash. So, if a string contains several +of one kind of quotation mark, it may be convenient to quote the string +using the other kind, as in these examples: + +```python +'Have you read "To Kill a Mockingbird?"' +"Yes, it's a classic." + +"Have you read \"To Kill a Mockingbird?\"" +'Yes, it\'s a classic.' +``` + +#### String escapes + +Within a string literal, the backslash character `\` indicates the +start of an _escape sequence_, a notation for expressing things that +are impossible or awkward to write directly. + +The following *traditional escape sequences* represent the ASCII control +codes 7-13: + +``` +\a \x07 alert or bell +\b \x08 backspace +\f \x0C form feed +\n \x0A line feed +\r \x0D carriage return +\t \x09 horizontal tab +\v \x0B vertical tab +``` + +A *literal backslash* is written using the escape `\\`. + +An *escaped newline*---that is, a backslash at the end of a line---is +ignored, allowing string literals to be conveniently split across +multiple lines (though multi-line string literals, described below, make +this even more convenient). + +```python +"abc\ +def" # "abcdef" +``` + +An *octal escape* encodes a single byte using its octal value. +It consists of a backslash followed by one, two, or three octal digits [0-7]. +It is error if the value is greater than decimal 255. + +```python +'\0' # "\x00" a string containing a single NUL byte +'\12' # "\n" octal 12 = decimal 10 +'\101-\132' # "A-Z" +'\119' # "\t9" = "\11" + "9" +``` + +Implementation note: +The Java implementation encodes strings using UTF-16, +so an octal escape encodes a single UTF-16 code unit. +Octal escapes for values above 127 are therefore not portable across implementations. +There is little reason to use octal escapes in new code. + +A *hex escape* encodes a single byte using its hexadecimal value. +It consists of `\x` followed by exactly two hexadecimal digits [0-9A-Fa-f]. + +```python +"\x00" # "\x00" a string containing a single NUL byte +"(\x20)" # "( )" ASCII 0x20 = 32 = space + +red, reset = "\x1b[31m", "\x1b[0m" # ANSI terminal control codes for color +"(" + red + "hello" + reset + ")" # "(hello)" with red text, if on a terminal +``` + +Implementation note: +The Java implementation does not support hex escapes. + +An ordinary string literal may not contain an unescaped newline, +but a *multiline string literal* may spread over multiple source lines. +It is denoted using three quotation marks at start and end. +Within it, unescaped newlines and quotation marks (or even pairs of +quotation marks) have their literal meaning, but three quotation marks +end the literal. This makes it easy to quote large blocks of text with +few escapes. + +``` +haiku = ''' +Yesterday it worked. +Today it is not working. +That's computers. Sigh. +''' +``` + +Regardless of the platform's convention for text line endings---for +example, a linefeed (\n) on UNIX, or a carriage return followed by a +linefeed (\r\n) on Microsoft Windows---an unescaped line ending in a +multiline string literal always denotes a line feed (\n). + +Starlark also supports *raw string literals*, which look like an +ordinary single- or double-quotation preceded by `r`. Within a raw +string literal, there is no special processing of backslash escapes, +other than an escaped quotation mark (which denotes a literal +quotation mark), or an escaped newline (which denotes a backslash +followed by a newline). This form of quotation is typically used when +writing strings that contain many quotation marks or backslashes (such +as regular expressions or shell commands) to reduce the burden of +escaping: + +```python +"a\nb" # "a\nb" = 'a' + '\n' + 'b' +r"a\nb" # "a\\nb" = 'a' + '\\' + '\n' + 'b' + +"a\ +b" # "ab" +r"a\ +b" # "a\\\nb" +``` + TODO: define string_lit, indent, outdent, semicolon, newline, eof ## Data types @@ -4106,6 +4233,7 @@ See [Starlark spec issue 20](https://github.com/bazelbuild/starlark/issues/20). * `lambda` expressions are supported (option: `-lambda`). * String elements are bytes. * Non-ASCII strings are encoded using UTF-8. +* Strings support octal and hex byte escapes. * Strings have the additional methods `elem_ords`, `codepoint_ords`, and `codepoints`. * The `chr` and `ord` built-in functions are supported. * The `set` built-in function is provided (option: `-set`). diff --git a/starlark/testdata/dict.star b/starlark/testdata/dict.star index 383b7b52..9864be73 100644 --- a/starlark/testdata/dict.star +++ b/starlark/testdata/dict.star @@ -13,7 +13,7 @@ assert.true({False: False}) assert.true(not {}) # dict + dict is no longer supported. -assert.fails(lambda: {"a": 1} + {"b": 2}, 'unknown binary op: dict \+ dict') +assert.fails(lambda: {"a": 1} + {"b": 2}, 'unknown binary op: dict \\+ dict') # dict comprehension assert.eq({x: x*x for x in range(3)}, {0: 0, 1: 1, 2: 4}) diff --git a/starlark/testdata/function.star b/starlark/testdata/function.star index 4b419197..a1096844 100644 --- a/starlark/testdata/function.star +++ b/starlark/testdata/function.star @@ -173,7 +173,7 @@ assert.fails(lambda: f( 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - 57, 58, 59, 60, 61, 62, 63, 64), "missing 1 argument \(mm\)") + 57, 58, 59, 60, 61, 62, 63, 64), "missing 1 argument \\(mm\\)") assert.fails(lambda: f( 1, 2, 3, 4, 5, 6, 7, 8, diff --git a/starlark/testdata/int.star b/starlark/testdata/int.star index 10fe059f..c987fd17 100644 --- a/starlark/testdata/int.star +++ b/starlark/testdata/int.star @@ -176,8 +176,8 @@ assert.fails(lambda: int("-0123", 0), "invalid literal.*base 0") assert.fails(lambda: int("0Oxa", 8), "invalid literal with base 8: 0Oxa") # follow-on bugs to issue 108 assert.fails(lambda: int("--4"), "invalid literal with base 10: --4") -assert.fails(lambda: int("++4"), "invalid literal with base 10: \+\+4") -assert.fails(lambda: int("+-4"), "invalid literal with base 10: \+-4") +assert.fails(lambda: int("++4"), "invalid literal with base 10: \\+\\+4") +assert.fails(lambda: int("+-4"), "invalid literal with base 10: \\+-4") assert.fails(lambda: int("0x-4", 16), "invalid literal with base 16: 0x-4") # bitwise union (int|int), intersection (int&int), XOR (int^int), unary not (~int), diff --git a/starlark/testdata/list.star b/starlark/testdata/list.star index cd020843..ff98b968 100644 --- a/starlark/testdata/list.star +++ b/starlark/testdata/list.star @@ -16,14 +16,14 @@ assert.true(not []) # indexing, x[i] abc = list("abc".elems()) -assert.fails(lambda : abc[-4], "list index -4 out of range \[-3:2]") +assert.fails(lambda : abc[-4], "list index -4 out of range \\[-3:2]") assert.eq(abc[-3], "a") assert.eq(abc[-2], "b") assert.eq(abc[-1], "c") assert.eq(abc[0], "a") assert.eq(abc[1], "b") assert.eq(abc[2], "c") -assert.fails(lambda : abc[3], "list index 3 out of range \[-3:2]") +assert.fails(lambda : abc[3], "list index 3 out of range \\[-3:2]") # x[i] = ... x3 = [0, 1, 2] @@ -45,8 +45,8 @@ assert.fails(x3.clear, "cannot clear frozen list") # list + list assert.eq([1, 2, 3] + [3, 4, 5], [1, 2, 3, 3, 4, 5]) -assert.fails(lambda : [1, 2] + (3, 4), "unknown.*list \+ tuple") -assert.fails(lambda : (1, 2) + [3, 4], "unknown.*tuple \+ list") +assert.fails(lambda : [1, 2] + (3, 4), "unknown.*list \\+ tuple") +assert.fails(lambda : (1, 2) + [3, 4], "unknown.*tuple \\+ list") # list * int, int * list assert.eq(abc * 0, []) @@ -98,8 +98,8 @@ listcompblock() # list.pop x4 = [1, 2, 3, 4, 5] -assert.fails(lambda : x4.pop(-6), "index -6 out of range \[-5:4]") -assert.fails(lambda : x4.pop(6), "index 6 out of range \[-5:4]") +assert.fails(lambda : x4.pop(-6), "index -6 out of range \\[-5:4]") +assert.fails(lambda : x4.pop(6), "index 6 out of range \\[-5:4]") assert.eq(x4.pop(), 5) assert.eq(x4, [1, 2, 3, 4]) assert.eq(x4.pop(1), 2) diff --git a/starlark/testdata/module.star b/starlark/testdata/module.star index 8edff1f5..6aac2e2d 100644 --- a/starlark/testdata/module.star +++ b/starlark/testdata/module.star @@ -14,4 +14,4 @@ assert.fails(assignfield, "can't assign to .foo field of module") # no such field assert.fails(lambda : assert.nonesuch, "module has no .nonesuch field or method$") -assert.fails(lambda : assert.falls, "module has no .falls field or method .did you mean .fails\?") +assert.fails(lambda : assert.falls, "module has no .falls field or method .did you mean .fails\\?") diff --git a/starlark/testdata/set.star b/starlark/testdata/set.star index 9e5250b6..bca41448 100644 --- a/starlark/testdata/set.star +++ b/starlark/testdata/set.star @@ -33,9 +33,9 @@ assert.eq(list(set([1, 3, 2, 3])), [1, 3, 2]) assert.eq(type(set("hello".elems())), "set") assert.eq(list(set("hello".elems())), ["h", "e", "l", "o"]) assert.eq(list(set(range(3))), [0, 1, 2]) -assert.fails(lambda: set(1), "got int, want iterable") -assert.fails(lambda: set(1, 2, 3), "got 3 arguments") -assert.fails(lambda: set([1, 2, {}]), "unhashable type: dict") +assert.fails(lambda : set(1), "got int, want iterable") +assert.fails(lambda : set(1, 2, 3), "got 3 arguments") +assert.fails(lambda : set([1, 2, {}]), "unhashable type: dict") # truth assert.true(not set()) @@ -46,12 +46,12 @@ x = set([1, 2, 3]) y = set([3, 4, 5]) # set + any is not defined -assert.fails(lambda: x + y, "unknown.*: set \+ set") +assert.fails(lambda : x + y, "unknown.*: set \\+ set") # set | set (use resolve.AllowBitwise to enable it) assert.eq(list(set("a".elems()) | set("b".elems())), ["a", "b"]) assert.eq(list(set("ab".elems()) | set("bc".elems())), ["a", "b", "c"]) -assert.fails(lambda: set() | [], "unknown binary op: set | list") +assert.fails(lambda : set() | [], "unknown binary op: set | list") assert.eq(type(x | y), "set") assert.eq(list(x | y), [1, 2, 3, 4, 5]) assert.eq(list(x | set([5, 1])), [1, 2, 3, 5]) @@ -65,7 +65,7 @@ assert.eq(type(x.union(y)), "set") assert.eq(list(x.union(y)), [1, 2, 3, 4, 5]) assert.eq(list(x.union([5, 1])), [1, 2, 3, 5]) assert.eq(list(x.union((6, 5, 4))), [1, 2, 3, 6, 5, 4]) -assert.fails(lambda: x.union([1, 2, {}]), "unhashable type: dict") +assert.fails(lambda : x.union([1, 2, {}]), "unhashable type: dict") # intersection, set & set (use resolve.AllowBitwise to enable it) assert.eq(list(set("a".elems()) & set("b".elems())), []) @@ -75,13 +75,14 @@ assert.eq(list(set("ab".elems()) & set("bc".elems())), ["b"]) assert.eq(set([1, 2, 3]) ^ set([4, 5, 3]), set([1, 2, 4, 5])) def test_set_augmented_assign(): - x = set([1, 2, 3]) - x &= set([2, 3]) - assert.eq(x, set([2, 3])) - x |= set([1]) - assert.eq(x, set([1, 2, 3])) - x ^= set([4, 5, 3]) - assert.eq(x, set([1, 2, 4, 5])) + x = set([1, 2, 3]) + x &= set([2, 3]) + assert.eq(x, set([2, 3])) + x |= set([1]) + assert.eq(x, set([1, 2, 3])) + x ^= set([4, 5, 3]) + assert.eq(x, set([1, 2, 4, 5])) + test_set_augmented_assign() # len @@ -99,17 +100,19 @@ assert.eq(x, x) assert.eq(y, y) assert.true(x != y) assert.eq(set([1, 2, 3]), set([3, 2, 1])) -assert.fails(lambda: x < y, "set < set not implemented") +assert.fails(lambda : x < y, "set < set not implemented") # iteration assert.true(type([elem for elem in x]), "list") assert.true(list([elem for elem in x]), [1, 2, 3]) + def iter(): - list = [] - for elem in x: - list.append(elem) - return list + list = [] + for elem in x: + list.append(elem) + return list + assert.eq(iter(), [1, 2, 3]) # sets are not indexable -assert.fails(lambda: x[0], "unhandled.*operation") +assert.fails(lambda : x[0], "unhandled.*operation") diff --git a/starlark/testdata/string.star b/starlark/testdata/string.star index 605e2030..8390cd5e 100644 --- a/starlark/testdata/string.star +++ b/starlark/testdata/string.star @@ -8,7 +8,7 @@ assert.eq(r'a\bc', "a\\bc") # truth assert.true("abc") -assert.true("\0") +assert.true(chr(0)) assert.true(not "") # str + str @@ -200,7 +200,7 @@ assert.eq("a{x!r}c".format(x='b'), r'a"b"c') assert.fails(lambda: "{x!}".format(x=1), "unknown conversion") assert.fails(lambda: "{x!:}".format(x=1), "unknown conversion") assert.fails(lambda: '{a.b}'.format(1), "syntax x.y is not supported") -assert.fails(lambda: '{a[0]}'.format(1), "syntax a\[i\] is not supported") +assert.fails(lambda: '{a[0]}'.format(1), "syntax a\\[i\\] is not supported") assert.fails(lambda: '{ {} }'.format(1), "nested replacement fields not supported") assert.fails(lambda: '{{}'.format(1), "single '}' in format") assert.fails(lambda: '{}}'.format(1), "single '}' in format") diff --git a/starlarkstruct/testdata/struct.star b/starlarkstruct/testdata/struct.star index 841ab411..e54fe046 100644 --- a/starlarkstruct/testdata/struct.star +++ b/starlarkstruct/testdata/struct.star @@ -58,6 +58,6 @@ assert.eq(getattr(alice, "city"), "NYC") assert.eq(bob + bob, bob) assert.eq(bob + alice, person(age = 50, city = "NYC", name = "alice")) assert.eq(alice + bob, person(age = 50, city = "NYC", name = "bob")) # not commutative! a misfeature -assert.fails(lambda : alice + 1, "struct \+ int") +assert.fails(lambda : alice + 1, "struct \\+ int") assert.eq(http + http, http) -assert.fails(lambda : http + bob, "different constructors: hostport \+ person") +assert.fails(lambda : http + bob, "different constructors: hostport \\+ person") diff --git a/syntax/quote.go b/syntax/quote.go index cc9a8d0a..45c0c760 100644 --- a/syntax/quote.go +++ b/syntax/quote.go @@ -40,13 +40,6 @@ var esc = [256]byte{ '"': '"', } -// notEsc is a list of characters that can follow a \ in a string value -// without having to escape the \. That is, since ( is in this list, we -// quote the Go string "foo\\(bar" as the Python literal "foo\(bar". -// This really does happen in BUILD files, especially in strings -// being used as shell arguments containing regular expressions. -const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~" - // unquote unquotes the quoted string, returning the actual // string value, whether the original was triple-quoted, and // an error describing invalid input. @@ -127,17 +120,19 @@ func unquote(quoted string) (s string, triple bool, err error) { switch quoted[1] { default: - // In Python, if \z (for some byte z) is not a known escape sequence - // then it appears as literal text in the string. - buf.WriteString(quoted[:2]) - quoted = quoted[2:] + // In Starlark, like Go, a backslash must escape something. + // (Python still treats unnecessary backslashes literally, + // but since 3.6 has emitted a deprecation warning.) + err = fmt.Errorf("invalid escape sequence \\%c", quoted[1]) + return case '\n': // Ignore the escape and the line break. quoted = quoted[2:] - case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': - // One-char escape + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: + // One-char escape. + // We escape only the kind of quotation mark in use. buf.WriteByte(unesc[quoted[1]]) quoted = quoted[2:] @@ -227,18 +222,6 @@ func quote(unquoted string, triple bool) string { buf.WriteByte(c) continue } - if c == '\\' { - if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 { - // Can pass \ through when followed by a byte that - // known not to be a valid escape sequence and also - // that does not trigger an escape sequence of its own. - // Use this, because various BUILD files do. - buf.WriteByte('\\') - buf.WriteByte(unquoted[i+1]) - i++ - continue - } - } if esc[c] != 0 { buf.WriteByte('\\') buf.WriteByte(esc[c]) diff --git a/syntax/quote_test.go b/syntax/quote_test.go index a39d217f..f9068eee 100644 --- a/syntax/quote_test.go +++ b/syntax/quote_test.go @@ -19,13 +19,10 @@ var quoteTests = []struct { {`"hello"`, `hello`, true}, {`'hello'`, `hello`, false}, {`"quote\"here"`, `quote"here`, true}, - {`'quote\"here'`, `quote"here`, false}, {`'quote"here'`, `quote"here`, false}, {`"quote'here"`, `quote'here`, true}, - {`"quote\'here"`, `quote'here`, false}, {`'quote\'here'`, `quote'here`, false}, {`"""hello " ' world "" asdf ''' foo"""`, `hello " ' world "" asdf ''' foo`, true}, - {`"foo\(bar"`, `foo\(bar`, true}, {`"""hello world"""`, "hello\nworld", true}, @@ -35,14 +32,14 @@ world"""`, "hello\nworld", true}, {`"\a\b\f\n\r\t\v\000\377\"'\\\003\200"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", true}, {`"\a\b\f\n\r\t\v\x00\xff\"'\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", false}, {`"\a\b\f\n\r\t\v\000\xFF\"'\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", false}, - {`"\a\b\f\n\r\t\v\000\xFF\"\'\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", false}, + {`"\a\b\f\n\r\t\v\000\xFF\"\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"\\\x03\x80", false}, { - `"cat $(SRCS) | grep '\s*ip_block:' | sed -e 's/\s*ip_block: \"\([^ ]*\)\"/ \x27\\1\x27,/g' >> $@; "`, + `"cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\\([^ ]*\\)\"/ \x27\\1\x27,/g' >> $@; "`, "cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\\([^ ]*\\)\"/ '\\1',/g' >> $@; ", false, }, { - `"cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\([^ ]*\)\"/ '\\1',/g' >> $@; "`, + `"cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\\([^ ]*\\)\"/ '\\1',/g' >> $@; "`, "cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\\([^ ]*\\)\"/ '\\1',/g' >> $@; ", true, }, diff --git a/syntax/scan_test.go b/syntax/scan_test.go index 005b64ea..a63ec81c 100644 --- a/syntax/scan_test.go +++ b/syntax/scan_test.go @@ -118,7 +118,6 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated {`x = 1 + \ 2`, `x = 1 + 2 EOF`}, {`x = 'a\nb'`, `x = "a\nb" EOF`}, - {`x = 'a\zb'`, `x = "a\\zb" EOF`}, {`x = r'a\nb'`, `x = "a\\nb" EOF`}, {`x = '\''`, `x = "'" EOF`}, {`x = "\""`, `x = "\"" EOF`}, @@ -192,10 +191,32 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated {`"\377"`, `"\xff" EOF`}, {`"\378"`, `"\x1f8" EOF`}, // = '\37' + '8' {`"\400"`, `foo.star:1:1: invalid escape sequence \400`}, // unlike Python 2 and 3 - // Backslashes that are not part of escapes are treated literally, - // but this behavior will change; see b/34519173. - {`"\+"`, `"\\+" EOF`}, - {`"\o123"`, `"\\o123" EOF`}, + + // backslash escapes + // As in Go, a backslash must escape something. + // (Python started issuing a deprecation warning in 3.6.) + {`"foo\(bar"`, `foo.star:1:1: invalid escape sequence \(`}, + {`"\+"`, `foo.star:1:1: invalid escape sequence \+`}, + {`"\w"`, `foo.star:1:1: invalid escape sequence \w`}, + {`"\""`, `"\"" EOF`}, + {`"\'"`, `foo.star:1:1: invalid escape sequence \'`}, + {`'\w'`, `foo.star:1:1: invalid escape sequence \w`}, + {`'\''`, `"'" EOF`}, + {`'\"'`, `foo.star:1:1: invalid escape sequence \"`}, + {`"""\w"""`, `foo.star:1:1: invalid escape sequence \w`}, + {`"""\""""`, `"\"" EOF`}, + {`"""\'"""`, `foo.star:1:1: invalid escape sequence \'`}, + {`'''\w'''`, `foo.star:1:1: invalid escape sequence \w`}, + {`'''\''''`, `"'" EOF`}, + {`'''\"'''`, `foo.star:1:1: invalid escape sequence \"`}, // error + {`r"\w"`, `"\\w" EOF`}, + {`r"\""`, `"\\\"" EOF`}, + {`r"\'"`, `"\\'" EOF`}, + {`r'\w'`, `"\\w" EOF`}, + {`r'\''`, `"\\'" EOF`}, + {`r'\"'`, `"\\\"" EOF`}, + {`'a\zb'`, `foo.star:1:1: invalid escape sequence \z`}, + {`"\o123"`, `foo.star:1:1: invalid escape sequence \o`}, // floats starting with octal digits {"012934.", `1.293400e+04 EOF`}, {"012934.1", `1.293410e+04 EOF`},