Skip to content

Commit

Permalink
Take encoding into account in string functions. (#4111)
Browse files Browse the repository at this point in the history
  • Loading branch information
toots authored Aug 25, 2024
1 parent 893690d commit 3b4b9a7
Show file tree
Hide file tree
Showing 15 changed files with 268 additions and 62 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ New:
- Add `string.of_int` and `string.spaces`.
- Add `list.assoc.nullable`.
- Add `source.cue` (#3620).
- Add `string.chars` (#4111)
- Added atomic file write operations.

Changed:
Expand All @@ -64,6 +65,7 @@ Changed:
- Changed internal metadata format to be immutable (#3297).
- Allow a getter for the offset of `on_offset` and dropped the metadata
mechanism for updating it (#3355).
- `string.length` and `string.sub` now default to `utf8` encoding (#4109)
- Disable output paging when `TERM` environment variable is not set.
- Allow running as `root` user inside `docker` container by default (#3406).
- Run `check_next` before playlist's requests resolutions (#3625)
Expand Down
8 changes: 8 additions & 0 deletions doc/content/migrating.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ end
However, EBU R128 data is now extracted directly from metadata when available.
So `replaygain` cannot control the gain type via this parameter anymore.

### String functions

Some string functions have been updated to account for string encoding. In particular, `string.length` and `string.sub` now assume that their
given string is in `utf8` by default.

While this is what most user expect, this can lead to backward incompatibilities and new exceptions. You can change back to the old default by
passing `encoding="ascii"` to these functions or using the `settings.string.default_encoding` settings.

### `check_next`

`check_next` in playlist operators is now called _before_ the request is resolved, to make it possible to cut out
Expand Down
26 changes: 26 additions & 0 deletions src/core/builtins/builtins_string_extra.ml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,32 @@
*****************************************************************************)

let log = Log.make ["lang"; "string"]

let conf_string =
Dtools.Conf.void ~p:(Configure.conf#plug "string") "String settings"

let () =
let conf_default_encoding =
Dtools.Conf.string
~p:(conf_string#plug "default_encoding")
~d:"utf8"
"Default encoding for `string.length`, `string.chars` and `string.sub`"
in
conf_default_encoding#on_change (fun v ->
let enc =
match v with
| "ascii" -> `Ascii
| "utf8" -> `Utf8
| _ ->
log#important
"Invalid value %s for `settings.string.default_encoding`! \
Should be one of: \"ascii\" or \"utf8\"."
v;
`Utf8
in
Liquidsoap_lang.Builtins_string.default_encoding := enc)

let string = Liquidsoap_lang.Builtins_string.string
let string_annotate = Lang.add_module ~base:string "annotate"

Expand Down
2 changes: 1 addition & 1 deletion src/lang/builtins_regexp.ml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ let escape_regex_descr =
else Lang_string.utf8_special_char s pos len)
~escape_char:(fun s pos len ->
if s.[pos] = '/' && len = 1 then "\\/"
else Lang_string.escape_utf8_char s pos len)
else Lang_string.escape_utf8_char ~strict:false s pos len)
~next:Lang_string.utf8_next
in
Lang_string.escape_string escape_regex_formatter
Expand Down
114 changes: 101 additions & 13 deletions src/lang/builtins_string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,86 @@ let _ =
let l = List.map Lang.to_string l in
Lang.string (String.concat sep l))

let split ~encoding s =
let buf = Buffer.create 1 in
let to_string add c =
Buffer.clear buf;
add buf c;
Buffer.contents buf
in
let get =
match encoding with
| `Ascii -> fun pos -> (to_string Buffer.add_char (String.get s pos), 1)
| `Utf8 ->
fun pos ->
let d = String.get_utf_8_uchar s pos in
if not (Uchar.utf_decode_is_valid d) then
failwith "Decoding failed!";
( to_string Buffer.add_utf_8_uchar (Uchar.utf_decode_uchar d),
Uchar.utf_decode_length d )
in
let len = String.length s in
let rec f chars pos =
if pos = len then List.rev chars
else (
let char, len = get pos in
f (char :: chars) (pos + len))
in
f [] 0

let default_encoding = ref `Utf8

let encoding_option =
( "encoding",
Lang.nullable_t Lang.string_t,
Some Lang.null,
Some
"Encoding used to split characters. Should be one of: `\"utf8\"` or \
`\"ascii\"`" )

let get_encoding p =
match Lang.to_valued_option Lang.to_string (List.assoc "encoding" p) with
| None -> ("utf8", !default_encoding)
| Some "utf8" -> ("utf8", `Utf8)
| Some "ascii" -> ("ascii", `Ascii)
| _ ->
Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!"
"invalid"

let _ =
Lang.add_builtin ~base:string "chars" ~category:`String
~descr:"Split string into characters. Raises `error.invalid` on errors."
[encoding_option; ("", Lang.string_t, None, None)]
(Lang.list_t Lang.string_t)
(fun p ->
let enc, encoding = get_encoding p in
let s = Lang.to_string (List.assoc "" p) in
try Lang.list (List.map Lang.string (split ~encoding s))
with _ ->
Runtime_error.raise ~pos:(Lang.pos p)
~message:
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
enc)
"invalid")

let _ =
Lang.add_builtin ~base:string "length" ~category:`String
~descr:
"Return the string's length using the given encoding. Raises \
`error.invalid` on errors."
[encoding_option; ("", Lang.string_t, None, None)]
Lang.int_t
(fun p ->
let enc, encoding = get_encoding p in
let s = Lang.to_string (List.assoc "" p) in
try Lang.int (List.length (split ~encoding s))
with _ ->
Runtime_error.raise ~pos:(Lang.pos p)
~message:
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
enc)
"invalid")

let _ =
Lang.add_builtin ~base:string "nth" ~category:`String
~descr:
Expand Down Expand Up @@ -165,7 +245,7 @@ let string_escape =
("", Lang.string (String.sub s ofs len));
])
| None, `Ascii -> Lang_string.escape_hex_char
| None, `Utf8 -> Lang_string.escape_utf8_char
| None, `Utf8 -> Lang_string.escape_utf8_char ~strict:false
in
let next =
match encoding with
Expand Down Expand Up @@ -213,7 +293,8 @@ let _ =
match Lang.to_string format with
| "octal" -> (Lang_string.escape_octal_char, Lang_string.ascii_next)
| "hex" -> (Lang_string.escape_hex_char, Lang_string.ascii_next)
| "utf8" -> (Lang_string.escape_utf8_char, Lang_string.utf8_next)
| "utf8" ->
(Lang_string.escape_utf8_char ~strict:false, Lang_string.utf8_next)
| _ ->
raise
(Error.Invalid_value
Expand Down Expand Up @@ -264,15 +345,6 @@ let _ =
let s = Lang.to_string (List.assoc "" p) in
Lang.string (Lang_string.unescape_string s))

let _ =
Lang.add_builtin ~base:string "length" ~category:`String
~descr:"Get the length of a string."
[("", Lang.string_t, None, None)]
Lang.int_t
(fun p ->
let string = Lang.to_string (List.assoc "" p) in
Lang.int (String.length string))

let _ =
Lang.add_builtin ~base:string "sub" ~category:`String
~descr:
Expand All @@ -285,6 +357,7 @@ let _ =
Some
"Return a sub string starting at this position. First position is 0."
);
encoding_option;
( "length",
Lang.int_t,
None,
Expand All @@ -294,9 +367,24 @@ let _ =
(fun p ->
let start = Lang.to_int (List.assoc "start" p) in
let len = Lang.to_int (List.assoc "length" p) in
let _, encoding = get_encoding p in
let string = Lang.to_string (List.assoc "" p) in
Lang.string
(try String.sub string start len with Invalid_argument _ -> ""))
let s =
match encoding with
| `Ascii -> (
try String.sub string start len with Invalid_argument _ -> "")
| `Utf8 -> (
try
let chars = split ~encoding string in
if List.length chars < len + start then ""
else
String.concat ""
(List.filteri
(fun pos _ -> start <= pos && pos < start + len)
chars)
with _ -> "")
in
Lang.string s)

let _ =
Lang.add_builtin ~base:string "index" ~category:`String
Expand Down
22 changes: 13 additions & 9 deletions src/lang/lang_string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,17 @@ let escape_char ~escape_fun s pos len =
| '\'', 1 -> "\\'"
| _ -> escape_fun s pos len

let escape_utf8_char =
let escape_utf8_char ~strict =
let utf8_char_code s pos len =
try utf8_char_code s pos len with _ -> Uchar.to_int Uchar.rep
try utf8_char_code s pos len
with _ when not strict -> Uchar.to_int Uchar.rep
in
escape_char ~escape_fun:(fun s pos len ->
Printf.sprintf "\\u%04X" (utf8_char_code s pos len))

let escape_utf8_formatter ?(special_char = utf8_special_char) =
escape ~special_char ~escape_char:escape_utf8_char ~next:utf8_next
let escape_utf8_formatter ?(strict = false) ?(special_char = utf8_special_char)
=
escape ~special_char ~escape_char:(escape_utf8_char ~strict) ~next:utf8_next

let escape_hex_char =
escape_char ~escape_fun:(fun s pos len ->
Expand Down Expand Up @@ -153,15 +155,15 @@ let escape_string escape s =
len segments);
Bytes.unsafe_to_string b)

let escape_utf8_string ?special_char =
escape_string (escape_utf8_formatter ?special_char)
let escape_utf8_string ?strict ?special_char =
escape_string (escape_utf8_formatter ?strict ?special_char)

let escape_ascii_string ?special_char =
escape_string (escape_ascii_formatter ?special_char)

let quote_utf8_string s =
let quote_utf8_string ?strict s =
Printf.sprintf "\"%s\""
(escape_utf8_string
(escape_utf8_string ?strict
~special_char:(fun s pos len ->
if s.[pos] = '\'' && len = 1 then false
else utf8_special_char s pos len)
Expand All @@ -175,7 +177,9 @@ let quote_ascii_string s =
else ascii_special_char s pos len)
s)

let quote_string s = try quote_utf8_string s with _ -> quote_ascii_string s
let quote_string s =
try quote_utf8_string ~strict:true s with _ -> quote_ascii_string s

let unescape_utf8_pattern = "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]"
let unescape_hex_pattern = "\\\\x[0-9a-fA-F][0-9a-fA-F]"
let unescape_octal_pattern = "\\\\[0-9][0-9][0-9]"
Expand Down
10 changes: 7 additions & 3 deletions src/lang/lang_string.mli
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ val ascii_next : 'a -> int -> int
val escape_char :
escape_fun:(string -> int -> int -> string) -> string -> int -> int -> string

val escape_utf8_char : string -> int -> int -> string
val escape_utf8_char : strict:bool -> string -> int -> int -> string

val escape_utf8_formatter :
?strict:bool ->
?special_char:(string -> int -> int -> bool) ->
string ->
[> `Orig of int * int | `Subst of string * int ] list * int
Expand All @@ -39,12 +40,15 @@ val escape_string :
string

val escape_utf8_string :
?special_char:(string -> int -> int -> bool) -> string -> string
?strict:bool ->
?special_char:(string -> int -> int -> bool) ->
string ->
string

val escape_ascii_string :
?special_char:(string -> int -> int -> bool) -> string -> string

val quote_utf8_string : string -> string
val quote_utf8_string : ?strict:bool -> string -> string
val quote_ascii_string : string -> string
val quote_string : string -> string
val unescape_utf8_pattern : string
Expand Down
4 changes: 2 additions & 2 deletions src/libs/extra/audio.liq
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,9 @@ end
def replaces dtmf(~duration=0.1, ~delay=0.05, dtmf) =
l = ref([])
for i = 0 to
string.length(dtmf) - 1
string.length(encoding="ascii", dtmf) - 1
do
c = string.sub(dtmf, start=i, length=1)
c = string.sub(encoding="ascii", dtmf, start=i, length=1)
let (row, col) =
if
c == "1"
Expand Down
13 changes: 7 additions & 6 deletions src/libs/file.liq
Original file line number Diff line number Diff line change
Expand Up @@ -260,15 +260,16 @@ def file.metadata.flac.cover.decode(s) =
def read_int() =
ret =
string.binary.to_int(
little_endian=false, string.sub(s, start=i(), length=4)
little_endian=false,
string.sub(encoding="ascii", s, start=i(), length=4)
)

i := i() + 4
ret
end

def read_string(len) =
ret = string.sub(s, start=i(), length=len)
ret = string.sub(encoding="ascii", s, start=i(), length=len)
i := i() + len
(ret : string)
end
Expand All @@ -284,7 +285,7 @@ def file.metadata.flac.cover.decode(s) =
number_of_colors = read_int()
number_of_colors = number_of_colors > 0 ? null(number_of_colors) : null()
data_len = read_int()
data = string.sub(s, start=i(), length=data_len)
data = string.sub(encoding="ascii", s, start=i(), length=data_len)
if
data == ""
then
Expand Down Expand Up @@ -320,18 +321,18 @@ def file.metadata.flac.cover.encode(
data
) =
def encode_string(s) =
len = 1 + (string.length(s) / 8)
len = 1 + (string.length(encoding="ascii", s) / 8)
str_len = string.binary.of_int(little_endian=false, pad=4, len)
if
string.length(str_len) > 4
string.length(encoding="ascii", str_len) > 4
then
error.raise(
error.invalid,
"Data length too long for APIC format!"
)
end

pad = string.make(char_code=0, len * 8 - string.length(s))
pad = string.make(char_code=0, len * 8 - string.length(encoding="ascii", s))
(str_len, "#{s}#{pad}")
end

Expand Down
Loading

0 comments on commit 3b4b9a7

Please sign in to comment.