Skip to content

Commit

Permalink
Choose a utf-8 encoding that will not break threads.
Browse files Browse the repository at this point in the history
...or fork() on Windows (implemented with threads).

I've deliberately avoided tests for which particular utf-8 encoding is
used in what scenario because that behavior is deliberately left
undefined for future-proofing. All utf8::all guarantees is threads
still work.
  • Loading branch information
schwern committed Aug 9, 2016
1 parent 244c50b commit 83bbd04
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 10 deletions.
33 changes: 32 additions & 1 deletion lib/utf8/all.pm
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,21 @@ L<Cwd::utf8> for fully utf-8 aware Cwd functions.
use Import::Into;
use parent qw(Encode charnames utf8 open warnings feature);
use Symbol qw(qualify_to_ref);
use Config;

# Holds the pointers to the original version of redefined functions
state %_orig_functions;

sub import {
my $class = shift;

# Enable features/pragmas in calling package
my $target = caller;

my $utf8_encoding = $class->_choose_utf8_encoding;

'utf8'->import::into($target);
'open'->import::into($target, qw{:encoding(UTF-8) :std});
'open'->import::into($target => $utf8_encoding, ':std');
'charnames'->import::into($target, qw{:full :short});
'warnings'->import::into($target, qw{FATAL utf8});
'feature'->import::into($target, qw{unicode_strings}) if $^V >= v5.11.0;
Expand Down Expand Up @@ -208,6 +214,31 @@ sub _utf8_glob {
}
}

sub _choose_utf8_encoding {
# No threads? No problem.
return ':encoding(UTF-8)' if !$Config{usethreads} && !$Config{useithreads};

# 5.24.0 seems to have fixed the major utf8 issues.
return ':encoding(UTF-8)' if $^V ge 5.24.0;

# A safe default.
return ':utf8';
}

=head1 WHICH UTF-8 ENCODING?
I<TL;DR>. Perl's unicode has bugs. utf8::all will try to work around them.
As of this writing, Perl has several ways to do utf-8. It has to do
with whether "unassigned" code points are considered errors or
not. The details are in L<perlunicode: Noncharacter code points|http://perldoc.perl.org/perlunicode.html#Noncharacter-code-points>. Perl also has lots of Unicode bugs, particularly with threads and
strict utf-8 encoding (ie. C<:encoding(UTF-8)>).
utf8::all will prefer the strictest encoding available, but it may
choose a less strict utf-8 encoding if it detects your Perl is
vulnerable to Unicode bugs. This should have no effect on how valid
utf-8 is handled.
=head1 INTERACTION WITH AUTODIE
If you use L<autodie>, which is a great idea, you need to use at least version
Expand Down
5 changes: 2 additions & 3 deletions t/lexical-again.t
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# no utf8::all should disable its effects lexically
# Note: Changes to @ARGV, STDIN, STDOU, and STDERR are always global!

use Test::More tests => 17;
use Test::More;
use PerlIO;

my $expected_unicode = "\x{30c6}\x{30b9}\x{30c8}"; # Unicode characters
Expand Down Expand Up @@ -52,7 +52,6 @@ for my $fh (keys %handles) {
my @layers = PerlIO::get_layers($handles{$fh});
ok(grep(m/utf8/, @layers), "$fh: utf8 does appear in the perlio layers")
or diag explain { $fh => \@layers };
ok(grep(m/utf-8-strict/, @layers), "$fh: utf-8-strict does appear in the perlio layers")
or diag explain { $fh => \@layers };
}

done_testing;
6 changes: 3 additions & 3 deletions t/open.t
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
# Test opening an actual file
use utf8::all;
use PerlIO;
use Test::More tests => 4;
use Test::More;

ok open my $in, '<', 'corpus/testfile';
my @layers = PerlIO::get_layers($in);
ok(grep(m/utf8/, @layers), 'utf8 appears in the perlio layers')
or diag explain { $fh => \@layers };
ok(grep(m/utf-8-strict/, @layers), 'utf-8-strict appears in the perlio layers')
or diag explain { $fh => \@layers };

my $contents = do { local $/; <$in>};
is $contents, "\x{30c6}\x{30b9}\x{30c8}\n", 'unicode retrieved OK';

done_testing;
33 changes: 33 additions & 0 deletions t/threads.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!perl

# Test that utf8::all is choosing the right encoding to not
# tickle thread bugs.

use strict;
use warnings;

# This is loaded before threads. It will not be aware of tests run in
# a thread.
use Test::More 0.96;
use Config;

BEGIN {
plan skip_all => "Requires threads"
if !$Config{usethreads};
}

# Deliberately before loading threads so we don't cheat and check
# if threads are loaded, that would be brittle.
use utf8::all;

use threads;
use threads::shared;

note "basic utf8 + threads bug"; {
my $ok :shared = 0;
my $t = threads->create(sub { $ok = 1; });
$t->join();
ok $ok, "threads ok with utf8::all";
}

done_testing;
4 changes: 1 addition & 3 deletions t/utf8.t
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ use Test::More;
my @layers = PerlIO::get_layers($fh);
ok(grep(m/utf8/, @layers), 'utf8 appears in the perlio layers')
or diag explain { $fh => \@layers };
ok(grep(m/utf-8-strict/, @layers), 'utf-8-strict appears in the perlio layers')
or diag explain { $fh => \@layers };
}
}

Expand All @@ -33,7 +31,7 @@ use Test::More;
END { unlink "perlio_test2" }

my @layers = PerlIO::get_layers($test_fh);
SKIP: {
SKIP: {
# If we have the Perl Unicode flag set that adds the UTF-8 layer,
# we need to skip this test.
skip 'Perl Unicode flag set that always adds UTF-8 layer to output', 1 if (${^UNICODE} & 16);
Expand Down

0 comments on commit 83bbd04

Please sign in to comment.