-
Notifications
You must be signed in to change notification settings - Fork 20
/
Twig_pm.slow
executable file
·14008 lines (10890 loc) · 457 KB
/
Twig_pm.slow
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
use strict;
use warnings; # > perl 5.5
# This is created in the caller's space
# I realize (now!) that it's not clean, but it's been there for 10+ years...
BEGIN
{ sub ::PCDATA { '#PCDATA' } ## no critic (Subroutines::ProhibitNestedSubs);
sub ::CDATA { '#CDATA' } ## no critic (Subroutines::ProhibitNestedSubs);
}
use UNIVERSAL();
## if a sub returns a scalar, it better not bloody disappear in list context
## no critic (Subroutines::ProhibitExplicitReturnUndef);
my $perl_version;
my $parser_version;
######################################################################
package XML::Twig;
######################################################################
require 5.004;
use utf8; # > perl 5.5
use vars qw($VERSION @ISA %valid_option);
use Carp;
use File::Spec;
use File::Basename;
*isa= *UNIVERSAL::isa;
# flag, set to true if the weaken sub is available
use vars qw( $weakrefs);
# flag set to true if the version of expat seems to be 1.95.2, which has annoying bugs
# wrt doctype handling. This is global for performance reasons.
my $expat_1_95_2=0;
# a slight non-xml mod: # is allowed as a first character
my $REG_TAG_FIRST_LETTER;
$REG_TAG_FIRST_LETTER= q{(?:[^\W\d]|[:#_])}; # < perl 5.6 - does not work for leading non-ascii letters
$REG_TAG_FIRST_LETTER= q{(?:[[:alpha:]:#_])}; # >= perl 5.6
my $REG_TAG_LETTER= q{(?:[\w_.-]*)};
# a simple name (no colon)
my $REG_NAME_TOKEN= qq{(?:$REG_TAG_FIRST_LETTER$REG_TAG_LETTER*)};
# a tag name, possibly including namespace
my $REG_NAME= qq{(?:(?:$REG_NAME_TOKEN:)?$REG_NAME_TOKEN)};
# tag name (leading # allowed)
# first line is for perl 5.005, second line for modern perl, that accept character classes
my $REG_TAG_NAME=$REG_NAME;
# name or wildcard (* or '') (leading # allowed)
my $REG_NAME_W = qq{(?:$REG_NAME|[*])};
# class and ids are deliberately permissive
my $REG_NTOKEN_FIRST_LETTER;
$REG_NTOKEN_FIRST_LETTER= q{(?:[^\W\d]|[:_])}; # < perl 5.6 - does not work for leading non-ascii letters
$REG_NTOKEN_FIRST_LETTER= q{(?:[[:alpha:]:_])}; # >= perl 5.6
my $REG_NTOKEN_LETTER= q{(?:[\w_:.-]*)};
my $REG_NTOKEN= qq{(?:$REG_NTOKEN_FIRST_LETTER$REG_NTOKEN_LETTER*)};
my $REG_CLASS = $REG_NTOKEN;
my $REG_ID = $REG_NTOKEN;
# allow <tag> #<tag> (private elt) * <tag>.<class> *.<class> <tag>#<id> *#<id>
my $REG_TAG_PART= qq{(?:$REG_NAME_W(?:[.]$REG_CLASS|[#]$REG_ID)?|[.]$REG_CLASS)};
my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp
my $REG_MATCH = q{[!=]~}; # match (or not)
my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted)
my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number
my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value
my $REG_OP = q{==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge|=}; # op
my $REG_FUNCTION = q{(?:string|text)\(\s*\)};
my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)};
my $REG_COMP = q{(?:>=|<=|!=|<|>|=)};
my $REG_TAG_IN_PREDICATE= $REG_NAME_W . q{(?=\s*(?i:and\b|or\b|\]|$))};
# keys in the context stack, chosen not to interfere with att names, even private (#-prefixed) ones
my $ST_TAG = '##tag';
my $ST_ELT = '##elt';
my $ST_NS = '##ns' ;
# used in the handler trigger code
my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or)|$REG_TAG_IN_PREDICATE)*)};
my $REG_PREDICATE= qq{\\[$REG_NAKED_PREDICATE\\]};
# not all axis, only supported ones (in get_xpath)
my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self',
'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self'
);
my $REG_AXIS = "(?:" . join( '|', @supported_axis) .")";
# only used in the "xpath"engine (for get_xpath/findnodes) for now
my $REG_PREDICATE_ALT = qr{\[(?:(?:string\(\s*\)|\@$REG_TAG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]};
# used to convert XPath tests on strings to the perl equivalent
my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le ');
my( $FB_HTMLCREF, $FB_XMLCREF);
my $NO_WARNINGS= $perl_version >= 5.006 ? 'no warnings' : 'local $^W=0';
# default namespaces, both ways
my %DEFAULT_NS= ( xml => "http://www.w3.org/XML/1998/namespace",
xmlns => "http://www.w3.org/2000/xmlns/",
);
my %DEFAULT_URI2NS= map { $DEFAULT_NS{$_} => $_ } keys %DEFAULT_NS;
# constants
my( $PCDATA, $CDATA, $PI, $COMMENT, $ENT, $ELT, $TEXT, $ASIS, $EMPTY, $BUFSIZE);
# used when an HTML doc only has a PUBLIC declaration, to generate the SYSTEM one
# this should really be done by HTML::TreeBuilder, but as of HTML::TreeBuilder 4.2 it isn't
# the various declarations are taken from http://en.wikipedia.org/wiki/Document_Type_Declaration
my %HTML_DECL= ( "-//W3C//DTD HTML 4.0 Transitional//EN" => "http://www.w3.org/TR/REC-html40/loose.dtd",
"-//W3C//DTD HTML 4.01//EN" => "http://www.w3.org/TR/html4/strict.dtd",
"-//W3C//DTD HTML 4.01 Transitional//EN" => "http://www.w3.org/TR/html4/loose.dtd",
"-//W3C//DTD HTML 4.01 Frameset//EN" => "http://www.w3.org/TR/html4/frameset.dtd",
"-//W3C//DTD XHTML 1.0 Strict//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd",
"-//W3C//DTD XHTML 1.0 Transitional//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
"-//W3C//DTD XHTML 1.0 Frameset//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd",
"-//W3C//DTD XHTML 1.1//EN" => "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd",
"-//W3C//DTD XHTML Basic 1.0//EN" => "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd",
"-//W3C//DTD XHTML Basic 1.1//EN" => "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd",
"-//WAPFORUM//DTD XHTML Mobile 1.0//EN" => "http://www.wapforum.org/DTD/xhtml-mobile10.dtd",
"-//WAPFORUM//DTD XHTML Mobile 1.1//EN" => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd",
"-//WAPFORUM//DTD XHTML Mobile 1.2//EN" => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd",
"-//W3C//DTD XHTML+RDFa 1.0//EN" => "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd",
);
my $DEFAULT_HTML_TYPE= "-//W3C//DTD HTML 4.0 Transitional//EN";
my $SEP= qr/\s*(?:$|\|)/;
BEGIN
{
$VERSION = '3.49';
use XML::Parser;
my $needVersion = '2.23';
($parser_version= $XML::Parser::VERSION)=~ s{_\d+}{}; # remove _<n> from version so numeric tests do not warn
croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion;
($perl_version= $])=~ s{_\d+}{};
if( $perl_version >= 5.008)
{ eval "use Encode qw( :all)"; ## no critic ProhibitStringyEval
$FB_XMLCREF = 0x0400; # Encode::FB_XMLCREF;
$FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF;
}
# test whether we can use weak references
# set local empty signal handler to trap error messages
{ local $SIG{__DIE__};
if( eval( 'require Scalar::Util') && defined( \&Scalar::Util::weaken))
{ import Scalar::Util( 'weaken'); $weakrefs= 1; }
elsif( eval( 'require WeakRef'))
{ import WeakRef; $weakrefs= 1; }
else
{ $weakrefs= 0; }
}
import XML::Twig::Elt;
import XML::Twig::Entity;
import XML::Twig::Entity_list;
# used to store the gi's
# should be set for each twig really, at least when there are several
# the init ensures that special gi's are always the same
# constants: element types
$PCDATA = '#PCDATA';
$CDATA = '#CDATA';
$PI = '#PI';
$COMMENT = '#COMMENT';
$ENT = '#ENT';
# element classes
$ELT = '#ELT';
$TEXT = '#TEXT';
# element properties
$ASIS = '#ASIS';
$EMPTY = '#EMPTY';
# used in parseurl to set the buffer size to the same size as in XML::Parser::Expat
$BUFSIZE = 32768;
# gi => index
%XML::Twig::gi2index=( '', 0, $PCDATA => 1, $CDATA => 2, $PI => 3, $COMMENT => 4, $ENT => 5);
# list of gi's
@XML::Twig::index2gi=( '', $PCDATA, $CDATA, $PI, $COMMENT, $ENT);
# gi's under this value are special
$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi;
%XML::Twig::base_ent= ( '>' => '>', '<' => '<', '&' => '&', "'" => ''', '"' => '"',);
foreach my $c ( "\n", "\r", "\t") { $XML::Twig::base_ent{$c}= sprintf( "&#x%02x;", ord( $c)); }
# now set some aliases
*find_nodes = *get_xpath; # same as XML::XPath
*findnodes = *get_xpath; # same as XML::LibXML
*getElementsByTagName = *descendants;
*descendants_or_self = *descendants; # valid in XML::Twig, not in XML::Twig::Elt
*find_by_tag_name = *descendants;
*getElementById = *elt_id;
*getEltById = *elt_id;
*toString = *sprint;
*create_accessors = *att_accessors;
}
@ISA = qw(XML::Parser);
# fake gi's used in twig_handlers and start_tag_handlers
my $ALL = '_all_'; # the associated function is always called
my $DEFAULT= '_default_'; # the function is called if no other handler has been
# some defaults
my $COMMENTS_DEFAULT= 'keep';
my $PI_DEFAULT = 'keep';
# handlers used in regular mode
my %twig_handlers=( Start => \&_twig_start,
End => \&_twig_end,
Char => \&_twig_char,
Entity => \&_twig_entity,
XMLDecl => \&_twig_xmldecl,
Doctype => \&_twig_doctype,
Element => \&_twig_element,
Attlist => \&_twig_attlist,
CdataStart => \&_twig_cdatastart,
CdataEnd => \&_twig_cdataend,
Proc => \&_twig_pi,
Comment => \&_twig_comment,
Default => \&_twig_default,
ExternEnt => \&_twig_extern_ent,
);
# handlers used when twig_roots is used and we are outside of the roots
my %twig_handlers_roots=
( Start => \&_twig_start_check_roots,
End => \&_twig_end_check_roots,
Doctype => \&_twig_doctype,
Char => undef, Entity => undef, XMLDecl => \&_twig_xmldecl,
Element => undef, Attlist => undef, CdataStart => undef,
CdataEnd => undef, Proc => undef, Comment => undef,
Proc => \&_twig_pi_check_roots,
Default => sub {}, # hack needed for XML::Parser 2.27
ExternEnt => \&_twig_extern_ent,
);
# handlers used when twig_roots and print_outside_roots are used and we are
# outside of the roots
my %twig_handlers_roots_print_2_30=
( Start => \&_twig_start_check_roots,
End => \&_twig_end_check_roots,
Char => \&_twig_print,
Entity => \&_twig_print_entity,
ExternEnt => \&_twig_print_entity,
DoctypeFin => \&_twig_doctype_fin_print,
XMLDecl => sub { _twig_xmldecl( @_); _twig_print( @_); },
Doctype => \&_twig_print_doctype, # because recognized_string is broken here
# Element => \&_twig_print, Attlist => \&_twig_print,
CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
Default => \&_twig_print_check_doctype,
ExternEnt => \&_twig_extern_ent,
);
# handlers used when twig_roots, print_outside_roots and keep_encoding are used
# and we are outside of the roots
my %twig_handlers_roots_print_original_2_30=
( Start => \&_twig_start_check_roots,
End => \&_twig_end_check_roots,
Char => \&_twig_print_original,
# I have no idea why I should not be using this handler!
Entity => \&_twig_print_entity,
ExternEnt => \&_twig_print_entity,
DoctypeFin => \&_twig_doctype_fin_print,
XMLDecl => sub { _twig_xmldecl( @_); _twig_print_original( @_) },
Doctype => \&_twig_print_original_doctype, # because original_string is broken here
Element => \&_twig_print_original, Attlist => \&_twig_print_original,
CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
Default => \&_twig_print_original_check_doctype,
);
# handlers used when twig_roots and print_outside_roots are used and we are
# outside of the roots
my %twig_handlers_roots_print_2_27=
( Start => \&_twig_start_check_roots,
End => \&_twig_end_check_roots,
Char => \&_twig_print,
# if the Entity handler is set then it prints the entity declaration
# before the entire internal subset (including the declaration!) is output
Entity => sub {},
XMLDecl => \&_twig_print, Doctype => \&_twig_print,
CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
Default => \&_twig_print,
ExternEnt => \&_twig_extern_ent,
);
# handlers used when twig_roots, print_outside_roots and keep_encoding are used
# and we are outside of the roots
my %twig_handlers_roots_print_original_2_27=
( Start => \&_twig_start_check_roots,
End => \&_twig_end_check_roots,
Char => \&_twig_print_original,
# for some reason original_string is wrong here
# this can be a problem if the doctype includes non ascii characters
XMLDecl => \&_twig_print, Doctype => \&_twig_print,
# if the Entity handler is set then it prints the entity declaration
# before the entire internal subset (including the declaration!) is output
Entity => sub {},
#Element => undef, Attlist => undef,
CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
Default => \&_twig_print, # _twig_print_original does not work
ExternEnt => \&_twig_extern_ent,
);
my %twig_handlers_roots_print= $parser_version > 2.27
? %twig_handlers_roots_print_2_30
: %twig_handlers_roots_print_2_27;
my %twig_handlers_roots_print_original= $parser_version > 2.27
? %twig_handlers_roots_print_original_2_30
: %twig_handlers_roots_print_original_2_27;
# handlers used when the finish_print method has been called
my %twig_handlers_finish_print=
( Start => \&_twig_print,
End => \&_twig_print, Char => \&_twig_print,
Entity => \&_twig_print, XMLDecl => \&_twig_print,
Doctype => \&_twig_print, Element => \&_twig_print,
Attlist => \&_twig_print, CdataStart => \&_twig_print,
CdataEnd => \&_twig_print, Proc => \&_twig_print,
Comment => \&_twig_print, Default => \&_twig_print,
ExternEnt => \&_twig_extern_ent,
);
# handlers used when the finish_print method has been called and the keep_encoding
# option is used
my %twig_handlers_finish_print_original=
( Start => \&_twig_print_original, End => \&_twig_print_end_original,
Char => \&_twig_print_original, Entity => \&_twig_print_original,
XMLDecl => \&_twig_print_original, Doctype => \&_twig_print_original,
Element => \&_twig_print_original, Attlist => \&_twig_print_original,
CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
Proc => \&_twig_print_original, Comment => \&_twig_print_original,
Default => \&_twig_print_original,
);
# handlers used within ignored elements
my %twig_handlers_ignore=
( Start => \&_twig_ignore_start,
End => \&_twig_ignore_end,
Char => undef, Entity => undef, XMLDecl => undef,
Doctype => undef, Element => undef, Attlist => undef,
CdataStart => undef, CdataEnd => undef, Proc => undef,
Comment => undef, Default => undef,
ExternEnt => undef,
);
# those handlers are only used if the entities are NOT to be expanded
my %twig_noexpand_handlers= ( ExternEnt => undef, Default => \&_twig_default );
my @saved_default_handler;
my $ID= 'id'; # default value, set by the Id argument
my $css_sel=0; # set through the css_sel option to allow .class selectors in triggers
# all allowed options
%valid_option=
( # XML::Twig options
TwigHandlers => 1, Id => 1,
TwigRoots => 1, TwigPrintOutsideRoots => 1,
StartTagHandlers => 1, EndTagHandlers => 1,
ForceEndTagHandlersUsage => 1,
DoNotChainHandlers => 1,
IgnoreElts => 1,
Index => 1,
AttAccessors => 1,
EltAccessors => 1,
FieldAccessors => 1,
CharHandler => 1,
TopDownHandlers => 1,
KeepEncoding => 1, DoNotEscapeAmpInAtts => 1,
ParseStartTag => 1, KeepAttsOrder => 1,
LoadDTD => 1, DTDHandler => 1, DTDBase => 1,
DoNotOutputDTD => 1, NoProlog => 1,
ExpandExternalEnts => 1,
DiscardSpaces => 1, KeepSpaces => 1, DiscardAllSpaces => 1,
DiscardSpacesIn => 1, KeepSpacesIn => 1,
PrettyPrint => 1, EmptyTags => 1,
EscapeGt => 1,
Quote => 1,
Comments => 1, Pi => 1,
OutputFilter => 1, InputFilter => 1,
OutputTextFilter => 1,
OutputEncoding => 1,
RemoveCdata => 1,
EltClass => 1,
MapXmlns => 1, KeepOriginalPrefix => 1,
SkipMissingEnts => 1,
# XML::Parser options
ErrorContext => 1, ProtocolEncoding => 1,
Namespaces => 1, NoExpand => 1,
Stream_Delimiter => 1, ParseParamEnt => 1,
NoLWP => 1, Non_Expat_Options => 1,
Xmlns => 1, CssSel => 1,
UseTidy => 1, TidyOptions => 1,
OutputHtmlDoctype => 1,
);
my $active_twig; # last active twig,for XML::Twig::s
# predefined input and output filters
use vars qw( %filter);
%filter= ( html => \&html_encode,
safe => \&safe_encode,
safe_hex => \&safe_encode_hex,
);
# trigger types (used to sort them)
my ($LEVEL_TRIGGER, $REGEXP_TRIGGER, $XPATH_TRIGGER)=(1..3);
sub new
{ my ($class, %args) = @_;
my $handlers;
# change all nice_perlish_names into nicePerlishNames
%args= _normalize_args( %args);
# check options
unless( $args{MoreOptions})
{ foreach my $arg (keys %args)
{ carp "invalid option $arg" unless $valid_option{$arg}; }
}
# a twig is really an XML::Parser
# my $self= XML::Parser->new(%args);
my $self;
$self= XML::Parser->new(%args);
bless $self, $class;
$self->{_twig_context_stack}= [];
# allow tag.class selectors in handler triggers
$css_sel= $args{CssSel} || 0;
if( exists $args{TwigHandlers})
{ $handlers= $args{TwigHandlers};
$self->setTwigHandlers( $handlers);
delete $args{TwigHandlers};
}
# take care of twig-specific arguments
if( exists $args{StartTagHandlers})
{ $self->setStartTagHandlers( $args{StartTagHandlers});
delete $args{StartTagHandlers};
}
if( exists $args{DoNotChainHandlers})
{ $self->{twig_do_not_chain_handlers}= $args{DoNotChainHandlers}; }
if( exists $args{IgnoreElts})
{ # change array to hash so you can write ignore_elts => [ qw(foo bar baz)]
if( isa( $args{IgnoreElts}, 'ARRAY')) { $args{IgnoreElts}= { map { $_ => 1 } @{$args{IgnoreElts}} }; }
$self->setIgnoreEltsHandlers( $args{IgnoreElts});
delete $args{IgnoreElts};
}
if( exists $args{Index})
{ my $index= $args{Index};
# we really want a hash name => path, we turn an array into a hash if necessary
if( ref( $index) eq 'ARRAY')
{ my %index= map { $_ => $_ } @$index;
$index= \%index;
}
while( my( $name, $exp)= each %$index)
{ $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); }
}
$self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt';
if( defined( $args{EltClass}) && $args{EltClass} ne 'XML::Twig::Elt') { $self->{twig_alt_elt_class}=1; }
if( exists( $args{EltClass})) { delete $args{EltClass}; }
if( exists( $args{MapXmlns}))
{ $self->{twig_map_xmlns}= $args{MapXmlns};
$self->{Namespaces}=1;
delete $args{MapXmlns};
}
if( exists( $args{KeepOriginalPrefix}))
{ $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix};
delete $args{KeepOriginalPrefix};
}
$self->{twig_dtd_handler}= $args{DTDHandler};
delete $args{DTDHandler};
if( $args{ExpandExternalEnts})
{ $self->set_expand_external_entities( 1);
$self->{twig_expand_external_ents}= $args{ExpandExternalEnts};
$self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts
if( $args{ExpandExternalEnts} == -1)
{ $self->{twig_extern_ent_nofail}= 1;
$self->setHandlers( ExternEnt => \&_twig_extern_ent_nofail);
}
delete $args{LoadDTD};
delete $args{ExpandExternalEnts};
}
else
{ $self->set_expand_external_entities( 0); }
if( !$args{NoLWP} && ! _use( 'URI') && ! _use( 'URI::File') && ! _use( 'LWP'))
{ $self->{twig_ext_ent_handler}= \&XML::Parser::initial_ext_ent_handler }
else
{ $self->{twig_ext_ent_handler}= \&XML::Parser::file_ext_ent_handler }
if( $args{DoNotEscapeAmpInAtts})
{ $self->set_do_not_escape_amp_in_atts( 1);
$self->{twig_do_not_escape_amp_in_atts}=1;
}
else
{ $self->set_do_not_escape_amp_in_atts( 0);
$self->{twig_do_not_escape_amp_in_atts}=0;
}
# deal with TwigRoots argument, a hash of elements for which
# subtrees will be built (and associated handlers)
if( $args{TwigRoots})
{ $self->setTwigRoots( $args{TwigRoots});
delete $args{TwigRoots};
}
if( $args{EndTagHandlers})
{ unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage})
{ croak "you should not use EndTagHandlers without TwigRoots\n",
"if you want to use it anyway, normally because you have ",
"a start_tag_handlers that calls 'ignore' and you want to ",
"call an ent_tag_handlers at the end of the element, then ",
"pass 'force_end_tag_handlers_usage => 1' as an argument ",
"to new";
}
$self->setEndTagHandlers( $args{EndTagHandlers});
delete $args{EndTagHandlers};
}
if( $args{TwigPrintOutsideRoots})
{ croak "cannot use twig_print_outside_roots without twig_roots"
unless( $self->{twig_roots});
# if the arg is a filehandle then store it
if( _is_fh( $args{TwigPrintOutsideRoots}) )
{ $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; }
$self->{twig_default_print}= $args{TwigPrintOutsideRoots};
}
# space policy
if( $args{KeepSpaces})
{ croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces});
croak "cannot use both keep_spaces and discard_all_spaces" if( $args{DiscardAllSpaces});
croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
$self->{twig_keep_spaces}=1;
delete $args{KeepSpaces};
}
if( $args{DiscardSpaces})
{
croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
croak "cannot use both discard_spaces and discard_all_spaces" if( $args{DiscardAllSpaces});
croak "cannot use both discard_spaces and discard_spaces_in" if( $args{DiscardSpacesIn});
$self->{twig_discard_spaces}=1;
delete $args{DiscardSpaces};
}
if( $args{KeepSpacesIn})
{ croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn});
croak "cannot use both keep_spaces_in and discard_all_spaces" if( $args{DiscardAllSpaces});
$self->{twig_discard_spaces}=1;
$self->{twig_keep_spaces_in}={};
my @tags= @{$args{KeepSpacesIn}};
foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; }
delete $args{KeepSpacesIn};
}
if( $args{DiscardAllSpaces})
{
croak "cannot use both discard_all_spaces and discard_spaces_in" if( $args{DiscardSpacesIn});
$self->{twig_discard_all_spaces}=1;
delete $args{DiscardAllSpaces};
}
if( $args{DiscardSpacesIn})
{ $self->{twig_keep_spaces}=1;
$self->{twig_discard_spaces_in}={};
my @tags= @{$args{DiscardSpacesIn}};
foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; }
delete $args{DiscardSpacesIn};
}
# discard spaces by default
$self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces});
$args{Comments}||= $COMMENTS_DEFAULT;
if( $args{Comments} eq 'drop') { $self->{twig_keep_comments}= 0; }
elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; }
elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; }
else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; }
delete $args{Comments};
$args{Pi}||= $PI_DEFAULT;
if( $args{Pi} eq 'drop') { $self->{twig_keep_pi}= 0; }
elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; }
elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; }
else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; }
delete $args{Pi};
if( $args{KeepEncoding})
{
# set it in XML::Twig::Elt so print functions know what to do
$self->set_keep_encoding( 1);
$self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag;
delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ;
delete $args{KeepEncoding};
}
else
{ $self->set_keep_encoding( 0);
if( $args{ParseStartTag})
{ $self->{parse_start_tag}= $args{ParseStartTag}; }
else
{ delete $self->{parse_start_tag}; }
delete $args{ParseStartTag};
}
if( $args{OutputFilter})
{ $self->set_output_filter( $args{OutputFilter});
delete $args{OutputFilter};
}
else
{ $self->set_output_filter( 0); }
if( $args{RemoveCdata})
{ $self->set_remove_cdata( $args{RemoveCdata});
delete $args{RemoveCdata};
}
else
{ $self->set_remove_cdata( 0); }
if( $args{OutputTextFilter})
{ $self->set_output_text_filter( $args{OutputTextFilter});
delete $args{OutputTextFilter};
}
else
{ $self->set_output_text_filter( 0); }
if( $args{KeepAttsOrder})
{ $self->{keep_atts_order}= $args{KeepAttsOrder};
if( _use( 'Tie::IxHash'))
{ $self->set_keep_atts_order( $self->{keep_atts_order}); }
else
{ croak "Tie::IxHash not available, option keep_atts_order not allowed"; }
}
else
{ $self->set_keep_atts_order( 0); }
if( $args{PrettyPrint}) { $self->set_pretty_print( $args{PrettyPrint}); }
if( $args{EscapeGt}) { $self->escape_gt( $args{EscapeGt}); }
if( $args{EmptyTags}) { $self->set_empty_tag_style( $args{EmptyTags}) }
if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; }
if( $args{NoProlog}) { $self->{no_prolog}= 1; delete $args{NoProlog}; }
if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1; delete $args{DoNotOutputDTD}; }
if( $args{LoadDTD}) { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD}; }
if( $args{CharHandler}) { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; }
if( $args{InputFilter}) { $self->set_input_filter( $args{InputFilter}); delete $args{InputFilter}; }
if( $args{NoExpand}) { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; }
if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; }
if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; }
if( my $acc_a= $args{AttAccessors}) { $self->att_accessors( @$acc_a); }
if( my $acc_e= $args{EltAccessors}) { $self->elt_accessors( isa( $acc_e, 'ARRAY') ? @$acc_e : $acc_e); }
if( my $acc_f= $args{FieldAccessors}) { $self->field_accessors( isa( $acc_f, 'ARRAY') ? @$acc_f : $acc_f); }
if( $args{UseTidy}) { $self->{use_tidy}= 1; }
$self->{tidy_options}= $args{TidyOptions} || {};
if( $args{OutputHtmlDoctype}) { $self->{html_doctype}= 1; }
$self->set_quote( $args{Quote} || 'double');
# set handlers
if( $self->{twig_roots})
{ if( $self->{twig_default_print})
{ if( $self->{twig_keep_encoding})
{ $self->setHandlers( %twig_handlers_roots_print_original); }
else
{ $self->setHandlers( %twig_handlers_roots_print); }
}
else
{ $self->setHandlers( %twig_handlers_roots); }
}
else
{ $self->setHandlers( %twig_handlers); }
# XML::Parser::Expat does not like these handler to be set. So in order to
# use the various sets of handlers on XML::Parser or XML::Parser::Expat
# objects when needed, these ones have to be set only once, here, at
# XML::Parser level
$self->setHandlers( Init => \&_twig_init, Final => \&_twig_final);
$self->{twig_entity_list}= XML::Twig::Entity_list->new;
$self->{twig_id}= $ID;
$self->{twig_stored_spaces}='';
$self->{twig_autoflush}= 1; # auto flush by default
$self->{twig}= $self;
if( $weakrefs) { weaken( $self->{twig}); }
return $self;
}
sub parse
{
my $t= shift;
# if called as a class method, calls nparse, which creates the twig then parses it
if( !ref( $t) || !isa( $t, 'XML::Twig')) { return $t->nparse( @_); }
# requires 5.006 at least (or the ${^UNICODE} causes a problem) # > perl 5.5
# trap underlying bug in IO::Handle (see RT #17500) # > perl 5.5
# croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe # > perl 5.5
if( $perl_version>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[0], 'GLOB') && -p $_[0] ) # > perl 5.5
{ croak "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n" # > perl 5.5
. "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n" # > perl 5.5
. "not to include 'D'"; # > perl 5.5
} # > perl 5.5
$t= eval { $t->SUPER::parse( @_); };
if( !$t
&& $@=~m{(syntax error at line 1, column 0, byte 0|not well-formed \(invalid token\) at line 1, column 1, byte 1)}
&& -f $_[0]
&& ( ! ref( $_[0]) || ref( $_[0])) ne 'GLOB' # -f works on a filehandle, so this make sure $_[0] is a real file
)
{ croak "you seem to have used the parse method on a filename ($_[0]), you probably want parsefile instead"; }
return _checked_parse_result( $t, $@);
}
sub parsefile
{ my $t= shift;
if( -f $_[0] && ! -s $_[0]) { return _checked_parse_result( undef, "empty file '$_[0]'"); }
$t= eval { $t->SUPER::parsefile( @_); };
return _checked_parse_result( $t, $@);
}
sub _checked_parse_result
{ my( $t, $returned)= @_;
if( !$t)
{ if( isa( $returned, 'XML::Twig') && $returned->{twig_finish_now})
{ $t= $returned;
delete $t->{twig_finish_now};
return $t->_twig_final;
}
else
{ _croak( $returned, 0); }
}
$active_twig= $t;
return $t;
}
sub active_twig { return $active_twig; }
sub finish_now
{ my $t= shift;
$t->{twig_finish_now}=1;
die $t;
}
sub parsefile_inplace { shift->_parse_inplace( parsefile => @_); }
sub parsefile_html_inplace { shift->_parse_inplace( parsefile_html => @_); }
sub _parse_inplace
{ my( $t, $method, $file, $suffix)= @_;
_use( 'File::Temp') || croak "need File::Temp to use inplace methods\n";
_use( 'File::Basename');
my $tmpdir= dirname( $file);
my( $tmpfh, $tmpfile)= File::Temp::tempfile( DIR => $tmpdir);
my $original_fh= select $tmpfh;
unless( $t->{twig_keep_encoding} || $perl_version < 5.006)
{ if( grep /useperlio=define/, `$^X -V`) # we can only use binmode :utf8 if perl was compiled with useperlio
{ binmode( $tmpfh, ":utf8" ); }
}
$t->$method( $file);
select $original_fh;
close $tmpfh;
my $mode= (stat( $file))[2] & oct(7777);
chmod $mode, $tmpfile or croak "cannot change temp file mode to $mode: $!";
if( $suffix)
{ my $backup;
if( $suffix=~ m{\*}) { ($backup = $suffix) =~ s/\*/$file/g; }
else { $backup= $file . $suffix; }
rename( $file, $backup) or croak "cannot backup initial file ($file) to $backup: $!";
}
rename( $tmpfile, $file) or croak "cannot rename temp file ($tmpfile) to initial file ($file): $!";
return $t;
}
sub parseurl
{ my $t= shift;
$t->_parseurl( 0, @_);
}
sub safe_parseurl
{ my $t= shift;
$t->_parseurl( 1, @_);
}
sub safe_parsefile_html
{ my $t= shift;
eval { $t->parsefile_html( @_); };
return $@ ? $t->_reset_twig_after_error : $t;
}
sub safe_parseurl_html
{ my $t= shift;
_use( 'LWP::Simple') or croak "missing LWP::Simple";
eval { $t->parse_html( LWP::Simple::get( shift()), @_); } ;
return $@ ? $t->_reset_twig_after_error : $t;
}
sub parseurl_html
{ my $t= shift;
_use( 'LWP::Simple') or croak "missing LWP::Simple";
$t->parse_html( LWP::Simple::get( shift()), @_);
}
# uses eval to catch the parser's death
sub safe_parse_html
{ my $t= shift;
eval { $t->parse_html( @_); } ;
return $@ ? $t->_reset_twig_after_error : $t;
}
sub parsefile_html
{ my $t= shift;
my $file= shift;
my $indent= $t->{ErrorContext} ? 1 : 0;
$t->set_empty_tag_style( 'html');
my $html2xml= $t->{use_tidy} ? \&_tidy_html : \&_html2xml;
my $options= $t->{use_tidy} ? $t->{tidy_options} || {} : { indent => $indent, html_doctype => $t->{html_doctype} };
$t->parse( $html2xml->( _slurp( $file), $options), @_);
return $t;
}
sub parse_html
{ my $t= shift;
my $options= ref $_[0] && ref $_[0] eq 'HASH' ? shift() : {};
my $use_tidy= exists $options->{use_tidy} ? $options->{use_tidy} : $t->{use_tidy};
my $content= shift;
my $indent= $t->{ErrorContext} ? 1 : 0;
$t->set_empty_tag_style( 'html');
my $html2xml= $use_tidy ? \&_tidy_html : \&_html2xml;
my $conv_options= $use_tidy ? $t->{tidy_options} || {} : { indent => $indent, html_doctype => $t->{html_doctype} };
$t->parse( $html2xml->( isa( $content, 'GLOB') ? _slurp_fh( $content) : $content, $conv_options), @_);
return $t;
}
sub xparse
{ my $t= shift;
my $to_parse= $_[0];
if( isa( $to_parse, 'GLOB')) { $t->parse( @_); }
elsif( $to_parse=~ m{^\s*<}) { $to_parse=~ m{<html}i ? $t->_parse_as_xml_or_html( @_)
: $t->parse( @_);
}
elsif( $to_parse=~ m{^\w+://.*\.html?$}) { _use( 'LWP::Simple') or croak "missing LWP::Simple";
$t->_parse_as_xml_or_html( LWP::Simple::get( shift()), @_);
}
elsif( $to_parse=~ m{^\w+://}) { _use( 'LWP::Simple') or croak "missing LWP::Simple";
my $doc= LWP::Simple::get( shift);
if( ! defined $doc) { $doc=''; }
my $xml_parse_ok= $t->safe_parse( $doc, @_);
if( $xml_parse_ok)
{ return $xml_parse_ok; }
else
{ my $diag= $@;
if( $doc=~ m{<html}i)
{ $t->parse_html( $doc, @_); }
else
{ croak $diag; }
}
}
elsif( $to_parse=~ m{\.html?$}) { my $content= _slurp( shift);
$t->_parse_as_xml_or_html( $content, @_);
}
else { $t->parsefile( @_); }
}
sub _parse_as_xml_or_html
{ my $t= shift;
if( _is_well_formed_xml( $_[0]))
{ $t->parse( @_) }
else
{ my $html2xml= $t->{use_tidy} ? \&_tidy_html : \&_html2xml;
my $options= $t->{use_tidy} ? $t->{tidy_options} || {} : { indent => 0, html_doctype => $t->{html_doctype} };
my $html= $html2xml->( $_[0], $options, @_);
if( _is_well_formed_xml( $html))
{ $t->parse( $html); }
else
{ croak $@; } # can't really test this because HTML::Parser or HTML::Tidy may change how they deal with bas HTML between versions
}
}
{ my $parser;
sub _is_well_formed_xml
{ $parser ||= XML::Parser->new;
eval { $parser->parse( $_[0]); };
return $@ ? 0 : 1;
}
}
sub nparse
{ my $class= shift;
my $to_parse= pop;
$class->new( @_)->xparse( $to_parse);
}
sub nparse_pp { shift()->nparse( pretty_print => 'indented', @_); }
sub nparse_e { shift()->nparse( error_context => 1, @_); }
sub nparse_ppe { shift()->nparse( pretty_print => 'indented', error_context => 1, @_); }
sub _html2xml
{ my( $html, $options)= @_;
_use( 'HTML::TreeBuilder', '3.13') or croak "cannot parse HTML: missing HTML::TreeBuilder v >= 3.13\n";
my $tree= HTML::TreeBuilder->new;
$tree->ignore_ignorable_whitespace( 0);
$tree->ignore_unknown( 0);
$tree->no_space_compacting( 1);
$tree->store_comments( 1);
$tree->store_pis(1);
$tree->parse( $html);
$tree->eof;
my $xml='';
if( $options->{html_doctype} && exists $tree->{_decl} )
{ my $decl= $tree->{_decl}->as_XML;
# first try to fix declarations that are missing the SYSTEM part
$decl =~ s{^\s*<!DOCTYPE \s+ ((?i)html) \s+ PUBLIC \s+ "([^"]*)" \s* >}
{ my $system= $HTML_DECL{$2} || $HTML_DECL{$DEFAULT_HTML_TYPE};
qq{<!DOCTYPE $1 PUBLIC "$2" "$system">}
}xe;
# then check that the declaration looks OK (so it parses), if not remove it,
# better to parse without the declaration than to die stupidly
if( $decl =~ m{<!DOCTYPE \s+ (?i:HTML) (\s+ PUBLIC \s+ "[^"]*" \s+ (SYSTEM \s+)? "[^"]*")? \s*>}x # PUBLIC then SYSTEM
|| $decl =~ m{<!DOCTYPE \s+ (?i:HTML) \s+ SYSTEM \s+ "[^"]*" \s*>}x # just SYSTEM
)
{ $xml= $decl; }
}
$xml.= _as_XML( $tree);