From 5cc39544e28ff6b7c8dcfe08ad329a5841df15cc Mon Sep 17 00:00:00 2001 From: Joseph Kempster Date: Mon, 9 Oct 2023 14:38:24 +0100 Subject: [PATCH] Parse legislative list as part of the main conversion Kramdown handles abbreviations and footnotes natively. Call to action and legislative list are custom Govspeak components built as extensions to Kramdown. These sections are parsed separately in preprocessing, they are ignored in any subsequent parsing because Kramdown ignores HTML. Because of this, we have added custom handling of abbreviations and footnotes specifically for these components [1]. This custom implementation means that abbreviations defined anywhere in the document will be applied to call to action and legislative list components. Otherwise, only abbreviations defined within these components would be applied. This custom implementation has been shown to contain bugs through several Zendesk tickets. The main issues reported are: 1. Acronyms are inserted into the produced HTML, even if undesirable. For example, a link `` with the acronym `*[email]: Electronic mail is a method of transmitting and receiving messages` would produce `

href="mailto:email@example.com"email@example.com

` rather than the expected link. 2. Because the `add_acronym_alt_text` method runs through the text for each acronym, acronyms can be inserted into other acronyms creating invalid HTML. If we change tack, and instead allow Kramdown to parse these components as part of the main conversion to HTML (instead of in preprocessing), Kramdown will handle abbreviations and footnotes correctly, fixing both of these issues (as well as some other undocumented issues). The legislative list component currently works by overriding Kramdowns `list` extension [2] and disabling ordered lists [3]. So that we can parse this component as part of the main conversion, this applies two options: `parse_block_html` [4], so that markdown inside the `legislative-list-wrapper` `div` is parsed by Kramdown, and a new custom option of `ordered_lists_disabled`, so that we can control flow inside our `Parser::Govuk` class (subclass of the Kramdown parser). It is possible to override `parse_list` instead of `parse_block_html`, but this means that the outermost list will not be parsed (by the time `parse_list` is called the outermost list is already being parsed) The only difference between this new iteration of the legislative list and the previous iteration, is that we now wrap the component in a `legislative-list-wrapper` `div`. We could remove this in postprocessing, but this requires additional modification of the Kramdown produced HTML which seems unnecessary. This also allows us to remove all remaining code which replicates the footnote and acronym functions of Kramdown. Some of the tests can likely be removed as we are now testing the functionality of the library, however, for now these represent that there are no regressions. [1]: https://github.com/alphagov/govspeak/pull/285 [2]: https://github.com/gettalong/kramdown/blob/bd678ecb59f70778fdb3b08bdcd39e2ab7379b45/lib/kramdown/parser/kramdown/list.rb#L54 [3]: https://github.com/alphagov/govspeak/pull/25 [4]: https://kramdown.gettalong.org/quickref.html#html-elements --- lib/govspeak.rb | 80 +------ lib/govspeak/post_processor.rb | 9 + lib/kramdown/parser/govuk.rb | 19 +- test/govspeak_test.rb | 414 ++++++++++++++++++--------------- 4 files changed, 258 insertions(+), 264 deletions(-) diff --git a/lib/govspeak.rb b/lib/govspeak.rb index f6e113f3..982facb1 100644 --- a/lib/govspeak.rb +++ b/lib/govspeak.rb @@ -63,8 +63,6 @@ def initialize(source, options = {}) sanitize: true, syntax_highlighter: nil }.merge(options) @options[:entity_output] = :symbolic - @footnote_definition_html = nil - @acronyms = [] end def to_html @@ -76,16 +74,6 @@ def to_html kramdown_doc.to_html end - unless @footnote_definition_html.nil? - regex = /
]/m - - if html.scan(regex).empty? - html << @footnote_definition_html - else - html.gsub!(regex, @footnote_definition_html) - end - end - Govspeak::PostProcessor.process(html, self) end end @@ -125,8 +113,6 @@ def preprocess(source) source = Govspeak::BlockquoteExtraQuoteRemover.remove(source) source = remove_forbidden_characters(source) - footnote_definitions(source) - self.class.extensions.each do |_, regexp, block| source.gsub!(regexp) do instance_exec(*Regexp.last_match.captures, &block) @@ -135,36 +121,6 @@ def preprocess(source) source end - def footnote_definitions(source) - is_legislative_list = source.scan(/\$LegislativeList.*?\[\^\d\]*.*?\$EndLegislativeList/m).size.positive? - footnotes = source.scan(/^\s*\[\^(\d+)\]:(.*)/) - @acronyms.concat(source.scan(/(?<=\*)\[(.*)\]:(.*)/)) - if is_legislative_list && footnotes.size.positive? - list_items = footnotes.map do |footnote| - number = footnote[0] - text = footnote[1].strip - footnote_definition = Govspeak::Document.new(text).to_html[/(?<=

).*(?=<\/p>)/] - footnote_definition = add_acronym_alt_text(footnote_definition) - - <<~HTML_SNIPPET -

  • -

    - #{footnote_definition} -

    -
  • - HTML_SNIPPET - end - - @footnote_definition_html = <<~HTML_CONTAINER -
    -
      - #{list_items.join.strip} -
    -
    - HTML_CONTAINER - end - end - def remove_forbidden_characters(source) # These are characters that are not deemed not suitable for # markup: https://www.w3.org/TR/unicode-xml/#Charlist @@ -344,25 +300,13 @@ def render_image(image) end extension("legislative list", /#{NEW_PARAGRAPH_LOOKBEHIND}\$LegislativeList\s*$(.*?)\$EndLegislativeList/m) do |body| - Govspeak::KramdownOverrides.with_kramdown_ordered_lists_disabled do - body = add_acronym_alt_text(body.strip) - - Kramdown::Document.new(body.strip).to_html.tap do |doc| - doc.gsub!("
      ", "
        ") - doc.gsub!("
    ", "") - doc.sub!("
      ", '
        ') - - footnotes = body.scan(/\[\^(\d+)\]/).flatten - - footnotes.each do |footnote| - html = "" \ - "" \ - "[footnote #{footnote}]" - - doc.sub!(/(\[\^#{footnote}\])/, html) - end - end - end + # The surrounding div is neccessary to control flow in `parse_block_html` and + # maintain the same functionality as a previous version of this extension. + <<~BODY + {::options parse_block_html=\"true\" ordered_lists_disabled=\"true\" /} +
        #{body}
        + {::options parse_block_html=\"false\" ordered_lists_disabled=\"false\" /} + BODY end extension("numbered list", /^[ \t]*((s\d+\.\s.*(?:\n|$))+)/) do |body| @@ -439,16 +383,6 @@ def kramdown_doc def encode(text) HTMLEntities.new.encode(text) end - - def add_acronym_alt_text(html) - return unless html - - @acronyms.each do |acronym| - html.gsub!(acronym[0], "#{acronym[0]}") - end - - html - end end end diff --git a/lib/govspeak/post_processor.rb b/lib/govspeak/post_processor.rb index d752ee82..2e477d9c 100644 --- a/lib/govspeak/post_processor.rb +++ b/lib/govspeak/post_processor.rb @@ -20,6 +20,15 @@ def self.extension(title, &block) end end + extension("covert legislative list ul to ol") do |document| + document.css(".legislative-list-wrapper").map do |el| + el.inner_html = el.inner_html + .sub("
          ", "
            ") + .gsub("
        ", "
      ") + .gsub("
        ", "
          ") + end + end + # This "fix" here is tied into the rendering of images as one of the # pre-processor tasks. As images can be created inside block level elements # it's possible that their block level elements can be HTML entity escaped diff --git a/lib/kramdown/parser/govuk.rb b/lib/kramdown/parser/govuk.rb index 70a19b20..0a572075 100644 --- a/lib/kramdown/parser/govuk.rb +++ b/lib/kramdown/parser/govuk.rb @@ -17,6 +17,13 @@ def ==(_other) DESCRIPTION simple_array_validator(val, :document_domains, AlwaysEqual.new) end + + define(:ordered_lists_disabled, Boolean, false, <<~DESCRIPTION) + Disables ordered lists + + Default: false + Used by: KramdownWithAutomaticExternalLinks + DESCRIPTION end module Parser @@ -46,7 +53,17 @@ def add_link(element, href, title, alt_text = nil, ial = nil) def parse_block_html return false if CUSTOM_INLINE_ELEMENTS.include?(@src[1].downcase) - super + return super unless @options[:ordered_lists_disabled] + + # Kramdown loads parsers into the instance from `options` which are scoped to + # the class. Because we are changing these options inside an instance, we must + # reconfigure the parser before and after disbaling ordered lists. + Govspeak::KramdownOverrides.with_kramdown_ordered_lists_disabled do + configure_parser + super + end + + configure_parser end end end diff --git a/test/govspeak_test.rb b/test/govspeak_test.rb index 47476124..7accde6c 100644 --- a/test/govspeak_test.rb +++ b/test/govspeak_test.rb @@ -753,13 +753,14 @@ class GovspeakTest < Minitest::Test $EndLegislativeList " do assert_html_output %{ +
          1. 1.0 Lorem ipsum dolor sit amet, consectetur adipiscing elit. - Fusce felis ante, lobortis non quam sit amet, tempus interdum justo.

            + Fusce felis ante, lobortis non quam sit amet, tempus interdum justo.

            Pellentesque quam enim, egestas sit amet congue sit amet, ultrices vitae arcu. - fringilla, metus dui scelerisque est.

            + fringilla, metus dui scelerisque est.

            1. @@ -772,10 +773,11 @@ class GovspeakTest < Minitest::Test
            2. 1.1 Second entry - Curabitur pretium pharetra sapien, a feugiat arcu euismod eget. - Nunc luctus ornare varius. Nulla scelerisque, justo dictum dapibus

              + Curabitur pretium pharetra sapien, a feugiat arcu euismod eget. + Nunc luctus ornare varius. Nulla scelerisque, justo dictum dapibus

            3. -
            } +
          +
          } end test_given_govspeak " @@ -788,6 +790,7 @@ class GovspeakTest < Minitest::Test $EndLegislativeList " do assert_html_output %{ +
          1. 1. The quick
          2. 2. Brown fox @@ -798,6 +801,7 @@ class GovspeakTest < Minitest::Test
          3. 3. Dog
          +
          } end @@ -812,28 +816,26 @@ class GovspeakTest < Minitest::Test [^2]: Footnote definition two " do assert_html_output %( +
          1. 1. Item 1[footnote 1] -
          2. +
          3. 2. Item 2[footnote 2] -
          4. +
          5. 3. Item 3
          +
          -
          -
            -
          1. -

            - Footnote definition one -

            -
          2. -
          3. -

            - Footnote definition two -

            -
          4. -
          -
          +
          +
            +
          1. +

            Footnote definition one 

            +
          2. +
          3. +

            Footnote definition two 

            +
          4. +
          +
          ) end @@ -857,41 +859,39 @@ class GovspeakTest < Minitest::Test [^3]: Footnote definition two " do assert_html_output %( +
          1. 1. Item 1[footnote 1] -
          2. +
          3. 2. Item 2
          4. 3. Item 3
          +
          -

          This is a paragraph with a footnote[footnote 2].

          +

          This is a paragraph with a footnote[footnote 2].

          +
          1. 1. Item 1
          2. 2. Item 2[footnote 3] -
          3. +
          4. 3. Item 3
          +
          -
          -
            -
          1. -

            - Footnote definition one -

            -
          2. -
          3. -

            - Footnote definition two -

            -
          4. -
          5. -

            - Footnote definition two -

            -
          6. -
          -
          +
          +
            +
          1. +

            Footnote definition one 

            +
          2. +
          3. +

            Footnote definition two 

            +
          4. +
          5. +

            Footnote definition two 

            +
          6. +
          +
          ) end @@ -934,101 +934,83 @@ class GovspeakTest < Minitest::Test [^12]: Footnote definition 12 " do assert_html_output %( -
            -
          1. 1. Item 1[footnote 1] +
            +
              +
            1. 1. Item 1[footnote 1]
            2. -
            3. 2. Item 2[footnote 2] +
            4. 2. Item 2[footnote 2]
            5. -
            6. 3. Item 3[footnote 3] +
            7. 3. Item 3[footnote 3]
            8. -
            +
          +

    This is a paragraph with a footnote[footnote 4].

    -
      -
    1. 1. Item 1[footnote 5] +
      +
        +
      1. 1. Item 1[footnote 5]
      2. -
      3. 2. Item 2[footnote 6] +
      4. 2. Item 2[footnote 6]
      5. -
      6. 3. Item 3[footnote 7] +
      7. 3. Item 3[footnote 7]
      8. -
      +
    +

    This is a paragraph with a footnote[footnote 8].

    -
      -
    1. 1. Item 1[footnote 9] +
      +
        +
      1. 1. Item 1[footnote 9]
      2. -
      3. 2. Item 2[footnote 10] +
      4. 2. Item 2[footnote 10]
      5. -
      6. 3. Item 3[footnote 11] +
      7. 3. Item 3[footnote 11]
      8. -
      +
    +

    This is a paragraph with a footnote[footnote 12].

    1. -

      - Footnote definition 1 -

      -
    2. -
    3. -

      - Footnote definition 2 -

      -
    4. -
    5. -

      - Footnote definition 3 -

      -
    6. -
    7. -

      - Footnote definition 4 -

      -
    8. -
    9. -

      - Footnote definition 5 -

      -
    10. -
    11. -

      - Footnote definition 6 -

      -
    12. -
    13. -

      - Footnote definition 7 -

      -
    14. -
    15. -

      - Footnote definition 8 -

      -
    16. -
    17. -

      - Footnote definition 9 -

      -
    18. -
    19. -

      - Footnote definition 10 -

      -
    20. -
    21. -

      - Footnote definition 11 -

      -
    22. -
    23. -

      - Footnote definition 12 -

      -
    24. +

      Footnote definition 1 

      + +
    25. +

      Footnote definition 2 

      +
    26. +
    27. +

      Footnote definition 3 

      +
    28. +
    29. +

      Footnote definition 4 

      +
    30. +
    31. +

      Footnote definition 5 

      +
    32. +
    33. +

      Footnote definition 6 

      +
    34. +
    35. +

      Footnote definition 7 

      +
    36. +
    37. +

      Footnote definition 8 

      +
    38. +
    39. +

      Footnote definition 9 

      +
    40. +
    41. +

      Footnote definition 10 

      +
    42. +
    43. +

      Footnote definition 11 

      +
    44. +
    45. +

      Footnote definition 12 

      +
    ) @@ -1047,29 +1029,27 @@ class GovspeakTest < Minitest::Test [^2]: Footnote definition two " do assert_html_output %( +
    1. 1. Item 1[footnote 1] with a link -
    2. +
    3. 2. Item 2
    4. 3. Item 3
    +
    -

    This is a paragraph with a footnote[footnote 2]

    +

    This is a paragraph with a footnote[footnote 2]

    -
    -
      -
    1. -

      - Footnote definition one -

      -
    2. -
    3. -

      - Footnote definition two -

      -
    4. -
    -
    +
    +
      +
    1. +

      Footnote definition one 

      +
    2. +
    3. +

      Footnote definition two 

      +
    4. +
    +
    ) end @@ -1084,28 +1064,26 @@ class GovspeakTest < Minitest::Test [^2]: Footnote definition two with an external [link](http://www.google.com) " do assert_html_output %( +
    1. 1. Item 1[footnote 1] with a link -
    2. +
    3. 2. Item 2
    4. 3. Item 3[footnote 2] -
    5. +
    +
    -
    -
      -
    1. -

      - Footnote definition one with a link included -

      -
    2. -
    3. -

      - Footnote definition two with an external link -

      -
    4. -
    -
    +
    +
      +
    1. +

      Footnote definition one with a link included 

      +
    2. +
    3. +

      Footnote definition two with an external link 

      +
    4. +
    +
    ) end @@ -1116,17 +1094,17 @@ class GovspeakTest < Minitest::Test [^1]: footnote text " do assert_html_output %( -

    1. some text[footnote 1]:

    - -
    -
      -
    1. -

      - footnote text -

      -
    2. -
    -
    +
    +

    1. some text[footnote 1]:

    +
    + +
    +
      +
    1. +

      footnote text 

      +
    2. +
    +
    ) end @@ -1137,15 +1115,15 @@ class GovspeakTest < Minitest::Test [^1]: footnote text " do assert_html_output %( -

    1. some text[footnote 1]: extra

    +
    +

    1. some text[footnote 1]: extra

    +
    1. -

      - footnote text -

      -
    2. +

      footnote text 

      +
    ) @@ -1166,32 +1144,28 @@ class GovspeakTest < Minitest::Test *[class]: Testing HTML matching " do assert_html_output %( -
      -
    1. 1. Item 1[footnote 1] with an ACRONYM +
      +
        +
      1. 1. Item 1[footnote 1] with an ACRONYM
      2. -
      3. 2. Item 2[footnote 2] +
      4. 2. Item 2[footnote 2]
      5. -
      6. 3. Item 3[footnote 3] +
      7. 3. Item 3[footnote 3]
      8. -
      +
    +
    1. -

      - Footnote definition one -

      -
    2. -
    3. -

      - Footnote definition two with an ACRONYM -

      -
    4. -
    5. -

      - Footnote definition three with an acronym that matches an HTML tag class -

      -
    6. +

      Footnote definition one 

      + +
    7. +

      Footnote definition two with an ACRONYM 

      +
    8. +
    9. +

      Footnote definition three with an acronym that matches an HTML tag class 

      +
    ) @@ -1218,9 +1192,11 @@ class GovspeakTest < Minitest::Test assert_html_output %(

    The quick brown fox

    -
      -
    1. 1. jumps over the lazy dog
    2. -
    +
    +
      +
    1. 1. jumps over the lazy dog
    2. +
    +
    ) end @@ -1228,9 +1204,67 @@ class GovspeakTest < Minitest::Test assert_html_output %(

    This bit of text

    -
      -
    1. 1. should be turned into a list
    2. -
    +
    +
      +
    1. 1. should be turned into a list
    2. +
    +
    + ) + end + + test_given_govspeak " + $LegislativeList + Welcome to the GOV.UK website + $EndLegislativeList + + *[GOV.UK]: The official UK government website + *[website]: A collection of web pages, such as GOV.UK + " do + assert_html_output %( +
    +

    Welcome to the GOV.UK website

    +
    + ) + end + + test_given_govspeak " + $LegislativeList + Please email + $EndLegislativeList + + *[GOV.UK]: The official UK government website + " do + assert_html_output %( + + ) + end + + test_given_govspeak " + $LegislativeList + Welcome to the GOV.UK[^1] + $EndLegislativeList + + [^1]: GOV.UK is the official UK government website + + *[GOV.UK]: The official UK government website + *[website]: A collection of web pages, such as GOV.UK + + *[GOV.UK]: The official UK government website + " do + assert_html_output %( +
    +

    Welcome to the GOV.UK[footnote 1]

    +
    + +
    +
      +
    1. +

      GOV.UK is the official UK government website 

      +
    2. +
    +
    ) end