From 86d039dbd1c76794b80fc5c983805d76b895253a Mon Sep 17 00:00:00 2001 From: Alexey Nikitin Date: Fri, 19 Nov 2021 12:54:49 +0300 Subject: [PATCH] Correct error handling (#86) From github/tank-bohr * Introduces custom exceptions New API is - SweetXml.stream_tags!/2,3 - SweetXml.stream!/2 * Fixes CR comments --- lib/sweet_xml.ex | 101 ++++++++++++++++++++++++++++++++- lib/sweet_xml/options.ex | 30 ++++++---- test/files/invalid.xml | 44 ++++++++++++++ test/sweet_xml_stream_test.exs | 46 +++++++++++++++ 4 files changed, 210 insertions(+), 11 deletions(-) create mode 100644 test/files/invalid.xml diff --git a/lib/sweet_xml.ex b/lib/sweet_xml.ex index b4c2c1b..122d0f5 100644 --- a/lib/sweet_xml.ex +++ b/lib/sweet_xml.ex @@ -17,6 +17,20 @@ defmodule SweetXpath do namespaces: [] end +defmodule SweetXml.XmerlFatal do + defexception [:message, :reason, :file, :line, :col] + + @impl Exception + def exception({reason, {:file, file}, {:line, line}, {:col ,col}}) do + %__MODULE__{reason: reason, file: file, line: line, col: col, message: inspect(reason)} + end +end + +defmodule SweetXml.DTDError do + defexception [:message] +end + + defmodule SweetXml do @moduledoc ~S""" `SweetXml` is a thin wrapper around `:xmerl`. It allows you to convert a @@ -143,7 +157,6 @@ defmodule SweetXml do @type spec :: %SweetXpath{} @type xmlElement :: record(:xmlElement) - @doc ~s""" `sigil_x/2` simply returns a `%SweetXpath{}` struct, with modifiers converted to boolean fields: @@ -359,6 +372,40 @@ defmodule SweetXml do end) end + def stream_tags!(doc, tags, options \\ []) do + tags = if is_atom(tags), do: [tags], else: tags + + {discard_tags, xmerl_options} = case :proplists.lookup(:discard, options) do + {:discard, tags} -> {tags, :proplists.delete(:discard, options)} + :none -> {[], options} + end + + doc |> stream!(fn emit -> + [ + hook_fun: fn + entity, xstate when Record.is_record(entity, :xmlElement) -> + name = xmlElement(entity, :name) + if length(tags) == 0 or name in tags do + emit.({name, entity}) + end + {entity, xstate} + entity, xstate -> + {entity, xstate} + end, + acc_fun: fn + entity, acc, xstate when Record.is_record(entity, :xmlElement) -> + if xmlElement(entity, :name) in discard_tags do + {acc, xstate} + else + {[entity | acc], xstate} + end + entity, acc, xstate -> + {[entity | acc], xstate} + end + ] ++ xmerl_options + end) + end + @doc """ Create an element stream from a XML `doc`. @@ -428,6 +475,58 @@ defmodule SweetXml do end end + def stream!(doc, options_callback) when is_binary(doc) do + stream!([doc], options_callback) + end + def stream!([c | _] = doc, options_callback) when is_integer(c) do + stream([IO.iodata_to_binary(doc)], options_callback) + end + def stream!(doc, options_callback) do + Stream.resource fn -> + {parent, ref} = waiter = {self(), make_ref()} + opts = options_callback.(fn e -> send(parent, {:event, ref, e}) end) + + ets = :ets.new(nil, [:public]) + dtd_arg = :proplists.get_value(:dtd, opts, :all) + opts = :proplists.delete(:dtd, opts) + opts = SweetXml.Options.handle_dtd(dtd_arg, SweetXml.DTDError).(ets) ++ opts + + {pid, monref} = spawn_monitor fn -> :xmerl_scan.string('', opts ++ continuation_opts(doc, waiter)) end + {ref, pid, monref, ets} + end, fn {ref, pid, monref, ets} = acc -> + receive do + {:DOWN, ^monref, :process, ^pid, :normal} -> + {:halt, {:parse_ended, ets}} + {:DOWN, ^monref, :process, ^pid, {:fatal, error}} -> + {:halt, {:fatal, error, ets}} + {:DOWN, ^monref, :process, ^pid, error} -> + {:halt, {:error, error, ets}} + {:event, ^ref, event} -> + {[event], acc} + {:wait, ^ref} -> + send(pid, {:continue, ref}) + {[], acc} + end + end, fn + {:parse_ended, ets} -> + _ = :ets.delete(ets) + :ok + + {:fatal, error, ets} -> + _ = :ets.delete(ets) + raise SweetXml.XmerlFatal, error + + {:error, {exception, stacktrace}, ets} -> + _ = :ets.delete(ets) + reraise(exception, stacktrace) + + {ref, pid, monref, ets} -> + Process.demonitor(monref) + _ = :ets.delete(ets) + flush_halt(pid, ref) + end + end + @doc ~S""" `xpath` allows you to query an XML document with XPath. diff --git a/lib/sweet_xml/options.ex b/lib/sweet_xml/options.ex index 434bfd0..02776f4 100644 --- a/lib/sweet_xml/options.ex +++ b/lib/sweet_xml/options.ex @@ -1,23 +1,32 @@ defmodule SweetXml.Options do @moduledoc false - def handle_dtd(:all) do + def handle_dtd(dtd_option, exception_module \\ RuntimeError) + + def handle_dtd(:all, _exception_module) do fn _ -> [] end end - def handle_dtd(:none) do + def handle_dtd(:none, exception_module) do fn ets -> - handle_dtd(:internal_only).(ets) ++ handle_dtd(only: []).(ets) + handle_dtd(:internal_only, exception_module).(ets) ++ handle_dtd([only: []], exception_module).(ets) end end - def handle_dtd(:internal_only) do - fn _ -> - [fetch_fun: fn _, _ -> {:error, "no external entity allowed"} end] + def handle_dtd(:internal_only, exception_module) do + case exception_module do + SweetXml.DTDError -> + fn _ -> + [fetch_fun: fn _, _ -> raise SweetXml.DTDError, message: "no external entity allowed" end] + end + _ -> + fn _ -> + [fetch_fun: fn _, _ -> {:error, "no external entity allowed"} end] + end end end - def handle_dtd(only: entity) when is_atom(entity) do - handle_dtd(only: [entity]) + def handle_dtd([only: entity], exception_module) when is_atom(entity) do + handle_dtd([only: [entity]], exception_module) end - def handle_dtd(only: entities) when is_list(entities) do + def handle_dtd([only: entities], exception_module) when is_list(entities) do fn ets -> read = fn context, name, state -> @@ -37,7 +46,8 @@ defmodule SweetXml.Options do [] -> :ets.insert(ets, {{context, name}, value}) _ -> :ok end - false -> raise("DTD not allowed: #{name}") + false -> + raise exception_module, message: "DTD not allowed: #{name}" end state diff --git a/test/files/invalid.xml b/test/files/invalid.xml new file mode 100644 index 0000000..91a16b1 --- /dev/null +++ b/test/files/invalid.xml @@ -0,0 +1,44 @@ + + + + + Match One + + + 1 + Team One + + + 2 + Team Two + + + + + Match Two + + + 2 + Team Two + + + 3 + Team Three + + + + + Match Three + + + 1 + Team One + + + 3 + Team & Three + + + + + diff --git a/test/sweet_xml_stream_test.exs b/test/sweet_xml_stream_test.exs index 4d616da..ce8751c 100644 --- a/test/sweet_xml_stream_test.exs +++ b/test/sweet_xml_stream_test.exs @@ -70,4 +70,50 @@ defmodule SweetXmlStreamTest do assert result == ['Nested Head', 'XML Parsing'] end + describe "stream_tags!/2" do + test "streaming tags", %{simple_stream: simple_stream} do + result = + simple_stream + |> stream_tags([:li, :special_match_key], discard: [:li, :special_match_key]) + |> Stream.map(fn {_, doc} -> xpath(doc, ~x"./text()") end) + |> Enum.to_list + + assert result == ['\n First', 'Second\n ', 'Third', 'Forth', 'first star'] + + result = + simple_stream + |> stream_tags(:head) + |> Stream.map(fn {_, doc} -> xpath(doc, ~x"./title/text()") end) + |> Enum.to_list + + assert result == ['Nested Head', 'XML Parsing'] + end + + test "invalid xml" do + assert_raise SweetXml.XmerlFatal, ":error_scanning_entity_ref", fn -> + "test/files/invalid.xml" + |> File.stream!() + |> SweetXml.stream_tags!(:matchup, quiet: true) + |> Stream.run() + end + end + + test "DTD error" do + assert_raise SweetXml.DTDError, "DTD not allowed: lol1", fn -> + "test/files/billion_laugh.xml" + |> File.stream!() + |> SweetXml.stream_tags!(:banana, dtd: :none, quiet: true) + |> Stream.run() + end + end + + test "internal only" do + assert_raise SweetXml.DTDError, "no external entity allowed", fn -> + "test/files/xxe.xml" + |> File.stream!() + |> SweetXml.stream_tags!(:result, dtd: :internal_only) + |> Stream.run + end + end + end end