From 3111842d97bb2ed4e55ea624b90848f87e3597c4 Mon Sep 17 00:00:00 2001 From: Linus De Meyere Date: Sun, 8 Sep 2024 00:20:27 +0200 Subject: [PATCH 1/3] Only consider the first character as the diff symbol. Previously, the first non-space character of each line was interpreted as a diff symbol (one of '>', '+', etc.). This is probably too lax, and can create ambiguous interpretations when the content of the diffed file itself begins with one of those characters. Only the first character of each line should be checked for a diff symbol. Anything else (like a space or any other character) should signal the whole line as plain text. --- lib/makeup/lexers/diff_lexer.ex | 56 +++++++------------------- lib/makeup/lexers/diff_lexer/helper.ex | 33 +++++++++++++++ test/makeup/lexers/diff_lexer_test.exs | 18 ++++----- 3 files changed, 57 insertions(+), 50 deletions(-) create mode 100644 lib/makeup/lexers/diff_lexer/helper.ex diff --git a/lib/makeup/lexers/diff_lexer.ex b/lib/makeup/lexers/diff_lexer.ex index 3ffbd2a..d80e8c1 100644 --- a/lib/makeup/lexers/diff_lexer.ex +++ b/lib/makeup/lexers/diff_lexer.ex @@ -6,56 +6,30 @@ defmodule Makeup.Lexers.DiffLexer do @behaviour Makeup.Lexer import NimbleParsec - import Makeup.Lexer.Combinators + import Makeup.Lexers.DiffLexer.Helper - whitespace = - [?\r, ?\s, ?\n, ?\f] - |> ascii_string(min: 1) - |> token(:whitespace) + heading = line_starting_with(["diff", "index"], :generic_heading) + inserted = line_starting_with(["+", ">"], :generic_inserted) + deleted = line_starting_with(["-", "<"], :generic_deleted) + strong = line_starting_with("!", :generic_strong) - line = utf8_string([{:not, ?\n}, {:not, ?\r}], min: 1) + root_element_combinator = + choice([heading, inserted, deleted, strong, text_line()]) + |> map(:add_meta_diff_language) - inserted = - [string("+"), string(">")] - |> choice() - |> concat(line) - |> token(:generic_inserted) - - deleted = - [string("-"), string("<")] - |> choice() - |> concat(line) - |> token(:generic_deleted) - - strong = - "!" - |> string() - |> concat(line) - |> token(:generic_strong) - - heading = - [string("diff"), string("index")] - |> choice() - |> concat(line) - |> token(:generic_heading) - - text = token(line, :text) - - root_element_combinator = choice([whitespace, heading, inserted, deleted, strong, text]) - - @doc false - def __as_diff_language__({type, meta, value}) do + defp add_meta_diff_language({type, meta, value}) do {type, Map.put(meta, :language, :diff), value} end @impl Makeup.Lexer - defparsec( - :root_element, - root_element_combinator |> map({__MODULE__, :__as_diff_language__, []}) - ) + defparsec(:root_element, root_element_combinator) @impl Makeup.Lexer - defparsec(:root, repeat(parsec(:root_element))) + defparsec( + :root, + repeat(parsec(:root_element) |> concat(newline())) + |> choice([ignore(eos()), parsec(:root_element)]) + ) @impl Makeup.Lexer def postprocess(tokens, _opts \\ []), do: tokens diff --git a/lib/makeup/lexers/diff_lexer/helper.ex b/lib/makeup/lexers/diff_lexer/helper.ex new file mode 100644 index 0000000..27550bb --- /dev/null +++ b/lib/makeup/lexers/diff_lexer/helper.ex @@ -0,0 +1,33 @@ +defmodule Makeup.Lexers.DiffLexer.Helper do + @moduledoc false + + import NimbleParsec + import Makeup.Lexer.Combinators + + def line_starting_with(start, token_type) when is_binary(start) do + string(start) + |> rest_of_line() + |> token(token_type) + end + + def line_starting_with([_, _ | _] = start, token_type) do + List.wrap(start) + |> Enum.map(&string/1) + |> choice() + |> rest_of_line() + |> token(token_type) + end + + def text_line() do + rest_of_line() |> token(:text) + end + + defp rest_of_line(combinator \\ empty()) do + repeat(combinator, utf8_char([{:not, ?\n}, {:not, ?\r}])) + end + + def newline() do + times(utf8_char([?\n, ?\r]), min: 1) + |> token(:whitespace) + end +end diff --git a/test/makeup/lexers/diff_lexer_test.exs b/test/makeup/lexers/diff_lexer_test.exs index aa8dbb3..72c9a88 100644 --- a/test/makeup/lexers/diff_lexer_test.exs +++ b/test/makeup/lexers/diff_lexer_test.exs @@ -72,10 +72,10 @@ defmodule Makeup.Lexers.DiffLexerTest do +++ b/setup @@ -11,16 +11,22 @@ context line unchanged - + inserted - - deleted - > inserted - < deleted + +inserted + -deleted + >inserted + inserted"}, - {:generic_deleted, %{}, "< deleted"} + {:text, %{}, " unchanged"}, + {:generic_inserted, %{}, "+inserted"}, + {:generic_deleted, %{}, "-deleted"}, + {:generic_inserted, %{}, ">inserted"}, + {:generic_deleted, %{}, " Date: Sun, 15 Sep 2024 10:14:34 +0200 Subject: [PATCH 2/3] add unit test to express that marker is expected in first position of each line --- lib/makeup/lexers/diff_lexer.ex | 10 ++----- lib/makeup/lexers/diff_lexer/helper.ex | 9 +++--- test/makeup/lexers/diff_lexer_test.exs | 38 +++++++++++++++++++++----- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/lib/makeup/lexers/diff_lexer.ex b/lib/makeup/lexers/diff_lexer.ex index d80e8c1..8127890 100644 --- a/lib/makeup/lexers/diff_lexer.ex +++ b/lib/makeup/lexers/diff_lexer.ex @@ -13,7 +13,7 @@ defmodule Makeup.Lexers.DiffLexer do deleted = line_starting_with(["-", "<"], :generic_deleted) strong = line_starting_with("!", :generic_strong) - root_element_combinator = + line = choice([heading, inserted, deleted, strong, text_line()]) |> map(:add_meta_diff_language) @@ -22,14 +22,10 @@ defmodule Makeup.Lexers.DiffLexer do end @impl Makeup.Lexer - defparsec(:root_element, root_element_combinator) + defparsec(:root_element, line |> optional(newline())) @impl Makeup.Lexer - defparsec( - :root, - repeat(parsec(:root_element) |> concat(newline())) - |> choice([ignore(eos()), parsec(:root_element)]) - ) + defparsec(:root, repeat(line |> newline()) |> choice([eos(), line])) @impl Makeup.Lexer def postprocess(tokens, _opts \\ []), do: tokens diff --git a/lib/makeup/lexers/diff_lexer/helper.ex b/lib/makeup/lexers/diff_lexer/helper.ex index 27550bb..0e6aadb 100644 --- a/lib/makeup/lexers/diff_lexer/helper.ex +++ b/lib/makeup/lexers/diff_lexer/helper.ex @@ -11,7 +11,7 @@ defmodule Makeup.Lexers.DiffLexer.Helper do end def line_starting_with([_, _ | _] = start, token_type) do - List.wrap(start) + start |> Enum.map(&string/1) |> choice() |> rest_of_line() @@ -23,11 +23,10 @@ defmodule Makeup.Lexers.DiffLexer.Helper do end defp rest_of_line(combinator \\ empty()) do - repeat(combinator, utf8_char([{:not, ?\n}, {:not, ?\r}])) + utf8_string(combinator, [not: 10, not: 13], min: 0) end - def newline() do - times(utf8_char([?\n, ?\r]), min: 1) - |> token(:whitespace) + def newline(combinator \\ empty()) do + concat(combinator, ascii_string([?\n, ?\r], min: 1) |> token(:whitespace)) end end diff --git a/test/makeup/lexers/diff_lexer_test.exs b/test/makeup/lexers/diff_lexer_test.exs index 72c9a88..cb59f6f 100644 --- a/test/makeup/lexers/diff_lexer_test.exs +++ b/test/makeup/lexers/diff_lexer_test.exs @@ -29,13 +29,13 @@ defmodule Makeup.Lexers.DiffLexerTest do end end - property "lexting a string with an insertion" do + property "lexing a string with an insertion" do check all text <- inserted() do assert [{:generic_inserted, %{}, ^text} | _] = lex(text) end end - property "lexting a string with a deletion" do + property "lexing a string with a deletion" do check all text <- deleted() do assert [{:generic_deleted, %{}, ^text}] = lex(text) end @@ -78,10 +78,7 @@ defmodule Makeup.Lexers.DiffLexerTest do lex() - |> Enum.reject(fn {type, _, _} -> type == :whitespace end) + lexed = lex(text, omit_whitespaces: true) assert [ {:generic_heading, %{}, "diff --git a/setup"}, @@ -96,13 +93,40 @@ defmodule Makeup.Lexers.DiffLexerTest do {:generic_deleted, %{}, "deleted + + """ + + lexed = lex(text, omit_whitespaces: true) + + assert [ + {:text, %{}, " +text"}, + {:text, %{}, " -text"}, + {:generic_inserted, %{}, "+-inserted"}, + {:generic_deleted, %{}, "<>deleted"}, + {:text, %{}, " "} + ] = lexed + end end - defp lex(text) do + defp lex(text, opts \\ []) do text |> DiffLexer.lex(group_prefix: "group") |> Postprocess.token_values_to_binaries() |> Enum.map(fn {type, meta, value} -> {type, Map.delete(meta, :language), value} end) + |> then(fn tokens -> + if Keyword.get(opts, :omit_whitespaces, false) do + Enum.reject(tokens, fn {type, _, _} -> type == :whitespace end) + else + tokens + end + end) end # Properties From ee1083d790dc5a7981a85b454fe8cc33c30ed9cb Mon Sep 17 00:00:00 2001 From: Linus De Meyere Date: Sun, 15 Sep 2024 10:57:35 +0200 Subject: [PATCH 3/3] revert to character literals for \n and \r --- lib/makeup/lexers/diff_lexer/helper.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/makeup/lexers/diff_lexer/helper.ex b/lib/makeup/lexers/diff_lexer/helper.ex index 0e6aadb..6fb6451 100644 --- a/lib/makeup/lexers/diff_lexer/helper.ex +++ b/lib/makeup/lexers/diff_lexer/helper.ex @@ -23,7 +23,7 @@ defmodule Makeup.Lexers.DiffLexer.Helper do end defp rest_of_line(combinator \\ empty()) do - utf8_string(combinator, [not: 10, not: 13], min: 0) + utf8_string(combinator, [not: ?\n, not: ?\r], min: 0) end def newline(combinator \\ empty()) do