external_libs/editscript.h

// Copyright 2018 The diff-match-patch Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// This code (especially GetBisectSplitPoints) derive in part from
// diff_match_patch.cpp under the Apache 2.0 license and attribution:
//
// Author: fraser@google.com (Neil Fraser)
// Author: mikeslemmer@gmail.com (Mike Slemmer)
//
// Diff Match and Patch
// https://github.com/google/diff-match-patch

// Returns the minimal number of edit operations (copy, delete, insert)
// needed to tranform one container of tokens into another (cf. `diff -d -e`).
// Requires random-access iterators of token streams.

#ifndef EDITSCRIPT_H_
#define EDITSCRIPT_H_

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <iterator>
#include <map>
#include <tuple>
#include <utility>
#include <vector>

namespace diff {

enum class Operation { EQUALS, DELETE, INSERT };

/**
 * The data structure of edit operations to transform tokens1 into tokens2.
 * Indices for EQUALS and DELETE point into tokens1, INSERT into tokens2.
 * Concatenating EQUALS and DELETE tokens will return tokens1.
 * Concatenating EQUALS and INSERT tokens will return tokens2.
 * [start, end) is a semiopen interval.
 */
struct Edit {
  Operation operation;  // One of: EQUALS, DELETE, or INSERT.
  int64_t start;        // Start offset into tokens1 (tokens2 for INSERT).
  int64_t end;          // End   offset into tokens1 (tokens2 for INSERT).
  bool operator==(const Edit &rhs) const {
    return (std::tie(operation, start, end) ==
            std::tie(rhs.operation, rhs.start, rhs.end));
  }
  bool operator<(const Edit &rhs) const {
    return (std::tie(operation, start, end) <
            std::tie(rhs.operation, rhs.start, rhs.end));
  }
};
using Edits = std::vector<Edit>;

/**
 * Finds the differences between two vectors of tokens, returning edits
 * required to transform tokens1 into tokens2.
 * Every token in the combined document belongs to exactly one edit.
 *
 * For example, for tokens1 = "Hello world." and tokens2 = "Goodbye world.":
 * {Edit(Operation::DELETE, 0, 5),   // token1[0,5)   == "Hello"
 *  Edit(Operation::INSERT, 0, 7),   // token2[0,7)   == "Goodbye"
 *  Edit(Operation::EQUALS, 5, 12)}  // token1[5, 12) == " world."
 *
 * @param tokens1_begin Beginning index into tokens1.
 * @param tokens1_end   Ending index into tokens1.
 * @param tokens2_begin Beginning index into tokens2.
 * @param tokens2_end   Ending index into tokens2.
 * @return Cumulative edits to transform tokens1 into tokens2.
 */
template <typename TokenIter>
Edits GetTokenDiffs(TokenIter tokens1_begin, TokenIter tokens1_end,
                    TokenIter tokens2_begin, TokenIter tokens2_end);

//////////////////////////////////////////////////////////////////////////////
// IMPLEMENTATION
//////////////////////////////////////////////////////////////////////////////

namespace diff_impl {
/**
 * Appends an edit operation to the cumulative edits.
 * Fuses with previous edit if possible.
 * @param op    Edit operation.
 * @param start Start offset into tokens1 (tokens2 for INSERT).
 * @param end   End offset into tokens1 (tokens2 for INSERT).
 * @param edits Cumulative vector of edits (inout).
 */
inline void AppendEdit(Operation op, int64_t start, int64_t end, Edits *edits) {
  if (!edits->empty() && edits->back().operation == op &&
      edits->back().end == start) {
    // Merge into previous edit operation.
    edits->back().end = end;
  } else {
    // Add a new edit operation.
    edits->emplace_back(Edit{op, start, end});
  }
}

/**
 * Inserts an edit operation into the cumulative edits at given index.
 * Fuses with neighboring edit if possible.
 * @param index Index of insertion point.
 * @param op    Edit operation.
 * @param start Start offset into tokens1 (tokens2 for INSERT).
 * @param end   End offset into tokens1 (tokens2 for INSERT).
 * @param edits Cumulative vector of edits (inout).
 */
inline void InsertEditAt(int64_t index, Operation op, int64_t start,
                         int64_t end, Edits *edits) {
  if (index > 0) {
    Edit &prev_edit = (*edits)[index - 1];
    if (prev_edit.operation == op && prev_edit.end == start) {
      // Merge into previous edit operation.
      prev_edit.end = end;
      return;
    }
  }
  if (index < static_cast<int64_t>(edits->size())) {
    Edit &next_edit = (*edits)[index];
    if (next_edit.operation == op && next_edit.end == start) {
      // Merge into subsequent edit operation.
      next_edit.end = end;
      return;
    }
  }

  // Cannot merge into existing? Insert a new edit operation.
  edits->emplace(edits->begin() + index, Edit{op, start, end});
}

/**
 * Reverse direction of iterator (without moving its location).
 * Returned iter will now point to one elem to left of input iter.
 * @param iter Original bidirectional or random iterator.
 */
template <typename TokenIter>
inline std::reverse_iterator<TokenIter> Reverse(TokenIter iter) {
  return std::reverse_iterator<TokenIter>(iter);
}

/**
 * Determine the common affix of two tokens.
 * For common prefix, pass normal iterators.
 * For common suffix, pass reverse iterators.
 * @param span1_begin Beginning iterator into tokens1.
 * @param span1_end   Ending iterator into tokens1.
 * @param span2_begin Beginning iterator into tokens2.
 * @param span2_end   Ending iterator into tokens2.
 * @return The number of tokens common to the start of spans.
 */
template <typename TokenIter>
inline int64_t CommonAffix(TokenIter span1_begin, TokenIter span1_end,
                           TokenIter span2_begin, TokenIter span2_end) {
  // std::mismatch wants the shorter container first.
  if (span1_end - span1_begin > span2_end - span2_begin) {
    using std::swap;
    swap(span1_begin, span2_begin);
    swap(span1_end, span2_end);
  }

  auto affix_ends = std::mismatch(span1_begin, span1_end, span2_begin);
  const int64_t affix_length = affix_ends.first - span1_begin;
  return affix_length;
}

/**
 * Finds the differences between two vectors of tokens.
 * Invoke using GetTokenDiffs.
 */
template <typename TokenIter>
class Diff {
 private:
  friend Edits GetTokenDiffs<>(TokenIter tokens1_begin, TokenIter tokens1_end,
                               TokenIter tokens2_begin, TokenIter tokens2_end);

  /**
   * Finds the differences between two vectors of tokens, returning edits
   * required to transform tokens1 into tokens2.
   * Every token in the combined document belongs to exactly one edit.
   * @param tokens1_begin Iterator pointing to start of tokens1.
   * @param tokens2_begin Iterator pointing to start of tokens2.
   * @param edits Cumulative edits to transform tokens1 into tokens2 (inout).
   */
  Diff(TokenIter tokens1_begin, TokenIter tokens2_begin)
      : tokens1_begin_(tokens1_begin), tokens2_begin_(tokens2_begin) {}

  /**
   * Find the differences between two vectors of tokens.
   * @param b1 Beginning index into tokens1.
   * @param e1 Ending index into tokens1.
   * @param b2 Beginning index into tokens2.
   * @param e2 Ending index into tokens2.
   * @param edits Cumulative edits to transform tokens1 into tokens2 (inout).
   * @param edits Vector of Edit objects
   */
  void Generate(int64_t b1, int64_t e1, int64_t b2, int64_t e2,
                Edits *edits) const {
    // Avoid growing stack through recursion more than necessary.
    int64_t prefix_size = 0;
    int64_t suffix_size = 0;
    int64_t edits_size = 0;
    {
      // The diff strategy is to recursively:
      // - Peel off common prefix and suffix
      // - Split what's left in two parts
      // - Generate diffs for each and combine.
      // The diffs could be done in parallel (with a fresh edits) and
      // later combined if speed up is needed.
      auto span1_begin = tokens1_begin_ + b1;
      auto span2_begin = tokens2_begin_ + b2;
      auto span1_end = tokens1_begin_ + e1;
      auto span2_end = tokens2_begin_ + e2;

      // Check for equality (speedup).
      if ((e1 - b1) == (e2 - b2) &&
          std::equal(span1_begin, span1_end, span2_begin)) {
        if (span1_begin != span1_end) {
          AppendEdit(Operation::EQUALS, b1, e1, edits);
        }
        return;
      }

      // Find longest common prefix and suffix.
      prefix_size = CommonAffix(span1_begin, span1_end, span2_begin, span2_end);
      suffix_size =
          CommonAffix(Reverse(span1_end), Reverse(span1_begin + prefix_size),
                      Reverse(span2_end), Reverse(span2_begin + prefix_size));

      // Remember the current location so we can insert a common prefix.
      edits_size = edits->size();
    }

    // Compute the diff on the middle block.
    Compute(b1 + prefix_size, e1 - suffix_size, b2 + prefix_size,
            e2 - suffix_size, edits);

    // Restore the prefix and suffix.
    if (prefix_size != 0) {
      InsertEditAt(edits_size, Operation::EQUALS, b1, b1 + prefix_size, edits);
    }
    if (suffix_size != 0) {
      AppendEdit(Operation::EQUALS, e1 - suffix_size, e1, edits);
    }
  }

  /**
   * Find the differences between two vectors of tokens.
   * @pre Tokens are not equal nor share a common prefix or suffix.
   * @param b1 Beginning index into tokens1.
   * @param e1 Ending index into tokens1.
   * @param b2 Beginning index into tokens2.
   * @param e2 Ending index into tokens2.
   * @param edits Cumulative edits to transform tokens1 into tokens2 (inout).
   */
  void Compute(int64_t b1, int64_t e1, int64_t b2, int64_t e2,
               Edits *edits) const {
    // Try various speedups first.
    // Enclose in a scope to save stack space before recursing.
    {
      const int64_t length1 = e1 - b1;
      const int64_t length2 = e2 - b2;
      if (length1 == 0 && length2 != 0) {
        // tokens1 empty, but tokens2 isn't: Just insert tokens2
        AppendEdit(Operation::INSERT, b2, e2, edits);
        return;
      }
      if (length2 == 0 && length1 != 0) {
        // tokens2 empty, but tokens1 isn't: Just delete tokens1
        AppendEdit(Operation::DELETE, b1, e1, edits);
        return;
      }
      if (length1 > length2) {
        const int64_t offset =
            std::search(tokens1_begin_ + b1, tokens1_begin_ + e1,
                        tokens2_begin_ + b2, tokens2_begin_ + e2) -
            tokens1_begin_;
        if (offset != e1) {
          // tokens2 is a proper substring of tokens1: delete the rest.
          const int64_t offset_end = offset + length2;
          AppendEdit(Operation::DELETE, b1, offset, edits);
          AppendEdit(Operation::EQUALS, offset, offset_end, edits);
          AppendEdit(Operation::DELETE, offset_end, e1, edits);
          return;
        }
        if (length2 == 1) {
          // Single-token span.
          // After the previous speedup, the operation can't be EQUALS.
          AppendEdit(Operation::DELETE, b1, e1, edits);
          AppendEdit(Operation::INSERT, b2, e2, edits);
          return;
        }
      } else if (length2 > length1) {
        const int64_t offset =
            std::search(tokens2_begin_ + b2, tokens2_begin_ + e2,
                        tokens1_begin_ + b1, tokens1_begin_ + e1) -
            tokens2_begin_;
        if (offset != e2) {
          // tokens1 is a proper substring of tokens2: insert the rest.
          AppendEdit(Operation::INSERT, b2, offset, edits);
          AppendEdit(Operation::EQUALS, b1, e1, edits);  // Index into tokens1!
          AppendEdit(Operation::INSERT, offset + length1, e2, edits);
          return;
        }
        if (length1 == 1) {
          // Single-token span.
          // After the previous speedup, the operation can't be EQUALS.
          AppendEdit(Operation::DELETE, b1, e1, edits);
          AppendEdit(Operation::INSERT, b2, e2, edits);
          return;
        }
      }
    }

    // No speedups apply? Bisect and diff each half, then combine results.
    Bisect(b1, e1, b2, e2, edits);
  }

  /**
   * Find the 'middle snake' of a diff, returning split points.
   * See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
   * @param b1    Beginning index into tokens1.
   * @param e1    Ending index into tokens1.
   * @param b2    Beginning index into tokens2.
   * @param e2    Ending index into tokens2.
   * @return Index of split points in tokens1 and tokens2, respectively.
   *     Returns {-1, -1} if there is no need to bisect.
   */
  std::pair<int64_t, int64_t> GetBisectSplitPoints(int64_t b1, int64_t e1,
                                                   int64_t b2,
                                                   int64_t e2) const {
    std::pair<int64_t, int64_t> splits;
    int64_t &x1 = splits.first;
    int64_t &y1 = splits.second;

    const int64_t length1 = e1 - b1;
    const int64_t length2 = e2 - b2;
    const int64_t max_d = (length1 + length2 + 1) / 2;
    const int64_t v_offset = max_d;
    const int64_t v_size = 2 * max_d;
    const int64_t w_size = 2 * v_size;
    std::vector<int64_t> w(w_size + 4,
                           -1);  // interleave for cache friendliness

    w[2 * v_offset + 2] = 0;
    w[2 * v_offset + 3] = 0;
    const int64_t delta = length1 - length2;

    // If the total number of tokens is odd, then the front path will
    // collide with the reverse path.
    const bool front = (delta % 2 != 0);

    // Offsets for start and end of k loop.
    // Prevents mapping of space beyond the grid.
    int64_t k1start = 0;
    int64_t k1end = 0;
    int64_t k2start = 0;
    int64_t k2end = 0;
    for (int64_t d = 0; d < max_d; d++) {
      // Walk the front path one step.
      for (int64_t k1 = -d + k1start; k1 <= d - k1end; k1 += 2) {
        const int64_t k1_offset = v_offset + k1;
        x1 = 0;
        if (k1 == -d ||
            (k1 != d && w[2 * k1_offset - 2] < w[2 * k1_offset + 2])) {
          x1 = w[2 * k1_offset + 2];
        } else {
          x1 = w[2 * k1_offset - 2] + 1;
        }
        y1 = x1 - k1;
        while (x1 < length1 && y1 < length2 &&
               *(tokens1_begin_ + (b1 + x1)) == *(tokens2_begin_ + (b2 + y1))) {
          x1++;
          y1++;
        }
        w[2 * k1_offset] = x1;
        if (x1 > length1) {
          // Ran off the right of the graph.
          k1end += 2;
        } else if (y1 > length2) {
          // Ran off the bottom of the graph.
          k1start += 2;
        } else if (front) {
          int64_t k2_offset = v_offset + delta - k1;
          if (k2_offset >= 0 && k2_offset < v_size &&
              w[2 * k2_offset + 1] != -1) {
            // Mirror x2 onto top-left coordinate system.
            int64_t x2 = length1 - w[2 * k2_offset + 1];
            if (x1 >= x2) {
              // Overlap detected.
              return splits;
            }
          }
        }
      }

      // Walk the reverse path one step.
      for (int64_t k2 = -d + k2start; k2 <= d - k2end; k2 += 2) {
        const int64_t k2_offset = v_offset + k2;
        int64_t x2;
        if (k2 == -d ||
            (k2 != d && w[2 * k2_offset - 1] < w[2 * k2_offset + 3])) {
          x2 = w[2 * k2_offset + 3];
        } else {
          x2 = w[2 * k2_offset - 1] + 1;
        }
        int64_t y2 = x2 - k2;
        while (x2 < length1 && y2 < length2 &&
               *(tokens1_begin_ + (b1 + length1 - x2 - 1)) ==
                   *(tokens2_begin_ + (b2 + length2 - y2 - 1))) {
          x2++;
          y2++;
        }
        w[2 * k2_offset + 1] = x2;
        if (x2 > length1) {
          // Ran off the left of the graph.
          k2end += 2;
        } else if (y2 > length2) {
          // Ran off the top of the graph.
          k2start += 2;
        } else if (!front) {
          int64_t k1_offset = v_offset + delta - k2;
          if (k1_offset >= 0 && k1_offset < v_size && w[2 * k1_offset] != -1) {
            x1 = w[2 * k1_offset];
            y1 = v_offset + x1 - k1_offset;

            // Mirror x2 onto top-left coordinate system.
            x2 = length1 - x2;
            if (x1 >= x2) {
              // Overlap detected.
              return splits;
            }
          }
        }
      }
    }
    x1 = -1;
    y1 = -1;
    return splits;
  }

  /**
   * Bisect and recurse if necessary.
   * @param b1    Beginning index into tokens1.
   * @param e1    Ending index into tokens1.
   * @param b2    Beginning index into tokens2.
   * @param e2    Ending index into tokens2.
   * @param edits Cumulative edits to transform tokens1 into tokens2 (inout).
   */
  void Bisect(int64_t b1, int64_t e1, int64_t b2, int64_t e2,
              Edits *edits) const {
    int64_t x1, x2;
    std::tie(x1, x2) = GetBisectSplitPoints(b1, e1, b2, e2);
    if (x1 >= 0) {
      // Some commonality, so bisect and recurse.
      Generate(b1, b1 + x1, b2, b2 + x2, edits);
      Generate(b1 + x1, e1, b2 + x2, e2, edits);
    } else {
      // No commonality at all (number of edits equals number of tokens),
      // so just delete the old and insert the new.
      AppendEdit(Operation::DELETE, b1, e1, edits);
      AppendEdit(Operation::INSERT, b2, e2, edits);
    }
  }

 private:
  TokenIter tokens1_begin_;
  TokenIter tokens2_begin_;
};  // class Diff
}  // namespace diff_impl

template <typename TokenIter>
inline Edits GetTokenDiffs(TokenIter tokens1_begin, TokenIter tokens1_end,
                           TokenIter tokens2_begin, TokenIter tokens2_end) {
  Edits token_edits;
  diff_impl::Diff<TokenIter>(tokens1_begin, tokens2_begin)
      .Generate(0, std::distance(tokens1_begin, tokens1_end), 0,
                std::distance(tokens2_begin, tokens2_end), &token_edits);
  return token_edits;  // efficient: uses named return value optimization
}

}  // namespace diff

#endif  // EDITSCRIPT_H_