From 0abf997e75bd3a8b76d920d33cc64d5e6c2d380f Mon Sep 17 00:00:00 2001 From: pulkin Date: Sun, 19 May 2024 23:46:37 +0200 Subject: [PATCH] gh-119105: difflib: improve recursion for degenerate cases (#119131) Code from https://github.com/pulkin, in PR https://github.com/python/cpython/pull/119131 Greatly speeds `Differ` when there are many identically scoring pairs, by splitting the recursion near the inputs' midpoints instead of degenerating (as now) into just peeling off the first two lines. Co-authored-by: Tim Peters --- Lib/difflib.py | 24 +++++++++++++++---- ...-05-19-12-25-36.gh-issue-119105.VcR4ig.rst | 1 + 2 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst diff --git a/Lib/difflib.py b/Lib/difflib.py index ba0b256969ebff..54ca33d5615f8d 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -911,16 +911,26 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # don't synch up unless the lines have a similarity score of at # least cutoff; best_ratio tracks the best score seen so far - best_ratio, cutoff = 0.74, 0.75 + # best_ratio is a tuple storing the best .ratio() seen so far, and + # a measure of how far the indices are from their index range + # midpoints. The latter is used to resolve ratio ties. Favoring + # indices near the midpoints tends to cut the ranges in half. Else, + # if there are many pairs with the best ratio, recursion can grow + # very deep, and runtime becomes cubic. See: + # https://github.com/python/cpython/issues/119105 + best_ratio, cutoff = (0.74, 0), 0.75 cruncher = SequenceMatcher(self.charjunk) eqi, eqj = None, None # 1st indices of equal lines (if any) # search for the pair that matches best without being identical # (identical lines must be junk lines, & we don't want to synch up # on junk -- unless we have to) + amid = (alo + ahi - 1) / 2 + bmid = (blo + bhi - 1) / 2 for j in range(blo, bhi): bj = b[j] cruncher.set_seq2(bj) + weight_j = - abs(j - bmid) for i in range(alo, ahi): ai = a[i] if ai == bj: @@ -928,16 +938,20 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): eqi, eqj = i, j continue cruncher.set_seq1(ai) + # weight is used to balance the recursion by prioritizing + # i and j in the middle of their ranges + weight = weight_j - abs(i - amid) # computing similarity is expensive, so use the quick # upper bounds first -- have seen this speed up messy # compares by a factor of 3. # note that ratio() is only expensive to compute the first # time it's called on a sequence pair; the expensive part # of the computation is cached by cruncher - if cruncher.real_quick_ratio() > best_ratio and \ - cruncher.quick_ratio() > best_ratio and \ - cruncher.ratio() > best_ratio: - best_ratio, best_i, best_j = cruncher.ratio(), i, j + if (cruncher.real_quick_ratio(), weight) > best_ratio and \ + (cruncher.quick_ratio(), weight) > best_ratio and \ + (cruncher.ratio(), weight) > best_ratio: + best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j + best_ratio, _ = best_ratio if best_ratio < cutoff: # no non-identical "pretty close" pair if eqi is None: diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst new file mode 100644 index 00000000000000..30b5f97b8059f9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst @@ -0,0 +1 @@ +``difflib.Differ`` is much faster for some cases of diffs where many pairs of lines are equally similar.