Apply dist_thresh to all search backends

This commit introduces a distance threshold mechanism to the Genius backend and unifies its implementation across the rest of backends that perform searching and matching artists and titles. - Create a new `SearchBackend` base class with a method `check_match` that performs checking. - Start using undocumented `dist_thresh` configuration option for good, and mention it in the docs. This controls the maximum allowable distance for matching artist and title names. These changes aim to improve the accuracy of lyrics matching, especially when there are slight variations in artist or title names, see #4791.
beetbox · Oct 8, 2024 · 52745dc · 52745dc
1 parent 03f1205
commit 52745dc
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 61 deletions.
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
@@ -14,7 +14,6 @@
 
 """Fetches, embeds, and displays lyrics."""
 
-import difflib
 import errno
 import itertools
 import json
@@ -24,6 +23,7 @@
 import unicodedata
 import urllib
 import warnings
+from functools import cached_property, partial
 
 import requests
 from unidecode import unidecode
@@ -356,15 +356,43 @@ def fetch(self, artist, title, album=None, length=None):
         return lyrics
 
 
-class Genius(Backend):
+class SearchBackend(Backend):
+    REQUIRES_BS = True
+
+    @cached_property
+    def dist_thresh(self) -> float:
+        return self.config["dist_thresh"].get(float)
+
+    def check_match(
+        self, target_artist: str, target_title: str, artist: str, title: str
+    ) -> bool:
+        """Check if the given artist and title are 'good enough' match."""
+        max_dist = max(
+            string_dist(target_artist, artist),
+            string_dist(target_title, title),
+        )
+        if round(max_dist, 2) <= self.dist_thresh:
+            return True
+
+        self._log.warning(
+            "({}, {}) pair does not match ({}, {}), dist {:.2f} > expected {:.2f}",
+            artist,
+            title,
+            target_artist,
+            target_title,
+            max_dist,
+            self.dist_thresh,
+        )
+        return False
+
+
+class Genius(SearchBackend):
     """Fetch lyrics from Genius via genius-api.
 
     Simply adapted from
     bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
     """
 
-    REQUIRES_BS = True
-
     base_url = "https://api.genius.com"
 
     def __init__(self, config, log):
@@ -388,11 +416,11 @@ def fetch(self, artist, title, album=None, length=None):
             return None
 
         # find a matching artist in the json
+        check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
-            hit_artist = hit["result"]["primary_artist"]["name"]
-
-            if slug(hit_artist) == slug(artist):
-                html = self.fetch_url(hit["result"]["url"])
+            result = hit["result"]
+            if check(result["primary_artist"]["name"], result["title"]):
+                html = self.fetch_url(result["url"])
                 if not html:
                     return None
                 return self._scrape_lyrics_from_html(html)
@@ -494,10 +522,8 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
         return lyrics_div.get_text()
 
 
-class Tekstowo(Backend):
+class Tekstowo(SearchBackend):
     # Fetch lyrics from Tekstowo.pl.
-    REQUIRES_BS = True
-
     BASE_URL = "http://www.tekstowo.pl"
     URL_PATTERN = BASE_URL + "/wyszukaj.html?search-title=%s&search-artist=%s"
 
@@ -563,14 +589,9 @@ def extract_lyrics(self, html, artist, title):
         if not info_elements:
             return None
 
-        html_title = info_elements[-1].get_text()
         html_artist = info_elements[-2].get_text()
-
-        title_dist = string_dist(html_title, title)
-        artist_dist = string_dist(html_artist, artist)
-
-        thresh = self.config["dist_thresh"].get(float)
-        if title_dist > thresh or artist_dist > thresh:
+        html_title = info_elements[-1].get_text()
+        if not self.check_match(artist, title, html_artist, html_title):
             return None
 
         lyrics_div = soup.select("div.song-text > div.inner-text")
@@ -649,11 +670,9 @@ def is_text_notcode(text):
         return None
 
 
-class Google(Backend):
+class Google(SearchBackend):
     """Fetch lyrics from Google search results."""
 
-    REQUIRES_BS = True
-
     def __init__(self, config, log):
         super().__init__(config, log)
         self.api_key = config["google_API_key"].as_str()
@@ -704,20 +723,21 @@ def slugify(self, text):
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
-    def is_page_candidate(self, url_link, url_title, title, artist):
+    def is_page_candidate(
+        self, artist: str, title: str, url_link: str, url_title: str
+    ) -> bool:
         """Return True if the URL title makes it a good candidate to be a
         page that contains lyrics of title by artist.
         """
-        title = self.slugify(title.lower())
+        title_slug = self.slugify(title.lower())
+        url_title_slug = self.slugify(url_title.lower())
+        if title_slug in url_title_slug:
+            return True
+
         artist = self.slugify(artist.lower())
         sitename = re.search(
             "//([^/]+)/.*", self.slugify(url_link.lower())
         ).group(1)
-        url_title = self.slugify(url_title.lower())
-
-        # Check if URL title contains song title (exact match)
-        if url_title.find(title) != -1:
-            return True
 
         # or try extracting song title from URL title and check if
         # they are close enough
@@ -727,12 +747,9 @@ def is_page_candidate(self, url_link, url_title, title, artist):
             + self.LYRICS_TRANS
         )
         tokens = [re.escape(t) for t in tokens]
-        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
+        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
 
-        song_title = song_title.strip("_|")
-        typo_ratio = 0.9
-        ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
-        return ratio >= typo_ratio
+        return self.check_match(artist, title_slug, artist, song_title)
 
     def fetch(self, artist, title, album=None, length=None):
         query = f"{artist} {title}"
@@ -755,24 +772,21 @@ def fetch(self, artist, title, album=None, length=None):
             self._log.debug("google backend error: {0}", reason)
             return None
 
-        if "items" in data.keys():
-            for item in data["items"]:
-                url_link = item["link"]
-                url_title = item.get("title", "")
-                if not self.is_page_candidate(
-                    url_link, url_title, title, artist
-                ):
-                    continue
-                html = self.fetch_url(url_link)
-                if not html:
-                    continue
-                lyrics = scrape_lyrics_from_html(html)
-                if not lyrics:
-                    continue
-
-                if self.is_lyrics(lyrics, artist):
-                    self._log.debug("got lyrics from {0}", item["displayLink"])
-                    return lyrics
+        check_candidate = partial(self.is_page_candidate, artist, title)
+        for item in data["items"]:
+            url_link = item["link"]
+            if not check_candidate(url_link, item.get("title", "")):
+                continue
+            html = self.fetch_url(url_link)
+            if not html:
+                continue
+            lyrics = scrape_lyrics_from_html(html)
+            if not lyrics:
+                continue
+
+            if self.is_lyrics(lyrics, artist):
+                self._log.debug("got lyrics from {0}", item["displayLink"])
+                return lyrics
 
         return None
 
@@ -796,6 +810,7 @@ def __init__(self):
                 "bing_client_secret": None,
                 "bing_lang_from": [],
                 "bing_lang_to": None,
+                "dist_thresh": 0.15,
                 "google_API_key": None,
                 "google_engine_ID": "009217259823014548361:lndtuqkycfu",
                 "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -807,7 +822,6 @@ def __init__(self):
                 # Musixmatch is disabled by default as they are currently blocking
                 # requests with the beets user agent.
                 "sources": [s for s in self.SOURCES if s != "musixmatch"],
-                "dist_thresh": 0.1,
             }
         )
         self.config["bing_client_secret"].redact = True

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -44,6 +44,9 @@ Bug fixes:
 * :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
 * Remove single quotes from all SQL queries
   :bug:`4709`
+* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
+  to match lyrics when there was a slight variation in the artist name.
+  :bug:`4791`
 
 For packagers:
 

diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
@@ -42,6 +42,12 @@ configuration file. The available options are:
   Default: ``[]``
 - **bing_lang_to**: Language to translate lyrics into.
   Default: None.
+- **dist_thresh**: The maximum distance between the artist and title
+  combination of the music file and lyrics candidate to consider them a match.
+  Lower values will make the plugin more strict, higher values will make it
+  more lenient. This does not apply to the ``lrclib`` backend as it matches
+  durations.
+  Default: ``0.15``.
 - **fallback**: By default, the file will be left unchanged when no lyrics are
   found. Use the empty string ``''`` to reset the lyrics in such a case.
   Default: None.

diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
@@ -31,11 +31,12 @@
 from beetsplug import lyrics
 
 log = logging.getLogger("beets.test_lyrics")
-raw_backend = lyrics.Backend({}, log)
-google = lyrics.Google(MagicMock(), log)
-genius = lyrics.Genius(MagicMock(), log)
-tekstowo = lyrics.Tekstowo(MagicMock(), log)
-lrclib = lyrics.LRCLib(MagicMock(), log)
+plugin = lyrics.LyricsPlugin()
+raw_backend = lyrics.Backend(plugin.config, log)
+google = lyrics.Google(plugin.config, log)
+genius = lyrics.Genius(plugin.config, log)
+tekstowo = lyrics.Tekstowo(plugin.config, log)
+lrclib = lyrics.LRCLib(plugin.config, log)
 
 
 class LyricsPluginTest(unittest.TestCase):
@@ -244,6 +245,28 @@ def setUp(self):
             self.skipTest("Beautiful Soup 4 not available")
 
 
+class TestSearchBackend:
+    @pytest.fixture(scope="class")
+    def backend(self):
+        plugin = lyrics.LyricsPlugin()
+        return lyrics.SearchBackend(plugin.config, plugin._log)
+
+    @pytest.mark.parametrize(
+        "target_artist, artist, should_match",
+        [
+            ("Target Artist", "Target Artist", True),
+            ("Target Artist", "Target Artis", True),
+            ("Target Artist", "Target Arti", False),
+            ("Psychonaut", "Psychonaut (BEL)", True),
+            ("beets song", "beats song", True),
+        ],
+    )
+    def test_check_match(self, backend, target_artist, artist, should_match):
+        assert (
+            backend.check_match(target_artist, "", artist, "") == should_match
+        )
+
+
 class LyricsPluginSourcesTest(LyricsGoogleBaseTest, LyricsAssertions):
     """Check that beets google custom search engine sources are correctly
     scraped.
@@ -400,7 +423,7 @@ def test_is_page_candidate_exact_match(self):
             html, "html.parser", parse_only=SoupStrainer("title")
         )
         assert google.is_page_candidate(
-            url, soup.title.string, s["title"], s["artist"]
+            s["artist"], s["title"], url, soup.title.string
         ), url
 
     def test_is_page_candidate_fuzzy_match(self):
@@ -413,12 +436,12 @@ def test_is_page_candidate_fuzzy_match(self):
 
         # very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist
         assert google.is_page_candidate(
-            url, url_title, s["title"], s["artist"]
+            s["artist"], s["title"], url, url_title
         ), url
         # reject different title
         url_title = "example.com | seets bong lyrics by John doe"
         assert not google.is_page_candidate(
-            url, url_title, s["title"], s["artist"]
+            s["artist"], s["title"], url, url_title
         ), url
 
     def test_is_page_candidate_special_chars(self):
@@ -430,7 +453,7 @@ def test_is_page_candidate_special_chars(self):
         url = s["url"] + s["path"]
         url_title = "foo"
 
-        google.is_page_candidate(url, url_title, s["title"], "Sunn O)))")
+        google.is_page_candidate("Sunn O)))", s["title"], url, url_title)
 
 
 # test Genius backend
@@ -506,12 +529,14 @@ def test_json(self, mock_fetch_url, mock_scrape):
                                     "name": "\u200bblackbear",
                                 },
                                 "url": "blackbear_url",
+                                "title": "Idfc",
                             }
                         },
                         {
                             "result": {
                                 "primary_artist": {"name": "El\u002dp"},
                                 "url": "El-p_url",
+                                "title": "Idfc",
                             }
                         },
                     ]