Skip to content

Commit

Permalink
Improve the fuzzy matching system to get the correct licence (sagemat…
Browse files Browse the repository at this point in the history
…h#470)

* Improve the fuzzy matching system to get the correct licence

* Constrain semver in environment.yaml as well

* Workaround for MIT licence
  • Loading branch information
marcelotrevisani authored May 18, 2023
1 parent b926438 commit 92cc05c
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 14 deletions.
4 changes: 2 additions & 2 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ dependencies:
- stdlib-list
- pip
- setuptools >=30.3.0
- rapidfuzz >=1.9.1
- rapidfuzz >=3.0.0
- progressbar2 >=3.53.0
- colorama
- mock
Expand All @@ -28,4 +28,4 @@ dependencies:
- tomli-w
- libcblas
- beautifulsoup4
- semver
- semver >=3.0.0,<4.0.0
21 changes: 16 additions & 5 deletions grayskull/license/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import requests
from colorama import Fore
from rapidfuzz import process
from rapidfuzz.fuzz import token_set_ratio, token_sort_ratio
from rapidfuzz.distance import OSA
from rapidfuzz.fuzz import partial_ratio, token_set_ratio, token_sort_ratio

from grayskull.cli.stdout import print_msg
from grayskull.license.data import get_all_licenses # noqa
Expand Down Expand Up @@ -72,26 +73,36 @@ def match_license(name: str) -> dict:
return {}
name = re.sub(r"\s+license\s*", "", name.strip(), flags=re.IGNORECASE)

best_matches = process.extract(name, _get_all_license_choice(all_licenses))
best_matches = process.extract(
name, _get_all_license_choice(all_licenses), scorer=partial_ratio
)
best_matches = process.extract(name, [lc for lc, *_ in best_matches])
spdx_license = best_matches[0]
if spdx_license[1] != 100:

if spdx_license[1] < 100:
best_matches = [lic[0] for lic in best_matches if not lic[0].endswith("-only")]

if best_matches:
best_matches = process.extract(name, best_matches, scorer=token_set_ratio)
best_matches = process.extract(
name, best_matches, scorer=OSA.normalized_similarity
)
spdx_license = best_matches[0]
best_matches = [lic[0] for lic in best_matches if lic[1] >= spdx_license[1]]
if len(best_matches) > 1:
spdx_license = process.extractOne(
name, best_matches, scorer=token_sort_ratio
)
if spdx_license[1] != 100 and spdx_license[0].startswith("MIT"):
spdx_license = "MIT"
else:
spdx_license = spdx_license[0]

log.info(
f"Best match for license {name} was {spdx_license}.\n"
f"Best matches: {best_matches}"
)

return _get_license(spdx_license[0], all_licenses)
return _get_license(spdx_license, all_licenses)


def get_short_license_id(name: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dependencies = [
"pip",
"pkginfo",
"progressbar2 >=3.53.0",
"rapidfuzz >=1.7.1",
"rapidfuzz >=3.0.0",
"requests",
"ruamel.yaml >=0.16.10",
"ruamel.yaml.jinja2",
Expand Down
18 changes: 12 additions & 6 deletions tests/license/test_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,18 @@ def test_get_opensource_license_data():
assert len(get_opensource_license_data()) >= 50


def test_short_license_id():
assert get_short_license_id("MIT License") == "MIT"
assert get_short_license_id("Expat") == "MIT"
assert get_short_license_id("GPL 2.0") == "GPL-2.0-or-later"
assert get_short_license_id("2-Clause BSD License") == "BSD-2-Clause"
assert get_short_license_id("3-Clause BSD License") == "BSD-3-Clause"
@pytest.mark.parametrize(
"licence_name, short_licence",
[
("MIT License", "MIT"),
("Expat", "MIT"),
("GPL 2.0", "GPL-2.0-or-later"),
("2-Clause BSD License", "BSD-2-Clause"),
("3-Clause BSD License", "BSD-3-Clause"),
],
)
def test_short_license_id(licence_name, short_licence):
assert get_short_license_id(licence_name) == short_licence


def test_get_other_names_from_opensource():
Expand Down

0 comments on commit 92cc05c

Please sign in to comment.