Skip to content

Commit

Permalink
Merge pull request #2828 from nexB/license-improvement-winter-2022
Browse files Browse the repository at this point in the history
License improvement winter 2022
  • Loading branch information
pombredanne authored Feb 14, 2022
2 parents a0e576a + 18e9bc5 commit 24aae22
Show file tree
Hide file tree
Showing 1,067 changed files with 10,624 additions and 3,972 deletions.
21 changes: 21 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ Important API changes:
column to "path". The "copyright_holder" has been ranmed to "holder"


Development environment changes:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

- The license cache consistency is not checked anymore when you are using a Git
checkout. The SCANCODE_DEV_MODE tag file has been removed entirely. Use
instead the --reindex-licenses option to rebuild the license index.


Copyright detection:
~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -107,6 +115,19 @@ License detection:
by the word "license" and assimilated are now filtered as false matches.


- The new --licenses-reference option adds a new "licenses_reference" top
level attribute to a scan when using the JSON and YAML outputs. This contains
all the details and the full text of every licenses seen in a file or
package license expression of a scan. This can be added added after the fact
using the --from-json option.

- New experimental support for non-English licenses. Use the command
./scancode --reindex-licenses-for-all-languages to index all known non-English
licenses and rules. From that point on, they will be detected. Because of this
some licenses that were not tagged with their languages are now correctly
tagged and they may not be detected unless you activate this new indexing
feature.

Package detection:
~~~~~~~~~~~~~~~~~~

Expand Down
11 changes: 6 additions & 5 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,12 @@ CFG_BIN_DIR=$CFG_ROOT_DIR/$VIRTUALENV_DIR/bin

# Find packages from the local thirdparty directory or from thirdparty.aboutcode.org
if [ -f "$CFG_ROOT_DIR/thirdparty" ]; then
PIP_EXTRA_ARGS="--find-links $CFG_ROOT_DIR/thirdparty "
# offline mode
PIP_EXTRA_ARGS="--no-index --find-links $CFG_ROOT_DIR/thirdparty "
else
# online mode
PIP_EXTRA_ARGS="$PIP_EXTRA_ARGS --index https://thirdparty.aboutcode.org/pypi/simple"
fi
PIP_EXTRA_ARGS="$PIP_EXTRA_ARGS --find-links https://thirdparty.aboutcode.org/pypi"


################################
Expand Down Expand Up @@ -163,9 +166,7 @@ install_packages() {

################################
# Main command line entry point
CFG_DEV_MODE=0
CFG_REQUIREMENTS=$REQUIREMENTS
NO_INDEX="--no-index"

# We are using getopts to parse option arguments that start with "-"
while getopts :-: optchar; do
Expand All @@ -175,7 +176,7 @@ while getopts :-: optchar; do
help ) cli_help;;
clean ) clean;;
dev ) CFG_REQUIREMENTS="$DEV_REQUIREMENTS" && CFG_DEV_MODE=1;;
init ) NO_INDEX="";;
init ) PIP_EXTRA_ARGS="$PIP_EXTRA_ARGS --extra-index-url https://pypi.org/simple/";;
esac;;
esac
done
Expand Down
2 changes: 1 addition & 1 deletion etc/scripts/fix_thirdparty.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
@click.option(
"--strip-classifiers",
is_flag=True,
help="Remove danglingf classifiers",
help="Remove dangling PyPI classifiers",
)
@click.help_option("-h", "--help")
def fix_thirdparty_dir(
Expand Down
83 changes: 53 additions & 30 deletions etc/scripts/gen_pypi_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,22 +69,22 @@ def get_package_name_from_filename(filename, normalize=True):
Optionally ``normalize`` the name according to distribution name rules.
Raise an ``InvalidDistributionFilename`` if the ``filename`` is invalid::
>>> get_package_name_from_filename("aboutcode_toolkit-5.1.0-py2.py3-none-any.whl")
'aboutcode-toolkit'
>>> get_package_name_from_filename("boolean.py-3.7-py2.py3-none-any.whl")
'boolean-py'
>>> get_package_name_from_filename("boolean.py-3.7.tar.gz")
'boolean-py'
>>> get_package_name_from_filename("foo-1.2.3_rc1.tar.gz")
'foo'
>>> get_package_name_from_filename("foo-bar-1.2-py27-none-any.whl")
>>> get_package_name_from_filename("foo_bar-1.2-py27-none-any.whl")
'foo-bar'
>>> get_package_name_from_filename("foo.py-1.2-py27-none-any.whl")
'foo-py'
>>> get_package_name_from_filename("Cython-0.17.2-cp26-none-linux_x86_64.whl")
'cython'
>>> get_package_name_from_filename("python_ldap-2.4.19-cp27-none-macosx_10_10_x86_64.whl")
'python-ldap'
>>> get_package_name_from_filename("foo.whl")
Traceback (most recent call last):
...
InvalidDistributionFilename: ...
>>> get_package_name_from_filename("foo.png")
Traceback (most recent call last):
...
InvalidFilePackageName: ...
"""
if not filename or not filename.endswith(dist_exts):
raise InvalidDistributionFilename(filename)
Expand Down Expand Up @@ -133,15 +133,30 @@ def get_package_name_from_filename(filename, normalize=True):
raise InvalidDistributionFilename(filename)

if normalize:
name = name.lower().replace("_", "-")
name = normalize_name(name)
return name


def build_pypi_index(directory, write_index=False):
def normalize_name(name):
"""
Using a ``directory`` directory of wheels and sdists, create the a PyPI simple
directory index at ``directory``/simple/ populated with the proper PyPI simple
index directory structure crafted using symlinks.
Return a normalized package name per PEP503, and copied from
https://www.python.org/dev/peps/pep-0503/#id4
"""
return name and re.sub(r"[-_.]+", "-", name).lower() or name


def normalize_name_plain(name):
"""
Return a normalized package name, but do not replace dots
"""
return name and re.sub(r"[-_]+", "-", name).lower() or name


def build_pypi_index(directory):
"""
Using a ``directory`` directory of wheels and sdists, create the a PyPI
simple directory index at ``directory``/simple/ populated with the proper
PyPI simple index directory structure crafted using symlinks.
WARNING: The ``directory``/simple/ directory is removed if it exists.
"""
Expand All @@ -154,11 +169,15 @@ def build_pypi_index(directory, write_index=False):

index_dir.mkdir(parents=True)

if write_index:
simple_html_index = [
"<html><head><title>PyPI Simple Index</title>",
"<meta name='api-version' value='2' /></head><body>",
]
simple_html_index = [
"<html>"
"<head>"
"<title>PyPI Simple Index</title>",
'<meta charset="UTF-8">'
'<meta name="api-version" value="2" />'
"</head>"
"<body>",
]

package_names = set()
for pkg_file in directory.iterdir():
Expand All @@ -172,26 +191,30 @@ def build_pypi_index(directory, write_index=False):
):
continue

pkg_name = get_package_name_from_filename(pkg_filename)
pkg_index_dir = index_dir / pkg_name
original_name = get_package_name_from_filename(pkg_filename, normalize=False)
pkg_dir_name = normalize_name(original_name)
pkg_link_name = normalize_name_plain(original_name)

pkg_index_dir = index_dir / pkg_dir_name
pkg_index_dir.mkdir(parents=True, exist_ok=True)
pkg_indexed_file = pkg_index_dir / pkg_filename
link_target = Path("../..") / pkg_filename
pkg_indexed_file.symlink_to(link_target)

if write_index and pkg_name not in package_names:
esc_name = escape(pkg_name)
simple_html_index.append(f'<a href="{esc_name}/">{esc_name}</a><br/>')
package_names.add(pkg_name)
if pkg_link_name not in package_names:
esc_dir = escape(pkg_dir_name)
esc_link = escape(pkg_link_name)

simple_html_index.append(f'<a href="{esc_dir}/">{esc_link}</a><br/>')
package_names.add(pkg_link_name)

if write_index:
simple_html_index.append("</body></html>")
index_html = index_dir / "index.html"
index_html.write_text("\n".join(simple_html_index))
simple_html_index.append("</body></html>")
index_html = index_dir / "index.html"
index_html.write_text("\n".join(simple_html_index))


if __name__ == "__main__":
import sys

pkg_dir = sys.argv[1]
build_pypi_index(pkg_dir, True)
build_pypi_index(pkg_dir)
8 changes: 4 additions & 4 deletions etc/scripts/licenses/buildrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,11 @@ def all_rule_by_tokens():
try:
rule_tokens[tuple(rule.tokens())] = rule.identifier
except Exception as e:
df=(' file://' + rule.data_file)
tf=(' file://' + rule.text_file)
df = f" file://{rule.data_file}"
tf = f" file://{rule.text_file}"
raise Exception(
f'Failed to to get tokens from rule:: {rule.identifier}\n'
f'{df}\n{tf}'
f"Failed to to get tokens from rule:: {rule.identifier}\n"
f"{df}\n{tf}"
) from e
return rule_tokens

Expand Down
147 changes: 147 additions & 0 deletions etc/scripts/licenses/gen_spdx_lists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import click

from licensedcode.cache import get_licenses_by_spdx_key

import synclic

"""
A script to generate license detection rules from lists of SPDX
licenses for their name or id/name combos.
It is common to see SPDX license names and ids used for licensing documentation.
Here we fetch the latest SPDX licenses list and generate rules for each
license id/name, name and a few other related combinations.
"""

TRACE = False

template = """----------------------------------------
license_expression: {key}
relevance: 100
{is_license}: yes
minimum_coverage: 100
is_continuous: yes
notes: Rule based on an SPDX license identifier and name
---
{text}
"""


@click.command()
@click.argument(
# 'A buildrules-formatted file used to generate new licenses rules.')
"output",
type=click.Path(),
metavar="FILE",
)
@click.help_option("-h", "--help")
def cli(output):
"""
Generate ScanCode license detection rules from a list of SPDX
license. Save these in FILE for use with buildrules.
The `spdx` directory is used as a temp store for fetched SPDX licenses.
"""

licenses_by_spdx_key = get_licenses_by_spdx_key(
licenses=None,
include_deprecated=False,
lowercase_keys=False,
include_other_spdx_license_keys=True,
)

spdx_source = synclic.SpdxSource(external_base_dir=None)
spdx_data = list(spdx_source.fetch_spdx_licenses())

messages = []
with open(output, "w") as o:
for spdx in spdx_data:
is_exception = "licenseExceptionId" in spdx
spdx_key = spdx.get("licenseId") or spdx.get("licenseExceptionId")
name = spdx["name"]
lic = licenses_by_spdx_key.get(spdx_key)
if not lic:
print(
"--> Skipping SPDX license unknown in ScanCode:",
spdx_key,
)
continue
for rule in build_rules(lic.key, spdx_key, name, is_exception):
o.write(rule)

o.write("----------------------------------------\n")

for msg in messages:
print(*msg)


def build_rules(key, spdx_key, name, is_exception=False):
yield template.format(
key=key,
is_license="is_license_reference",
text=name,
)

yield template.format(
key=key,
is_license="is_license_reference",
text=f"name: {name}",
)

yield template.format(
key=key,
is_license="is_license_reference",
text=f"{spdx_key} {name}",
)

yield template.format(
key=key,
is_license="is_license_reference",
text=f"{name} {spdx_key}",
)

yield template.format(
key=key,
is_license="is_license_tag",
text=f"{spdx_key} {name}",
)

yield template.format(
key=key,
is_license="is_license_tag",
text=f"license: {spdx_key}",
)

yield template.format(
key=key,
is_license="is_license_tag",
text=f"license: {name}",
)

if is_exception:
yield template.format(
key=key,
is_license="is_license_tag",
text=f"licenseExceptionId: {spdx_key}",
)
else:
yield template.format(
key=key,
is_license="is_license_tag",
text=f"licenseId: {spdx_key}",
)


if __name__ == "__main__":
cli()
Loading

0 comments on commit 24aae22

Please sign in to comment.