diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 2ee4362a384..534457e0992 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,6 +1,6 @@
[bumpversion]
current_version = 2.2.1
-files = setup.py src/scancode/__init__.py
+files = setup.py src/scancode_config.py
commit = False
tag = False
diff --git a/.gitignore b/.gitignore
index 8d35930fb9f..a39bc6d78ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,3 +67,4 @@ docs/_build
# pyenv
/.python-version
+/man/
diff --git a/configure.bat b/configure.bat
index 437c7d77983..8ba282a0275 100644
--- a/configure.bat
+++ b/configure.bat
@@ -1,6 +1,6 @@
@echo OFF
-@rem Copyright (c) 2015 nexB Inc. http://www.nexb.com/ - All rights reserved.
+@rem Copyright (c) 2018 nexB Inc. http://www.nexb.com/ - All rights reserved.
@rem ################################
@rem # change these variables to customize this script locally
@@ -44,7 +44,7 @@ if not exist "c:\python27\python.exe" (
echo Do NOT install Python v3 or any 64 bits edition.
echo Instead download Python from this url and see the README.rst file for more details:
echo(
- echo https://www.python.org/ftp/python/2.7.10/python-2.7.10.msi
+ echo https://www.python.org/ftp/python/2.7.14/python-2.7.14.msi
echo(
exit /b 1
)
diff --git a/etc/conf/base.py b/etc/conf/base.py
index 6f1b431d30a..d5643fb4246 100644
--- a/etc/conf/base.py
+++ b/etc/conf/base.py
@@ -5,17 +5,18 @@
import sys
-
"""
Check that we run a supported OS and architecture.
"""
+
def unsupported(platform):
print('Unsupported OS/platform %r.' % platform)
print('See https://github.com/nexB/scancode-toolkit/ for supported OS/platforms.')
print('Enter a ticket https://github.com/nexB/scancode-toolkit/issues asking for support of your OS/platform combo.')
sys.exit(1)
+
if sys.maxsize > 2 ** 32:
arch = '64'
else:
@@ -31,29 +32,12 @@ def unsupported(platform):
else:
unsupported(sys_platform)
-
supported_combos = {
'linux': ['32', '64'],
- 'win': ['32',],
- 'mac': ['64',],
+ 'win': ['32', ],
+ 'mac': ['64', ],
}
arches = supported_combos[os]
if arch not in arches:
unsupported(os + arch)
-
-
-"""
-Re/build the license cache on every configure run.
-"""
-
-def build_license_cache():
- """
- Force a rebuild of the license cache on configure.
- """
- from licensedcode import cache
- print('* Building license index...')
- cache.reindex()
-
-
-build_license_cache()
diff --git a/etc/conf/dev/base.py b/etc/conf/dev/base.py
index 588a475c0c2..e78bdd149f3 100644
--- a/etc/conf/dev/base.py
+++ b/etc/conf/dev/base.py
@@ -12,8 +12,8 @@ def setup_dev_mode():
not rely on license data to remain untouched and will always check the
license index cache for consistency, rebuilding it if necessary.
"""
- from scancode import root_dir
- with open(os.path.join(root_dir, 'SCANCODE_DEV_MODE'), 'wb') as sdm:
+ from scancode_config import scancode_root_dir
+ with open(os.path.join(scancode_root_dir, 'SCANCODE_DEV_MODE'), 'wb') as sdm:
sdm.write('This is a tag file to notify that ScanCode is used in development mode.')
@@ -21,14 +21,14 @@ def setup_vscode():
"""
Add base settings for .vscode
"""
- from scancode import root_dir
+ from scancode_config import scancode_root_dir
from commoncode.fileutils import create_dir
from commoncode.fileutils import copyfile
- settings = os.path.join(root_dir, 'etc', 'vscode', 'settings.json')
+ settings = os.path.join(scancode_root_dir, 'etc', 'vscode', 'settings.json')
if os.path.exists(settings):
- vscode = os.path.join(root_dir, '.vscode')
+ vscode = os.path.join(scancode_root_dir, '.vscode')
create_dir(vscode)
copyfile(settings, vscode)
diff --git a/etc/configure.py b/etc/configure.py
index 44cada3448a..207e7988f25 100644
--- a/etc/configure.py
+++ b/etc/configure.py
@@ -64,7 +64,6 @@
import shutil
import subprocess
-
# platform-specific file base names
sys_platform = str(sys.platform).lower()
on_win = False
@@ -79,7 +78,6 @@
raise Exception('Unsupported OS/platform %r' % sys_platform)
platform_names = tuple()
-
# common file basenames for requirements and scripts
base = ('base',)
@@ -213,7 +211,7 @@ def create_virtualenv(std_python, root_dir, tpp_dirs, quiet=False):
def activate(root_dir):
""" Activate a virtualenv in the current process."""
- print("* Activating ...")
+ # print("* Activating...")
bin_dir = os.path.join(root_dir, 'bin')
activate_this = os.path.join(bin_dir, 'activate_this.py')
with open(activate_this) as f:
diff --git a/etc/release/release.sh b/etc/release/release.sh
index 2e8684d6fe7..a4f03caf795 100755
--- a/etc/release/release.sh
+++ b/etc/release/release.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2017 nexB Inc. http://www.nexb.com/ - All rights reserved.
+# Copyright (c) 2018 nexB Inc. http://www.nexb.com/ - All rights reserved.
#
# ScanCode release script
@@ -52,17 +52,36 @@ function test_scan {
# this is needed for the zip
chmod o+x scancode extractcode
- # minimal test: update when new scans are available
- ./scancode --quiet -lcip apache-2.0.LICENSE test_scan.json
- echo "TEST JSON passed: ./scancode --quiet -lcip apache-2.0.LICENSE test_scan.json"
- ./scancode --quiet -lcip --format json-pp apache-2.0.LICENSE test_scan.json
- echo "TEST JSON-PP passed: ./scancode --quiet -lcip --format json-pp apache-2.0.LICENSE test_scan.json"
- ./scancode --quiet -lcip --format html apache-2.0.LICENSE test_scan.html
- echo "TEST HTML passed: ./scancode --quiet -lcip --format html apache-2.0.LICENSE test_scan.html"
- ./scancode --quiet -lcip --format html-app apache-2.0.LICENSE test_scan_app.html
- echo "TEST HTML-APP passed: ./scancode --quiet -lcip --format html-app apache-2.0.LICENSE test_scan_app.html"
- ./extractcode --quiet samples/arch
- echo "TEST EXTRACTCODE passed: ./extractcode --quiet samples/arch"
+ # minimal tests: update when new scans are available
+ cmd="./scancode --quiet -lcip apache-2.0.LICENSE --json test_scan.json"
+ echo "RUNNING TEST: $cmd"
+ $cmd
+ echo "TEST PASSED"
+
+ cmd="./scancode --quiet -lcip apache-2.0.LICENSE --json-pp test_scan.json"
+ echo "RUNNING TEST: $cmd"
+ $cmd
+ echo "TEST PASSED"
+
+ cmd="./scancode --quiet -lcip apache-2.0.LICENSE --output-html test_scan.html"
+ echo "RUNNING TEST: $cmd"
+ $cmd
+ echo "TEST PASSED"
+
+ cmd="./scancode --quiet -lcip apache-2.0.LICENSE --output-html-app test_scan_app.html"
+ echo "RUNNING TEST: $cmd"
+ $cmd
+ echo "TEST PASSED"
+
+ cmd="./scancode --quiet -lcip apache-2.0.LICENSE --output-spdx-tv test_scan.spdx"
+ echo "RUNNING TEST: $cmd"
+ $cmd
+ echo "TEST PASSED"
+
+ cmd="./extractcode --quiet samples/arch"
+ echo "RUNNING TEST: $cmd"
+ $cmd
+ echo "TEST PASSED"
# cleanup
cd ..
diff --git a/etc/scripts/sch2js/sch2js.py b/etc/scripts/sch2js/sch2js.py
index e99527a0b2e..812dd2ab4c9 100644
--- a/etc/scripts/sch2js/sch2js.py
+++ b/etc/scripts/sch2js/sch2js.py
@@ -46,10 +46,8 @@
from schematics.types.compound import ListType
from schematics.types.compound import ModelType
-
__version__ = '1.0.1.patch'
-
SCHEMATIC_TYPE_TO_JSON_TYPE = {
'NumberType': 'number',
'IntType': 'integer',
diff --git a/etc/scripts/synclic.py b/etc/scripts/synclic.py
index 7bc7ae63a30..6fdd2902b74 100644
--- a/etc/scripts/synclic.py
+++ b/etc/scripts/synclic.py
@@ -31,22 +31,25 @@
from collections import OrderedDict
import json
import os
+from os import mkdir
+from os.path import exists
+from os.path import join
import zipfile
import click
+from os.path import realpath
click.disable_unicode_literals_warning = True
import requests
-from commoncode import fileutils
from commoncode import fetch
+from commoncode import fileutils
import licensedcode
-from licensedcode.cache import get_licenses_db
from licensedcode.cache import get_index
+from licensedcode.cache import get_licenses_db
from licensedcode.models import load_licenses
from licensedcode.models import License
-
"""
Sync and update the ScanCode licenses against:
- the SPDX license list
@@ -59,6 +62,7 @@
TRACE_DEEP = False
TRACE_FETCH = False
+
class ExternalLicensesSource(object):
"""
Base class to provide (including possibly fetch) licenses from an
@@ -80,30 +84,30 @@ def __init__(self, src_dir, match_text=False, match_approx=False):
"""
`src_dir` is where the License objects are dumped.
"""
- src_dir = os.path.realpath(src_dir)
+ src_dir = realpath(src_dir)
self.src_dir = src_dir
self.match_text = match_text
self.match_approx = match_approx
self.fetched = False
- if os.path.exists(src_dir):
+ if exists(src_dir):
# fetch ONLY if the directory is empty
self.fetched = True
else:
- os.mkdir(src_dir)
+ mkdir(src_dir)
self.update_dir = self.src_dir.rstrip('\\/') + '-update'
- if not os.path.exists(self.update_dir):
- os.mkdir(self.update_dir)
+ if not exists(self.update_dir):
+ mkdir(self.update_dir)
self.new_dir = self.src_dir.rstrip('\\/') + '-new'
- if not os.path.exists(self.new_dir):
- os.mkdir(self.new_dir)
+ if not exists(self.new_dir):
+ mkdir(self.new_dir)
self.del_dir = self.src_dir.rstrip('\\/') + '-del'
- if not os.path.exists(self.del_dir):
- os.mkdir(self.del_dir)
+ if not exists(self.del_dir):
+ mkdir(self.del_dir)
self.scancodes_by_key = get_licenses_db()
@@ -111,13 +115,15 @@ def __init__(self, src_dir, match_text=False, match_approx=False):
for l in self.scancodes_by_key.values()
if l.spdx_license_key}
- composites_dir = os.path.join(licensedcode.data_dir, 'composites', 'licenses')
+ composites_dir = join(
+ licensedcode.models.data_dir, 'composites', 'licenses')
self.composites_by_key = load_licenses(composites_dir, with_deprecated=True)
self.composites_by_spdx_key = {l.spdx_license_key.lower(): l
for l in self.composites_by_key.values()
if l.spdx_license_key}
- foreign_dir = os.path.join(licensedcode.data_dir, 'non-english', 'licenses')
+ foreign_dir = join(
+ licensedcode.models.data_dir, 'non-english', 'licenses')
self.non_english_by_key = load_licenses(foreign_dir, with_deprecated=True)
self.non_english_by_spdx_key = {l.spdx_license_key.lower(): l
for l in self.non_english_by_key.values()
@@ -449,8 +455,8 @@ def __init__(self, src_dir, match_text=False, match_approx=False,
api_base_url=None, api_key=None):
super(DejaSource, self).__init__(src_dir, match_text, match_approx)
- self.api_base_url = api_base_url or os.environ.get('DEJACODE_API_URL', None)
- self.api_key = api_key or os.environ.get('DEJACODE_API_KEY', None)
+ self.api_base_url = api_base_url or os.getenv('DEJACODE_API_URL')
+ self.api_key = api_key or os.getenv('DEJACODE_API_KEY')
assert (self.api_key and self.api_base_url), (
'You must set the DEJACODE_API_URL and DEJACODE_API_KEY ' +
@@ -608,11 +614,13 @@ def merge_licenses(scancode_license, other_license, updatable_attributes):
(attribute name, value before, value after)
"""
scancode_updated = []
+
def update_sc(_attrib, _sc_val, _o_val):
setattr(scancode_license, _attrib, _o_val)
scancode_updated.append((_attrib, _sc_val, _o_val))
other_updated = []
+
def update_ot(_attrib, _sc_val, _o_val):
setattr(other_license, _attrib, _sc_val)
other_updated.append((_attrib, _o_val, _sc_val))
@@ -781,7 +789,7 @@ def synchronize_licenses(external_source):
if not TRACE:print('.', end='')
# Create a new ScanCode license
- sc_license = ot_license.relocate(licensedcode.licenses_data_dir, o_key)
+ sc_license = ot_license.relocate(licensedcode.models.data_dir, o_key)
scancodes_added.add(sc_license.key)
scancodes_by_key[sc_license.key] = sc_license
if TRACE: print('Other license key not in ScanCode:', ot_license.key, 'created in ScanCode.')
@@ -793,7 +801,6 @@ def synchronize_licenses(external_source):
for k in others_changed | others_added:
others_by_key[k].dump()
-
# TODO: at last: print report of incorrect OTHER licenses to submit
# updates eg. make API calls to DejaCode to create or update
# licenses and submit review request e.g. submit requests to SPDX
diff --git a/etc/scripts/test_json2csv.py b/etc/scripts/test_json2csv.py
index 8fcb96f8851..75b17e9cb2f 100644
--- a/etc/scripts/test_json2csv.py
+++ b/etc/scripts/test_json2csv.py
@@ -208,16 +208,17 @@ class TestJson2CSVWithLiveScans(FileBasedTesting):
test_data_dir = os.path.join(os.path.dirname(__file__), 'testdata')
def test_can_process_scan_from_json_scan(self):
- import scancode
+ from scancode_config import scancode_root_dir
from commoncode.command import execute
test_dir = self.get_test_loc('livescan/scan')
json_file = self.get_temp_file('json')
- scan_cmd = os.path.join(scancode.root_dir, 'scancode')
+ scan_cmd = os.path.join(scancode_root_dir, 'scancode')
rc, _stdout, _stderr = execute(scan_cmd,
- ['-clip', '--email', '--url', '--strip-root', '--format', 'json', test_dir, json_file])
- assert rc == 0
+ ['-clip', '--email', '--url', '--strip-root', test_dir,
+ '--json', json_file])
result_file = self.get_temp_file('.csv')
with open(result_file, 'wb') as rf:
json2csv.json_scan_to_csv(json_file, rf)
expected_file = self.get_test_loc('livescan/expected.csv')
check_csvs(result_file, expected_file, regen=False)
+ assert rc == 0
diff --git a/etc/scripts/testdata/livescan/expected.csv b/etc/scripts/testdata/livescan/expected.csv
index 1e1003996a2..6950b3d7305 100644
--- a/etc/scripts/testdata/livescan/expected.csv
+++ b/etc/scripts/testdata/livescan/expected.csv
@@ -1,20 +1,20 @@
-Resource,type,name,base_name,extension,date,size,sha1,md5,files_count,mime_type,file_type,programming_language,is_binary,is_text,is_archive,is_media,is_source,is_script,scan_errors,license__key,license__score,license__short_name,license__category,license__owner,license__homepage_url,license__text_url,license__reference_url,license__spdx_license_key,license__spdx_url,start_line,end_line,matched_rule__identifier,matched_rule__license_choice,matched_rule__licenses,copyright,copyright_holder,email,url,package__type,package__name,package__version,package__primary_language,package__summary,package__description,package__size,package__release_date,package__authors,package__homepage_url,package__notes,package__download_urls,package__bug_tracking_url,package__vcs_repository,package__copyright_top_level
-/package.json,file,package.json,package,.json,2017-10-03,2200,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-/package.json,,,,,,,,,,,,,,,,,,,,mit,15.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit_27.RULE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
-/package.json,,,,,,,,,,,,,,,,,,,,mit,100.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit.LICENSE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
-/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost < tj@learnboost.com>,,,,,,,,,,,,,,,,,,
-/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost <,,,,,,,,,,,,,,,,,
-/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,12,,,,,,tj@learnboost.com,,,,,,,,,,,,,,,,
-/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16,16,,,,,,,https://github.com/visionmedia/node-cookie-signature.git,,,,,,,,,,,,,,,
-/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27,27,,,,,,,https://github.com/visionmedia/node-cookie-signature/issues,,,,,,,,,,,,,,,
-/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TJ Holowaychuk,,,https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.3.tgz,,,
-/json2csv.rb,file,json2csv.rb,json2csv,.rb,2017-10-03,1599,6cfb0bd0fb0b784f57164d15bdfca2b734ad87a6,f18e519b77bc7f3e4213215033db3857,,text/x-python,"Python script, ASCII text executable",Ruby,False,True,False,False,True,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-/json2csv.rb,,,,,,,,,,,,,,,,,,,,apache-2.0,98.45,Apache 2.0,Permissive,Apache Software Foundation,http://www.apache.org/licenses/,http://www.apache.org/licenses/LICENSE-2.0,https://enterprise.dejacode.com/urn/urn:dje:license:apache-2.0,Apache-2.0,https://spdx.org/licenses/Apache-2.0,5,24,apache-2.0_scancode.RULE,False,"[u'apache-2.0', u'scancode-acknowledgment']",,,,,,,,,,,,,,,,,,,
-/json2csv.rb,,,,,,,,,,,,,,,,,,,,scancode-acknowledgment,98.45,ScanCode acknowledgment,Permissive,nexB,https://github.com/nexB/scancode-toolkit/,,https://enterprise.dejacode.com/urn/urn:dje:license:scancode-acknowledgment,,,5,24,apache-2.0_scancode.RULE,False,"[u'apache-2.0', u'scancode-acknowledgment']",,,,,,,,,,,,,,,,,,,
-/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,3,,,,Copyright (c) 2017 nexB Inc. and others.,,,,,,,,,,,,,,,,,,
-/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,3,,,,,nexB Inc. and others.,,,,,,,,,,,,,,,,,
-/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,,,,,,,http://nexb.com/,,,,,,,,,,,,,,,
-/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,,,,,,,https://github.com/nexB/scancode-toolkit/,,,,,,,,,,,,,,,
-/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,10,,,,,,,http://apache.org/licenses/LICENSE-2.0,,,,,,,,,,,,,,,
-/license,file,license,license,,2017-10-03,679,75c5490a718ddd45e40e0cc7ce0c756abc373123,b965a762efb9421cf1bf4405f336e278,,text/plain,ASCII text,,False,True,False,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-/license,,,,,,,,,,,,,,,,,,,,gpl-2.0-plus,100.00,GPL 2.0 or later,Copyleft,Free Software Foundation (FSF),http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html,http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html,https://enterprise.dejacode.com/urn/urn:dje:license:gpl-2.0-plus,GPL-2.0+,https://spdx.org/licenses/GPL-2.0,1,12,gpl-2.0-plus.LICENSE,False,[u'gpl-2.0-plus'],,,,,,,,,,,,,,,,,,,
+Resource,type,name,base_name,extension,size,date,sha1,md5,mime_type,file_type,programming_language,is_binary,is_text,is_archive,is_media,is_source,is_script,files_count,dirs_count,size_count,scan_errors,license__key,license__score,license__short_name,license__category,license__owner,license__homepage_url,license__text_url,license__reference_url,license__spdx_license_key,license__spdx_url,start_line,end_line,matched_rule__identifier,matched_rule__license_choice,matched_rule__licenses,copyright,copyright_holder,email,url,package__type,package__name,package__version,package__primary_language,package__summary,package__description,package__size,package__release_date,package__authors,package__homepage_url,package__notes,package__download_urls,package__bug_tracking_url,package__vcs_repository,package__copyright_top_level
+/json2csv.rb,file,json2csv.rb,json2csv,.rb,1599,2017-10-03,6cfb0bd0fb0b784f57164d15bdfca2b734ad87a6,f18e519b77bc7f3e4213215033db3857,text/x-python,"Python script, ASCII text executable",Ruby,False,True,False,False,True,True,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,apache-2.0,98.45,Apache 2.0,Permissive,Apache Software Foundation,http://www.apache.org/licenses/,http://www.apache.org/licenses/LICENSE-2.0,https://enterprise.dejacode.com/urn/urn:dje:license:apache-2.0,Apache-2.0,https://spdx.org/licenses/Apache-2.0,5,24,apache-2.0_scancode.RULE,False,"[u'apache-2.0', u'scancode-acknowledgment']",,,,,,,,,,,,,,,,,,,
+/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,scancode-acknowledgment,98.45,ScanCode acknowledgment,Permissive,nexB,https://github.com/nexB/scancode-toolkit/,,https://enterprise.dejacode.com/urn/urn:dje:license:scancode-acknowledgment,,,5,24,apache-2.0_scancode.RULE,False,"[u'apache-2.0', u'scancode-acknowledgment']",,,,,,,,,,,,,,,,,,,
+/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,3,,,,Copyright (c) 2017 nexB Inc. and others.,,,,,,,,,,,,,,,,,,
+/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,3,,,,,nexB Inc. and others.,,,,,,,,,,,,,,,,,
+/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,,,,,,,http://nexb.com/,,,,,,,,,,,,,,,
+/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,,,,,,,https://github.com/nexB/scancode-toolkit/,,,,,,,,,,,,,,,
+/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,10,,,,,,,http://apache.org/licenses/LICENSE-2.0,,,,,,,,,,,,,,,
+/license,file,license,license,,679,2017-10-03,75c5490a718ddd45e40e0cc7ce0c756abc373123,b965a762efb9421cf1bf4405f336e278,text/plain,ASCII text,,False,True,False,False,False,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+/license,,,,,,,,,,,,,,,,,,,,,,gpl-2.0-plus,100.00,GPL 2.0 or later,Copyleft,Free Software Foundation (FSF),http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html,http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html,https://enterprise.dejacode.com/urn/urn:dje:license:gpl-2.0-plus,GPL-2.0+,https://spdx.org/licenses/GPL-2.0,1,12,gpl-2.0-plus.LICENSE,False,[u'gpl-2.0-plus'],,,,,,,,,,,,,,,,,,,
+/package.json,file,package.json,package,.json,2200,2017-10-03,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+/package.json,,,,,,,,,,,,,,,,,,,,,,mit,15.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit_27.RULE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
+/package.json,,,,,,,,,,,,,,,,,,,,,,mit,100.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit.LICENSE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
+/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost < tj@learnboost.com>,,,,,,,,,,,,,,,,,,
+/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost <,,,,,,,,,,,,,,,,,
+/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,12,,,,,,tj@learnboost.com,,,,,,,,,,,,,,,,
+/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16,16,,,,,,,https://github.com/visionmedia/node-cookie-signature.git,,,,,,,,,,,,,,,
+/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27,27,,,,,,,https://github.com/visionmedia/node-cookie-signature/issues,,,,,,,,,,,,,,,
+/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TJ Holowaychuk,,,https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.3.tgz,,,
diff --git a/setup.py b/setup.py
index b3d63e9c7eb..4ccbc0ec9ff 100644
--- a/setup.py
+++ b/setup.py
@@ -161,7 +161,6 @@ def read(*names, **kwargs):
'pygments >= 2.0.1, <3.0.0',
# packagedcode
- 'attrs >=16.0, < 17.0',
'pymaven-patch >= 0.2.4',
'requests >= 2.7.0, < 3.0.0',
'schematics_patched',
@@ -170,6 +169,8 @@ def read(*names, **kwargs):
'click >= 6.0.0, < 7.0.0',
'colorama >= 0.3.9',
'pluggy >= 0.4.0, < 1.0',
+ 'attrs >=17.0, < 18.0',
+ 'typing >=3.6, < 3.7',
# scancode outputs
'jinja2 >= 2.7.0, < 3.0.0',
@@ -199,39 +200,81 @@ def read(*names, **kwargs):
'extractcode = scancode.extract_cli:extractcode',
],
- # scancode_output_writers is an entry point to define plugins
- # that write a scan output in a given format.
- # See the plugincode.output module for details and doc.
- # note: the "name" of the entrypoint (e.g "html") becomes the
- # ScanCode command line --format option used to enable a given
- # format plugin
- 'scancode_output_writers': [
- 'html = formattedcode.format_templated:write_html',
- 'html-app = formattedcode.format_templated:write_html_app',
- 'json = formattedcode.format_json:write_json_compact',
- 'json-pp = formattedcode.format_json:write_json_pretty_printed',
- 'spdx-tv = formattedcode.format_spdx:write_spdx_tag_value',
- 'spdx-rdf = formattedcode.format_spdx:write_spdx_rdf',
- 'csv = formattedcode.format_csv:write_csv',
- 'jsonlines = formattedcode.format_jsonlines:write_jsonlines',
+ # scancode_pre_scan is the entry point for pre_scan plugins executed
+ # before the scans.
+ #
+ # Each entry hast this form:
+ # plugin-name = fully.qualified.module:PluginClass
+ # where plugin-name must be a unique name for this entrypoint.
+ #
+ # See also plugincode.pre_scan module for details and doc.
+ 'scancode_pre_scan': [
+ 'ignore = scancode.plugin_ignore:ProcessIgnore',
],
- # scancode_post_scan is an entry point for post_scan_plugins.
- # See plugincode.post_scan module for details and doc.
- # note: the "name" of the entrypoint (e.g only-findings)
- # becomes the ScanCode CLI boolean flag used to enable a
- # given post_scan plugin
+ # scancode_scan is the entry point for scan plugins that run a scan
+ # after the pre_scan plugins and before the post_scan plugins.
+ #
+ # Each entry hast this form:
+ # plugin-name = fully.qualified.module:PluginClass
+ # where plugin-name must be a unique name for this entrypoint.
+ #
+ # IMPORTANT: The plugin-name is also the "scan key" used in scan results
+ # for this scanner.
+ #
+ # See also plugincode.scan module for details and doc.
+ 'scancode_scan': [
+ 'info = scancode.plugin_info:InfoScanner',
+ 'licenses = scancode.plugin_license:LicenseScanner',
+ 'copyrights = scancode.plugin_copyright:CopyrightScanner',
+ 'packages = scancode.plugin_package:PackageScanner',
+ 'emails = scancode.plugin_email:EmailScanner',
+ 'urls = scancode.plugin_url:UrlScanner',
+ ],
+
+ # scancode_post_scan is the entry point for post_scan plugins executed
+ # after the scan plugins and before the output plugins.
+ #
+ # Each entry hast this form:
+ # plugin-name = fully.qualified.module:PluginClass
+ # where plugin-name must be a unique name for this entrypoint.
+ #
+ # See also plugincode.post_scan module for details and doc.
'scancode_post_scan': [
- 'only-findings = scancode.plugin_only_findings:process_only_findings',
- 'mark-source = scancode.plugin_mark_source:process_mark_source',
+ 'mark-source = scancode.plugin_mark_source:MarkSource',
],
- # scancode_pre_scan is an entry point to define pre_scan plugins.
- # See plugincode.pre_scan module for details and doc.
- # note: the "name" of the entrypoint (e.g ignore) will be used for
- # the option name which passes the input to the given pre_scan plugin
- 'scancode_pre_scan': [
- 'ignore = scancode.plugin_ignore:ProcessIgnore',
- ]
+ # scancode_output_filter is the entry point for filter plugins executed
+ # after the post-scan plugins and used by the output plugins to
+ # exclude/filter certain files or directories from the codebase.
+ #
+ # Each entry hast this form:
+ # plugin-name = fully.qualified.module:PluginClass
+ # where plugin-name must be a unique name for this entrypoint.
+ #
+ # See also plugincode.post_scan module for details and doc.
+ 'scancode_output_filter': [
+ 'only-findings = scancode.plugin_only_findings:OnlyFindings',
+ ],
+
+ # scancode_output is the entry point for ouput plugins that write a scan
+ # output in a given format at the end of a scan.
+ #
+ # Each entry hast this form:
+ # plugin-name = fully.qualified.module:PluginClass
+ # where plugin-name must be a unique name for this entrypoint.
+ #
+ # See also plugincode._output module for details and doc.
+ 'scancode_output': [
+ 'html = formattedcode.output_html:HtmlOutput',
+ 'html-app = formattedcode.output_html:HtmlAppOutput',
+ 'json = formattedcode.output_json:JsonCompactOutput',
+ 'json-pp = formattedcode.output_json:JsonPrettyOutput',
+ 'spdx-tv = formattedcode.output_spdx:SpdxTvOutput',
+ 'spdx-rdf = formattedcode.output_spdx:SpdxRdfOutput',
+ 'csv = formattedcode.output_csv:CsvOutput',
+ 'jsonlines = formattedcode.output_jsonlines:JsonLinesOutput',
+ 'template = formattedcode.output_html:CustomTemplateOutput',
+ ],
},
)
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
index 28bc3db1c1c..a7bce5f9f99 100644
--- a/src/cluecode/copyrights.py
+++ b/src/cluecode/copyrights.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -30,19 +30,19 @@
import os
import re
-
+# importand: this sets re._MAXCACHE
import commoncode
+
from textcode import analysis
from cluecode import copyrights_hint
-
-COPYRIGHT_TRACE = 0
+TRACE = 0
logger = logging.getLogger(__name__)
-if os.environ.get('SCANCODE_COPYRIGHT_DEBUG'):
+if os.environ.get('SCANCODE_DEBUG_COPYRIGHT'):
import sys
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
- COPYRIGHT_TRACE = 0
+ TRACE = 1
"""
Detect and collect copyright statements.
@@ -132,7 +132,6 @@ def detect(location):
'\ ' # html entity sometimes are double escaped
')*') # repeated 0 or more times
-
_YEAR_PUNCT = _YEAR + _PUNCT
_YEAR_YEAR_PUNCT = _YEAR_YEAR + _PUNCT
_YEAR_SHORT_PUNCT = _YEAR_SHORT + _PUNCT
@@ -1068,11 +1067,12 @@ class CopyrightDetector(object):
"""
Class to detect copyrights and authorship.
"""
+
def __init__(self):
from nltk import RegexpTagger
from nltk import RegexpParser
self.tagger = RegexpTagger(patterns)
- self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE)
+ self.chunker = RegexpParser(grammar, trace=0)
@classmethod
def as_str(cls, node, ignores=()):
@@ -1385,24 +1385,29 @@ def lowercase_well_known_word(text):
lines_append(' '.join(words))
return '\n'.join(lines)
-
# FIXME: instead of using functions, use plain re and let the re cache do its work
+
def IGNORED_PUNCTUATION_RE():
return re.compile(r'[*#"%\[\]\{\}`]+', re.I | re.M | re.U)
+
def ASCII_LINE_DECO_RE():
return re.compile(r'[-_=!\\*]{2,}')
+
def ASCII_LINE_DECO2_RE():
return re.compile(r'/{3,}')
+
def WHITESPACE_RE():
return re.compile(r' +')
+
def MULTIQUOTES_RE():
return re.compile(r"\'{2,}")
+
# TODO: add debian POS name taggings
def DEBIAN_COPYRIGHT_TAGS_RE():
return re.compile(r"(\|\)")
@@ -1417,7 +1422,7 @@ def prepare_text_line(line):
# strip whitespace
line = line.strip()
- #FIXME: how did we get line returns in this????
+ # FIXME: how did we get line returns in this????
line = line.replace('\n', ' ')
# remove some junk in man pages: \(co
diff --git a/src/cluecode/copyrights_hint.py b/src/cluecode/copyrights_hint.py
index 90f2be1be0f..33739cfb368 100644
--- a/src/cluecode/copyrights_hint.py
+++ b/src/cluecode/copyrights_hint.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -23,7 +23,6 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
from __future__ import absolute_import
from datetime import datetime
@@ -37,7 +36,6 @@
years = r'[\(\.,\-\)\s]+(' + '|'.join(years) + r')[\(\.,\-\)\s]+'
years = re.compile(years).findall
-
statement_markers = u'''
©
cop
@@ -54,7 +52,6 @@
devel
'''.split()
-
# (various copyright/copyleft signs tm, r etc) http://en.wikipedia.org/wiki/Copyright_symbol
# ™ U+2122 TRADE MARK SIGN, decimal: 8482, HTML: ™, UTF-8: 0xE2 0x84 0xA2, block: Letterlike Symbols, decomposition: U+0054 U+004D
@@ -63,7 +60,6 @@
# � U+00AE (174)
# � U+2122 (8482)
-
'''HTML Entity (decimal) ©
HTML Entity (hex) ©
HTML Entity (named) ©
@@ -79,13 +75,11 @@
Python source code u"\u00A9"
'''
-
end_of_statement = '''
rights reserve
right reserve
'''.split()
-
# others stuffs
'''
®
diff --git a/src/cluecode/finder.py b/src/cluecode/finder.py
index a1bc5aee525..5ba20343b63 100644
--- a/src/cluecode/finder.py
+++ b/src/cluecode/finder.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -22,22 +22,36 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-from __future__ import absolute_import, print_function
+from __future__ import absolute_import
+from __future__ import print_function
-import logging
import string
import re
import url as urlpy
import ipaddress
-from textcode import analysis
from cluecode import finder_data
+from textcode import analysis
+
+# Tracing flags
+TRACE = False
-LOG = logging.getLogger(__name__)
+def logger_debug(*args):
+ pass
-DEBUG = False
+
+if TRACE:
+ import logging
+ import sys
+ logger = logging.getLogger(__name__)
+ # logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+ logging.basicConfig(stream=sys.stdout)
+ logger.setLevel(logging.DEBUG)
+
+ def logger_debug(*args):
+ return logger.debug(' '.join(isinstance(a, basestring) and a or repr(a) for a in args))
"""
Find patterns in text lines such as a emails and URLs.
@@ -53,18 +67,18 @@ def find(location, patterns):
Note: the location can be a list of lines for testing convenience.
"""
- if DEBUG:
+ if TRACE:
from pprint import pformat
loc = pformat(location)
- print('find(location=%(loc)r,\n patterns=%(patterns)r)' % locals())
+ logger_debug('find(location=%(loc)r,\n patterns=%(patterns)r)' % locals())
for i, line in enumerate(analysis.text_lines(location)):
lineno = i + 1
for key, pattern in patterns:
for match in pattern.findall(line):
- if DEBUG:
- print('find: yielding match: key=%(key)r, '
+ if TRACE:
+ logger_debug('find: yielding match: key=%(key)r, '
'match=%(match)r,\n line=%(line)r' % locals())
yield key, unicode(match), line, lineno
@@ -110,11 +124,12 @@ def build_regex_filter(pattern):
Return a filter function using regex pattern, filtering out matches
matching this regex. The pattern should be text, not a compiled re.
"""
+
def re_filt(matches):
for key, match, line, lineno in matches:
if re.match(regex, match):
- if DEBUG:
- print('build_regex_filter(pattern=%(pattern)r: '
+ if TRACE:
+ logger_debug('build_regex_filter(pattern=%(pattern)r: '
'filtering match: %(match)r' % locals())
continue
yield key, match, line, lineno
@@ -122,7 +137,6 @@ def re_filt(matches):
regex = re.compile(pattern, re.UNICODE | re.I)
return re_filt
-
# A good reference page of email address regex is:
# http://fightingforalostcause.net/misc/2006/compare-email-regex.php email
# regex from http://www.regular-expressions.info/regexbuddy/email.html
@@ -172,7 +186,6 @@ def uninteresting_emails_filter(matches):
continue
yield key, email, line, lineno
-
# TODO: consider: http://www.regexguru.com/2008/11/detecting-urls-in-a-block-of-text/
# TODO: consider: http://blog.codinghorror.com/the-problem-with-urls/
@@ -180,6 +193,7 @@ def uninteresting_emails_filter(matches):
schemes = 'https?|ftps?|sftp|rsync|ssh|svn|git|hg|https?\+git|https?\+svn|https?\+hg'
url_body = '[^\s<>\[\]"]'
+
def urls_regex():
# no space, no < >, no [ ] and no double quote
return re.compile(r'''
@@ -237,8 +251,8 @@ def empty_urls_filter(matches):
for key, match, line, lineno in matches:
junk = match.lower().strip(string.punctuation).strip()
if not junk or junk in EMPTY_URLS:
- if DEBUG:
- print('empty_urls_filter: filtering match: %(match)r'
+ if TRACE:
+ logger_debug('empty_urls_filter: filtering match: %(match)r'
% locals())
continue
yield key, match, line, lineno
@@ -328,8 +342,8 @@ def user_pass_cleaning_filter(matches):
if is_filterable(match):
host, _domain = url_host_domain(match)
if not host:
- if DEBUG:
- print('user_pass_cleaning_filter: '
+ if TRACE:
+ logger_debug('user_pass_cleaning_filter: '
'filtering match(no host): %(match)r' % locals())
continue
if '@' in host:
@@ -362,14 +376,15 @@ def canonical_url_cleaner(matches):
for key, match, line, lineno in matches:
if is_filterable(match):
match = canonical_url(match)
- if DEBUG:
- print('canonical_url_cleaner: '
+ if TRACE:
+ logger_debug('canonical_url_cleaner: '
'match=%(match)r, canonic=%(canonic)r' % locals())
yield key, match , line, lineno
IP_V4_RE = r'^(\d{1,3}\.){0,3}\d{1,3}$'
+
def is_ip_v4(s):
return re.compile(IP_V4_RE).match(s)
@@ -449,7 +464,6 @@ def is_good_host(host):
return False
return finder_data.classify_ip(host)
-
# at this stage we have a host name, not an IP
if '.' not in host:
@@ -484,14 +498,14 @@ def junk_url_hosts_filter(matches):
if is_filterable(match):
host, domain = url_host_domain(match)
if not is_good_host(host):
- if DEBUG:
- print('junk_url_hosts_filter: '
+ if TRACE:
+ logger_debug('junk_url_hosts_filter: '
'!is_good_host:%(host)r): %(match)r' % locals())
continue
if not is_good_host(domain) and not is_ip(host):
- if DEBUG:
- print('junk_url_hosts_filter: ''!is_good_host:%(domain)r '
+ if TRACE:
+ logger_debug('junk_url_hosts_filter: ''!is_good_host:%(domain)r '
'and !is_ip:%(host)r: %(match)r' % locals())
continue
yield key, match, line, lineno
@@ -506,8 +520,8 @@ def junk_urls_filter(matches):
for key, match, line, lineno in matches:
good_url = finder_data.classify_url(match)
if not good_url:
- if DEBUG:
- print('junk_url_filter: %(match)r' % locals())
+ if TRACE:
+ logger_debug('junk_url_filter: %(match)r' % locals())
continue
yield key, match, line, lineno
diff --git a/src/cluecode/finder_data.py b/src/cluecode/finder_data.py
index cbecb6533ce..3baf0a8fef2 100644
--- a/src/cluecode/finder_data.py
+++ b/src/cluecode/finder_data.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -40,7 +40,6 @@ def set_from_text(text):
test.com
''')
-
JUNK_HOSTS_AND_DOMAINS = set_from_text(u'''
exmaple.com
example.com
@@ -56,12 +55,10 @@ def set_from_text(text):
hostname
''')
-
JUNK_IPS = set_from_text(u'''
1.2.3.4
''')
-
JUNK_URLS = set_from_text(u'''
http://www.adobe.com/2006/mxml
http://www.w3.org/1999/XSL/Transform
@@ -134,7 +131,6 @@ def set_from_text(text):
http://gcc.gnu.org/bugs.html
''')
-
JUNK_URL_PREFIXES = tuple(set_from_text('''
http://www.springframework.org/dtd/
http://www.slickedit.com/dtd/
@@ -175,7 +171,6 @@ def set_from_text(text):
http://www.oasis-open.org/docbook/xml/
'''))
-
JUNK_URL_SUFFIXES = tuple(set_from_text('''
.png
.jpg
diff --git a/src/commoncode/__init__.py b/src/commoncode/__init__.py
index 096c946c0db..702495e4b65 100644
--- a/src/commoncode/__init__.py
+++ b/src/commoncode/__init__.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -26,19 +26,22 @@
from __future__ import print_function
from __future__ import unicode_literals
-# set re and fnmatch _MAXCACHE to 1M to cache regex compiled aggressively
-# their default is 100 and many utilities and libraries use a lot of regex
-import re
-remax = getattr(re, '_MAXCACHE', 0)
-if remax < 1000000:
- setattr(re, '_MAXCACHE', 1000000)
-del remax
+def set_re_max_cache(max_cache=1000000):
+ """
+ Set re and fnmatch _MAXCACHE to 1M to cache regex compiled aggressively
+ their default is 100 and many utilities and libraries use a lot of regex
+ """
+ import re
+ import fnmatch
-import fnmatch
+ remax = getattr(re, '_MAXCACHE', 0)
+ if remax < max_cache:
+ setattr(re, '_MAXCACHE', max_cache)
-fnmatchmax = getattr(fnmatch, '_MAXCACHE', 0)
-if fnmatchmax < 1000000:
- setattr(fnmatch, '_MAXCACHE', 1000000)
-del fnmatchmax
-del re
+ fnmatchmax = getattr(fnmatch, '_MAXCACHE', 0)
+ if fnmatchmax < max_cache:
+ setattr(fnmatch, '_MAXCACHE', max_cache)
+
+
+set_re_max_cache()
diff --git a/src/commoncode/command.py b/src/commoncode/command.py
index ba1b6fd8e53..4594091cabb 100644
--- a/src/commoncode/command.py
+++ b/src/commoncode/command.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -27,14 +27,21 @@
from __future__ import unicode_literals
import ctypes
-import os
+import os as _os_module
+from os.path import abspath
+from os.path import exists
+from os.path import dirname
+from os.path import join
+
import logging
import signal
import subprocess
-from commoncode import fileutils
-from commoncode.fileutils import path_to_bytes
-from commoncode.fileutils import path_to_unicode
+from commoncode.fileutils import chmod
+from commoncode.fileutils import fsencode
+from commoncode.fileutils import fsdecode
+from commoncode.fileutils import get_temp_dir
+from commoncode.fileutils import RX
from commoncode import text
from commoncode import system
from commoncode.system import current_os_arch
@@ -43,21 +50,14 @@
from commoncode.system import on_windows
from commoncode.system import on_linux
-
# Python 2 and 3 support
try:
# Python 2
unicode
- str = unicode
+ str = unicode # NOQA
except NameError:
# Python 3
- unicode = str
-
-try:
- from os import fsencode
-except ImportError:
- from backports.os import fsencode
-
+ unicode = str # NOQA
"""
Minimal wrapper for executing external commands in sub-processes. The approach
@@ -81,7 +81,7 @@
# logger.setLevel(logging.DEBUG)
# current directory is the root dir of this library
-curr_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+curr_dir = dirname(dirname(abspath(__file__)))
def execute(cmd, args, root_dir=None, cwd=None, env=None, to_files=False):
@@ -108,9 +108,9 @@ def execute(cmd, args, root_dir=None, cwd=None, env=None, to_files=False):
cwd = cwd or curr_dir
# temp files for stderr and stdout
- tmp_dir = fileutils.get_temp_dir(base_dir='cmd')
- sop = os.path.join(tmp_dir, 'stdout')
- sep = os.path.join(tmp_dir, 'stderr')
+ tmp_dir = get_temp_dir(prefix='scancode-cmd-')
+ sop = join(tmp_dir, 'stdout')
+ sep = join(tmp_dir, 'stderr')
# shell==True is DANGEROUS but we are not running arbitrary commands
# though we can execute command that just happen to be in the path
@@ -144,7 +144,7 @@ def os_arch_dir(root_dir, _os_arch=current_os_arch):
Return a sub-directory of `root_dir` tailored for the current OS and
current processor architecture.
"""
- return os.path.join(root_dir, _os_arch)
+ return join(root_dir, _os_arch)
def os_noarch_dir(root_dir, _os_noarch=current_os_noarch):
@@ -152,7 +152,7 @@ def os_noarch_dir(root_dir, _os_noarch=current_os_noarch):
Return a sub-directory of `root_dir` tailored for the current OS and NOT
specific to a processor architecture.
"""
- return os.path.join(root_dir, _os_noarch)
+ return join(root_dir, _os_noarch)
def noarch_dir(root_dir, _noarch=noarch):
@@ -160,7 +160,7 @@ def noarch_dir(root_dir, _noarch=noarch):
Return a sub-directory of `root_dir` that is NOT specific to an OS or
processor architecture.
"""
- return os.path.join(root_dir, _noarch)
+ return join(root_dir, _noarch)
def get_base_dirs(root_dir,
@@ -185,14 +185,14 @@ def get_base_dirs(root_dir,
binary of any given binary. This function resolves to an actual OS/arch
location in this context.
"""
- if not root_dir or not os.path.exists(root_dir):
+ if not root_dir or not exists(root_dir):
return []
dirs = []
def find_loc(fun, arg):
loc = fun(root_dir, arg)
- if os.path.exists(loc):
+ if exists(loc):
dirs.append(loc)
if _os_arch:
@@ -217,17 +217,17 @@ def get_bin_lib_dirs(base_dir):
if not base_dir:
return None, None
- bin_dir = os.path.join(base_dir, 'bin')
+ bin_dir = join(base_dir, 'bin')
- if os.path.exists(bin_dir):
- fileutils.chmod(bin_dir, fileutils.RX, recurse=True)
+ if exists(bin_dir):
+ chmod(bin_dir, RX, recurse=True)
else:
bin_dir = None
- lib_dir = os.path.join(base_dir, 'lib')
+ lib_dir = join(base_dir, 'lib')
- if os.path.exists(lib_dir):
- fileutils.chmod(bin_dir, fileutils.RX, recurse=True)
+ if exists(lib_dir):
+ chmod(bin_dir, RX, recurse=True)
else:
# default to bin for lib if it exists
lib_dir = bin_dir or None
@@ -291,9 +291,9 @@ def get_locations(cmd, root_dir,
for base_dir in get_base_dirs(root_dir, _os_arch, _os_noarch, _noarch):
bin_dir, lib_dir = get_bin_lib_dirs(base_dir)
- cmd_loc = os.path.join(bin_dir, cmd)
- if os.path.exists(cmd_loc):
- fileutils.chmod(cmd_loc, fileutils.RX, recurse=False)
+ cmd_loc = join(bin_dir, cmd)
+ if exists(cmd_loc):
+ chmod(cmd_loc, RX, recurse=False)
return cmd_loc, bin_dir, lib_dir
else:
# we just care for getting the dirs and grab the first one
@@ -326,7 +326,7 @@ def close_pipe(p):
try:
# Ensure process death otherwise proc.wait may hang in some cases
# NB: this will run only on POSIX OSes supporting signals
- os.kill(proc.pid, signal.SIGKILL) # @UndefinedVariable
+ os.kill(proc.pid, signal.SIGKILL) # NOQA
except:
pass
@@ -341,12 +341,12 @@ def load_lib(libname, root_dir):
"""
os_dir = get_base_dirs(root_dir)[0]
_bin_dir, lib_dir = get_bin_lib_dirs(os_dir)
- so = os.path.join(lib_dir, libname + system.lib_ext)
+ so = join(lib_dir, libname + system.lib_ext)
# add lib path to the front of the PATH env var
update_path_environment(lib_dir)
- if os.path.exists(so):
+ if exists(so):
if not isinstance(so, bytes):
# ensure that the path is not Unicode...
so = fsencode(so)
@@ -356,7 +356,7 @@ def load_lib(libname, root_dir):
raise ImportError('Failed to load %(libname)s from %(so)r' % locals())
-def update_path_environment(new_path, _os_module=os):
+def update_path_environment(new_path, _os_module=_os_module):
"""
Update the PATH environment variable by adding `new_path` to the front
of PATH if `new_path` is not alreday in the PATH.
@@ -379,12 +379,12 @@ def update_path_environment(new_path, _os_module=os):
# ensure we use unicode or bytes depending on OSes
if on_linux:
- new_path = path_to_bytes(new_path)
- path_env = path_to_bytes(path_env)
+ new_path = fsencode(new_path)
+ path_env = fsencode(path_env)
sep = _os_module.pathsep
else:
- new_path = path_to_unicode(new_path)
- path_env = path_to_unicode(path_env)
+ new_path = fsdecode(new_path)
+ path_env = fsdecode(path_env)
sep = unicode(_os_module.pathsep)
path_segments = path_env.split(sep)
@@ -399,6 +399,6 @@ def update_path_environment(new_path, _os_module=os):
if not on_linux:
# recode to bytes using FS encoding
- new_path_env = path_to_bytes(new_path_env)
+ new_path_env = fsencode(new_path_env)
# ... and set the variable back as bytes
_os_module.environ[b'PATH'] = new_path_env
diff --git a/src/commoncode/dict_utils.py b/src/commoncode/dict_utils.py
index d9df72d7af8..5c71159e0af 100644
--- a/src/commoncode/dict_utils.py
+++ b/src/commoncode/dict_utils.py
@@ -36,13 +36,11 @@
import collections
import itertools
-
# Placeholder constants
FREE = -1
DUMMY = -2
-
class Dict(collections.MutableMapping):
"""
Space efficient dictionary with fast iteration and cheap resizes.
diff --git a/src/commoncode/fetch.py b/src/commoncode/fetch.py
index 6f4c34ae0a3..0a1e656a8ab 100644
--- a/src/commoncode/fetch.py
+++ b/src/commoncode/fetch.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -35,7 +35,6 @@
from commoncode import fileutils
import os
-
logger = logging.getLogger(__name__)
# import sys
# logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
@@ -65,7 +64,7 @@ def download_url(url, file_name=None, verify=True, timeout=10):
logger.error(msg)
raise Exception(msg)
- tmp_dir = fileutils.get_temp_dir(base_dir='fetch')
+ tmp_dir = fileutils.get_temp_dir(prefix='scancode-fetch-')
output_file = os.path.join(tmp_dir, file_name)
with open(output_file, 'wb') as out:
out.write(response.content)
diff --git a/src/commoncode/fileset.py b/src/commoncode/fileset.py
index 589feb88922..b36e2d51e07 100644
--- a/src/commoncode/fileset.py
+++ b/src/commoncode/fileset.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -34,18 +34,15 @@
from commoncode import paths
from commoncode.system import on_linux
-
DEBUG = False
logger = logging.getLogger(__name__)
# import sys
# logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
# logger.setLevel(logging.DEBUG)
-
POSIX_PATH_SEP = b'/' if on_linux else '/'
EMPTY_STRING = b'' if on_linux else ''
-
"""
Match files and directories paths based on inclusion and exclusion glob-style
patterns.
diff --git a/src/commoncode/filetype.py b/src/commoncode/filetype.py
index 9e24e00b12d..ca4db6f3117 100644
--- a/src/commoncode/filetype.py
+++ b/src/commoncode/filetype.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -33,11 +33,11 @@
from commoncode.system import on_posix
from commoncode.functional import memoize
-
"""
Low level file type utilities, essentially a wrapper around os.path and stat.
"""
+
def is_link(location):
"""
Return True if `location` is a symbolic link.
@@ -192,6 +192,7 @@ def get_last_modified_date(location):
'file_size': os.path.getsize,
}
+
@memoize
def counter(location, counting_function):
"""
diff --git a/src/commoncode/fileutils.py b/src/commoncode/fileutils.py
index a0c236212e1..9f45aee8b4a 100644
--- a/src/commoncode/fileutils.py
+++ b/src/commoncode/fileutils.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -23,24 +23,24 @@
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
from __future__ import absolute_import
-from __future__ import unicode_literals
from __future__ import print_function
+from __future__ import unicode_literals
# Python 2 and 3 support
try:
# Python 2
unicode
- str = unicode
+ str = unicode # NOQA
except NameError:
# Python 3
- unicode = str
+ unicode = str # NOQA
try:
from os import fsencode
+ from os import fsdecode
except ImportError:
from backports.os import fsencode
- from backports.os import fsdecode
-
+ from backports.os import fsdecode # NOQA
import codecs
import errno
@@ -52,19 +52,21 @@
import sys
import tempfile
+try:
+ from scancode_config import scancode_temp_dir
+except ImportError:
+ scancode_temp_dir = None
from commoncode import filetype
from commoncode.filetype import is_rwx
-from commoncode import system
from commoncode.system import on_linux
from commoncode import text
# this exception is not available on posix
try:
- WindowsError # @UndefinedVariable
+ WindowsError # NOQA
except NameError:
- WindowsError = None # @ReservedAssignment
-
+ WindowsError = None # NOQA
TRACE = False
@@ -72,9 +74,11 @@
logger = logging.getLogger(__name__)
+
def logger_debug(*args):
pass
+
if TRACE:
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
@@ -82,7 +86,6 @@ def logger_debug(*args):
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, basestring) and a or repr(a) for a in args))
-
# Paths can only be sanely handled as raw bytes on Linux
PATH_TYPE = bytes if on_linux else unicode
POSIX_PATH_SEP = b'/' if on_linux else '/'
@@ -98,6 +101,7 @@ def logger_debug(*args):
# DIRECTORIES
#
+
def create_dir(location):
"""
Create directory and all sub-directories recursively at location ensuring these
@@ -115,7 +119,7 @@ def create_dir(location):
# FIXME: consider using UNC ?\\ paths
if on_linux:
- location = path_to_bytes(location)
+ location = fsencode(location)
try:
os.makedirs(location)
chmod(location, RW, recurse=False)
@@ -137,37 +141,39 @@ def create_dir(location):
raise
-def system_temp_dir():
+def get_temp_dir(base_dir=scancode_temp_dir, prefix=''):
"""
- Return the global temp directory for the current user.
+ Return the path to a new existing unique temporary directory, created under
+ the `base_dir` base directory using the `prefix` prefix.
+ If `base_dir` is not provided, use the 'SCANCODE_TMP' env var or the system
+ temp directory.
+
+ WARNING: do not change this code without changing scancode_config.py too
"""
- temp_dir = os.getenv('SCANCODE_TMP')
- if not temp_dir:
- sc = text.python_safe_name('scancode_' + system.username)
- temp_dir = os.path.join(tempfile.gettempdir(), sc)
- if on_linux:
- temp_dir = path_to_bytes(temp_dir)
- create_dir(temp_dir)
- return temp_dir
+ has_base = bool(base_dir)
+ if not has_base:
+ base_dir = os.getenv('SCANCODE_TMP')
+ if not base_dir:
+ base_dir = tempfile.gettempdir()
+ else:
+ if on_linux:
+ base_dir = fsencode(base_dir)
+ create_dir(base_dir)
+
+ if not has_base:
+ prefix = 'scancode-tk-'
-def get_temp_dir(base_dir, prefix=''):
- """
- Return the path to a new unique temporary directory, created under
- the system-wide `system_temp_dir` temp directory as a subdir of the
- base_dir path (a path relative to the `system_temp_dir`).
- """
if on_linux:
- base_dir = path_to_bytes(base_dir)
- prefix = path_to_bytes(prefix)
- base = os.path.join(system_temp_dir(), base_dir)
- create_dir(base)
- return tempfile.mkdtemp(prefix=prefix, dir=base)
+ prefix = fsencode(prefix)
+
+ return tempfile.mkdtemp(prefix=prefix, dir=base_dir)
#
# FILE READING
#
+
def file_chunks(file_object, chunk_size=1024):
"""
Yield a file piece by piece. Default chunk size: 1k.
@@ -190,7 +196,7 @@ def _text(location, encoding, universal_new_lines=True):
Python2.6 see http://bugs.python.org/issue691291
"""
if on_linux:
- location = path_to_bytes(location)
+ location = fsencode(location)
with codecs.open(location, 'r', encoding) as f:
text = f.read()
if universal_new_lines:
@@ -215,25 +221,6 @@ def read_text_file(location, universal_new_lines=True):
# TODO: move these functions to paths.py or codecs.py
-def path_to_unicode(path):
- """
- Return a path string `path` as a unicode string.
- """
- if isinstance(path, unicode):
- return path
- if TRACE: logger_debug('path_to_unicode:', fsdecode(path))
- return fsdecode(path)
-
-
-def path_to_bytes(path):
- """
- Return a `path` string as a byte string using the filesystem encoding.
- """
- if isinstance(path, bytes):
- return path
- if TRACE: logger_debug('path_to_bytes:' , repr(fsencode(path)))
- return fsencode(path)
-
def is_posixpath(location):
"""
@@ -328,6 +315,53 @@ def file_extension(path, force_posix=False):
return splitext(path, force_posix)[1]
+def splitext_name(file_name, is_file=True):
+ """
+ Return a tuple of Unicode strings (basename, extension) for a file name. The
+ basename is the file name minus its extension. Return an empty extension
+ string for a directory. Not the same as os.path.splitext_name.
+
+ For example:
+ >>> expected = 'path', '.ext'
+ >>> assert expected == splitext_name('path.ext')
+
+ Directories even with dotted names have no extension:
+ >>> expected = 'path.ext', ''
+ >>> assert expected == splitext_name('path.ext', is_file=False)
+
+ >>> expected = 'file', '.txt'
+ >>> assert expected == splitext_name('file.txt')
+
+ Composite extensions for tarballs are properly handled:
+ >>> expected = 'archive', '.tar.gz'
+ >>> assert expected == splitext_name('archive.tar.gz')
+
+ dotfile are properly handled:
+ >>> expected = '.dotfile', ''
+ >>> assert expected == splitext_name('.dotfile')
+ >>> expected = '.dotfile', '.this'
+ >>> assert expected == splitext_name('.dotfile.this')
+ """
+
+ if not file_name:
+ return '', ''
+ file_name = fsdecode(file_name)
+
+ if not is_file:
+ return file_name, ''
+
+ if file_name.startswith('.') and '.' not in file_name[1:]:
+ # .dot files base name is the full name and they do not have an extension
+ return file_name, ''
+
+ base_name, extension = posixpath.splitext(file_name)
+ # handle composed extensions of tar.gz, bz, zx,etc
+ if base_name.endswith('.tar'):
+ base_name, extension2 = posixpath.splitext(base_name)
+ extension = extension2 + extension
+ return base_name, extension
+
+# TODO: FIXME: this is badly broken!!!!
def splitext(path, force_posix=False):
"""
Return a tuple of strings (basename, extension) for a path. The basename is
@@ -382,6 +416,7 @@ def splitext(path, force_posix=False):
# DIRECTORY AND FILES WALKING/ITERATION
#
+
ignore_nothing = lambda _: False
@@ -397,7 +432,7 @@ def walk(location, ignored=ignore_nothing):
- location is a directory or a file: for a file, the file is returned.
"""
if on_linux:
- location = path_to_bytes(location)
+ location = fsencode(location)
# TODO: consider using the new "scandir" module for some speed-up.
if TRACE:
@@ -432,60 +467,28 @@ def walk(location, ignored=ignore_nothing):
yield tripple
-def file_iter(location, ignored=ignore_nothing):
- """
- Return an iterable of files at `location` recursively.
-
- :param location: a file or a directory.
- :param ignored: a callable accepting a location argument and returning True
- if the location should be ignored.
- :return: an iterable of file locations.
- """
- if on_linux:
- location = path_to_bytes(location)
-
- return resource_iter(location, ignored, with_dirs=False)
-
-
-def dir_iter(location, ignored=ignore_nothing):
+def resource_iter(location, ignored=ignore_nothing, with_dirs=True):
"""
- Return an iterable of directories at `location` recursively.
-
- :param location: a directory.
- :param ignored: a callable accepting a location argument and returning True
- if the location should be ignored.
- :return: an iterable of directory locations.
- """
- if on_linux:
- location = path_to_bytes(location)
- return resource_iter(location, ignored, with_files=False)
-
-
-def resource_iter(location, ignored=ignore_nothing, with_files=True, with_dirs=True):
- """
- Return an iterable of resources at `location` recursively.
+ Return an iterable of paths at `location` recursively.
:param location: a file or a directory.
:param ignored: a callable accepting a location argument and returning True
if the location should be ignored.
- :param with_dirs: If True, include the directories.
- :param with_files: If True, include the files.
:return: an iterable of file and directory locations.
"""
- assert with_dirs or with_files, "fileutils.resource_iter: One or both of 'with_dirs' and 'with_files' is required"
if on_linux:
- location = path_to_bytes(location)
+ location = fsencode(location)
for top, dirs, files in walk(location, ignored):
- if with_files:
- for f in files:
- yield os.path.join(top, f)
if with_dirs:
for d in dirs:
yield os.path.join(top, d)
+ for f in files:
+ yield os.path.join(top, f)
#
# COPY
#
+
def copytree(src, dst):
"""
Copy recursively the `src` directory to the `dst` directory. If `dst` is an
@@ -501,8 +504,8 @@ def copytree(src, dst):
function. See fileutils.py.ABOUT for details.
"""
if on_linux:
- src = path_to_bytes(src)
- dst = path_to_bytes(dst)
+ src = fsencode(src)
+ dst = fsencode(dst)
if not filetype.is_readable(src):
chmod(src, R, recurse=False)
@@ -550,8 +553,8 @@ def copyfile(src, dst):
for details.
"""
if on_linux:
- src = path_to_bytes(src)
- dst = path_to_bytes(dst)
+ src = fsencode(src)
+ dst = fsencode(dst)
if not filetype.is_regular(src):
return
@@ -571,8 +574,8 @@ def copytime(src, dst):
for details.
"""
if on_linux:
- src = path_to_bytes(src)
- dst = path_to_bytes(dst)
+ src = fsencode(src)
+ dst = fsencode(dst)
errors = []
st = os.stat(src)
@@ -591,6 +594,7 @@ def copytime(src, dst):
# PERMISSIONS
#
+
# modes: read, write, executable
R = stat.S_IRUSR
RW = stat.S_IRUSR | stat.S_IWUSR
@@ -608,7 +612,7 @@ def chmod(location, flags, recurse=False):
if not location or not os.path.exists(location):
return
if on_linux:
- location = path_to_bytes(location)
+ location = fsencode(location)
location = os.path.abspath(location)
@@ -638,7 +642,7 @@ def chmod_tree(location, flags):
Update permissions recursively in a directory tree `location`.
"""
if on_linux:
- location = path_to_bytes(location)
+ location = fsencode(location)
if filetype.is_dir(location):
for top, dirs, files in walk(location):
for d in dirs:
@@ -650,13 +654,14 @@ def chmod_tree(location, flags):
# DELETION
#
-def _rm_handler(function, path, excinfo): # @UnusedVariable
+
+def _rm_handler(function, path, excinfo): # NOQA
"""
shutil.rmtree handler invoked on error when deleting a directory tree.
This retries deleting once before giving up.
"""
if on_linux:
- path = path_to_bytes(path)
+ path = fsencode(path)
if function == os.rmdir:
try:
chmod(path, RW, recurse=True)
@@ -686,7 +691,7 @@ def delete(location, _err_handler=_rm_handler):
return
if on_linux:
- location = path_to_bytes(location)
+ location = fsencode(location)
if os.path.exists(location) or filetype.is_broken_link(location):
chmod(os.path.dirname(location), RW, recurse=False)
diff --git a/src/commoncode/functional.py b/src/commoncode/functional.py
index 1175f98fd6c..93049018a06 100644
--- a/src/commoncode/functional.py
+++ b/src/commoncode/functional.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -37,6 +37,7 @@ def flatten(seq):
flat list of elements.
For example::
+
>>> flatten([7, (6, [5, [4, ['a'], 3]], 3), 2, 1])
[7, 6, 5, 4, 'a', 3, 3, 2, 1]
>>> def gen():
@@ -68,6 +69,7 @@ def pair_chunks(iterable):
must contain an even number of elements or it will truncated.
For example::
+
>>> list(pair_chunks([1, 2, 3, 4, 5, 6]))
[(1, 2), (3, 4), (5, 6)]
>>> list(pair_chunks([1, 2, 3, 4, 5, 6, 7]))
@@ -78,10 +80,11 @@ def pair_chunks(iterable):
def memoize(fun):
"""
- Decorate fun function and cache return values. Arguments must be
- hashable. kwargs are not handled. Used to speed up some often executed
- functions.
- Usage example::
+ Decorate `fun` function and cache return values. Arguments must be hashable.
+ Only args are supported, kwargs are not handled. Used to speed up some often
+ executed functions.
+
+ For example::
>>> @memoize
... def expensive(*args, **kwargs):
@@ -114,7 +117,7 @@ def memoized(*args, **kwargs):
# calls with kwargs are not handled and not cached
if kwargs:
return fun(*args, **kwargs)
- # convert any list arg to a tuple
+ # convert any list args to a tuple
args = tuple(tuple(arg) if isinstance(arg, (ListType, tuple, array)) else arg
for arg in args)
try:
@@ -128,10 +131,11 @@ def memoized(*args, **kwargs):
def memoize_to_attribute(attr_name, _test=False):
"""
- Decorate a method and cache return values in attr_name of the parent object.
+ Decorate a method and cache return values in `attr_name` of the parent object.
Used to speed up some often called methods that cache their values in
instance variables.
- Usage example::
+
+ For example::
>>> class Obj(object):
... def __init__(self):
@@ -153,7 +157,9 @@ def memoize_to_attribute(attr_name, _test=False):
The Obj().expensive property value will be cached to attr_name
self._expensive and computed only once in the life of the Obj instance.
"""
+
def memoized_to_attr(meth):
+
@functools.wraps(meth)
def wrapper(self, *args, **kwargs):
if getattr(self, attr_name) is None:
@@ -162,6 +168,7 @@ def wrapper(self, *args, **kwargs):
else:
res = getattr(self, attr_name)
return res
+
return wrapper
return memoized_to_attr
@@ -169,10 +176,11 @@ def wrapper(self, *args, **kwargs):
def memoize_gen(fun):
"""
- Decorate fun generator function and cache return values. Arguments must be
+ Decorate `fun` generator function and cache return values. Arguments must be
hashable. kwargs are not handled. Used to speed up some often executed
functions.
- Usage example::
+
+ For example::
>>> @memoize
... def expensive(*args, **kwargs):
@@ -215,3 +223,34 @@ def memoized(*args, **kwargs):
return memos[args]
return functools.update_wrapper(memoized, fun)
+
+
+def iter_skip(iterable, skip_first=False, skip_last=False):
+ """
+ Given an iterable, return an iterable skipping the first item if skip_first
+ is True or the last item if skip_last is True.
+ For example:
+ >>> a = iter(range(10))
+ >>> list(iter_skip(a, skip_first=True, skip_last=False))
+ [1, 2, 3, 4, 5, 6, 7, 8, 9]
+ >>> a = iter(range(10))
+ >>> list(iter_skip(a, skip_first=False, skip_last=True))
+ [0, 1, 2, 3, 4, 5, 6, 7, 8]
+ >>> a = iter(range(10))
+ >>> list(iter_skip(a, skip_first=True, skip_last=True))
+ [1, 2, 3, 4, 5, 6, 7, 8]
+ >>> a = iter(range(10))
+ >>> list(iter_skip(a, skip_first=False, skip_last=False))
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+ >>> a = iter(range(10))
+ >>> list(iter_skip(a))
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+ """
+ current = next(iterable)
+ if skip_first:
+ current = next(iterable)
+ for item in iterable:
+ yield current
+ current = item
+ if not skip_last:
+ yield current
diff --git a/src/commoncode/hash.py b/src/commoncode/hash.py
index d8f9ab94feb..d7b3f48ec40 100644
--- a/src/commoncode/hash.py
+++ b/src/commoncode/hash.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -33,7 +33,6 @@
from commoncode.codec import urlsafe_b64encode
from commoncode import filetype
-
"""
Hashes and checksums.
@@ -44,12 +43,15 @@
Checksums are operating on files.
"""
+
def _hash_mod(bitsize, hmodule):
"""
Return a hashing class returning hashes with a `bitsize` bit length. The
interface of this class is similar to the hash module API.
"""
+
class hasher(object):
+
def __init__(self, msg=None):
self.digest_size = bitsize // 8
self.h = msg and hmodule(msg).digest()[:self.digest_size] or None
@@ -94,6 +96,7 @@ class sha1_git_hasher(object):
"""
Hash content using the git blob SHA1 convention.
"""
+
def __init__(self, msg=None):
self.digest_size = 160 // 8
self.h = msg and self._compute(msg) or None
@@ -148,18 +151,23 @@ def checksum(location, name, base64=False):
def md5(location):
return checksum(location, name='md5', base64=False)
+
def sha1(location):
return checksum(location, name='sha1', base64=False)
+
def b64sha1(location):
return checksum(location, name='sha1', base64=True)
+
def sha256(location):
return checksum(location, name='sha256', base64=False)
+
def sha512(location):
return checksum(location, name='sha512', base64=False)
+
def sha1_git(location):
return checksum(location, name='sha1_git', base64=False)
diff --git a/src/commoncode/ignore.py b/src/commoncode/ignore.py
index c4a86be930b..d04e4892342 100644
--- a/src/commoncode/ignore.py
+++ b/src/commoncode/ignore.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -36,7 +36,7 @@
"""
-def is_ignored(location, ignores, unignores, skip_special=True):
+def is_ignored(location, ignores, unignores=None, skip_special=True):
"""
Return a tuple of (pattern , message) if a file at location is ignored
or False otherwise.
@@ -74,6 +74,7 @@ def get_ignores(location, include_defaults=True):
# Default ignores
#
+
ignores_MacOSX = {
'.DS_Store': 'Default ignore: MacOSX artifact',
'._.DS_Store': 'Default ignore: MacOSX artifact',
@@ -293,7 +294,6 @@ def get_ignores(location, include_defaults=True):
'/.ssh': 'Default ignore: SSH configuration',
}
-
default_ignores = {}
default_ignores.update(chain(*[d.items() for d in [
diff --git a/src/commoncode/misc.ABOUT b/src/commoncode/misc.ABOUT
deleted file mode 100644
index 5cd542f93b1..00000000000
--- a/src/commoncode/misc.ABOUT
+++ /dev/null
@@ -1,8 +0,0 @@
-about_resource: misc.py
-download_url:
- - http://code.activestate.com/recipes/578433-mixin-for-pickling-objects-with-__slots__/
-
-dje_license: mit
-license_text_file: misc.LICENSE
-copyright: Copyright (c) 2013 Oren Tirosh
-owner: Oren Tirosh
diff --git a/src/commoncode/misc.LICENSE b/src/commoncode/misc.LICENSE
deleted file mode 100644
index 4a72b80190d..00000000000
--- a/src/commoncode/misc.LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2013 Oren Tirosh
-#
-# Permission is hereby granted, free of charge, to any person
-# obtaining a copy of this software and associated documentation files
-# (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-# OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/src/commoncode/misc.py b/src/commoncode/misc.py
deleted file mode 100644
index be957dfdaed..00000000000
--- a/src/commoncode/misc.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
-# http://nexb.com and https://github.com/nexB/scancode-toolkit/
-# The ScanCode software is licensed under the Apache License version 2.0.
-# Data generated with ScanCode require an acknowledgment.
-# ScanCode is a trademark of nexB Inc.
-#
-# You may not use this software except in compliance with the License.
-# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software distributed
-# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# When you publish or redistribute any data created with ScanCode or any ScanCode
-# derivative work, you must accompany this data with the following acknowledgment:
-#
-# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
-# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
-# ScanCode should be considered or used as legal advice. Consult an Attorney
-# for any legal advice.
-# ScanCode is a free software code scanning tool from nexB Inc. and others.
-# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
-from __future__ import absolute_import, print_function
-
-
-class SlotPickleMixin(object):
- # SlotPickelMixin is originally from:
- # http://code.activestate.com/recipes/578433-mixin-for-pickling-objects-with-__slots__/
- # Copyright (c) 2013 Created by Oren Tirosh
- #
- # Permission is hereby granted, free of charge, to any person
- # obtaining a copy of this software and associated documentation files
- # (the "Software"), to deal in the Software without restriction,
- # including without limitation the rights to use, copy, modify, merge,
- # publish, distribute, sublicense, and/or sell copies of the Software,
- # and to permit persons to whom the Software is furnished to do so,
- # subject to the following conditions:
- #
- # The above copyright notice and this permission notice shall be
- # included in all copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- # OTHER DEALINGS IN THE SOFTWARE.
- def __getstate__(self):
- return {slot: getattr(self, slot) for slot in self.__slots__ if hasattr(self, slot)}
-
- def __setstate__(self, state):
- for slot, value in state.items():
- setattr(self, slot, value)
diff --git a/src/commoncode/paths.py b/src/commoncode/paths.py
index 903eba1e224..17defd15a52 100644
--- a/src/commoncode/paths.py
+++ b/src/commoncode/paths.py
@@ -38,13 +38,11 @@
from commoncode.fileutils import is_posixpath
from commoncode.system import on_linux
-
"""
Various path utilities such as common prefix and suffix functions, conversion
to OS-safe paths and to POSIX paths.
"""
-
POSIX_PATH_SEP = b'/' if on_linux else '/'
WIN_PATH_SEP = b'\\' if on_linux else '\\'
EMPTY_STRING = b'' if on_linux else ''
@@ -52,6 +50,7 @@
#
# Build OS-portable and safer paths
+
def safe_path(path, posix=False):
"""
Convert `path` to a safe and portable POSIX path usable on multiple OSes. The
@@ -78,8 +77,6 @@ def safe_path(path, posix=False):
segments = [s.strip() for s in path.split(path_sep) if s.strip()]
segments = [portable_filename(s) for s in segments]
- # print('safe_path: orig:', orig_path, 'segments:', segments)
-
if not segments:
return '_'
@@ -89,7 +86,6 @@ def safe_path(path, posix=False):
return as_posixpath(path)
-
def path_handlers(path, posix=True):
"""
Return a path module and path separator to use for handling (e.g. split and join)
@@ -223,7 +219,6 @@ def portable_filename(filename):
if basename.lower() in windows_illegal_names:
filename = ''.join([basename, '_', dot, extension])
-
# no name made only of dots.
if set(filename) == set(['.']):
filename = 'dot' * len(filename)
@@ -239,6 +234,7 @@ def portable_filename(filename):
# paths comparisons, common prefix and suffix extraction
#
+
def common_prefix(s1, s2):
"""
Return the common leading subsequence of two sequences and its length.
diff --git a/src/commoncode/saneyaml.py b/src/commoncode/saneyaml.py
index 2de2aa78d74..17634a6f39e 100644
--- a/src/commoncode/saneyaml.py
+++ b/src/commoncode/saneyaml.py
@@ -22,7 +22,6 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
from __future__ import absolute_import
from __future__ import print_function
@@ -38,7 +37,6 @@
from yaml import SafeLoader
from yaml import SafeDumper
-
"""
Wrapper around PyYAML to provide sane defaults ensuring that dump/load does not
damage content, keeps ordering, use always block-style and use four spaces
@@ -57,6 +55,7 @@
# https://pypi.python.org/pypi/ruamel.yaml/0.9.1
# https://pypi.python.org/pypi/yaml2rst/0.2
+
def load(s):
"""
Return an object safely loaded from YAML string `s`. `s` must be unicode
@@ -90,6 +89,7 @@ class SaneLoader(SafeLoader):
"""
A safe loader configured with many sane defaults.
"""
+
def ignore_aliases(self, data):
return True
@@ -120,6 +120,7 @@ def string_loader(loader, node):
# keep boolean conversion
# SaneLoader.add_constructor(u'tag:yaml.org,2002:boolean', string_loader)
+
def ordered_loader(loader, node):
"""
Ensure that YAML maps ordered is preserved and loaded in an OrderedDict.
@@ -143,6 +144,7 @@ def ordered_loader(loader, node):
class SaneDumper(SafeDumper):
+
def increase_indent(self, flow=False, indentless=False):
"""
Ensure that lists items are always indented.
@@ -162,6 +164,7 @@ def ordered_dumper(dumper, data):
"""
return dumper.represent_mapping(u'tag:yaml.org,2002:map', data.items())
+
SaneDumper.add_representer(OrderedDict, ordered_dumper)
@@ -171,6 +174,7 @@ def null_dumper(dumper, value):
"""
return dumper.represent_scalar(u'tag:yaml.org,2002:null', u'')
+
SafeDumper.add_representer(type(None), null_dumper)
@@ -210,4 +214,5 @@ def boolean_dumper(dumper, value):
style = None
return dumper.represent_scalar(u'tag:yaml.org,2002:bool', value, style=style)
+
SaneDumper.add_representer(bool, boolean_dumper)
diff --git a/src/commoncode/system.py b/src/commoncode/system.py
index 4cfc520e726..250d1f4b0e6 100644
--- a/src/commoncode/system.py
+++ b/src/commoncode/system.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -50,8 +50,9 @@ def os_arch():
raise Exception('Unsupported OS/platform %r' % sys_platform)
return os, arch
-
# FIXME use these for architectures
+
+
'''
darwin/386
darwin/amd64
@@ -85,12 +86,10 @@ def os_arch():
on_linux = current_os == 'linux'
on_posix = not on_windows and (on_mac or on_linux)
-
current_os_arch = '%(current_os)s-%(current_arch)s' % locals()
noarch = 'noarch'
current_os_noarch = '%(current_os)s-%(noarch)s' % locals()
-
#
# Shared library file extensions
#
@@ -101,25 +100,19 @@ def os_arch():
if on_linux:
lib_ext = '.so'
-
#
# Python versions
#
-py27 = (sys.version_info[0] == 2 and sys.version_info[1] == 7)
-py34 = (sys.version_info[0] == 3 and sys.version_info[1] == 4)
-py35 = (sys.version_info[0] == 3 and sys.version_info[1] == 5)
-py35 = (sys.version_info[0] == 3 and sys.version_info[1] == 6)
-#
-# User related
-#
-if on_windows:
- user_home = os.path.join(os.path.expandvars('$HOMEDRIVE'),
- os.path.expandvars('$HOMEPATH'))
-else:
- user_home = os.path.expanduser('~')
-
-username = getpass.getuser()
-
+_sys_v0 = sys.version_info[0]
+py2 = _sys_v0 == 2
+py3 = _sys_v0 == 3
+
+_sys_v1 = sys.version_info[1]
+py27 = py2 and _sys_v1 == 7
+py34 = py3 and _sys_v1 == 4
+py35 = py3 and _sys_v1 == 5
+py36 = py3 and _sys_v1 == 6
+py37 = py3 and _sys_v1 == 7
# Do not let Windows error pop up messages with default SetErrorMode
# See http://msdn.microsoft.com/en-us/library/ms680621(VS100).aspx
diff --git a/src/commoncode/testcase.py b/src/commoncode/testcase.py
index 780fd74a29d..d341e4b7173 100644
--- a/src/commoncode/testcase.py
+++ b/src/commoncode/testcase.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -22,7 +22,6 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
@@ -39,7 +38,7 @@
import zipfile
from commoncode import fileutils
-from commoncode.fileutils import path_to_bytes
+from commoncode.fileutils import fsencode
from commoncode import filetype
from commoncode.system import on_linux
from commoncode.system import on_posix
@@ -53,7 +52,6 @@ class EnhancedAssertions(TestCaseClass):
# always show full diff
maxDiff = None
-
def failUnlessRaisesInstance(self, excInstance, callableObj,
*args, **kwargs):
"""
@@ -79,11 +77,9 @@ def failUnlessRaisesInstance(self, excInstance, callableObj,
# to ensure that multiple tests run can be launched in parallel
test_run_temp_dir = None
-
# set to 1 to see the slow tests
timing_threshold = sys.maxint
-
POSIX_PATH_SEP = b'/' if on_linux else '/'
WIN_PATH_SEP = b'\\' if on_linux else '\\'
EMPTY_STRING = b'' if on_linux else ''
@@ -100,7 +96,7 @@ def to_os_native_path(path):
Normalize a path to use the native OS path separator.
"""
if on_linux:
- path = path_to_bytes(path)
+ path = fsencode(path)
path = path.replace(POSIX_PATH_SEP, OS_PATH_SEP)
path = path.replace(WIN_PATH_SEP, OS_PATH_SEP)
path = path.rstrip(OS_PATH_SEP)
@@ -113,8 +109,8 @@ def get_test_loc(test_path, test_data_dir, debug=False, exists=True):
location to a test file or directory for this path. No copy is done.
"""
if on_linux:
- test_path = path_to_bytes(test_path)
- test_data_dir = path_to_bytes(test_data_dir)
+ test_path = fsencode(test_path)
+ test_data_dir = fsencode(test_data_dir)
if debug:
import inspect
@@ -154,8 +150,8 @@ def get_test_loc(self, test_path, copy=False, debug=False):
"""
test_data_dir = self.test_data_dir
if on_linux:
- test_path = path_to_bytes(test_path)
- test_data_dir = path_to_bytes(test_data_dir)
+ test_path = fsencode(test_path)
+ test_data_dir = fsencode(test_data_dir)
if debug:
import inspect
@@ -189,9 +185,9 @@ def get_temp_file(self, extension=None, dir_name='td', file_name='tf'):
extension = '.txt'
if on_linux:
- extension = path_to_bytes(extension)
- dir_name = path_to_bytes(dir_name)
- file_name = path_to_bytes(file_name)
+ extension = fsencode(extension)
+ dir_name = fsencode(dir_name)
+ file_name = fsencode(file_name)
if extension and not extension.startswith(DOT):
extension = DOT + extension
@@ -211,11 +207,12 @@ def get_temp_dir(self, sub_dir_path=None):
# ensure that we have a new unique temp directory for each test run
global test_run_temp_dir
if not test_run_temp_dir:
- test_run_temp_dir = fileutils.get_temp_dir(base_dir='tst', prefix=' ')
+ # not we add a space in the path for testing path with spaces
+ test_run_temp_dir = fileutils.get_temp_dir(prefix='scancode-tests -')
if on_linux:
- test_run_temp_dir = path_to_bytes(test_run_temp_dir)
+ test_run_temp_dir = fsencode(test_run_temp_dir)
- new_temp_dir = fileutils.get_temp_dir(base_dir=test_run_temp_dir)
+ new_temp_dir = fileutils.get_temp_dir(base_dir=test_run_temp_dir, prefix='')
if sub_dir_path:
# create a sub directory hierarchy if requested
@@ -230,8 +227,8 @@ def remove_vcs(self, test_dir):
"""
vcses = ('CVS', '.svn', '.git', '.hg')
if on_linux:
- vcses = tuple(path_to_bytes(p) for p in vcses)
- test_dir = path_to_bytes(test_dir)
+ vcses = tuple(fsencode(p) for p in vcses)
+ test_dir = fsencode(test_dir)
for root, dirs, files in os.walk(test_dir):
for vcs_dir in vcses:
@@ -247,7 +244,6 @@ def remove_vcs(self, test_dir):
map(os.remove, [os.path.join(root, file_loc)
for file_loc in files if file_loc.endswith(tilde)])
-
def __extract(self, test_path, extract_func=None, verbatim=False):
"""
Given an archive file identified by test_path relative
@@ -257,14 +253,14 @@ def __extract(self, test_path, extract_func=None, verbatim=False):
"""
assert test_path and test_path != ''
if on_linux:
- test_path = path_to_bytes(test_path)
+ test_path = fsencode(test_path)
test_path = to_os_native_path(test_path)
target_path = os.path.basename(test_path)
target_dir = self.get_temp_dir(target_path)
original_archive = self.get_test_loc(test_path)
if on_linux:
- target_dir = path_to_bytes(target_dir)
- original_archive = path_to_bytes(original_archive)
+ target_dir = fsencode(target_dir)
+ original_archive = fsencode(original_archive)
extract_func(original_archive, target_dir,
verbatim=verbatim)
return target_dir
@@ -272,6 +268,9 @@ def __extract(self, test_path, extract_func=None, verbatim=False):
def extract_test_zip(self, test_path, *args, **kwargs):
return self.__extract(test_path, extract_zip)
+ def extract_test_zip_raw(self, test_path, *args, **kwargs):
+ return self.__extract(test_path, extract_zip_raw)
+
def extract_test_tar(self, test_path, verbatim=False):
return self.__extract(test_path, extract_tar, verbatim)
@@ -289,12 +288,13 @@ def _extract_tar_raw(test_path, target_dir, to_bytes, *args, **kwargs):
"""
if to_bytes:
# use bytes for paths on ALL OSes (though this may fail on macOS)
- target_dir = path_to_bytes(target_dir)
- test_path = path_to_bytes(test_path)
+ target_dir = fsencode(target_dir)
+ test_path = fsencode(test_path)
tar = tarfile.open(test_path)
tar.extractall(path=target_dir)
tar.close()
+
extract_tar_raw = partial(_extract_tar_raw, to_bytes=True)
extract_tar_uni = partial(_extract_tar_raw, to_bytes=False)
@@ -307,8 +307,8 @@ def extract_tar(location, target_dir, verbatim=False, *args, **kwargs):
"""
# always for using bytes for paths on all OSses... tar seems to use bytes internally
# and get confused otherwise
- location = path_to_bytes(location)
- target_dir = path_to_bytes(target_dir)
+ location = fsencode(location)
+ target_dir = fsencode(target_dir)
with open(location, 'rb') as input_tar:
tar = None
@@ -335,8 +335,8 @@ def extract_zip(location, target_dir, *args, **kwargs):
raise Exception('Incorrect zip file %(location)r' % locals())
if on_linux:
- location = path_to_bytes(location)
- target_dir = path_to_bytes(target_dir)
+ location = fsencode(location)
+ target_dir = fsencode(target_dir)
with zipfile.ZipFile(location) as zipf:
for info in zipf.infolist():
@@ -353,6 +353,22 @@ def extract_zip(location, target_dir, *args, **kwargs):
f.write(content)
+def extract_zip_raw(location, target_dir, *args, **kwargs):
+ """
+ Extract a zip archive file at location in the target_dir directory.
+ Use the builtin extractall function
+ """
+ if not os.path.isfile(location) and zipfile.is_zipfile(location):
+ raise Exception('Incorrect zip file %(location)r' % locals())
+
+ if on_linux:
+ location = fsencode(location)
+ target_dir = fsencode(target_dir)
+
+ with zipfile.ZipFile(location) as zipf:
+ zipf.extractall(path=target_dir)
+
+
def tar_can_extract(tarinfo, verbatim):
"""
Return True if a tar member can be extracted to handle OS specifics.
diff --git a/src/commoncode/text.py b/src/commoncode/text.py
index 5be67b83fba..b613df485ca 100644
--- a/src/commoncode/text.py
+++ b/src/commoncode/text.py
@@ -35,15 +35,13 @@
import chardet
from text_unidecode import unidecode
-
# Python 2 and 3 support
try:
# Python 2
unicode
except NameError:
# Python 3
- unicode = str
-
+ unicode = str # NOQA
"""
A text processing module providing functions to process and prepare text
@@ -54,7 +52,6 @@
- line separator stripping and conversion
"""
-
LOG = logging.getLogger(__name__)
diff --git a/src/commoncode/timeutils.py b/src/commoncode/timeutils.py
index 9db6613508d..99cc33db260 100644
--- a/src/commoncode/timeutils.py
+++ b/src/commoncode/timeutils.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -24,23 +24,28 @@
from __future__ import absolute_import, print_function
-
-from datetime import datetime, tzinfo
+from datetime import datetime
+from datetime import tzinfo
+from functools import update_wrapper
+from functools import wraps
+from time import time
"""
Time is of the essence: path safe time stamps creation and conversion to
datetime objects.
"""
+
class UTC(tzinfo):
"""UTC timezone"""
- def utcoffset(self, dt): # @UnusedVariable
+
+ def utcoffset(self, dt): # NOQA
return None
- def tzname(self, dt): # @UnusedVariable
+ def tzname(self, dt): # NOQA
return 'UTC'
- def dst(self, dt): # @UnusedVariable
+ def dst(self, dt): # NOQA
return None
@@ -60,7 +65,8 @@ def time2tstamp(dt=None):
For times, the ISO 8601 format specifies either a colon : (extended format)
or nothing as a separator (basic format). Here Python defaults to using a
- colon. We therefore remove all the colons to be file system safe.
+ colon. We therefore remove all the colons to be safe across filesystems. (a
+ colon is not a valid path char on Windows)
Another character may show up in the ISO representation such as / for time
intervals. We could replace the forward slash with a double hyphen (--) as
@@ -99,3 +105,22 @@ def tstamp2time(stamp):
if 0 <= microsec <= 999999:
datim = datim.replace(microsecond=microsec)
return datim
+
+
+def timed(fun):
+ """
+ Decorate `fun` callable to return a tuple of (timing, result) where timing
+ is a function execution time in seconds as a float and result is the value
+ returned by calling `fun`.
+
+ Note: this decorator will not work as expected for functions that return
+ generators.
+ """
+
+ @wraps(fun)
+ def _timed(*args, **kwargs):
+ start = time()
+ result = fun(*args, **kwargs)
+ return time() - start, result
+
+ return update_wrapper(_timed, fun)
diff --git a/src/commoncode/version.py b/src/commoncode/version.py
index c980ee04f87..61323746a8a 100644
--- a/src/commoncode/version.py
+++ b/src/commoncode/version.py
@@ -24,7 +24,6 @@
from __future__ import absolute_import, print_function
-
import re
from commoncode.system import on_linux
diff --git a/src/extractcode/__init__.py b/src/extractcode/__init__.py
index 6c6ed472ac9..8b70e72c64c 100644
--- a/src/extractcode/__init__.py
+++ b/src/extractcode/__init__.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -33,12 +33,16 @@
import shutil
import sys
-from commoncode import fileutils
+from commoncode.fileutils import as_posixpath
+from commoncode.fileutils import create_dir
+from commoncode.fileutils import file_name
+from commoncode.fileutils import fsencode
+from commoncode.fileutils import parent_directory
from commoncode.text import toascii
from commoncode.system import on_linux
-from commoncode.fileutils import path_to_bytes
-from commoncode.system import on_linux
-
+from os.path import dirname
+from os.path import join
+from os.path import exists
logger = logging.getLogger(__name__)
DEBUG = False
@@ -46,9 +50,7 @@
# logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
# logger.setLevel(logging.DEBUG)
-
-root_dir = os.path.join(os.path.dirname(__file__), 'bin')
-
+root_dir = join(dirname(__file__), 'bin')
POSIX_PATH_SEP = b'/' if on_linux else '/'
WIN_PATH_SEP = b'\\' if on_linux else '\\'
@@ -61,7 +63,6 @@
# Suffix added to extracted target_dir paths
EXTRACT_SUFFIX = b'-extract' if on_linux else r'-extract'
-
# high level archive "kinds"
docs = 1
regular = 2
@@ -71,7 +72,6 @@
patches = 6
special_package = 7
-
kind_labels = {
1: 'docs',
2: 'regular',
@@ -103,7 +103,7 @@ def is_extraction_path(path):
Return True is the path points to an extraction path.
"""
if on_linux:
- path = path_to_bytes(path)
+ path = fsencode(path)
return path and path.rstrip(PATHS_SEPS).endswith(EXTRACT_SUFFIX)
@@ -114,8 +114,8 @@ def is_extracted(location):
extraction location.
"""
if on_linux:
- location = path_to_bytes(location)
- return location and os.path.exists(get_extraction_path(location))
+ location = fsencode(location)
+ return location and exists(get_extraction_path(location))
def get_extraction_path(path):
@@ -123,7 +123,7 @@ def get_extraction_path(path):
Return a path where to extract.
"""
if on_linux:
- path = path_to_bytes(path)
+ path = fsencode(path)
return path.rstrip(PATHS_SEPS) + EXTRACT_SUFFIX
@@ -132,7 +132,7 @@ def remove_archive_suffix(path):
Remove all the extracted suffix from a path.
"""
if on_linux:
- path = path_to_bytes(path)
+ path = fsencode(path)
return re.sub(EXTRACT_SUFFIX, EMPTY_STRING, path)
@@ -142,25 +142,25 @@ def remove_backslashes_and_dotdots(directory):
Return a list of errors if any.
"""
if on_linux:
- directory = path_to_bytes(directory)
+ directory = fsencode(directory)
errors = []
for top, _, files in os.walk(directory):
for filename in files:
if not (WIN_PATH_SEP in filename or DOTDOT in filename):
continue
try:
- new_path = fileutils.as_posixpath(filename)
+ new_path = as_posixpath(filename)
new_path = new_path.strip(POSIX_PATH_SEP)
new_path = posixpath.normpath(new_path)
new_path = new_path.replace(DOTDOT, POSIX_PATH_SEP)
new_path = new_path.strip(POSIX_PATH_SEP)
new_path = posixpath.normpath(new_path)
segments = new_path.split(POSIX_PATH_SEP)
- directory = os.path.join(top, *segments[:-1])
- fileutils.create_dir(directory)
- shutil.move(os.path.join(top, filename), os.path.join(top, *segments))
+ directory = join(top, *segments[:-1])
+ create_dir(directory)
+ shutil.move(join(top, filename), join(top, *segments))
except Exception:
- errors.append(os.path.join(top, filename))
+ errors.append(join(top, filename))
return errors
@@ -180,16 +180,16 @@ def new_name(location, is_dir=False):
"""
assert location
if on_linux:
- location = path_to_bytes(location)
+ location = fsencode(location)
location = location.rstrip(PATHS_SEPS)
assert location
- parent = fileutils.parent_directory(location)
+ parent = parent_directory(location)
# all existing files or directory as lower case
siblings_lower = set(s.lower() for s in os.listdir(parent))
- filename = fileutils.file_name(location)
+ filename = file_name(location)
# corner case
if filename in (DOT, DOT):
@@ -197,7 +197,7 @@ def new_name(location, is_dir=False):
# if unique, return this
if filename.lower() not in siblings_lower:
- return os.path.join(parent, filename)
+ return join(parent, filename)
# otherwise seek a unique name
if is_dir:
@@ -219,7 +219,7 @@ def new_name(location, is_dir=False):
if filename.lower() not in siblings_lower:
break
counter += 1
- return os.path.join(parent, filename)
+ return join(parent, filename)
# TODO: use attrs and slots
@@ -289,14 +289,18 @@ def to_dict(self):
class ExtractError(Exception):
pass
+
class ExtractErrorPasswordProtected(ExtractError):
pass
+
class ExtractErrorFailedToExtract(ExtractError):
pass
+
class ExtractWarningIncorrectEntry(ExtractError):
pass
+
class ExtractWarningTrailingGarbage(ExtractError):
pass
diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py
index 673bcd1199b..555b33b5a0c 100644
--- a/src/extractcode/archive.py
+++ b/src/extractcode/archive.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -33,6 +33,7 @@
from commoncode import fileutils
from commoncode import filetype
+from commoncode.system import on_linux
import typecode
from extractcode import all_kinds
@@ -49,9 +50,6 @@
from extractcode import libarchive2
from extractcode.uncompress import uncompress_gzip
from extractcode.uncompress import uncompress_bzip2
-from commoncode.system import on_linux
-from commoncode.fileutils import path_to_bytes
-
logger = logging.getLogger(__name__)
TRACE = False
@@ -62,8 +60,6 @@
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
-
-
"""
Archive formats handling. The purpose of this module is to select an extractor
suitable for the accurate extraction of a given kind of archive. An extractor is
@@ -150,7 +146,7 @@ def get_best_handler(location, kinds=all_kinds):
Return the best handler of None for the file at location.
"""
if on_linux:
- location = path_to_bytes(location)
+ location = fileutils.fsencode(location)
location = os.path.abspath(os.path.expanduser(location))
if not filetype.is_file(location):
return
@@ -166,7 +162,7 @@ def get_handlers(location):
extension_matched,) for this `location`.
"""
if on_linux:
- location = path_to_bytes(location)
+ location = fileutils.fsencode(location)
if filetype.is_file(location):
T = typecode.contenttype.get_type(location)
@@ -187,7 +183,7 @@ def get_handlers(location):
exts = handler.extensions
if exts:
if on_linux:
- exts = tuple(path_to_bytes(e) for e in exts)
+ exts = tuple(fileutils.fsencode(e) for e in exts)
extension_matched = exts and location.lower().endswith(exts)
if TRACE_DEEP:
@@ -311,19 +307,19 @@ def extract_twice(location, target_dir, extractor1, extractor2):
covers most common cases.
"""
if on_linux:
- location = path_to_bytes(location)
- target_dir = path_to_bytes(target_dir)
+ location = fileutils.fsencode(location)
+ target_dir = fileutils.fsencode(target_dir)
abs_location = os.path.abspath(os.path.expanduser(location))
abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir)))
# extract first the intermediate payload to a temp dir
- temp_target = unicode(fileutils.get_temp_dir('extract'))
+ temp_target = unicode(fileutils.get_temp_dir(prefix='scancode-extract-'))
warnings = extractor1(abs_location, temp_target)
if TRACE:
logger.debug('extract_twice: temp_target: %(temp_target)r' % locals())
# extract this intermediate payload to the final target_dir
try:
- inner_archives = list(fileutils.file_iter(temp_target))
+ inner_archives = list(fileutils.resource_iter(temp_target, with_dirs=False))
if not inner_archives:
warnings.append(location + ': No files found in archive.')
else:
@@ -349,7 +345,7 @@ def extract_with_fallback(location, target_dir, extractor1, extractor2):
abs_location = os.path.abspath(os.path.expanduser(location))
abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir)))
# attempt extract first to a temp dir
- temp_target1 = unicode(fileutils.get_temp_dir('extract1'))
+ temp_target1 = unicode(fileutils.get_temp_dir(prefix='scancode-extract1-'))
try:
warnings = extractor1(abs_location, temp_target1)
if TRACE:
@@ -357,7 +353,7 @@ def extract_with_fallback(location, target_dir, extractor1, extractor2):
fileutils.copytree(temp_target1, abs_target_dir)
except:
try:
- temp_target2 = unicode(fileutils.get_temp_dir('extract2'))
+ temp_target2 = unicode(fileutils.get_temp_dir(prefix='scancode-extract2-'))
warnings = extractor2(abs_location, temp_target2)
if TRACE:
logger.debug('extract_with_fallback: temp_target2: %(temp_target2)r' % locals())
@@ -379,7 +375,7 @@ def try_to_extract(location, target_dir, extractor):
"""
abs_location = os.path.abspath(os.path.expanduser(location))
abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir)))
- temp_target = unicode(fileutils.get_temp_dir('extract1'))
+ temp_target = unicode(fileutils.get_temp_dir(prefix='scancode-extract1-'))
warnings = []
try:
warnings = extractor(abs_location, temp_target)
@@ -392,10 +388,10 @@ def try_to_extract(location, target_dir, extractor):
fileutils.delete(temp_target)
return warnings
-
# High level aliases to lower level extraction functions
########################################################
+
extract_tar = libarchive2.extract
extract_patch = patch.extract
@@ -412,7 +408,6 @@ def try_to_extract(location, target_dir, extractor):
extract_springboot = functools.partial(try_to_extract, extractor=extract_zip)
-
extract_iso = sevenzip.extract
extract_rar = sevenzip.extract
extract_rpm = sevenzip.extract
@@ -425,7 +420,6 @@ def try_to_extract(location, target_dir, extractor):
extract_Z = sevenzip.extract
extract_xarpkg = sevenzip.extract
-
# Archive handlers.
####################
diff --git a/src/extractcode/extract.py b/src/extractcode/extract.py
index 2e3c8103b86..619a71ddb47 100644
--- a/src/extractcode/extract.py
+++ b/src/extractcode/extract.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -32,6 +32,7 @@
from os.path import abspath
from os.path import expanduser
from os.path import join
+import traceback
from commoncode import fileutils
from commoncode import ignore
@@ -46,7 +47,6 @@
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
-
"""
Extract archives and compressed files recursively to get the file content available for
further processing. This the high level extraction entry point.
@@ -86,7 +86,6 @@
the original archive.
"""
-
"""
An ExtractEvent contains data about an archive extraction progress:
- `source` is the location of the archive being extracted
@@ -166,7 +165,7 @@ def extract(location, kinds=extractcode.default_kinds, recurse=False):
yield xevent
-def extract_file(location, target, kinds=extractcode.default_kinds):
+def extract_file(location, target, kinds=extractcode.default_kinds, verbose=False):
"""
Extract a single archive at `location` in the `target` directory if it is
of a kind supported in the `kinds` kind tuple.
@@ -181,17 +180,21 @@ def extract_file(location, target, kinds=extractcode.default_kinds):
if extractor:
yield ExtractEvent(location, target, done=False, warnings=[], errors=[])
try:
- # extract first to a temp directory.
- # if there is an error, the extracted files will not be moved
- # to target
- tmp_tgt = fileutils.get_temp_dir('extract')
+ # extract first to a temp directory: if there is an error, the
+ # extracted files will not be moved to target
+ tmp_tgt = fileutils.get_temp_dir(prefix='scancode-extract-')
abs_location = abspath(expanduser(location))
- warnings.extend(extractor(abs_location, tmp_tgt))
+ warns = extractor(abs_location, tmp_tgt) or []
+ warnings.extend(warns)
fileutils.copytree(tmp_tgt, target)
fileutils.delete(tmp_tgt)
except Exception, e:
- if TRACE:
- logger.debug('extract_file: ERROR: %(location)r: %(errors)r, %(e)r.\n' % locals())
errors = [str(e).strip(' \'"')]
+ if verbose:
+ errors.append(traceback.format_exc())
+ if TRACE:
+ tb = traceback.format_exc()
+ logger.debug('extract_file: ERROR: %(location)r: %(errors)r\n%(e)r\n%(tb)s' % locals())
+
finally:
yield ExtractEvent(location, target, done=True, warnings=warnings, errors=errors)
diff --git a/src/extractcode/libarchive2.py b/src/extractcode/libarchive2.py
index 40e0011e460..aeb480fc7b4 100644
--- a/src/extractcode/libarchive2.py
+++ b/src/extractcode/libarchive2.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -22,7 +22,6 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
@@ -50,19 +49,16 @@
from extractcode import ExtractError
from extractcode import ExtractErrorPasswordProtected
-
# Python 2 and 3 support
try:
from os import fsencode
except ImportError:
from backports.os import fsencode
-
logger = logging.getLogger(__name__)
DEBUG = False
# logging.basicConfig(level=logging.DEBUG)
-
"""
libarchive2 is a minimal and specialized wrapper around a vendored libarchive archive
extraction library. It only deals with archive extraction and does not know how to
@@ -142,17 +138,25 @@ def extract(location, target_dir):
warnings = []
for entry in list_entries(abs_location):
- if not (entry.isdir or entry.isfile):
- # skip special files and links
- continue
-
- _target_path = entry.write(abs_target_dir, transform_path=paths.safe_path)
- if entry.warnings:
- msgs = [w.strip('"\' ') for w in entry.warnings if w and w.strip('"\' ')]
- msgs = msgs or ['No message provided']
- formatted = entry.path + ': ' + '\n'.join(msgs)
- if formatted not in warnings:
- warnings.append(formatted)
+
+ if entry and entry.warnings:
+ if not entry.is_empty():
+ entry_path = entry.path
+ msgs = ['%(entry_path)r: ' % locals()]
+ else:
+ msgs = ['No path available: ']
+
+ msgs.extend([w.strip('"\' ') for w in entry.warnings if w and w.strip('"\' ')])
+ msgs = '\n'.join(msgs) or 'No message provided'
+
+ if msgs not in warnings:
+ warnings.append(msgs)
+
+ if not entry.is_empty():
+ if not (entry.isdir or entry.isfile):
+ # skip special files and links
+ continue
+ _target_path = entry.write(abs_target_dir, transform_path=paths.safe_path)
return warnings
@@ -179,6 +183,7 @@ class Archive(object):
for entry in archive:
# dome something with entry
"""
+
def __init__(self, location, uncompress=True, extract=True, block_size=10240):
"""
Build an Archive object from file at `location`.
@@ -231,23 +236,37 @@ def close(self):
free_archive(self.archive_struct)
self.archive_struct = None
- def iter(self):
+ def iter(self, verbose=False):
"""
Yield Entry for this archive.
"""
assert self.archive_struct, 'Archive must be used as a context manager.'
entry_struct = new_entry()
try:
- while 1:
+ while True:
+ entry = None
+ warnings = []
try:
r = next_entry(self.archive_struct, entry_struct)
if r == ARCHIVE_EOF:
return
- e = Entry(self, entry_struct)
+ entry = Entry(self, entry_struct)
except ArchiveWarning, aw:
- if aw.msg and aw.msg not in e.warnings:
- e.warnings.append(aw.msg)
- yield e
+ if not entry:
+ entry = Entry(self, entry_struct)
+ if aw.msg and aw.msg not in entry.warnings:
+ entry.warnings.append(aw.msg)
+
+# msg = 'WARNING: '
+# if aw.msg and aw.msg not in entry.warnings:
+# msg += repr(aw.msg) + '\n'
+# if verbose:
+# msg += traceback.format_exc()
+# warnings.append(msg % locals())
+ finally:
+ if entry:
+ entry.warnings.extend(warnings)
+ yield entry
finally:
if entry_struct:
free_entry(entry_struct)
@@ -277,32 +296,55 @@ def __init__(self, archive, entry_struct):
self.archive = archive
self.entry_struct = entry_struct
- self.filetype = entry_type(self.entry_struct)
- self.isfile = self.filetype & AE_IFMT == AE_IFREG
- self.isdir = self.filetype & AE_IFMT == AE_IFDIR
- self.isblk = self.filetype & AE_IFMT == AE_IFBLK
- self.ischr = self.filetype & AE_IFMT == AE_IFCHR
- self.isfifo = self.filetype & AE_IFMT == AE_IFIFO
- self.issock = self.filetype & AE_IFMT == AE_IFSOCK
- self.isspecial = self.ischr or self.isblk or self.isfifo or self.issock
+ self.filetype = None
+ self.isfile = None
+ self.isdir = None
+ self.isblk = None
+ self.ischr = None
+ self.isfifo = None
+ self.issock = None
+ self.isspecial = None
# bytes
- self.size = entry_size(self.entry_struct) or 0
+ self.size = None
# sec since epoch
- self.time = entry_time(self.entry_struct) or 0
+ self.time = None
# all paths are byte strings not unicode
- self.path = self._path_bytes(entry_path, entry_path_w)
- self.issym = self.filetype & AE_IFMT == AE_IFLNK
- # FIXME: could there be cases with link path and symlink is False?
- if self.issym:
- self.symlink_path = self._path_bytes(symlink_path, symlink_path_w)
- self.hardlink_path = self._path_bytes(hardlink_path, hardlink_path_w)
- # hardlinks do not have a filetype: we test the path instead
- self.islnk = bool(self.hardlink_path)
+ self.path = None
+
+ self.issym = None
+ self.symlink_path = None
+ self.islnk = None
+ self.hardlink_path = None
+
+ # list of strings
self.warnings = []
+ if self.entry_struct:
+ self.filetype = entry_type(self.entry_struct)
+ self.isfile = self.filetype & AE_IFMT == AE_IFREG
+ self.isdir = self.filetype & AE_IFMT == AE_IFDIR
+ self.isblk = self.filetype & AE_IFMT == AE_IFBLK
+ self.ischr = self.filetype & AE_IFMT == AE_IFCHR
+ self.isfifo = self.filetype & AE_IFMT == AE_IFIFO
+ self.issock = self.filetype & AE_IFMT == AE_IFSOCK
+ self.isspecial = self.ischr or self.isblk or self.isfifo or self.issock
+ self.size = entry_size(self.entry_struct) or 0
+ self.time = entry_time(self.entry_struct) or 0
+ self.path = self._path_bytes(entry_path, entry_path_w)
+ self.issym = self.filetype & AE_IFMT == AE_IFLNK
+ # FIXME: could there be cases with link path and symlink is False?
+ if self.issym:
+ self.symlink_path = self._path_bytes(symlink_path, symlink_path_w)
+ self.hardlink_path = self._path_bytes(hardlink_path, hardlink_path_w)
+ # hardlinks do not have a filetype: we test the path instead
+ self.islnk = bool(self.hardlink_path)
+
+ def is_empty(self):
+ return not self.archive or not self.entry_struct
+
def _path_bytes(self, func, func_w):
"""
Return a path as a byte string converted to UTF-8-encoded bytes if this is
@@ -381,6 +423,7 @@ def __repr__(self):
class ArchiveException(ExtractError):
+
def __init__(self, rc=None, archive_struct=None, archive_func=None, root_ex=None):
self.root_ex = root_ex
if root_ex and isinstance(root_ex, ArchiveException):
@@ -405,29 +448,35 @@ def __str__(self):
class ArchiveWarning(ArchiveException):
pass
+
class ArchiveErrorRetryable(ArchiveException):
pass
+
class ArchiveError(ArchiveException):
pass
+
class ArchiveErrorFatal(ArchiveException):
pass
+
class ArchiveErrorFailedToWriteEntry(ArchiveException):
pass
+
class ArchiveErrorPasswordProtected(ArchiveException, ExtractErrorPasswordProtected):
pass
+
class ArchiveErrorIllegalOperationOnClosedArchive(ArchiveException):
pass
-
#################################################
# ctypes defintion of the interface to libarchive
#################################################
+
def errcheck(rc, archive_func, args, null=False):
"""
ctypes error check handler for functions returning int, or null if null is True.
@@ -455,7 +504,6 @@ def errcheck(rc, archive_func, args, null=False):
errcheck_null = partial(errcheck, null=True)
-
# libarchive return codes
ARCHIVE_EOF = 1
ARCHIVE_OK = 0
@@ -464,7 +512,6 @@ def errcheck(rc, archive_func, args, null=False):
ARCHIVE_FAILED = -25
ARCHIVE_FATAL = -30
-
# libarchive stat/file types
AE_IFREG = 0o0100000 # Regular file
AE_IFLNK = 0o0120000 # Symbolic link
@@ -476,7 +523,6 @@ def errcheck(rc, archive_func, args, null=False):
AE_IFMT = 0o0170000 # Format mask
-
#####################################
# libarchive C functions declarations
#####################################
@@ -492,7 +538,6 @@ def errcheck(rc, archive_func, args, null=False):
# wide string and then store a narrow string for the same data, the previously-set
# wide string will be discarded in favor of the new data.
-
"""
To read an archive, you must first obtain an initialized struct archive object
from archive_read_new()
@@ -506,7 +551,6 @@ def errcheck(rc, archive_func, args, null=False):
archive_reader.restype = c_void_p
archive_reader.errcheck = errcheck_null
-
"""
Given a struct archive object, you can enable support for formats and filters.
Enables support for all available formats except the "raw" format.
@@ -522,7 +566,6 @@ def errcheck(rc, archive_func, args, null=False):
use_all_formats.restype = c_int
use_all_formats.errcheck = errcheck
-
"""
Given a struct archive object, you can enable support for formats and filters.
@@ -539,7 +582,6 @@ def errcheck(rc, archive_func, args, null=False):
use_raw_formats.restype = c_int
use_raw_formats.errcheck = errcheck
-
"""
Given a struct archive object, you can enable support for formats and filters.
@@ -555,7 +597,6 @@ def errcheck(rc, archive_func, args, null=False):
use_all_filters.restype = c_int
use_all_filters.errcheck = errcheck
-
"""
Once formats and filters have been set, you open an archive filename for
actual reading.
@@ -575,7 +616,6 @@ def errcheck(rc, archive_func, args, null=False):
open_file.restype = c_int
open_file.errcheck = errcheck
-
"""
Wide char version of archive_read_open_filename.
"""
@@ -585,7 +625,6 @@ def errcheck(rc, archive_func, args, null=False):
open_file_w.restype = c_int
open_file_w.errcheck = errcheck
-
"""
When done with reading an archive you must free its resources.
@@ -618,7 +657,6 @@ def errcheck(rc, archive_func, args, null=False):
new_entry.restype = c_void_p
new_entry.errcheck = errcheck_null
-
"""
Given an opened archive struct object, you can iterate through the archive
entries. An entry has a header with various data and usually a payload that is
@@ -639,7 +677,6 @@ def errcheck(rc, archive_func, args, null=False):
next_entry.restype = c_int
next_entry.errcheck = errcheck
-
"""
Read data associated with the header just read. Internally, this is a
convenience function that calls archive_read_data_block() and fills any gaps
@@ -651,7 +688,6 @@ def errcheck(rc, archive_func, args, null=False):
read_entry_data.restype = c_ssize_t
read_entry_data.errcheck = errcheck
-
"""
Return the next available block of data for this entry. Unlike
archive_read_data(), the archive_read_data_block() function avoids copying
@@ -667,7 +703,6 @@ def errcheck(rc, archive_func, args, null=False):
read_entry_data_block.restype = c_int
read_entry_data_block.errcheck = errcheck
-
"""
Releases the struct archive_entry object.
The struct entry object must be freed when no longer needed.
@@ -677,7 +712,6 @@ def errcheck(rc, archive_func, args, null=False):
free_entry.argtypes = [c_void_p]
free_entry.restype = None
-
#
# Entry attributes: path, type, size, etc. are collected with these functions:
#
@@ -704,7 +738,6 @@ def errcheck(rc, archive_func, args, null=False):
entry_type.argtypes = [c_void_p]
entry_type.restype = c_int
-
"""
This function retrieves the mtime field in an archive_entry. (modification
time).
@@ -718,7 +751,6 @@ def errcheck(rc, archive_func, args, null=False):
entry_time.argtypes = [c_void_p]
entry_time.restype = c_int
-
"""
Path in the archive.
@@ -737,14 +769,12 @@ def errcheck(rc, archive_func, args, null=False):
entry_path_w.argtypes = [c_void_p]
entry_path_w.restype = c_wchar_p
-
# int64_t archive_entry_size(struct archive_entry *a);
entry_size = libarchive.archive_entry_size
entry_size.argtypes = [c_void_p]
entry_size.restype = c_longlong
entry_size.errcheck = errcheck
-
"""
Destination of the hardlink.
"""
@@ -753,13 +783,11 @@ def errcheck(rc, archive_func, args, null=False):
hardlink_path.argtypes = [c_void_p]
hardlink_path.restype = c_char_p
-
# const wchar_t * archive_entry_hardlink_w(struct archive_entry *a);
hardlink_path_w = libarchive.archive_entry_hardlink_w
hardlink_path_w.argtypes = [c_void_p]
hardlink_path_w.restype = c_wchar_p
-
"""
The number of references (hardlinks) can be obtained by calling
archive_entry_nlinks()
@@ -769,7 +797,6 @@ def errcheck(rc, archive_func, args, null=False):
hardlink_count.argtypes = [c_void_p]
hardlink_count.restype = c_int
-
"""
The functions archive_entry_dev() and archive_entry_ino64() are used by
ManPageArchiveEntryLinkify3 to find hardlinks. The pair of device and inode is
@@ -779,7 +806,6 @@ def errcheck(rc, archive_func, args, null=False):
# dev_t archive_entry_dev(struct archive_entry *a);
# int archive_entry_dev_is_set(struct archive_entry *a);
-
"""
Destination of the symbolic link.
"""
@@ -789,14 +815,12 @@ def errcheck(rc, archive_func, args, null=False):
symlink_path.restype = c_char_p
symlink_path.errcheck = errcheck_null
-
# const wchar_t * archive_entry_symlink_w(struct archive_entry *);
symlink_path_w = libarchive.archive_entry_symlink_w
symlink_path_w.argtypes = [c_void_p]
symlink_path_w.restype = c_wchar_p
symlink_path_w.errcheck = errcheck_null
-
#
# Utilities and error handling: not all are defined for now
#
@@ -812,7 +836,6 @@ def errcheck(rc, archive_func, args, null=False):
errno.argtypes = [c_void_p]
errno.restype = c_int
-
"""
Returns a textual error message suitable for display. The error message here
is usually more specific than that obtained from passing the result of
@@ -823,7 +846,6 @@ def errcheck(rc, archive_func, args, null=False):
err_msg.argtypes = [c_void_p]
err_msg.restype = c_char_p
-
"""
Returns a count of the number of files processed by this archive object. The
count is incremented by calls to ManPageArchiveWriteHeader3 or
@@ -844,13 +866,11 @@ def errcheck(rc, archive_func, args, null=False):
"""
# int archive_filter_count(struct archive *, int);
-
"""
Synonym for archive_filter_code(a,(0)).
"""
# int archive_compression(struct archive *);
-
"""
Returns a textual name identifying the indicated filter. See
archive_filter_count() for details of the numbering.
diff --git a/src/extractcode/patch.py b/src/extractcode/patch.py
index 765295ab731..9882a5bce92 100644
--- a/src/extractcode/patch.py
+++ b/src/extractcode/patch.py
@@ -48,7 +48,6 @@
more conveniently.
"""
-
LOG = logging.getLogger(__name__)
diff --git a/src/extractcode/sevenzip.py b/src/extractcode/sevenzip.py
index c626fba2699..c21e26d0c10 100644
--- a/src/extractcode/sevenzip.py
+++ b/src/extractcode/sevenzip.py
@@ -43,12 +43,10 @@
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'bin'))
-
"""
Low level support for p/7zip-based archive extraction.
"""
-
sevenzip_errors = [
('unsupported method', 'Unsupported archive or broken archive'),
('wrong password', 'Password protected archive, unable to extract'),
@@ -222,7 +220,6 @@ def list_entries(location, arch_type='*'):
if rc != 0:
# FIXME: this test is useless
_error = get_7z_errors(stdout) or UNKNOWN_ERROR
- # print(_error)
# the listing was produced as UTF on windows to avoid damaging binary
# paths in console outputs
diff --git a/src/extractcode/tar.py b/src/extractcode/tar.py
index f7c5b0a628d..fa3f3e24c6a 100644
--- a/src/extractcode/tar.py
+++ b/src/extractcode/tar.py
@@ -50,7 +50,6 @@
#
# Credits: Gustavo Niemeyer, Niels Gustabel, Richard Townsend.
-
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
@@ -71,7 +70,6 @@
logger = logging.getLogger('extractcode')
# logging.basicConfig(level=logging.DEBUG)
-
"""
Low level support for tar-based archive extraction using Python built-in tar
support.
diff --git a/src/extractcode/tarfile_patch/tarfile.py b/src/extractcode/tarfile_patch/tarfile.py
index 1b0b2f7d2d3..f826401dd2c 100644
--- a/src/extractcode/tarfile_patch/tarfile.py
+++ b/src/extractcode/tarfile_patch/tarfile.py
@@ -1,4 +1,5 @@
# -*- coding: iso-8859-1 -*-
+# flake8: noqa
#-------------------------------------------------------------------
# tarfile.py
#-------------------------------------------------------------------
@@ -2650,7 +2651,7 @@ def writestr(self, zinfo, bytes):
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
- import calendar # @UnresolvedImport
+ import calendar # NOQA
tinfo = TarInfo(zinfo.filename)
tinfo.size = len(bytes)
tinfo.mtime = calendar.timegm(zinfo.date_time)
diff --git a/src/extractcode/uncompress.py b/src/extractcode/uncompress.py
index d6469e6906d..df6dfc03d39 100644
--- a/src/extractcode/uncompress.py
+++ b/src/extractcode/uncompress.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -79,7 +79,7 @@ def uncompress_file(location, decompressor):
warnings = []
base_name = fileutils.file_base_name(location)
- target_location = os.path.join(fileutils.get_temp_dir(base_dir='extract'), base_name)
+ target_location = os.path.join(fileutils.get_temp_dir(prefix='scancode-extract-'), base_name)
with decompressor(location, 'rb') as compressed:
with open(target_location, 'wb') as uncompressed:
buffer_size = 32 * 1024 * 1024
diff --git a/src/formattedcode/format_json.py b/src/formattedcode/format_json.py
deleted file mode 100644
index 7eb8d272235..00000000000
--- a/src/formattedcode/format_json.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
-# http://nexb.com and https://github.com/nexB/scancode-toolkit/
-# The ScanCode software is licensed under the Apache License version 2.0.
-# Data generated with ScanCode require an acknowledgment.
-# ScanCode is a trademark of nexB Inc.
-#
-# You may not use this software except in compliance with the License.
-# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software distributed
-# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# When you publish or redistribute any data created with ScanCode or any ScanCode
-# derivative work, you must accompany this data with the following acknowledgment:
-#
-# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
-# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
-# ScanCode should be considered or used as legal advice. Consult an Attorney
-# for any legal advice.
-# ScanCode is a free software code scanning tool from nexB Inc. and others.
-# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
-from collections import OrderedDict
-
-import simplejson
-
-from plugincode.output import scan_output_writer
-
-
-"""
-Output plugins to write scan results as JSON.
-"""
-
-@scan_output_writer
-def write_json_compact(files_count, version, notice, scanned_files, options, output_file, *args, **kwargs):
- """
- Write scan output formatted as compact JSON.
- """
- _write_json(files_count, version, notice, scanned_files, options, output_file, pretty=False)
-
-
-@scan_output_writer
-def write_json_pretty_printed(files_count, version, notice, scanned_files, options, output_file, *args, **kwargs):
- """
- Write scan output formatted as pretty-printed JSON.
- """
- _write_json(files_count, version, notice, scanned_files, options, output_file, pretty=True)
-
-
-def _write_json(files_count, version, notice, scanned_files, options, output_file, pretty=False):
- scan = OrderedDict([
- ('scancode_notice', notice),
- ('scancode_version', version),
- ('scancode_options', options),
- ('files_count', files_count),
- ('files', scanned_files),
- ])
- kwargs = dict(iterable_as_array=True, encoding='utf-8')
- if pretty:
- kwargs['indent'] = 2 * ' '
- else:
- kwargs['separators'] = (',', ':',)
-
- # FIXME: Why do we wrap the output in unicode? Test output when we do not wrap the output in unicode
- output_file.write(unicode(simplejson.dumps(scan, **kwargs)))
- output_file.write('\n')
diff --git a/src/formattedcode/format_csv.py b/src/formattedcode/output_csv.py
similarity index 85%
rename from src/formattedcode/format_csv.py
rename to src/formattedcode/output_csv.py
index a5782919a24..f9d6f4a5aa3 100644
--- a/src/formattedcode/format_csv.py
+++ b/src/formattedcode/output_csv.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -31,20 +31,36 @@
import unicodecsv
-from plugincode.output import scan_output_writer
+from plugincode.output import output_impl
+from plugincode.output import OutputPlugin
+from scancode import CommandLineOption
+from scancode import FileOptionType
+from scancode import OUTPUT_GROUP
-"""
-Output plugin to write scan results as CSV.
-"""
+@output_impl
+class CsvOutput(OutputPlugin):
+ options = [
+ CommandLineOption(('--output-csv',),
+ type=FileOptionType(mode='wb', lazy=False),
+ metavar='FILE',
+ help='Write scan output as CSV to FILE.',
+ help_group=OUTPUT_GROUP,
+ sort_order=30),
+ ]
+
+ def is_enabled(self, output_csv, **kwargs):
+ return output_csv
+
+ def process_codebase(self, codebase, output_csv, **kwargs):
+ results = self.get_results(codebase, **kwargs)
+ write_csv(results, output_csv)
-@scan_output_writer
-def write_csv(scanned_files, output_file, *args, **kwargs):
- """
- Write scan output formatted as CSV.
- """
- scan_results = list(scanned_files)
+
+def write_csv(results, output_file):
+ # FIXMe: this is reading all in memory
+ results = list(results)
headers = OrderedDict([
('info', []),
@@ -56,7 +72,7 @@ def write_csv(scanned_files, output_file, *args, **kwargs):
])
# note: FIXME: headers are collected as a side effect and this is not great
- rows = list(flatten_scan(scan_results, headers))
+ rows = list(flatten_scan(results, headers))
ordered_headers = []
for key_group in headers.values():
@@ -112,25 +128,28 @@ def collect_keys(mapping, key_group):
# do not include matched text for now.
if k == 'matched_text':
continue
+
if k == 'matched_rule':
+ is_choice = val.get('license_choice', False)
for mrk, mrv in val.items():
- mrk = 'matched_rule__' + mrk
if mrk == 'license_choice':
mrv = 'y' if mrv else ''
if mrk == 'licenses':
- mrv = ' '.join(mrv)
+ sep = ' OR ' if is_choice else ' AND '
+ mrv = sep.join(mrv)
if mrk in ('match_coverage', 'rule_relevance'):
# normalize the string representation of this number
mrv = '{:.2f}'.format(mrv)
+ mrk = 'matched_rule__' + mrk
lic[mrk] = mrv
continue
if k == 'score':
- # normalize the string representation of this number
+ # normalize score with two decimal values
val = '{:.2f}'.format(val)
- # lines are present in multiple scans: keep their column name as not scan-specific
- # Prefix othe columns with license__
+ # lines are present in multiple scans: keep their column name as
+ # not scan-specific. Prefix othe columns with license__
if k not in ('start_line', 'end_line',):
k = 'license__' + k
lic[k] = val
diff --git a/src/formattedcode/format_templated.py b/src/formattedcode/output_html.py
similarity index 54%
rename from src/formattedcode/format_templated.py
rename to src/formattedcode/output_html.py
index 9774dcdacef..5cee9fab016 100644
--- a/src/formattedcode/format_templated.py
+++ b/src/formattedcode/output_html.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -23,20 +23,38 @@
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
from __future__ import absolute_import
-from __future__ import print_function
from __future__ import division
+from __future__ import print_function
from __future__ import unicode_literals
from collections import OrderedDict
import codecs
from operator import itemgetter
-import os
-
-import simplejson as json
-
-from commoncode import fileutils
-from plugincode.output import scan_output_writer
-
+from os.path import abspath
+from os.path import basename
+from os.path import dirname
+from os.path import exists
+from os.path import expanduser
+from os.path import isfile
+from os.path import join
+
+import click
+import simplejson
+
+from commoncode.fileutils import PATH_TYPE
+from commoncode.fileutils import as_posixpath
+from commoncode.fileutils import copytree
+from commoncode.fileutils import delete
+from commoncode.fileutils import file_name
+from commoncode.fileutils import file_base_name
+from commoncode.fileutils import fsencode
+from commoncode.fileutils import parent_directory
+from commoncode.system import on_linux
+from plugincode.output import output_impl
+from plugincode.output import OutputPlugin
+from scancode import CommandLineOption
+from scancode import FileOptionType
+from scancode import OUTPUT_GROUP
"""
Output plugins to write scan results using templates such as HTML.
@@ -46,152 +64,111 @@
"""
-@scan_output_writer
-def write_html(scanned_files, output_file, _echo, version, *args, **kwargs):
+@output_impl
+class HtmlOutput(OutputPlugin):
+
+ options = [
+ CommandLineOption(('--output-html',),
+ type=FileOptionType(mode='wb', lazy=False),
+ metavar='FILE',
+ help='Write scan output as HTML to FILE.',
+ help_group=OUTPUT_GROUP,
+ sort_order=50),
+ ]
+
+ def is_enabled(self, output_html, **kwargs):
+ return output_html
+
+ def process_codebase(self, codebase, output_html, scancode_version, **kwargs):
+ results = self.get_results(codebase, **kwargs)
+ write_templated(output_html, results, scancode_version,
+ template_or_format='html')
+
+
+@output_impl
+class CustomTemplateOutput(OutputPlugin):
+
+ options = [
+ CommandLineOption(('--output-custom',),
+ type=FileOptionType(mode='wb', lazy=False),
+ requires=['custom_template'],
+ metavar='FILE',
+ help='Write scan output to FILE formatted with '
+ 'the custom Jinja template file.',
+ help_group=OUTPUT_GROUP,
+ sort_order=60),
+
+ CommandLineOption(('--custom-template',),
+ type=click.Path(
+ exists=True, file_okay=True, dir_okay=False,
+ readable=True, path_type=PATH_TYPE),
+ requires=['output_custom'],
+ metavar='FILE',
+ help='Use this Jinja template FILE as a custom template.',
+ help_group=OUTPUT_GROUP,
+ sort_order=65),
+ ]
+
+ def is_enabled(self, output_custom, custom_template, **kwargs):
+ return output_custom and custom_template
+
+ def process_codebase(self, codebase, output_custom, custom_template,
+ scancode_version, **kwargs):
+
+ results = self.get_results(codebase, **kwargs)
+ if on_linux:
+ custom_template = fsencode(custom_template)
+ write_templated(output_custom, results, scancode_version,
+ template_or_format=custom_template)
+
+
+@output_impl
+class HtmlAppOutput(OutputPlugin):
"""
- Write scan output formatted as plain HTML page.
+ Write scan output as a mini HTML application.
"""
- _write_templated(scanned_files, output_file, _echo, version, template_or_format='html', raise_ex=False)
+ options = [
+ CommandLineOption(('--output-html-app',),
+ type=FileOptionType(mode='wb', lazy=False),
+ metavar='FILE',
+ help='Write scan output as a mini HTML application to FILE.',
+ help_group=OUTPUT_GROUP,
+ sort_order=70),
+ ]
+ def is_enabled(self, output_html_app, **kwargs):
+ return output_html_app
-def write_custom(scanned_files, output_file, _echo, version, template_path):
- """
- Write scan output formatted with a custom template.
- NOTE: this is NOT a plugin, but a built-in
- """
- _write_templated(scanned_files, output_file, _echo, version, template_or_format=template_path, raise_ex=True)
+ def process_codebase(self, codebase,
+ input, # NOQA
+ output_html_app,
+ scancode_version, **kwargs):
+
+ results = self.get_results(codebase, **kwargs)
+ output_html_app.write(as_html_app(output_html_app, input, scancode_version))
+ create_html_app_assets(results, output_html_app)
-def _write_templated(scanned_files, output_file, _echo, version, template_or_format, raise_ex=False):
+def write_templated(output_file, results, version, template_or_format):
"""
Write scan output using a template or a format.
Optionally raise an exception on errors.
"""
- for template_chunk in as_template(scanned_files, version, template=template_or_format):
+ for template_chunk in as_template(results, version, template_or_format=template_or_format):
try:
output_file.write(template_chunk)
except Exception:
import traceback
- extra_context = 'ERROR: Failed to write output for: ' + repr(template_chunk)
- extra_context += '\n' + traceback.format_exc()
- _echo(extra_context, fg='red')
- if raise_ex:
- # NOTE: this is a tad brutal to raise here, but helps
- # the template authors
- raise
-
-
-@scan_output_writer
-def write_html_app(scanned_files, input, output_file, _echo, version, *args, **kwargs):
- """
- Write scan output formatted as a mini HTML application.
- """
- output_file.write(as_html_app(input, version, output_file))
- try:
- create_html_app_assets(scanned_files, output_file)
- except HtmlAppAssetCopyWarning:
- _echo('\nHTML app creation skipped when printing to stdout.', fg='yellow')
- except HtmlAppAssetCopyError:
- _echo('\nFailed to create HTML app.', fg='red')
-
-
-def create_html_app_assets(results, output_file):
- """
- Given an html-app output_file, create the corresponding `_files`
- directory and copy the assets to this directory. The target
- directory is deleted if it exists.
-
- Raise HtmlAppAssetCopyWarning if the output_file is or
- HtmlAppAssetCopyError if the copy was not possible.
- """
- try:
- if is_stdout(output_file):
- raise HtmlAppAssetCopyWarning()
- assets_dir = os.path.join(get_template_dir('html-app'), 'assets')
-
- # delete old assets
- tgt_dirs = get_html_app_files_dirs(output_file)
- target_dir = os.path.join(*tgt_dirs)
- if os.path.exists(target_dir):
- fileutils.delete(target_dir)
-
- # copy assets
- fileutils.copytree(assets_dir, target_dir)
-
- # write json data
- root_path, assets_dir = get_html_app_files_dirs(output_file)
- with codecs.open(os.path.join(root_path, assets_dir, 'data.json'), 'wb', encoding='utf-8') as f:
- f.write('data=')
- json.dump(results, f, iterable_as_array=True)
-
- # create help file
- with codecs.open(os.path.join(root_path, assets_dir, 'help.html'), 'wb', encoding='utf-8') as f:
- f.write(get_html_app_help(os.path.basename(output_file.name)))
- except HtmlAppAssetCopyWarning, w:
- raise w
- except Exception, e:
- raise HtmlAppAssetCopyError(e)
-
-
-def as_html_app(scanned_path, version, output_file):
- """
- Return an HTML string built from a list of results and the html-app template.
- """
- template = get_template(get_template_dir('html-app'))
- _, assets_dir = get_html_app_files_dirs(output_file)
-
- return template.render(assets_dir=assets_dir, scanned_path=scanned_path, version=version)
-
-
-def get_html_app_help(output_filename):
- """
- Return an HTML string containing the html-app help page with a
- reference back to the main app page.
- """
- template = get_template(get_template_dir('html-app'),
- template_name='help_template.html')
-
- return template.render(main_app=output_filename)
-
-
-class HtmlAppAssetCopyWarning(Exception):
- pass
-
-
-class HtmlAppAssetCopyError(Exception):
- pass
-
-
-def is_stdout(output_file):
- return output_file.name == ''
-
-
-def get_html_app_files_dirs(output_file):
- """
- Return a tuple of (parent_dir, dir_name) directory named after the
- `output_file` file object file_base_name (stripped from extension) and a
- `_files` suffix Return empty strings if output is to stdout.
- """
- if is_stdout(output_file):
- return '', ''
-
- file_name = output_file.name
- parent_dir = os.path.dirname(file_name)
- dir_name = fileutils.file_base_name(file_name) + '_files'
- return parent_dir, dir_name
+ msg = 'ERROR: Failed to write output for: ' + repr(template_chunk)
+ msg += '\n' + traceback.format_exc()
+ raise Exception(msg)
-#
-# Common utilities for templated scans outputs: html, html-app and
-# custom templates.
-#
-
-# FIXME: no HTML default!
def get_template(templates_dir, template_name='template.html'):
"""
- Given a template directory, load and return the template file in the template_name
- file found in that directory.
+ Given a `templates_dir` template directory, load and return the template
+ file for the `template_name` file found in that directory.
"""
from jinja2 import Environment, FileSystemLoader
env = Environment(loader=FileSystemLoader(templates_dir))
@@ -199,20 +176,19 @@ def get_template(templates_dir, template_name='template.html'):
return template
-def get_template_dir(format):
+def get_template_dir(format_code):
"""
- Given a format string return the corresponding standard template
- directory.
+ Return the template directory of a built-in template for a `format_code`
+ string.
"""
- return os.path.join(os.path.dirname(__file__), 'templates', format)
+ return join(dirname(__file__), 'templates', format_code)
-# FIXME: no HTML default!
-def as_template(scanned_files, version, template):
+def as_template(results, version, template_or_format):
"""
- Return an string built from a list of `scanned_files` results and
- the provided `template` identifier. The template defaults to the standard HTML
- template format or can point to the path of a custom template file.
+ Return an string built from a list of `results` and the provided `template`
+ identifier. The template_or_format is either a built-in template format code
+ (e.g. "html") or the path of a custom template file.
"""
# FIXME: This code is highly coupled with actual scans and may not
# support adding new scans at all
@@ -220,14 +196,14 @@ def as_template(scanned_files, version, template):
from licensedcode.cache import get_licenses_db
# FIXME: factor out the html vs custom from this function: we should get a template path
- if template == 'html':
+ if template_or_format == 'html':
template = get_template(get_template_dir('html'))
else:
# load a custom template
- tpath = fileutils.as_posixpath(os.path.abspath(os.path.expanduser(template)))
- assert os.path.isfile(tpath)
- tdir = fileutils.parent_directory(tpath)
- tfile = fileutils.file_name(tpath)
+ tpath = as_posixpath(abspath(expanduser(template_or_format)))
+ assert isfile(tpath)
+ tdir = parent_directory(tpath)
+ tfile = file_name(tpath)
template = get_template(tdir, tfile)
converted = OrderedDict()
@@ -242,7 +218,7 @@ def as_template(scanned_files, version, template):
EMAILS = 'emails'
# Create a flattened data dict keyed by path
- for scanned_file in scanned_files:
+ for scanned_file in results:
path = scanned_file['path']
results = []
if COPYRIGHTS in scanned_file:
@@ -292,3 +268,91 @@ def as_template(scanned_files, version, template):
}
return template.generate(files=files, licenses=licenses, version=version)
+
+
+def create_html_app_assets(results, output_file):
+ """
+ Given an html-app output_file, create the corresponding `_files`
+ directory and copy the assets to this directory. The target
+ directory is deleted if it exists.
+
+ Raise HtmlAppAssetCopyWarning if the output_file is or
+ HtmlAppAssetCopyError if the copy was not possible.
+ """
+ try:
+ if is_stdout(output_file):
+ raise HtmlAppAssetCopyWarning()
+ assets_dir = join(get_template_dir('html-app'), 'assets')
+
+ # delete old assets
+ tgt_dirs = get_html_app_files_dirs(output_file)
+ target_dir = join(*tgt_dirs)
+ if exists(target_dir):
+ delete(target_dir)
+
+ # copy assets
+ copytree(assets_dir, target_dir)
+
+ # write json data
+ # FIXME: this should a regular JSON scan format
+ root_path, assets_dir = get_html_app_files_dirs(output_file)
+ with codecs.open(join(root_path, assets_dir, 'data.json'), 'wb', encoding='utf-8') as f:
+ f.write('data=')
+ simplejson.dump(results, f, iterable_as_array=True)
+
+ # create help file
+ with codecs.open(join(root_path, assets_dir, 'help.html'), 'wb', encoding='utf-8') as f:
+ f.write(get_html_app_help(basename(output_file.name)))
+ except HtmlAppAssetCopyWarning, w:
+ raise w
+ except Exception, e:
+ raise HtmlAppAssetCopyError(e)
+
+
+def as_html_app(output_file, scanned_path, version,):
+ """
+ Return an HTML string built from a list of results and the html-app template.
+ """
+ template = get_template(get_template_dir('html-app'))
+ _, assets_dir = get_html_app_files_dirs(output_file)
+
+ return template.render(assets_dir=assets_dir, scanned_path=scanned_path, version=version)
+
+
+def get_html_app_help(output_filename):
+ """
+ Return an HTML string containing the html-app help page with a
+ reference back to the main app page.
+ """
+ template = get_template(get_template_dir('html-app'),
+ template_name='help_template.html')
+
+ return template.render(main_app=output_filename)
+
+
+class HtmlAppAssetCopyWarning(Exception):
+ pass
+
+
+class HtmlAppAssetCopyError(Exception):
+ pass
+
+
+def is_stdout(output_file):
+ return output_file.name == ''
+
+
+def get_html_app_files_dirs(output_file):
+ """
+ Return a tuple of (parent_dir, dir_name) directory named after the
+ `output_file` file-like object file_base_name (stripped from extension) and
+ a `_files` suffix Return empty strings if output is to stdout.
+ """
+ if is_stdout(output_file):
+ return '', ''
+
+ # FIXME: what if there is no name attribute??
+ file_name = output_file.name
+ parent_dir = dirname(file_name)
+ dir_name = file_base_name(file_name) + '_files'
+ return parent_dir, dir_name
diff --git a/src/formattedcode/output_json.py b/src/formattedcode/output_json.py
new file mode 100644
index 00000000000..23751d107de
--- /dev/null
+++ b/src/formattedcode/output_json.py
@@ -0,0 +1,118 @@
+#
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+# ScanCode is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+
+import simplejson
+
+from plugincode.output import output_impl
+from plugincode.output import OutputPlugin
+from scancode import CommandLineOption
+from scancode import FileOptionType
+from scancode import OUTPUT_GROUP
+
+"""
+Output plugins to write scan results as JSON.
+"""
+
+
+@output_impl
+class JsonCompactOutput(OutputPlugin):
+
+ options = [
+ CommandLineOption(('--json', 'output_json',),
+ type=FileOptionType(mode='wb', lazy=False),
+ metavar='FILE',
+ help='Write scan output as compact JSON to FILE.',
+ help_group=OUTPUT_GROUP,
+ sort_order=10),
+ ]
+
+ def is_enabled(self, output_json, **kwargs):
+ return output_json
+
+ def process_codebase(self, codebase, output_json, files_count,
+ scancode_version, scancode_notice, pretty_options,
+ **kwargs):
+
+ results = self.get_results(codebase, **kwargs)
+ write_json(results=results, output_file=output_json,
+ files_count=files_count,
+ scancode_version=scancode_version,
+ scancode_notice=scancode_notice,
+ pretty_options=pretty_options,
+ pretty=False)
+
+
+@output_impl
+class JsonPrettyOutput(OutputPlugin):
+
+ options = [
+ CommandLineOption(('--json-pp', 'output_json_pp',),
+ type=FileOptionType(mode='wb', lazy=False),
+ metavar='FILE',
+ help='Write scan output as pretty-printed JSON to FILE.',
+ help_group=OUTPUT_GROUP,
+ sort_order=10),
+ ]
+
+ def is_enabled(self, output_json_pp, **kwargs):
+ return output_json_pp
+
+ def process_codebase(self, codebase, output_json_pp, files_count,
+ scancode_version, scancode_notice, pretty_options,
+ **kwargs):
+
+ results = self.get_results(codebase, **kwargs)
+ write_json(results=results, output_file=output_json_pp,
+ files_count=files_count,
+ scancode_version=scancode_version,
+ scancode_notice=scancode_notice,
+ pretty_options=pretty_options,
+ pretty=True)
+
+
+def write_json(results, output_file, files_count,
+ scancode_version, scancode_notice,
+ pretty_options, pretty=False):
+
+ scan = OrderedDict([
+ ('scancode_notice', scancode_notice),
+ ('scancode_version', scancode_version),
+ ('scancode_options', pretty_options),
+ ('files_count', files_count),
+ ('files', results),
+ ])
+
+ kwargs = dict(iterable_as_array=True, encoding='utf-8')
+ if pretty:
+ kwargs.update(dict(indent=2 * b' '))
+ else:
+ kwargs.update(dict(separators=(b',', b':',)))
+
+ output_file.write(simplejson.dumps(scan, **kwargs))
+ output_file.write(b'\n')
diff --git a/src/formattedcode/output_jsonlines.py b/src/formattedcode/output_jsonlines.py
new file mode 100644
index 00000000000..38f15235fb8
--- /dev/null
+++ b/src/formattedcode/output_jsonlines.py
@@ -0,0 +1,75 @@
+#
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+# ScanCode is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+
+import simplejson
+
+from plugincode.output import output_impl
+from plugincode.output import OutputPlugin
+from scancode import CommandLineOption
+from scancode import FileOptionType
+from scancode import OUTPUT_GROUP
+
+
+@output_impl
+class JsonLinesOutput(OutputPlugin):
+
+ options = [
+ CommandLineOption(('--json-lines', 'output_json_lines',),
+ type=FileOptionType(mode='wb', lazy=False),
+ metavar='FILE',
+ help='Write scan output as JSON Lines to FILE.',
+ help_group=OUTPUT_GROUP,
+ sort_order=15),
+ ]
+
+ def is_enabled(self, output_json_lines, **kwargs):
+ return output_json_lines
+
+ def process_codebase(self, codebase, output_json_lines, files_count,
+ scancode_version, scancode_notice, pretty_options,
+ **kwargs):
+
+ results = self.get_results(codebase, **kwargs)
+
+ header = dict(header=OrderedDict([
+ ('scancode_notice', scancode_notice),
+ ('scancode_version', scancode_version),
+ ('scancode_options', pretty_options),
+ ('files_count', files_count)
+ ]))
+
+ kwargs = dict(
+ iterable_as_array=True, encoding='utf-8', separators=(',', ':',))
+ output_json_lines.write(simplejson.dumps(header, **kwargs))
+ output_json_lines.write('\n')
+
+ for scanned_file in results:
+ scanned_file_line = {'files': [scanned_file]}
+ output_json_lines.write(simplejson.dumps(scanned_file_line, **kwargs))
+ output_json_lines.write('\n')
diff --git a/src/formattedcode/format_spdx.py b/src/formattedcode/output_spdx.py
similarity index 67%
rename from src/formattedcode/format_spdx.py
rename to src/formattedcode/output_spdx.py
index 622a051f2f9..e910116e17d 100644
--- a/src/formattedcode/format_spdx.py
+++ b/src/formattedcode/output_spdx.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -27,8 +27,13 @@
from __future__ import division
from __future__ import unicode_literals
-import os
from os.path import abspath
+from os.path import basename
+from os.path import dirname
+from os.path import isdir
+from os.path import isfile
+from os.path import join
+import sys
from spdx.checksum import Algorithm
from spdx.creationinfo import Tool
@@ -41,48 +46,118 @@
from spdx.utils import SPDXNone
from spdx.version import Version
-from plugincode.output import scan_output_writer
+from plugincode.output import output_impl
+from plugincode.output import OutputPlugin
+from scancode import CommandLineOption
+from scancode import FileOptionType
+from scancode import OUTPUT_GROUP
+# Python 2 and 3 support
+try:
+ # Python 2
+ unicode
+ str_orig = str
+ bytes = str # NOQA
+ str = unicode # NOQA
+except NameError:
+ # Python 3
+ unicode = str # NOQA
+
+# Tracing flags
+TRACE = False
+TRACE_DEEP = False
+
+
+def logger_debug(*args):
+ pass
+
+
+if TRACE or TRACE_DEEP:
+ import logging
+
+ logger = logging.getLogger(__name__)
+ logging.basicConfig(stream=sys.stdout)
+ logger.setLevel(logging.DEBUG)
+
+ def logger_debug(*args):
+ return logger.debug(' '.join(isinstance(a, unicode)
+ and a or repr(a) for a in args))
"""
Output plugins to write scan results in SPDX format.
"""
-@scan_output_writer
-def write_spdx_tag_value(files_count, version, notice, scanned_files, input, output_file, *args, **kwargs):
- """
- Write scan output formatted as SPDX Tag/Value.
- """
- write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=True)
+@output_impl
+class SpdxTvOutput(OutputPlugin):
-@scan_output_writer
-def write_spdx_rdf(files_count, version, notice, scanned_files, input, output_file, *args, **kwargs):
- """
- Write scan output formatted as SPDX RDF.
- """
- write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=False)
+ options = [
+ CommandLineOption(('--output-spdx-tv',),
+ type=FileOptionType(mode='wb', lazy=False),
+ metavar='FILE',
+ requires=['info'],
+ help='Write scan output as SPDX Tag/Value to FILE.',
+ help_group=OUTPUT_GROUP)
+ ]
+ def is_enabled(self, output_spdx_tv, info, **kwargs):
+ return output_spdx_tv and info
-def write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=True):
+ def process_codebase(self, codebase,
+ input, # NOQA
+ output_spdx_tv,
+ scancode_version, scancode_notice, **kwargs):
+
+ results = self.get_results(codebase, **kwargs)
+ write_spdx(output_spdx_tv, results, scancode_version, scancode_notice,
+ input, as_tagvalue=True)
+
+
+@output_impl
+class SpdxRdfOutput(OutputPlugin):
+
+ options = [
+ CommandLineOption(('--output-spdx-rdf',),
+ type=FileOptionType(mode='wb', lazy=False),
+ metavar='FILE',
+ requires=['info'],
+ help='Write scan output as SPDX RDF to FILE.',
+ help_group=OUTPUT_GROUP)
+ ]
+
+ def is_enabled(self, output_spdx_rdf, info, **kwargs):
+ return output_spdx_rdf and info
+
+ def process_codebase(self, codebase,
+ input, # NOQA
+ output_spdx_rdf,
+ scancode_version, scancode_notice, **kwargs):
+
+ results = self.get_results(codebase, **kwargs)
+ write_spdx(output_spdx_rdf, results, scancode_version, scancode_notice,
+ input, as_tagvalue=False)
+
+
+def write_spdx(output_file, results, scancode_version, scancode_notice,
+ input_file, as_tagvalue=True):
"""
- Write scan output formatted as SPDX Tag/value or RDF.
+ Write scan output as SPDX Tag/value or RDF.
"""
- absinput = abspath(input)
+ absinput = abspath(input_file)
- if os.path.isdir(absinput):
+ if isdir(absinput):
input_path = absinput
else:
- input_path = os.path.dirname(absinput)
+ input_path = dirname(absinput)
doc = Document(Version(2, 1), License.from_identifier('CC0-1.0'))
- doc.comment = notice
+ doc.comment = scancode_notice
- doc.creation_info.add_creator(Tool('ScanCode ' + version))
+ doc.creation_info.add_creator(Tool('ScanCode ' + scancode_version))
doc.creation_info.set_created_now()
package = doc.package = Package(
- name=os.path.basename(input_path),
+ name=basename(input_path),
download_location=NoAssert()
)
@@ -92,14 +167,15 @@ def write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=T
all_files_have_no_license = True
all_files_have_no_copyright = True
- for file_data in scanned_files:
+ # FIXME: this should walk the codebase instead!!!
+ for file_data in results:
# Construct the absolute path in case we need to access the file
# to calculate its SHA1.
- file_entry = File(os.path.join(input_path, file_data.get('path')))
+ file_entry = File(join(input_path, file_data.get('path')))
file_sha1 = file_data.get('sha1')
if not file_sha1:
- if os.path.isfile(file_entry.name):
+ if isfile(file_entry.name):
# Calculate the SHA1 in case it is missing, e.g. for empty files.
file_sha1 = file_entry.calc_chksum()
else:
@@ -125,7 +201,8 @@ def write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=T
licenseref_id = 'LicenseRef-' + license_key
spdx_license = ExtractedLicense(licenseref_id)
spdx_license.name = file_license.get('short_name')
- comment = 'See details at https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/%s.yml\n' % license_key
+ comment = ('See details at https://github.com/nexB/scancode-toolkit'
+ '/blob/develop/src/licensedcode/data/licenses/%s.yml\n' % license_key)
spdx_license.comment = comment
text = file_license.get('matched_text')
# always set some text, even if we did not extract the matched text
@@ -168,7 +245,6 @@ def write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=T
else:
file_entry.copyright = SPDXNone()
-
package.add_file(file_entry)
if len(package.files) == 0:
@@ -203,9 +279,9 @@ def write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=T
package.conc_lics = NoAssert()
if as_tagvalue:
- from spdx.writers.tagvalue import write_document
+ from spdx.writers.tagvalue import write_document # NOQA
else:
- from spdx.writers.rdf import write_document
+ from spdx.writers.rdf import write_document # NOQA
# The spdx-tools write_document returns either:
# - unicode for tag values
diff --git a/src/licensedcode/__init__.py b/src/licensedcode/__init__.py
index 972c5bde04b..ecfc6d7ffd3 100644
--- a/src/licensedcode/__init__.py
+++ b/src/licensedcode/__init__.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -22,31 +22,8 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
from __future__ import absolute_import
-from os.path import dirname
-from os.path import abspath
-from os.path import getsize
-from os.path import getmtime
-from os.path import join
-from os.path import exists
-
-from commoncode import fileutils
-
-
-lic_src_dir = abspath(dirname(__file__))
-src_dir = dirname(lic_src_dir)
-data_dir = join(lic_src_dir, 'data')
-licenses_data_dir = join(data_dir, 'licenses')
-rules_data_dir = join(data_dir, 'rules')
-root_dir = dirname(src_dir)
-cache_dir = join(root_dir, '.cache')
-license_index_cache_dir = join(cache_dir, 'license_index')
-
-if not exists(license_index_cache_dir):
- fileutils.create_dir(license_index_cache_dir)
-
# minimum number of tokens a match should have to be considered as worthy keeping
MIN_MATCH_LENGTH = 4
MIN_MATCH_HIGH_LENGTH = 3
@@ -55,4 +32,3 @@
# eventually this should be skipped early right during the matching too
# maximum distance between two matches to merge
MAX_DIST = 120
-
diff --git a/src/licensedcode/cache.py b/src/licensedcode/cache.py
index 6f155e97ec4..873ba344bbd 100644
--- a/src/licensedcode/cache.py
+++ b/src/licensedcode/cache.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -26,21 +26,20 @@
from functools import partial
from hashlib import md5
-import os
from os.path import exists
from os.path import getmtime
from os.path import getsize
from os.path import join
-import yg.lockfile # @UnresolvedImport
+import yg.lockfile # NOQA
-from commoncode.fileutils import file_iter
+from commoncode.fileutils import resource_iter
+from commoncode.fileutils import create_dir
from commoncode import ignore
-from licensedcode import root_dir
-from licensedcode import src_dir
-from licensedcode import license_index_cache_dir
-
+from scancode_config import scancode_cache_dir
+from scancode_config import scancode_src_dir
+from scancode_config import SCANCODE_DEV_MODE
"""
An on-disk persistent cache of LicenseIndex. The index is pickled and invalidated if
@@ -48,160 +47,160 @@
cached index is safe to use across multiple processes using lock files.
"""
-index_lock_file = join(license_index_cache_dir, 'lockfile')
-tree_checksum_file = join(license_index_cache_dir, 'tree_checksums')
-index_cache_file = join(license_index_cache_dir, 'index_cache')
-
+LICENSE_INDEX_LOCK_TIMEOUT = 60 * 3
-_ignored_from_hash = partial(
- ignore.is_ignored,
- ignores={'*.pyc': 'pyc files', '*~': 'temp gedit files', '*.swp': 'vi swap files'},
- unignores={}
-)
+# global in-memory cache of the main license index instance
+_LICENSES_INDEX = None
-def tree_checksum(tree_base_dir=src_dir, _ignored=_ignored_from_hash):
+def get_index(cache_dir=scancode_cache_dir, check_consistency=SCANCODE_DEV_MODE,
+ return_value=True):
"""
- Return a checksum computed from a file tree using the file paths,
- size and last modified time stamps.
- The purpose is to detect is there has been any modification to
- source code or data files and use this as a proxy to verify the
- cache consistency.
-
- NOTE: this is not 100% fool proof but good enough in practice.
+ Return and eventually cache an index built from an iterable of rules.
+ Build the index from the built-in rules dataset.
"""
- hashable = (pth + str(getmtime(pth)) + str(getsize(pth))
- for pth in file_iter(tree_base_dir, ignored=_ignored))
- return md5(''.join(sorted(hashable))).hexdigest()
-
-
-LICENSE_INDEX_LOCK_TIMEOUT = 60 * 3
+ global _LICENSES_INDEX
+ if not _LICENSES_INDEX:
+ _LICENSES_INDEX = get_cached_index(cache_dir, check_consistency)
+ if return_value:
+ return _LICENSES_INDEX
-# If this file exists at the root, the cache is always checked for consistency
-DEV_MODE = os.path.exists(os.path.join(root_dir, 'SCANCODE_DEV_MODE'))
+# global in-memory cache of a mapping of key -> license instance
+_LICENSES = {}
-def get_or_build_index_through_cache(
- check_consistency=DEV_MODE,
- return_index=True,
- # used for testing only
- _tree_base_dir=src_dir,
- _tree_checksum_file=tree_checksum_file,
- _index_lock_file=index_lock_file,
- _index_cache_file=index_cache_file,
- _licenses_data_dir=None,
- _rules_data_dir=None,
- _timeout=LICENSE_INDEX_LOCK_TIMEOUT,
- ):
+def get_licenses_db(licenses_data_dir=None):
"""
- Check and build or rebuild the LicenseIndex cache.
- If the cache does not exist, a new index is built an cached.
- Return the LicenseIndex if return_index is True.
+ Return a mapping of license key -> license object.
+ """
+ global _LICENSES
+ if not _LICENSES :
+ from licensedcode.models import load_licenses
+ if not licenses_data_dir:
+ from licensedcode.models import licenses_data_dir as ldd
+ licenses_data_dir = ldd
+ _LICENSES = load_licenses(licenses_data_dir)
+ return _LICENSES
- If `check_consistency` is True, the cache is checked for consistency
- and rebuilt if inconsistent or stale.
- If `check_consistency` is False, the cache is NOT checked for consistency
- If the cache files exist but stale, the cache WILL NOT be rebuilt
+def get_cached_index(cache_dir=scancode_cache_dir,
+ check_consistency=SCANCODE_DEV_MODE,
+ # used for testing only
+ timeout=LICENSE_INDEX_LOCK_TIMEOUT,
+ tree_base_dir=scancode_src_dir,
+ licenses_data_dir=None, rules_data_dir=None,):
+ """
+ Return a LicenseIndex: either load a cached index or build and cache the
+ index.
+ - If the cache does not exist, a new index is built an cached.
+ - If `check_consistency` is True, the cache is checked for consistency and
+ rebuilt if inconsistent or stale.
+ - If `check_consistency` is False, the cache is NOT checked for consistency
+ If the cache files exist but ARE stale, the cache WILL NOT be rebuilt
"""
from licensedcode.index import LicenseIndex
+ from licensedcode.models import licenses_data_dir as ldd
+ from licensedcode.models import rules_data_dir as rdd
from licensedcode.models import get_rules
- from licensedcode.models import licenses_data_dir
- from licensedcode.models import rules_data_dir
- _licenses_data_dir = _licenses_data_dir or licenses_data_dir
- _rules_data_dir = _rules_data_dir or rules_data_dir
- has_cache = exists(_index_cache_file)
- has_tree_checksum = exists(_tree_checksum_file)
+ licenses_data_dir = licenses_data_dir or ldd
+ rules_data_dir = rules_data_dir or rdd
+
+ lock_file, checksum_file, cache_file = get_license_cache_paths(cache_dir)
+
+ has_cache = exists(cache_file)
+ has_tree_checksum = exists(checksum_file)
# bypass check if no consistency check is needed
if has_cache and has_tree_checksum and not check_consistency:
- return return_index and _load_index(_index_cache_file)
+ return load_index(cache_file)
# here, we have no cache or we want a validity check: lock, check
# and build or rebuild as needed
try:
# acquire lock and wait until timeout to get a lock or die
- with yg.lockfile.FileLock(_index_lock_file, timeout=_timeout):
+ with yg.lockfile.FileLock(lock_file, timeout=timeout):
current_checksum = None
# is the current cache consistent or stale?
if has_cache and has_tree_checksum:
# if we have a saved cached index
# load saved tree_checksum and compare with current tree_checksum
- with open(_tree_checksum_file, 'rb') as etcs:
+ with open(checksum_file, 'rb') as etcs:
existing_checksum = etcs.read()
- current_checksum = tree_checksum(tree_base_dir=_tree_base_dir)
+ current_checksum = tree_checksum(tree_base_dir=tree_base_dir)
if current_checksum == existing_checksum:
# The cache is consistent with the latest code and data
# load and return
- return return_index and _load_index(_index_cache_file)
+ return load_index(cache_file)
# Here, the cache is not consistent with the latest code and
# data: It is either stale or non-existing: we need to
# rebuild the index and cache it
rules = get_rules(
- licenses_data_dir=_licenses_data_dir,
- rules_data_dir=_rules_data_dir)
+ licenses_data_dir=licenses_data_dir,
+ rules_data_dir=rules_data_dir)
+
idx = LicenseIndex(rules)
- with open(_index_cache_file, 'wb') as ifc:
+
+ with open(cache_file, 'wb') as ifc:
ifc.write(idx.dumps())
# save the new checksums tree
- with open(_tree_checksum_file, 'wb') as ctcs:
- ctcs.write(current_checksum or tree_checksum(tree_base_dir=_tree_base_dir))
+ with open(checksum_file, 'wb') as ctcs:
+ ctcs.write(current_checksum
+ or tree_checksum(tree_base_dir=tree_base_dir))
- return return_index and idx
+ return idx
except yg.lockfile.FileLockTimeout:
# TODO: handle unable to lock in a nicer way
raise
-def _load_index(_index_cache_file=index_cache_file):
+def load_index(cache_file):
"""
Return a LicenseIndex loaded from cache.
"""
from licensedcode.index import LicenseIndex
-
- with open(_index_cache_file, 'rb') as ifc:
+ with open(cache_file, 'rb') as ifc:
# Note: weird but read() + loads() is much (twice++???) faster than load()
- idx = LicenseIndex.loads(ifc.read())
- return idx
-
-
-"""Check the license index and reindex if needed."""
-reindex = partial(get_or_build_index_through_cache, check_consistency=True, return_index=False)
+ return LicenseIndex.loads(ifc.read())
-# global in-memory cache of the main license index instance
-_LICENSES_INDEX = None
+_ignored_from_hash = partial(
+ ignore.is_ignored,
+ ignores={'*.pyc': 'pyc files',
+ '*~': 'temp gedit files',
+ '*.swp': 'vi swap files'},
+ unignores={}
+)
-def get_index(_return_index=True):
+def tree_checksum(tree_base_dir=scancode_src_dir, _ignored=_ignored_from_hash):
"""
- Return and eventually cache an index built from an iterable of rules.
- Build the index from the built-in rules dataset.
- """
- global _LICENSES_INDEX
- if not _LICENSES_INDEX:
- _LICENSES_INDEX = get_or_build_index_through_cache()
- return _return_index and _LICENSES_INDEX
-
+ Return a checksum computed from a file tree using the file paths,
+ size and last modified time stamps.
+ The purpose is to detect is there has been any modification to
+ source code or data files and use this as a proxy to verify the
+ cache consistency.
-# global in-memory cache of a mapping of key -> license instance
-_LICENSES = {}
+ NOTE: this is not 100% fool proof but good enough in practice.
+ """
+ resources = resource_iter(tree_base_dir, ignored=_ignored, with_dirs=False)
+ hashable = (pth + str(getmtime(pth)) + str(getsize(pth)) for pth in resources)
+ return md5(''.join(sorted(hashable))).hexdigest()
-def get_licenses_db(licenses_data_dir=None):
+def get_license_cache_paths(cache_dir=scancode_cache_dir):
"""
- Return a mapping of license key -> license object.
+ Return a tuple of index cache files given a master `cache_dir`
"""
- global _LICENSES
- if not _LICENSES :
- from licensedcode.models import load_licenses
- if not licenses_data_dir:
- from licensedcode.models import licenses_data_dir as ldd
- licenses_data_dir = ldd
- _LICENSES = load_licenses(licenses_data_dir)
- return _LICENSES
+ idx_cache_dir = join(cache_dir, 'license_index')
+ create_dir(idx_cache_dir)
+
+ lock_file = join(idx_cache_dir, 'lockfile')
+ checksum_file = join(idx_cache_dir, 'tree_checksums')
+ cache_file = join(idx_cache_dir, 'index_cache')
+
+ return lock_file, checksum_file, cache_file
diff --git a/src/licensedcode/index.py b/src/licensedcode/index.py
index 049f36fe00b..2c298dbc2af 100644
--- a/src/licensedcode/index.py
+++ b/src/licensedcode/index.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -34,16 +34,15 @@
from functools import partial
from itertools import izip
from operator import itemgetter
-import os
import sys
from time import time
-from commoncode.dict_utils import sparsify
+# import early
+from scancode_config import scancode_cache_dir
+from commoncode.dict_utils import sparsify
from licensedcode import MAX_DIST
-from licensedcode.cache import get_index
from licensedcode.frequent_tokens import global_tokens_by_ranks
-
from licensedcode import match
from licensedcode import match_aho
from licensedcode import match_hash
@@ -83,8 +82,8 @@
def logger_debug(*args):
pass
-if (TRACE or TRACE_INDEXING_PERF or TRACE_QUERY_RUN_SIMPLE
- or os.environ.get('SCANCODE_LICENSE_DEBUG') or TRACE_NEGATIVE):
+
+if TRACE or TRACE_INDEXING_PERF or TRACE_QUERY_RUN_SIMPLE or TRACE_NEGATIVE:
import logging
logger = logging.getLogger(__name__)
@@ -96,7 +95,8 @@ def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, basestring) and a or repr(a) for a in args))
-def get_license_matches(location=None, query_string=None, min_score=0):
+def get_license_matches(location=None, query_string=None, min_score=0,
+ cache_dir=scancode_cache_dir):
"""
Yield detected license matches in the file at `location` or the
`query_string` string.
@@ -108,7 +108,8 @@ def get_license_matches(location=None, query_string=None, min_score=0):
The minimum length for an approximate match is four tokens.
Spurrious matched are always filtered.
"""
- return get_index().match(location=location, query_string=query_string, min_score=min_score)
+ from licensedcode.cache import get_index
+ return get_index(cache_dir).match(location=location, query_string=query_string, min_score=min_score)
# Feature switch to enable or not ngram fragments detection
@@ -565,7 +566,7 @@ def negative_match(self, query_run):
from the query run.
"""
matches = match_aho.exact_match(self, query_run, self.negative_automaton)
-
+
if TRACE_NEGATIVE and matches: logger_debug(' ##final _negative_matches:....', len(matches))
return matches
diff --git a/src/licensedcode/legal.py b/src/licensedcode/legal.py
index 7a7e0933af8..e7af15faa9d 100644
--- a/src/licensedcode/legal.py
+++ b/src/licensedcode/legal.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -28,17 +28,14 @@
from commoncode import fileutils
-
"""
Recognition of typical "legal" files such as "LICENSE", "COPYING", etc.
"""
-
special_names = (
'COPYING', 'COPYRIGHT', 'NOTICE', 'LICENSE', 'LICENCE',
'LEGAL', 'EULA', 'AGREEMENT', 'ABOUT', 'COPYLEFT', 'LICENSING')
-
special_names_lower = tuple(x.lower() for x in special_names)
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
index 440357e5ff4..d3e35dae75d 100644
--- a/src/licensedcode/match.py
+++ b/src/licensedcode/match.py
@@ -53,6 +53,7 @@
def logger_debug(*args): pass
+
if (TRACE or TRACE_FILTER_CONTAINS or TRACE_MERGE
or TRACE_REFINE_RULE_MIN_COVERAGE or TRACE_REFINE_SINGLE
or TRACE_REFINE_SMALL):
@@ -430,7 +431,6 @@ def merge_matches(matches, max_dist=MAX_DIST):
returned as-is.
For being merged two matches must also be in increasing query and index positions.
"""
- from licensedcode.match_seq import MATCH_SEQ
# shortcut for single matches
if len(matches) < 2:
@@ -474,7 +474,6 @@ def merge_matches(matches, max_dist=MAX_DIST):
if TRACE_MERGE: logger_debug(' ---> ###merge_matches: MAX_DIST reached, breaking')
break
-
# keep one of equal matches
# with same qspan: FIXME: is this ever possible?
if current_match.qspan == next_match.qspan and current_match.ispan == next_match.ispan:
@@ -563,10 +562,10 @@ def merge_matches(matches, max_dist=MAX_DIST):
merged.extend(rule_matches)
return merged
-
# FIXME we should consider the length and distance between matches to break
# early from the loops: trying to check containment on wildly separated matches does not make sense
+
def filter_contained_matches(matches):
"""
Return a filtered list of LicenseMatch given a `matches` list of LicenseMatch by
@@ -1067,6 +1066,7 @@ def get_full_matched_text(
dictionary_get = idx.dictionary.get
import attr
+
@attr.s(slots=True)
class Token(object):
value = attr.ib()
diff --git a/src/licensedcode/match_aho.py b/src/licensedcode/match_aho.py
index 8c7b090775f..d5c706877dd 100644
--- a/src/licensedcode/match_aho.py
+++ b/src/licensedcode/match_aho.py
@@ -51,6 +51,7 @@ def logger_debug(*args):
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
else:
+
def logger_debug(*args):
pass
diff --git a/src/licensedcode/match_hash.py b/src/licensedcode/match_hash.py
index 76bfc5de15c..512d2a1e2e3 100644
--- a/src/licensedcode/match_hash.py
+++ b/src/licensedcode/match_hash.py
@@ -30,7 +30,6 @@
from licensedcode.spans import Span
from licensedcode.match import LicenseMatch
-
"""
Matching strategy using hashes to match a whole text chunk at once.
"""
@@ -51,10 +50,10 @@ def logger_debug(*args):
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
else:
+
def logger_debug(*args):
pass
-
MATCH_HASH = '1-hash'
diff --git a/src/licensedcode/match_seq.py b/src/licensedcode/match_seq.py
index 90d5d870f9e..8f70143555e 100644
--- a/src/licensedcode/match_seq.py
+++ b/src/licensedcode/match_seq.py
@@ -24,7 +24,6 @@
from __future__ import absolute_import, division, print_function
-
from licensedcode.match import get_texts
from licensedcode.match import LicenseMatch
from licensedcode.seq import match_blocks
@@ -33,8 +32,10 @@
TRACE = False
TRACE2 = False
+
def logger_debug(*args): pass
+
if TRACE:
import logging
import sys
@@ -54,6 +55,7 @@ def logger_debug(*args):
MATCH_SEQ = '3-seq'
+
def match_sequence(idx, candidate, query_run, start_offset=0):
"""
Return a list of LicenseMatch by matching the `query_run` tokens sequence
diff --git a/src/licensedcode/match_set.py b/src/licensedcode/match_set.py
index b96a95fee44..e7da310e75e 100644
--- a/src/licensedcode/match_set.py
+++ b/src/licensedcode/match_set.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -35,7 +35,6 @@
from licensedcode.models import Rule
-
"""
Approximate matching strategies using token sets and multisets.
@@ -123,6 +122,7 @@
def logger_debug(*args): pass
+
if TRACE:
import logging
import sys
@@ -134,10 +134,10 @@ def logger_debug(*args): pass
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, basestring) and a or repr(a) for a in args))
-
# TODO: add bigrams sets and multisets
# TODO: see also https://github.com/bolo1729/python-memopt/blob/master/memopt/memopt.py for multisets
+
def tids_sets_intersector(qset, iset):
"""
Return the intersection of a query and index token ids sets.
@@ -225,6 +225,7 @@ def index_token_sets(token_ids, len_junk, len_good):
# would discard when we compute candaites to eventually discard many or all candidates
# we compute too many candidates that may waste time in seq matching for no reason
+
# FIXME: Also we should remove any weak and or small rules from the top candidates
# and anything that cannot be seq matched at all. (e.g. no high match)
def compute_candidates(query_run, idx, rules_subset, top=30):
@@ -270,7 +271,10 @@ def compute_candidates(query_run, idx, rules_subset, top=30):
logger_debug('candidate: ihigh:', [(idx.tokens_by_tid[tid], val) for tid, val in enumerate(ihigh, idx.len_junk)])
thresholds = thresholds_getter(rule)
- compared = compare_sets(qhigh, qlow, ihigh, ilow, thresholds, intersector, counter)
+ if TRACE_DEEP:
+ compared = compare_sets(qhigh, qlow, ihigh, ilow, thresholds, intersector, counter, rule, idx)
+ else:
+ compared = compare_sets(qhigh, qlow, ihigh, ilow, thresholds, intersector, counter)
if compared:
sort_order, intersection = compared
sortable_candidates.append((sort_order, rid, rule, intersection))
@@ -309,7 +313,7 @@ def compute_candidates(query_run, idx, rules_subset, top=30):
return candidates
-def compare_sets(qhigh, qlow, ihigh, ilow, thresholds, intersector, counter):
+def compare_sets(qhigh, qlow, ihigh, ilow, thresholds, intersector, counter, _rule=None, _idx=None):
"""
Compare a query qhigh and qlow sets with an index rule ihigh and ilow sets.
Return a tuple suitable for sorting and the computed sets intersection or None if
@@ -383,4 +387,9 @@ def compare_sets(qhigh, qlow, ihigh, ilow, thresholds, intersector, counter):
inter = low_inter
low_inter.update(high_inter)
+ if TRACE_DEEP:
+ logger_debug('compare_sets: intersected rule:', _rule.identifier)
+ logger_debug(' compare_sets: thresholds:', thresholds)
+ logger_debug(' compare_sets: high_inter:', ' '.join(_idx.tokens_by_tid[tid] for tid in high_inter))
+
return sort_order, inter
diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
index 6057761d43e..99791aa7a52 100644
--- a/src/licensedcode/models.py
+++ b/src/licensedcode/models.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -23,9 +23,9 @@
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
from __future__ import absolute_import
-from __future__ import unicode_literals
-from __future__ import print_function
from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
import codecs
from collections import Counter
@@ -34,23 +34,27 @@
from collections import OrderedDict
from itertools import chain
from operator import itemgetter
+from os.path import abspath
+from os.path import dirname
from os.path import exists
from os.path import join
+from commoncode.fileutils import copyfile
from commoncode.fileutils import file_base_name
from commoncode.fileutils import file_name
-from commoncode.fileutils import file_iter
+from commoncode.fileutils import resource_iter
from commoncode import saneyaml
from textcode.analysis import text_lines
from licensedcode import MIN_MATCH_LENGTH
from licensedcode import MIN_MATCH_HIGH_LENGTH
-from licensedcode import licenses_data_dir
-from licensedcode import rules_data_dir
from licensedcode.tokenize import rule_tokenizer
from licensedcode.tokenize import query_tokenizer
-from commoncode import fileutils
+# these are globals but always side-by-side with the code so not moving
+data_dir = join(abspath(dirname(__file__)), 'data')
+licenses_data_dir = join(data_dir, 'licenses')
+rules_data_dir = join(data_dir, 'rules')
"""
Reference License and license Rule structures persisted as a combo of a YAML
@@ -180,7 +184,7 @@ def relocate(self, target_dir, new_key=None):
# save it all to files
if self.text:
- fileutils.copyfile(self.text_file, newl.text_file)
+ copyfile(self.text_file, newl.text_file)
newl.dump()
return newl
@@ -389,7 +393,6 @@ def validate(licenses, verbose=False, no_dupe_urls=False):
# for global dedupe
by_text[license_qtokens].append(key + ': TEXT')
-
# SPDX consistency
if lic.spdx_license_key:
by_spdx_key[lic.spdx_license_key].append(key)
@@ -431,7 +434,7 @@ def load_licenses(licenses_data_dir=licenses_data_dir , with_deprecated=False):
Return a mapping of key -> license objects, loaded from license files.
"""
licenses = {}
- for data_file in file_iter(licenses_data_dir):
+ for data_file in resource_iter(licenses_data_dir, with_dirs=False):
if not data_file.endswith('.yml'):
continue
key = file_base_name(data_file)
@@ -511,7 +514,7 @@ def load_rules(rules_data_dir=rules_data_dir, load_notes=False):
processed_files = set()
lower_case_files = set()
case_problems = set()
- for data_file in file_iter(rules_data_dir):
+ for data_file in resource_iter(rules_data_dir, with_dirs=False):
if data_file.endswith('.yml'):
base_name = file_base_name(data_file)
rule_file = join(rules_data_dir, base_name + '.RULE')
@@ -740,30 +743,37 @@ def thresholds(self):
Return a Thresholds tuple considering the occurrence of all tokens.
"""
if not self._thresholds:
- min_high = min([self.high_length, MIN_MATCH_HIGH_LENGTH])
- min_len = MIN_MATCH_LENGTH
+ length = self.length
+ high_length = self.high_length
+ if length > 200:
+ min_high = high_length // 10
+ min_len = length // 10
+ else:
+ min_high = min([high_length, MIN_MATCH_HIGH_LENGTH])
+ min_len = MIN_MATCH_LENGTH
# note: we cascade ifs from largest to smallest lengths
# FIXME: this is not efficient
+
if self.length < 30:
- min_len = self.length // 2
+ min_len = length // 2
if self.length < 10:
- min_high = self.high_length
- min_len = self.length
+ min_high = high_length
+ min_len = length
self.minimum_coverage = 80
if self.length < 3:
- min_high = self.high_length
- min_len = self.length
+ min_high = high_length
+ min_len = length
self.minimum_coverage = 100
if self.minimum_coverage == 100:
- min_high = self.high_length
- min_len = self.length
+ min_high = high_length
+ min_len = length
self._thresholds = Thresholds(
- self.high_length, self.low_length, self.length,
+ high_length, self.low_length, length,
self.small(), min_high, min_len
)
return self._thresholds
@@ -773,31 +783,40 @@ def thresholds_unique(self):
Return a Thresholds tuple considering the occurrence of only unique tokens.
"""
if not self._thresholds_unique:
- highu = (int(self.high_unique // 2)) or self.high_unique
- min_high = min([highu, MIN_MATCH_HIGH_LENGTH])
- min_len = MIN_MATCH_LENGTH
+ length = self.length
+ high_unique = self.high_unique
+ length_unique = self.length_unique
+
+ if length > 200:
+ min_high = high_unique // 10
+ min_len = length // 10
+ else:
+ highu = (int(high_unique // 2)) or high_unique
+ min_high = min([highu, MIN_MATCH_HIGH_LENGTH])
+ min_len = MIN_MATCH_LENGTH
+
# note: we cascade IFs from largest to smallest lengths
- if self.length < 20:
- min_high = self.high_unique
+ if length < 20:
+ min_high = high_unique
min_len = min_high
- if self.length < 10:
- min_high = self.high_unique
- if self.length_unique < 2:
- min_len = self.length_unique
+ if length < 10:
+ min_high = high_unique
+ if length_unique < 2:
+ min_len = length_unique
else:
- min_len = self.length_unique - 1
+ min_len = length_unique - 1
- if self.length < 5:
- min_high = self.high_unique
- min_len = self.length_unique
+ if length < 5:
+ min_high = high_unique
+ min_len = length_unique
if self.minimum_coverage == 100:
- min_high = self.high_unique
- min_len = self.length_unique
+ min_high = high_unique
+ min_len = length_unique
self._thresholds_unique = Thresholds(
- self.high_unique, self.low_unique, self.length_unique,
+ high_unique, self.low_unique, length_unique,
self.small(), min_high, min_len)
return self._thresholds_unique
diff --git a/src/licensedcode/query.py b/src/licensedcode/query.py
index baa94c827a1..2f76bd16226 100644
--- a/src/licensedcode/query.py
+++ b/src/licensedcode/query.py
@@ -23,7 +23,8 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-from __future__ import print_function, absolute_import
+from __future__ import absolute_import
+from __future__ import print_function
from collections import defaultdict
@@ -35,7 +36,6 @@
from licensedcode.tokenize import query_lines
from licensedcode.tokenize import query_tokenizer
-
"""
Build license queries from scanned files to feed the detection pipeline.
@@ -84,6 +84,7 @@
def logger_debug(*args):
pass
+
if TRACE:
import logging
import sys
diff --git a/src/licensedcode/seq.py b/src/licensedcode/seq.py
index cf23d0dd9ea..68555c4f58e 100644
--- a/src/licensedcode/seq.py
+++ b/src/licensedcode/seq.py
@@ -3,7 +3,6 @@
from collections import namedtuple as _namedtuple
-
"""
Token sequences alignement and diffing based on the longest common substrings of
"high tokens". This essentially a non-optimal and reasonably fast single local
@@ -15,7 +14,6 @@
license: PSF. See seq.ABOUT file for details.
"""
-
Match = _namedtuple('Match', 'a b size')
diff --git a/src/licensedcode/spans.py b/src/licensedcode/spans.py
index bb258ad36f2..4b60d0cf155 100644
--- a/src/licensedcode/spans.py
+++ b/src/licensedcode/spans.py
@@ -37,7 +37,6 @@
from intbitset import intbitset
-
"""
Ranges and intervals of integers using bitmaps.
Used as a compact and faster data structure for token and position sets.
@@ -51,6 +50,7 @@ class Span(Set):
It is equivalent to a sparse closed interval.
Originally derived and heavily modified from Whoosh Span.
"""
+
def __init__(self, *args):
"""
Create a new Span from a start and end ints or an iterable of ints.
diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py
index ef53e9094ca..4549621bc7e 100644
--- a/src/licensedcode/tokenize.py
+++ b/src/licensedcode/tokenize.py
@@ -34,12 +34,12 @@
from textcode.analysis import text_lines
-
"""
Utilities to break texts in lines and tokens (aka. words) with specialized version
for queries and rules texts.
"""
+
def query_lines(location=None, query_string=None, strip=True):
"""
Return an iterable of text lines given a file at `location` or a
@@ -70,6 +70,7 @@ def query_lines(location=None, query_string=None, strip=True):
query_pattern = '[^\W]+\+?[^\W]*'
word_splitter = re.compile(query_pattern, re.UNICODE).findall
+
def query_tokenizer(text, lower=True):
"""
Return an iterable of tokens from a unicode query text.
@@ -84,11 +85,11 @@ def query_tokenizer(text, lower=True):
# matched text collection
not_query_pattern = '[\W\s\+]+[\W\s]?'
-
# collect tokens and non-token texts in two different groups
_text_capture_pattern = '(?P' + query_pattern + ')' + '|' + '(?P' + not_query_pattern + ')'
tokens_and_non_tokens = re.compile(_text_capture_pattern, re.UNICODE).finditer
+
def matched_query_text_tokenizer(text):
"""
Return an iterable of tokens and non-tokens from a unicode query text keeping
@@ -118,6 +119,7 @@ def matched_query_text_tokenizer(text):
rule_pattern = '%s|%s+' % (query_pattern, template_pattern,)
template_splitter = re.compile(rule_pattern , re.UNICODE).findall
+
def rule_tokenizer(text, lower=True):
"""
Return an iterable of tokens from a unicode rule text, skipping templated
diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py
index 47aab4d7a92..68fd453243e 100644
--- a/src/packagedcode/__init__.py
+++ b/src/packagedcode/__init__.py
@@ -22,6 +22,8 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+from __future__ import absolute_import
+
from packagedcode import models
from packagedcode import maven
from packagedcode import npm
@@ -29,7 +31,6 @@
from packagedcode import phpcomposer
from packagedcode import rpm
-
# Note: the order matters: from the most to the least specific
# Package classes MUST be added to this list to be active
PACKAGE_TYPES = [
diff --git a/src/packagedcode/maven.py b/src/packagedcode/maven.py
index 4ceb12fe7dd..cb8e5fe9832 100644
--- a/src/packagedcode/maven.py
+++ b/src/packagedcode/maven.py
@@ -45,7 +45,6 @@
from typecode import contenttype
from textcode import analysis
-
logger = logging.getLogger(__name__)
TRACE = False
@@ -54,12 +53,12 @@
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
-
"""
Support Maven2 POMs.
Attempts to resolve Maven properties when possible.
"""
+
class MavenPomPackage(models.Package):
metafiles = ('.pom', 'pom.xml',)
extensions = ('.pom', '.xml',)
@@ -118,6 +117,7 @@ def to_dict(self):
class MavenPom(pom.Pom):
+
def __init__(self, location=None, text=None):
"""
Build a POM from a location or unicode text.
@@ -217,6 +217,7 @@ def _extra_properties(self):
def _replace_props(cls, text, properties):
if not text:
return text
+
def subfunc(matchobj):
"""Return the replacement value for a matched property key."""
key = matchobj.group(1)
@@ -775,6 +776,7 @@ class MavenRecognizer(object):
"""
A package recognizer for Maven-based packages.
"""
+
def __init__(self):
return NotImplementedError()
diff --git a/src/packagedcode/models.py b/src/packagedcode/models.py
index ed1f0b07368..ce6e039b779 100644
--- a/src/packagedcode/models.py
+++ b/src/packagedcode/models.py
@@ -54,7 +54,6 @@
from schematics.types.compound import ModelType
from schematics.transforms import blacklist
-
"""
Common data model for package information and dependencies, abstracting the
many small differences existing between package management formats and tools.
@@ -127,6 +126,7 @@ class BaseListType(ListType):
"""
ListType with a default of an empty list.
"""
+
def __init__(self, field, **kwargs):
super(BaseListType, self).__init__(field=field, default=[], **kwargs)
@@ -138,6 +138,7 @@ class PackageIndentifierType(BaseType):
"""
Global identifier for a package
"""
+
def __init__(self, **kwargs):
super(PackageIndentifierType, self).__init__(**kwargs)
@@ -298,6 +299,7 @@ class BaseModel(Model):
"""
Base class for all schematics models.
"""
+
def __init__(self, **kwargs):
super(BaseModel, self).__init__(raw_data=kwargs)
@@ -514,7 +516,6 @@ def resolve(self):
payload_doc = 'doc'
PAYLOADS = (payload_src, payload_bin, payload_doc)
-
# Packaging types
#################################
as_archive = 'archive'
@@ -946,7 +947,6 @@ def identifier(self):
"""
return PackageId(self.type, self.name, self.version)
-
#
# Package sub types
# NOTE: this is somewhat redundant with extractcode archive handlers
@@ -1212,7 +1212,6 @@ class SquashfsPackage(Package):
type = StringType(default='squashfs image')
packaging = StringType(default=as_archive)
-
#
# these very generic archive packages must come last in recogniztion order
#
diff --git a/src/packagedcode/nevra.py b/src/packagedcode/nevra.py
index a840ff9f651..8d6718a72ca 100644
--- a/src/packagedcode/nevra.py
+++ b/src/packagedcode/nevra.py
@@ -22,13 +22,13 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-from __future__ import absolute_import, print_function
+from __future__ import absolute_import
+from __future__ import print_function
import re
from commoncode import fileutils
-
"""
Utilities to handle RPM NEVRA (name, epoch, version, release, architecture)
"""
@@ -50,6 +50,7 @@
# modified and originally from:
# https://raw.githubusercontent.com/sassoftware/conary/c26507001b62b0839539908cc5bf28893c45c0b4/conary/rpmhelper.py
+
def from_name(filename):
"""
Return an (E, N, V, R, A) tuple given a file name, by splitting
diff --git a/src/packagedcode/npm.py b/src/packagedcode/npm.py
index 756d1806222..655ded0b885 100644
--- a/src/packagedcode/npm.py
+++ b/src/packagedcode/npm.py
@@ -48,7 +48,6 @@
https://github.com/pombredanne/normalize-package-data
"""
-
logger = logging.getLogger(__name__)
# import sys
# logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
@@ -129,7 +128,6 @@ def build_package(package_data, base_dir=None, metafile_name='package.json'):
('repository', repository_mapper),
])
-
if not package_data.get('name') or not package_data.get('version'):
# a package.json without name and version is not a usable NPM package
return
@@ -404,7 +402,6 @@ def deps_mapper(deps, package, field_name):
peer_dependencies_mapper = partial(deps_mapper, field_name='peerDependencies')
optional_dependencies_mapper = partial(deps_mapper, field_name='optionalDependencies')
-
person_parser = re.compile(
r'^(?P[^\(<]+)'
r'\s?'
diff --git a/src/packagedcode/phpcomposer.py b/src/packagedcode/phpcomposer.py
index 7877cc58048..909cedb2c81 100644
--- a/src/packagedcode/phpcomposer.py
+++ b/src/packagedcode/phpcomposer.py
@@ -31,7 +31,6 @@
from collections import OrderedDict
from functools import partial
-
from commoncode import filetype
from commoncode import fileutils
@@ -42,7 +41,6 @@
Handle PHP composer packages, refer to https://getcomposer.org/
"""
-
logger = logging.getLogger(__name__)
# import sys
# logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
@@ -84,7 +82,7 @@ def parse(location):
return build_package(package_data, base_dir, metafile_name)
-def build_package(package_data, base_dir =None, metafile_name='composer.json'):
+def build_package(package_data, base_dir=None, metafile_name='composer.json'):
"""
Return a composer Package object from a package data mapping or
None.
@@ -112,11 +110,10 @@ def build_package(package_data, base_dir =None, metafile_name='composer.json'):
('support', support_mapper),
])
-
# A composer.json without name and description is not a usable PHP
# composer package. Name and description fields are required but
# only for published packages:
- # https://getcomposer.org/doc/04-schema.md#name
+ # https://getcomposer.org/doc/04-schema.md#name
# We want to catch both published and non-published packages here.
package = PHPComposerPackage()
@@ -141,7 +138,7 @@ def build_package(package_data, base_dir =None, metafile_name='composer.json'):
if value:
func(value, package)
# Parse vendor from name value
- vendor_mapper(package)
+ vendor_mapper(package)
return package
diff --git a/src/packagedcode/pypi.py b/src/packagedcode/pypi.py
index c99368fcb16..8b76246122a 100644
--- a/src/packagedcode/pypi.py
+++ b/src/packagedcode/pypi.py
@@ -34,12 +34,10 @@
from packagedcode.models import PythonPackage
from packagedcode import models
-
"""
Detect and collect Python packages information.
"""
-
PKG_INFO_ATTRIBUTES = [
'Name',
'Version',
@@ -129,8 +127,8 @@ def parse_metadata(location):
for fname in ('METADATA', 'DESCRIPTION.rst')):
return
# FIXME: wrap in a with statement
+ # FIXME: use ordereddict
infos = json.loads(open(location, 'rb').read())
- print(infos)
homepage_url = None
authors = []
if infos['extensions']:
diff --git a/src/packagedcode/pyrpm/rpm.py b/src/packagedcode/pyrpm/rpm.py
index 9502c270821..371f69db558 100644
--- a/src/packagedcode/pyrpm/rpm.py
+++ b/src/packagedcode/pyrpm/rpm.py
@@ -35,7 +35,6 @@
from __future__ import absolute_import
-
from StringIO import StringIO
import struct
import re
@@ -63,6 +62,7 @@ def find_magic_number(regexp, data):
class Entry(object):
''' RPM Header Entry
'''
+
def __init__(self, entry, store):
self.entry = entry
self.store = store
@@ -155,6 +155,7 @@ def __readbin(self):
class Header(object):
''' RPM Header Structure
'''
+
def __init__(self, header, entries, store):
self.header = header
self.entries = entries
diff --git a/src/packagedcode/pyrpm/rpmdefs.py b/src/packagedcode/pyrpm/rpmdefs.py
index bd416ad68b0..f1077874503 100644
--- a/src/packagedcode/pyrpm/rpmdefs.py
+++ b/src/packagedcode/pyrpm/rpmdefs.py
@@ -27,9 +27,10 @@
'''
rpm definitions
-
'''
+from __future__ import absolute_import
+
RPM_LEAD_MAGIC_NUMBER = '\xed\xab\xee\xdb'
RPM_HEADER_MAGIC_NUMBER = '\x8e\xad\xe8'
@@ -45,11 +46,9 @@
RPMSIGTAG_GPG = 1005
RPMSIGTAG_PGP5 = 1006
-
MD5_SIZE = 16 # 16 bytes long
PGP_SIZE = 152 # 152 bytes long
-
# data types definition
RPM_DATA_TYPE_NULL = 0
RPM_DATA_TYPE_CHAR = 1
@@ -102,7 +101,6 @@
RPMTAG_SOURCEPACKAGE = 1106
RPMTAG_DISTURL = 1123
-
RPMTAGS = {
RPMTAG_NAME: 'name',
RPMTAG_EPOCH: 'epoch',
@@ -124,7 +122,6 @@
RPMTAG_DISTURL: 'dist_url',
}
-
"""
from rpm.org lib/rpmtag.h
See also: http://refspecs.linuxfoundation.org/LSB_5.0.0/LSB-Core-generic/LSB-Core-generic/pkgformat.html
diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py
index a8a78f265e2..2a6fe4260db 100644
--- a/src/packagedcode/recognize.py
+++ b/src/packagedcode/recognize.py
@@ -29,17 +29,18 @@
import sys
from commoncode import filetype
+from commoncode.fileutils import fsencode
from commoncode.system import on_linux
-from commoncode.fileutils import path_to_bytes
from packagedcode import PACKAGE_TYPES
from typecode import contenttype
-
TRACE = False
+
def logger_debug(*args):
pass
+
logger = logging.getLogger(__name__)
if TRACE:
@@ -49,7 +50,6 @@ def logger_debug(*args):
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, basestring) and a or repr(a) for a in args))
-
"""
Recognize packages in files or directories.
"""
@@ -67,12 +67,11 @@ def recognize_package(location):
ftype = T.filetype_file.lower()
mtype = T.mimetype_file
-
for package_type in PACKAGE_TYPES:
# Note: default to True if there is nothing to match against
metafiles = package_type.metafiles
if on_linux:
- metafiles = (path_to_bytes(m) for m in metafiles)
+ metafiles = (fsencode(m) for m in metafiles)
if location.endswith(tuple(metafiles)):
logger_debug('metafile matching: package_type is of type:', package_type)
return package_type.recognize(location)
@@ -89,7 +88,7 @@ def recognize_package(location):
extensions = package_type.extensions
if extensions:
if on_linux:
- extensions = tuple(path_to_bytes(e) for e in extensions)
+ extensions = tuple(fsencode(e) for e in extensions)
extension_matched = location.lower().endswith(extensions)
else:
extension_matched = False
diff --git a/src/packagedcode/rpm.py b/src/packagedcode/rpm.py
index b715bb16952..f0c9ee3d7bb 100644
--- a/src/packagedcode/rpm.py
+++ b/src/packagedcode/rpm.py
@@ -30,19 +30,19 @@
import string
import sys
-
from packagedcode import models
from packagedcode import nevra
from packagedcode.pyrpm.rpm import RPM
import typecode.contenttype
-
TRACE = False
+
def logger_debug(*args):
pass
+
logger = logging.getLogger(__name__)
if TRACE:
@@ -52,7 +52,6 @@ def logger_debug(*args):
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, basestring) and a or repr(a) for a in args))
-
# TODO: retrieve dependencies
# TODO: parse spec files see:
@@ -82,7 +81,6 @@ def logger_debug(*args):
'bin_or_src',
)
-
RPMInfo = namedtuple('RPMInfo', list(RPM_TAGS))
@@ -118,7 +116,6 @@ def info(location, include_desc=False):
the long RPM description value if include_desc is True.
"""
tgs = tags(location, include_desc)
- print(tgs)
return tgs and RPMInfo(**tgs) or None
@@ -126,6 +123,7 @@ class EVR(namedtuple('EVR', 'epoch version release')):
"""
The RPM Epoch, Version, Release tuple.
"""
+
# note: the order of the named tuple is the sort order.
# But for creation we put the rarely used epoch last
def __new__(self, version, release, epoch=None):
diff --git a/src/packagedcode/utils.py b/src/packagedcode/utils.py
index 423033c739f..69ce4aeb8a2 100644
--- a/src/packagedcode/utils.py
+++ b/src/packagedcode/utils.py
@@ -22,7 +22,8 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-from __future__ import print_function, absolute_import
+from __future__ import absolute_import
+from __future__ import print_function
VCS_URLS = (
diff --git a/src/packagedcode/xmlutils.py b/src/packagedcode/xmlutils.py
index 75b3c5af730..3cd018103f0 100644
--- a/src/packagedcode/xmlutils.py
+++ b/src/packagedcode/xmlutils.py
@@ -26,13 +26,11 @@
from __future__ import print_function
from __future__ import unicode_literals
-
import chardet
from lxml import etree
from textcode import analysis
-
"""
Utility functions for dealing with XML.
"""
@@ -61,7 +59,7 @@ def parse(location, handler):
except:
parser = etree.XMLParser(recover=True, remove_blank_text=True, resolve_entities=False)
text = analysis.unicode_text(location)
- xdoc= etree.fromstring(_as_unicode_bytes(text), parser)
+ xdoc = etree.fromstring(_as_unicode_bytes(text), parser)
return handler(xdoc)
diff --git a/src/plugincode/__init__.py b/src/plugincode/__init__.py
index 7ebe6b01cb0..1ef5f285eb2 100644
--- a/src/plugincode/__init__.py
+++ b/src/plugincode/__init__.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -22,5 +22,209 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+import sys
+
+from pluggy import HookimplMarker
+from pluggy import HookspecMarker
+from pluggy import PluginManager as PluggyPluginManager
+from scancode import CommandLineOption
+
+
+class BasePlugin(object):
+ """
+ A base class for all ScanCode plugins.
+ """
+ # List of CommandLineOption CLI options for this plugin.
+ # Subclasses should set this as needed
+ options = []
+
+ # flag set to True once this plugin class has been initialized by calling it
+ # setup() class method.
+ # This is set automatically when a plugin class is loaded in its manager.
+ # Subclasses must not set this.
+ initialized = False
+
+ # stage string for this plugin.
+ # This is set automatically when a plugin class is loaded in its manager.
+ # Subclasses must not set this.
+ stage = None
+
+ # name string under which this plugin is registered.
+ # This is set automatically when a plugin class is loaded in its manager.
+ # Subclasses must not set this.
+ name = None
+
+ # An ordered mapping of attr attributes that specifies the data returned by
+ # this plugin. These attributes will be added to a Resource subclass. The
+ # position of these attributes in the returned serialized data is determined
+ # by the sort_order then the plugin name
+ attributes = OrderedDict()
+
+ # a relative sort order number (integer or float). In scan results, results
+ # from scanners are sorted by this sorted_order then by "keys".
+ # This is also used in the CLI UI to sort the SCAN_GROUP option help group.
+ sort_order = 100
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initialize a new plugin with a user kwargs.
+ Plugins can override as needed (still calling super).
+ """
+ self.options_by_name = {o.name: o for o in self.options}
+
+ self.kwargs = kwargs
+
+ # mapping of scan summary data and statistics.
+ # This is populated automatically on the plugin instance.
+ # Subclasses must not set this.
+ self.summary = OrderedDict()
+
+ # TODO: pass own command options name/values as concrete kwargs
+ def is_enabled(self, **kwargs):
+ """
+ Return True is this plugin is enabled by user-selected options.
+ Subclasses must override.
+ This receives all the ScanCode call arguments as kwargs.
+ """
+ raise NotImplementedError
+
+ # TODO: pass own command options name/values as concrete kwargs
+ def setup(self, **kwargs):
+ """
+ Execute some setup for this plugin. This is guaranteed to be called
+ exactly one time at initialization if this plugin is enabled.
+ Must raise an Exception on failure.
+ Subclasses can override as needed.
+ This receives all the ScanCode call arguments as kwargs.
+ """
+ pass
+
+ # NOTE: Other methods below should NOT be overriden.
+
+ @property
+ def qname(self):
+ """
+ Return the qualified name of this plugin.
+ """
+ return '{self.stage}:{self.name}'.format(self=self)
+
+ def get_option(self, name):
+ """
+ Return the CommandLineOption of this plugin with `name` or None.
+ """
+ return self.options_by_name.get(name)
+
+
+class CodebasePlugin(BasePlugin):
+ """
+ Base class for plugins that process a whole codebase at once.
+ """
+
+ def process_codebase(self, codebase, **kwargs):
+ """
+ Process a `codebase` Codebase object updating its Reousrce as needed.
+ Subclasses should override.
+ This receives all the ScanCode call arguments as kwargs.
+ """
+ raise NotImplementedError
+
+
+class PluginManager(object):
+ """
+ A PluginManager class for plugins.
+ """
+
+ # a global managers cache as a mapping of {stage: manager instance}
+ managers = {}
+
+ def __init__(self, stage, module_qname, entrypoint, plugin_base_class):
+ """
+ Initialize this plugin manager for the `stage` specified in the fully
+ qualified Python module name `module_qname` with plugins loaded from the
+ setuptools `entrypoint` that must subclass `plugin_base_class`.
+ """
+ self.manager = PluggyPluginManager(project_name=stage)
+ self.managers[stage] = self
+
+ self.stage = stage
+ self.entrypoint = entrypoint
+ self.plugin_base_class = plugin_base_class
+ self.manager.add_hookspecs(sys.modules[module_qname])
+
+ # set to True once this manager is initialized by running its setup()
+ self.initialized = False
+
+ # mapping of {plugin.name: plugin_class} for all the plugins of this
+ # manager
+ self.plugin_classes = OrderedDict()
+
+ @classmethod
+ def load_plugins(cls):
+ """
+ Setup the plugins enviroment.
+ Must be called once to initialize all the plugins of all managers.
+ """
+ plugin_classes = []
+ plugin_options = []
+ for stage, manager in cls.managers.items():
+ mgr_setup = manager.setup()
+ if not mgr_setup:
+ msg = 'Cannot load ScanCode plugins for stage: %(stage)s' % locals()
+ raise Exception(msg)
+ mplugin_classes, mplugin_options = mgr_setup
+ plugin_classes.extend(mplugin_classes)
+ plugin_options.extend(mplugin_options)
+ return plugin_classes, plugin_options
+
+ def setup(self):
+ """
+ Return a tuple of (list of all plugin classes, list of all options of
+ all plugin classes).
+
+ Load and validate available plugins for this PluginManager from its
+ assigned `entrypoint`. Raise an Exception if a plugin is not valid such
+ that when it does not subcclass the manager `plugin_base_class`.
+ Must be called once to setup the plugins if this manager.
+ """
+ if self.initialized:
+ return
+
+ entrypoint = self.entrypoint
+ try:
+ self.manager.load_setuptools_entrypoints(entrypoint)
+ except ImportError, e:
+ raise e
+ stage = self.stage
+
+ plugin_options = []
+ for name, plugin_class in self.manager.list_name_plugin():
+
+ if not issubclass(plugin_class, self.plugin_base_class):
+ qname = '%(stage)s:%(name)s' % locals()
+ raise Exception(
+ 'Invalid plugin: %(qname)r: %(plugin_class)r '
+ 'must extend %(plugin_base_class)r.' % locals())
+
+ for option in plugin_class.options:
+ if not isinstance(option, CommandLineOption):
+ qname = '%(stage)s:%(name)s' % locals()
+ oname = option.name
+ clin = CommandLineOption
+ raise Exception(
+ 'Invalid plugin: %(qname)r: option %(oname)r '
+ 'must extend %(clin)r.' % locals())
+ plugin_options.append(option)
+
+ plugin_class.stage = stage
+ plugin_class.name = name
+
+ self.plugin_classes[name] = plugin_class
+
+ self.initialized = True
+ return self.plugin_classes.values(), plugin_options
+
diff --git a/src/plugincode/output.py b/src/plugincode/output.py
index 824911e406a..987e6b04774 100644
--- a/src/plugincode/output.py
+++ b/src/plugincode/output.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -23,56 +23,85 @@
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
from __future__ import absolute_import
+from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
-from collections import OrderedDict
-import sys
+from functools import partial
+from itertools import imap
-from pluggy import HookimplMarker
-from pluggy import HookspecMarker
-from pluggy import PluginManager
+from plugincode import CodebasePlugin
+from plugincode import PluginManager
+from plugincode import HookimplMarker
+from plugincode import HookspecMarker
+from scancode.resource import Resource
+# Python 2 and 3 support
+try:
+ # Python 2
+ unicode
+ str_orig = str
+ bytes = str # NOQA
+ str = unicode # NOQA
+except NameError:
+ # Python 3
+ unicode = str # NOQA
-scan_output_spec = HookspecMarker('scan_output_writer')
-scan_output_writer = HookimplMarker('scan_output_writer')
+# Tracing flags
+TRACE = False
+TRACE_DEEP = False
-# FIXME: simplify the hooskpec
-@scan_output_spec
-def write_output(files_count, version, notice, scanned_files, options, input, output_file, _echo):
- """
- Write the `scanned_files` scan results in the format supplied by
- the --format command line option.
- Parameters:
- - `file_count`: the number of files and directories scanned.
- - `version`: ScanCode version
- - `notice`: ScanCode notice
- - `scanned_files`: an iterable of scan results for each file
- - `options`: a mapping of key by command line option to a flag True
- if this option was enabled.
- - `input`: the original input path scanned.
- - `output_file`: an opened, file-like object to write the output to.
- - `_echo`: a funtion to echo strings to stderr. This will be removedd in the future.
- """
+def logger_debug(*args):
pass
-output_plugins = PluginManager('scan_output_writer')
-output_plugins.add_hookspecs(sys.modules[__name__])
+if TRACE or TRACE_DEEP:
+ import logging
+ import sys
+ logger = logging.getLogger(__name__)
+ logging.basicConfig(stream=sys.stdout)
+ logger.setLevel(logging.DEBUG)
-def initialize():
- """
- NOTE: this defines the entry points for use in setup.py
- """
- output_plugins.load_setuptools_entrypoints('scancode_output_writers')
+ def logger_debug(*args):
+ return logger.debug(' '.join(isinstance(a, unicode)
+ and a or repr(a) for a in args))
+
+stage = 'output'
+entrypoint = 'scancode_output'
+output_spec = HookspecMarker(project_name=stage)
+output_impl = HookimplMarker(project_name=stage)
-def get_format_plugins():
+
+@output_spec
+class OutputPlugin(CodebasePlugin):
"""
- Return an ordered mapping of format name --> plugin callable for all
- the output plugins. The mapping is ordered by sorted key.
- This is the main API for other code to access format plugins.
+ Base plugin class for scan output formatters all output plugins must extend.
"""
- return OrderedDict(sorted(output_plugins.list_name_plugin()))
+
+ def process_codebase(self, codebase, **kwargs):
+ """
+ Write scan output for the `codebase`.
+ """
+ raise NotImplementedError
+
+ @classmethod
+ def get_results(cls, codebase, info, full_root, strip_root, timing, **kwargs):
+ """
+ Return an iterable of serialized scan results from a codebase.
+ """
+ # FIXME: serialization SHOULD NOT be needed: only some format need it
+ # (e.g. JSON) and only these should serialize
+ serializer = partial(Resource.to_dict, with_info=info, with_timing=timing)
+ resources = codebase.walk_filtered(topdown=True, skip_root=strip_root)
+ return imap(serializer, resources)
+
+
+output_plugins = PluginManager(
+ stage=stage,
+ module_qname=__name__,
+ entrypoint=entrypoint,
+ plugin_base_class=OutputPlugin
+)
diff --git a/src/plugincode/output_filter.py b/src/plugincode/output_filter.py
new file mode 100644
index 00000000000..9c3f4a2e768
--- /dev/null
+++ b/src/plugincode/output_filter.py
@@ -0,0 +1,59 @@
+#
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+# ScanCode is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from plugincode import CodebasePlugin
+from plugincode import PluginManager
+from plugincode import HookimplMarker
+from plugincode import HookspecMarker
+
+stage = 'output_filter'
+entrypoint = 'scancode_output_filter'
+
+output_filter_spec = HookspecMarker(project_name=stage)
+output_filter_impl = HookimplMarker(project_name=stage)
+
+
+@output_filter_spec
+class OutputFilterPlugin(CodebasePlugin):
+ """
+ Base plugin class for Resource output filter plugins that all output filter
+ plugins must extend.
+
+ Filter plugins MUST NOT modify the codebase beyond setting the
+ Resource.is_filtered flag on resources.
+ """
+ pass
+
+
+output_filter_plugins = PluginManager(
+ stage=stage,
+ module_qname=__name__,
+ entrypoint=entrypoint,
+ plugin_base_class=OutputFilterPlugin
+)
diff --git a/src/plugincode/post_scan.py b/src/plugincode/post_scan.py
index 4f1aee9ce79..2281f759fb8 100644
--- a/src/plugincode/post_scan.py
+++ b/src/plugincode/post_scan.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -25,45 +25,29 @@
from __future__ import absolute_import
from __future__ import unicode_literals
-from collections import OrderedDict
-import sys
+from plugincode import CodebasePlugin
+from plugincode import PluginManager
+from plugincode import HookimplMarker
+from plugincode import HookspecMarker
-from pluggy import HookimplMarker
-from pluggy import HookspecMarker
-from pluggy import PluginManager
+stage = 'post_scan'
+entrypoint = 'scancode_post_scan'
-
-post_scan_spec = HookspecMarker('post_scan')
-post_scan_impl = HookimplMarker('post_scan')
+post_scan_spec = HookspecMarker(project_name=stage)
+post_scan_impl = HookimplMarker(project_name=stage)
@post_scan_spec
-def post_scan_handler(active_scans, results):
+class PostScanPlugin(CodebasePlugin):
"""
- Process the scanned files and yield the modified results.
- Parameters:
- - `active_scans`: a list of scanners names requested in the current run.
- - `results`: an iterable of scan results for each file or directory.
+ A post-scan plugin base class that all post-scan plugins must extend.
"""
pass
-post_scan_plugins = PluginManager('post_scan')
-post_scan_plugins.add_hookspecs(sys.modules[__name__])
-
-
-def initialize():
- """
- NOTE: this defines the entry points for use in setup.py
- """
- post_scan_plugins.load_setuptools_entrypoints('scancode_post_scan')
-
-
-def get_post_scan_plugins():
- """
- Return an ordered mapping of
- "command line option name" --> "plugin callable"
- for all the post_scan plugins. The mapping is sorted by option name.
- This is the main API for other code to access post_scan plugins.
- """
- return OrderedDict(sorted(post_scan_plugins.list_name_plugin()))
+post_scan_plugins = PluginManager(
+ stage=stage,
+ module_qname=__name__,
+ entrypoint=entrypoint,
+ plugin_base_class=PostScanPlugin
+)
diff --git a/src/plugincode/pre_scan.py b/src/plugincode/pre_scan.py
index c9ba789bdad..a44026c7135 100644
--- a/src/plugincode/pre_scan.py
+++ b/src/plugincode/pre_scan.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -25,60 +25,83 @@
from __future__ import absolute_import
from __future__ import unicode_literals
-from collections import OrderedDict
-import sys
+from plugincode import CodebasePlugin
+from plugincode import PluginManager
+from plugincode import HookimplMarker
+from plugincode import HookspecMarker
-from pluggy import HookimplMarker
-from pluggy import HookspecMarker
-from pluggy import PluginManager
+stage = 'pre_scan'
+entrypoint = 'scancode_pre_scan'
+pre_scan_spec = HookspecMarker(stage)
+pre_scan_impl = HookimplMarker(stage)
-pre_scan_spec = HookspecMarker('pre_scan')
-pre_scan_impl = HookimplMarker('pre_scan')
@pre_scan_spec
-class PreScanPlugin(object):
+class PreScanPlugin(CodebasePlugin):
"""
- A pre-scan plugin layout class to be extended by the pre_scan plugins.
- Docstring of a plugin class will be used as the plugin option's help text
+ A pre-scan plugin base class that all pre-scan plugins must extend.
"""
- # attributes to be used while creating the option for this plugin.
- option_attrs = {}
+ # List of scanner name strings that this plugin requires to run first
+ # before this pres-scan plugin runs.
+ # Subclasses should set this as needed
+ requires = []
- def __init__(self, user_input):
- self.user_input = user_input
-
- def process_resource(self, resource):
- """
- Process a resource prior to scan.
- :param resource: instance of Resource to process
- :return: resource or None to ignore the resource
+ def get_required(self, scanner_plugins):
"""
- return resource
+ Return a list of required scanner plugin instances that are direct
+ requirements of self.
- def get_ignores(self):
- """
- Return a dict of ignores to be used when processing resources
+ `scanner_plugins` is a {name: plugin} mapping of enabled scanner
+ plugins.
"""
- return {}
+ required = []
+ for name in self.requires:
+ required_plugin = scanner_plugins.get(name)
-pre_scan_plugins = PluginManager('pre_scan')
-pre_scan_plugins.add_hookspecs(sys.modules[__name__])
+ if not required_plugin:
+ qname = self.qname
+ raise Exception(
+ 'Missing required scan plugin: %(name)r '
+ 'for plugin: %(qname)r.' % locals())
+ required.append(required_plugin)
-def initialize():
- # NOTE: this defines the entry points for use in setup.py
- pre_scan_plugins.load_setuptools_entrypoints('scancode_pre_scan')
- for name, plugin in get_pre_scan_plugins().items():
- if not issubclass(plugin, PreScanPlugin):
- raise Exception('Invalid pre-scan plugin "%(name)s": does not extend "plugincode.pre_scan.PreScanPlugin".' % locals())
+ return unique(required)
-def get_pre_scan_plugins():
+ @classmethod
+ def get_all_required(self, prescan_plugins, scanner_plugins):
+ """
+ Return a list of unique required scanner plugin instances that are direct
+ requirements of any of the `prescan_plugins` pre-scan plugin instances.
+ `prescan_plugins` is a list of enabled pre-scan plugins.
+ `scanner_plugins` is a {name: plugin} mapping of enabled scanner
+ plugins.
+ """
+ required = []
+ for plugin in prescan_plugins:
+ required.extend(plugin.get_required(scanner_plugins))
+ return unique(required)
+
+
+def unique(iterable):
"""
- Return an ordered mapping of CLI option name --> plugin callable
- for all the pre_scan plugins. The mapping is ordered by sorted key.
- This is the main API for other code to access pre_scan plugins.
+ Return a sequence of unique items in `iterable` keeping their
+ original order.
+ Note: this can be very slow for large sequences as this is using lists.
"""
- return OrderedDict(sorted(pre_scan_plugins.list_name_plugin()))
+ uniques = []
+ for item in iterable:
+ if item not in uniques:
+ uniques.append(item)
+ return uniques
+
+
+pre_scan_plugins = PluginManager(
+ stage=stage,
+ module_qname=__name__,
+ entrypoint=entrypoint,
+ plugin_base_class=PreScanPlugin
+)
diff --git a/src/plugincode/scan.py b/src/plugincode/scan.py
new file mode 100644
index 00000000000..77f12ac57e4
--- /dev/null
+++ b/src/plugincode/scan.py
@@ -0,0 +1,91 @@
+#
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+# ScanCode is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+from plugincode import BasePlugin
+from plugincode import PluginManager
+from plugincode import HookimplMarker
+from plugincode import HookspecMarker
+
+stage = 'scan'
+entrypoint = 'scancode_scan'
+
+scan_spec = HookspecMarker(stage)
+scan_impl = HookimplMarker(stage)
+
+
+@scan_spec
+class ScanPlugin(BasePlugin):
+ """
+ A scan plugin base class that all scan plugins must extend. A scan plugin
+ provides a single `get_scanner()` method that returns a scanner function.
+ The key under which scan results are returned for a scanner is the plugin
+ "name" attribute. This attribute is set automatically as the "entrypoint"
+ name used for this plugin.
+ """
+
+ def get_scanner(self, **kwargs):
+ """
+ Return a scanner callable, receiving all the scancode call arguments as
+ kwargs.
+
+ The returned callable MUST be a top-level module importable function
+ (e.g. that is picklable and it can be possibly closed on argumenst with
+ functools.partial) and accept these arguments:
+
+ - a first `location` argument that is always an absolute path string to
+ a file. This string is using the filesystem encoding (e.g. bytes on
+ Linux and Unicode elsewhere).
+
+ - other **kwargs that will be all the scancode call arguments.
+
+ The returned callable MUST RETURN an ordered mapping of key/values that
+ must be serializable to JSON.
+
+ All mapping keys must be strings, including for any nested mappings.
+
+ Any value must be one of:
+ - None, unicode or str, int, flota, long.
+ str if not unicode WILL be converted to unicode with UTF-8.
+ - iterable/list/tuple/generator or dict/mapping preferrably ordered.
+ - any object beyond these above that has an asdict() ot to_dict() method
+ that returns an ordered mapping of key/values of the same styke the
+ top-level mapping defined here.
+
+ This callable (typically a bare function) should carry as little state
+ as possible as it may be executed through multiprocessing.
+
+ Subclasses must override.
+ """
+ raise NotImplementedError
+
+
+scan_plugins = PluginManager(
+ stage=stage,
+ module_qname=__name__,
+ entrypoint=entrypoint,
+ plugin_base_class=ScanPlugin
+)
diff --git a/src/scancode/__init__.py b/src/scancode/__init__.py
index 46affd53e35..8d96cabaa22 100644
--- a/src/scancode/__init__.py
+++ b/src/scancode/__init__.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -22,32 +22,291 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-from __future__ import print_function
from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from collections import namedtuple
+from itertools import chain
from os.path import dirname
from os.path import abspath
from os.path import getsize
from os.path import getmtime
from os.path import join
from os.path import exists
+from types import BooleanType
+
+import click
+from click.types import BoolParamType
from commoncode import fileutils
+# Python 2 and 3 support
+try:
+ # Python 2
+ unicode
+ str_orig = str
+ bytes = str # NOQA
+ str = unicode # NOQA
+except NameError:
+ # Python 3
+ unicode = str # NOQA
-scan_src_dir = abspath(dirname(__file__))
-src_dir = dirname(scan_src_dir)
-root_dir = dirname(src_dir)
-cache_dir = join(root_dir, '.cache')
-scans_cache_dir = join(cache_dir, 'scan_results_caches')
+# Tracing flags
+TRACE = False
-if not exists(scans_cache_dir):
- fileutils.create_dir(scans_cache_dir)
+def logger_debug(*args):
+ pass
+
+
+if TRACE:
+ import logging
+ import sys
+ logger = logging.getLogger(__name__)
+ logging.basicConfig(stream=sys.stdout)
+ logger.setLevel(logging.DEBUG)
+
+ def logger_debug(*args):
+ return logger.debug(' '.join(isinstance(a, (unicode, str))
+ and a or repr(a) for a in args))
+
+# CLI help groups
+SCAN_GROUP = 'primary scans'
+SCAN_OPTIONS_GROUP = 'scan options'
+OTHER_SCAN_GROUP = 'other scans'
+OUTPUT_GROUP = 'output formats'
+OUTPUT_FILTER_GROUP = 'output filters'
+OUTPUT_CONTROL_GROUP = 'output control'
+PRE_SCAN_GROUP = 'pre-scan'
+POST_SCAN_GROUP = 'post-scan'
+MISC_GROUP = 'miscellaneous'
+DOC_GROUP = 'documentation'
+CORE_GROUP = 'core'
+
+# Holds a scan plugin result "key and the corresponding function.
+# click.Parameter instance
+Scanner = namedtuple('Scanner', 'name function')
+
+
+class CommandLineOption(click.Option):
+ """
+ An option with extra args and attributes to control CLI help options
+ grouping, co-required and conflicting options (e.g. mutually exclusive).
+ """
+
+ # args are from Click 6.7
+ def __init__(self, param_decls=None, show_default=False,
+ prompt=False, confirmation_prompt=False,
+ hide_input=False, is_flag=None, flag_value=None,
+ multiple=False, count=False, allow_from_autoenv=True,
+ type=None, help=None, # NOQA
+ # custom additions #
+ # a string that set the CLI help group for this option
+ help_group=MISC_GROUP,
+ # a relative sort order number (integer or float) for this
+ # option within a help group: the sort is by increasing
+ # sort_order then by option declaration.
+ sort_order=100,
+ # a sequence of other option name strings that this option
+ # requires to be set
+ requires=(),
+ # a sequence of other option name strings that this option
+ # conflicts with if they are set
+ conflicts=(),
+ # a flag set to True if this option should be hidden from the CLI help
+ hidden=False,
+ **attrs):
+
+ super(CommandLineOption, self).__init__(param_decls, show_default,
+ prompt, confirmation_prompt,
+ hide_input, is_flag, flag_value,
+ multiple, count, allow_from_autoenv,
+ type, help, **attrs)
+
+ self.help_group = help_group
+ self.sort_order = sort_order
+ self.requires = requires
+ self.conflicts = conflicts
+ self.hidden = hidden
+
+ def __repr__(self, *args, **kwargs):
+ name = self.name
+ opt = self.opts[-1]
+ help_group = self.help_group
+ requires = self.requires
+ conflicts = self.conflicts
+
+ return ('CommandLineOption' % locals())
+
+ def validate_dependencies(self, ctx, value):
+ """
+ Validate `value` against declared `requires` or `conflicts` dependencies.
+ """
+ _validate_option_dependencies(ctx, self, value, self.requires, required=True)
+ _validate_option_dependencies(ctx, self, value, self.conflicts, required=False)
+
+
+def validate_option_dependencies(ctx):
+ """
+ Validate all CommandLineOption dependencies in the `ctx` Click context.
+ Ignore eager flags.
+ """
+ values = ctx.params
+ if TRACE:
+ logger_debug('validate_option_dependencies: values:')
+ for va in sorted(values.items()):
+ logger_debug(' ', va)
+
+ for param in ctx.command.params:
+ if param.is_eager:
+ continue
+ if not isinstance(param, CommandLineOption):
+ if TRACE:
+ logger_debug(' validate_option_dependencies: skip param:', param)
+ continue
+ value = values.get(param.name)
+ if TRACE:
+ logger_debug(' validate_option_dependencies: param:', param, 'value:', value)
+ param.validate_dependencies(ctx, value)
+
+
+def _validate_option_dependencies(ctx, param, value,
+ other_option_names, required=False):
+ """
+ Validate the `other_option_names` option dependencies and return a
+ UsageError if the `param` `value` is set to a not-None non-default value and
+ if:
+ - `required` is True and the `other_option_names` options are not set with a
+ not-None value in the `ctx` context.
+ - `required` is False and any of the `other_option_names` options are set
+ with a not-None, non-default value in the `ctx` context.
+ """
+ if not other_option_names:
+ return
+
+ def _is_set(_value, _default, typ):
+ if type in (BooleanType, BoolParamType):
+ return _value
+ return bool(_value is not None and _value != _default)
+
+ is_set = _is_set(value, param.default, param.type)
+
+ if TRACE:
+ logger_debug()
+ logger_debug('Checking param:', param)
+ logger_debug(' value:', value, 'is_set:' , is_set)
+
+ if not is_set:
+ return
+
+ oparams_by_name = {oparam.name: oparam for oparam in ctx.command.params}
+ oparams = []
+ missing_onames = []
+
+ for oname in other_option_names:
+ oparam = oparams_by_name.get(oname)
+ if not oparam:
+ missing_onames.append(oparam)
+ else:
+ oparams.append(oparam)
+
+ if TRACE:
+ logger_debug()
+ logger_debug(' Available other params:')
+ for oparam in oparams:
+ logger_debug(' other param:', oparam)
+ logger_debug(' value:', ctx.params.get(oparam.name))
+ if required:
+ logger_debug(' missing names:', missing_onames)
+
+ if required and missing_onames:
+ opt = param.opts[-1]
+ oopts = [oparam.opts[-1] for oparam in oparams]
+ omopts = ['--' + oname.replace('_', '-') for oname in missing_onames]
+ oopts.extend(omopts)
+ oopts = ', '.join(oopts)
+ msg = ('The option %(opt)s requires the option(s) %(all_opts)s.'
+ 'and is missing %(omopts)s. '
+ 'You must set all of these options if you use this option.' % locals())
+ raise click.UsageError(msg)
+
+ if TRACE:
+ logger_debug()
+ logger_debug(' Checking other params:')
+
+ opt = param.opts[-1]
+
+ for oparam in oparams:
+ ovalue = ctx.params.get(oparam.name)
+ ois_set = _is_set(ovalue, oparam.default, oparam.type)
+
+ if TRACE:
+ logger_debug(' Checking oparam:', oparam)
+ logger_debug(' value:', ovalue, 'ois_set:' , ois_set)
+
+ # by convention the last opt is the long form
+ oopt = oparam.opts[-1]
+ oopts = ', '.join(oparam.opts[-1] for oparam in oparams)
+ all_opts = '%(opt)s and %(oopts)s' % locals()
+ if required and not ois_set:
+ msg = ('The option %(opt)s requires the option(s) %(oopts)s '
+ 'and is missing %(oopt)s. '
+ 'You must set all of these options if you use this option.' % locals())
+ raise click.UsageError(msg)
+
+ if not required and ois_set:
+ msg = ('The option %(opt)s cannot be used together with the %(oopts)s option(s) '
+ 'and %(oopt)s is used. '
+ 'You can set only one of these options at a time.' % locals())
+ raise click.UsageError(msg)
+
+
+class FileOptionType(click.File):
+ """
+ A click.File subclass that ensures that a file name is not set to an
+ existing option parameter to avoid mistakes.
+ """
+
+ def convert(self, value, param, ctx):
+ known_opts = set(chain.from_iterable(p.opts for p in ctx.command.params
+ if isinstance(p, click.Option)))
+ if value in known_opts:
+ self.fail('Illegal file name conflicting with an option name: %s. '
+ 'Use the special "-" file name to print results on screen/stdout.'
+ % (click.types.filename_to_ui(value),
+ ), param, ctx)
+ return click.File.convert(self, value, param, ctx)
+
+
+info_text = '''
+ScanCode scans code and other files for origin and license.
+Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+'''
+
+notice_path = join(abspath(dirname(__file__)), 'NOTICE')
+notice_text = open(notice_path).read()
+
+delimiter = '\n\n\n'
+[notice_text, extra_notice_text] = notice_text.split(delimiter, 1)
+extra_notice_text = delimiter + extra_notice_text
+
+delimiter = '\n\n '
+[notice_text, acknowledgment_text] = notice_text.split(delimiter, 1)
+acknowledgment_text = delimiter + acknowledgment_text
+
+notice = acknowledgment_text.strip().replace(' ', '')
+
+
+def print_about(ctx, param, value):
+ """
+ Click callback to print a notice.
+ """
+ if not value or ctx.resilient_parsing:
+ return
+ click.echo(info_text + notice_text + acknowledgment_text + extra_notice_text)
+ ctx.exit()
-from pkg_resources import get_distribution, DistributionNotFound
-try:
- __version__ = get_distribution('scancode-toolkit').version
-except DistributionNotFound:
- # package is not installed ??
- __version__ = '2.2.1'
diff --git a/src/scancode/api.py b/src/scancode/api.py
index 86d0eac8ad9..3675807fc2f 100644
--- a/src/scancode/api.py
+++ b/src/scancode/api.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -22,146 +22,123 @@
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-from __future__ import print_function
from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
from __future__ import unicode_literals
from collections import OrderedDict
+from os.path import getsize
-from commoncode.fileutils import as_posixpath
-from commoncode.fileutils import path_to_bytes
-from commoncode.fileutils import path_to_unicode
-from commoncode.system import on_linux
-from scancode.utils import get_relative_path
-
+from commoncode.filetype import get_last_modified_date
+from commoncode.hash import multi_checksums
+from typecode.contenttype import get_type
"""
Main scanning functions.
-Note: this API is unstable and still evolving.
-"""
-
-class Resource(object):
- """
- Store scanned details for a single resource (file or a directory)
- such as infos and path
- """
-
- def __init__(self, scan_cache_class, abs_path, base_is_dir, len_base_path):
- self.scan_cache_class = scan_cache_class()
- self.is_cached = False
- self.abs_path = abs_path
- self.base_is_dir = base_is_dir
- posix_path = as_posixpath(abs_path)
- # fix paths: keep the path as relative to the original
- # base_path. This is always Unicode
- self.rel_path = get_relative_path(posix_path, len_base_path, base_is_dir)
- self.infos = OrderedDict()
- self.infos['path'] = self.rel_path
-
- def put_info(self, infos):
- """
- Cache file info and set `is_cached` to True if already cached or false otherwise.
- """
- self.infos.update(infos)
- self.is_cached = self.scan_cache_class.put_info(self.rel_path, self.infos)
-
- def get_info(self):
- """
- Retrieve info from cache.
- """
- return self.scan_cache_class.get_info(self.rel_path)
+Each scanner is a function that accepts a location and returns a sequence of
+mappings as results.
-def extract_archives(location, recurse=True):
- """
- Extract any archives found at `location` and yield ExtractEvents. If
- `recurse` is True, extracts nested archives-in- archives
- recursively.
- """
- from extractcode.extract import extract
- from extractcode import default_kinds
- for xevent in extract(location, kinds=default_kinds, recurse=recurse):
- yield xevent
+Note: this API is unstable and still evolving.
+"""
-def get_copyrights(location):
+def get_copyrights(location, **kwargs):
"""
- Yield mappings of copyright data detected in the file at `location`.
+ Return a mapping with a single 'copyrights' key with a value that is a list
+ of mappings for copyright detected in the file at `location`.
"""
from cluecode.copyrights import detect_copyrights
-
+ results = []
for copyrights, authors, _years, holders, start_line, end_line in detect_copyrights(location):
result = OrderedDict()
+ results.append(result)
# FIXME: we should call this copyright instead, and yield one item per statement
result['statements'] = copyrights
result['holders'] = holders
result['authors'] = authors
result['start_line'] = start_line
result['end_line'] = end_line
- yield result
+ return dict(copyrights=results)
-def get_emails(location):
+def get_emails(location, **kwargs):
"""
- Yield mappings of emails detected in the file at `location`.
+ Return a mapping with a single 'emails' key with a value that is a list of
+ mappings for emails detected in the file at `location`.
"""
from cluecode.finder import find_emails
+ results = []
for email, line_num in find_emails(location):
if not email:
continue
- misc = OrderedDict()
- misc['email'] = email
- misc['start_line'] = line_num
- misc['end_line'] = line_num
- yield misc
+ result = OrderedDict()
+ results.append(result)
+ result['email'] = email
+ result['start_line'] = line_num
+ result['end_line'] = line_num
+ return dict(emails=results)
-def get_urls(location):
+def get_urls(location, **kwargs):
"""
- Yield mappings of urls detected in the file at `location`.
+ Return a mapping with a single 'urls' key with a value that is a list of
+ mappings for urls detected in the file at `location`.
"""
from cluecode.finder import find_urls
+ results = []
for urls, line_num in find_urls(location):
if not urls:
continue
- misc = OrderedDict()
- misc['url'] = urls
- misc['start_line'] = line_num
- misc['end_line'] = line_num
- yield misc
+ result = OrderedDict()
+ results.append(result)
+ result['url'] = urls
+ result['start_line'] = line_num
+ result['end_line'] = line_num
+ return dict(urls=results)
DEJACODE_LICENSE_URL = 'https://enterprise.dejacode.com/urn/urn:dje:license:{}'
SPDX_LICENSE_URL = 'https://spdx.org/licenses/{}'
-def get_licenses(location, min_score=0, include_text=False, diag=False, license_url_template=DEJACODE_LICENSE_URL):
+def get_licenses(location, min_score=0, include_text=False, diag=False,
+ license_url_template=DEJACODE_LICENSE_URL,
+ cache_dir=None,
+ **kwargs):
"""
- Yield mappings of license data detected in the file at `location`.
+ Return a mapping with a single 'licenses' key with a value that is list of
+ mappings for licenses detected in the file at `location`.
- `minimum_score` is a minimum score threshold from 0 to 100. The
- default is 0 means that all license matches will be returned. With
- any other value matches that have a score below minimum score with
- not be returned.
+ `minimum_score` is a minimum score threshold from 0 to 100. The default is 0
+ means that all license matches are returned. Otherwise, matches with a score
+ below `minimum_score` are returned.
- if `include_text` is True, the matched text is included in the
- returned data.
+ if `include_text` is True, matched text is included in the returned data.
- If `diag` is True, additional match details are returned with the
+ If `diag` is True, additional license match details are returned with the
matched_rule key of the returned mapping.
"""
+ from scancode_config import SCANCODE_DEV_MODE
+ if not cache_dir:
+ from scancode_config import scancode_cache_dir as cache_dir
+
from licensedcode.cache import get_index
from licensedcode.cache import get_licenses_db
- idx = get_index()
+ idx = get_index(cache_dir, SCANCODE_DEV_MODE)
licenses = get_licenses_db()
+ results = []
for match in idx.match(location=location, min_score=min_score):
if include_text:
matched_text = match.matched_text(whole_lines=False)
+
for license_key in match.rule.licenses:
lic = licenses.get(license_key)
result = OrderedDict()
+ results.append(result)
result['key'] = lic.key
result['score'] = match.score()
result['short_name'] = lic.short_name
@@ -194,97 +171,58 @@ def get_licenses(location, min_score=0, include_text=False, diag=False, license_
# FIXME: for sanity this should always be included?????
if include_text:
result['matched_text'] = matched_text
- yield result
+ return dict(licenses=results)
-def get_file_infos(location):
+
+def get_package_info(location, **kwargs):
"""
- Return a mapping of file information collected from the file or
- directory at `location`.
+ mappings for package information detected in the file at `location`.
"""
- from commoncode import fileutils
- from commoncode import filetype
- from commoncode.hash import multi_checksums
- from typecode import contenttype
-
- if on_linux:
- location = path_to_bytes(location)
- else:
- location = path_to_unicode(location)
-
- infos = OrderedDict()
- is_file = filetype.is_file(location)
- is_dir = filetype.is_dir(location)
-
- T = contenttype.get_type(location)
-
- infos['type'] = filetype.get_type(location, short=False)
- name = fileutils.file_name(location)
- if is_file:
- base_name, extension = fileutils.splitext(location)
- else:
- base_name = name
- extension = ''
-
- if on_linux:
- infos['name'] = path_to_unicode(name)
- infos['base_name'] = path_to_unicode(base_name)
- infos['extension'] = path_to_unicode(extension)
- else:
- infos['name'] = name
- infos['base_name'] = base_name
- infos['extension'] = extension
-
- infos['date'] = is_file and filetype.get_last_modified_date(location) or None
- infos['size'] = T.size
- infos.update(multi_checksums(location, ('sha1', 'md5',)))
- infos['files_count'] = is_dir and filetype.get_file_count(location) or None
- infos['mime_type'] = is_file and T.mimetype_file or None
- infos['file_type'] = is_file and T.filetype_file or None
- infos['programming_language'] = is_file and T.programming_language or None
- infos['is_binary'] = bool(is_file and T.is_binary)
- infos['is_text'] = bool(is_file and T.is_text)
- infos['is_archive'] = bool(is_file and T.is_archive)
- infos['is_media'] = bool(is_file and T.is_media)
- infos['is_source'] = bool(is_file and T.is_source)
- infos['is_script'] = bool(is_file and T.is_script)
-
- return infos
-
-
-# FIXME: this smells bad
-def _empty_file_infos():
+ from packagedcode.recognize import recognize_package
+ package = recognize_package(location)
+ if package:
+ return dict(packages=[package.to_dict()])
+ return dict(packages=[])
+
+
+def get_file_info(location, **kwargs):
"""
- Return an empty mapping of file info, used in case of failure.
+ Return a mappings of file information collected for the file at `location`.
"""
- infos = OrderedDict()
- infos['type'] = None
- infos['name'] = None
- infos['extension'] = None
- infos['date'] = None
- infos['size'] = None
- infos['sha1'] = None
- infos['md5'] = None
- infos['files_count'] = None
- infos['mime_type'] = None
- infos['file_type'] = None
- infos['programming_language'] = None
- infos['is_binary'] = False
- infos['is_text'] = False
- infos['is_archive'] = False
- infos['is_media'] = False
- infos['is_source'] = False
- infos['is_script'] = False
- return infos
-
-
-def get_package_infos(location):
+ result = OrderedDict()
+
+ # TODO: move date and size these to the inventory collection step???
+ result['date'] = get_last_modified_date(location) or None
+ result['size'] = getsize(location) or 0
+
+ sha1, md5 = multi_checksums(location, ('sha1', 'md5',)).values()
+ result['sha1'] = sha1
+ result['md5'] = md5
+
+ collector = get_type(location)
+ result['mime_type'] = collector.mimetype_file or None
+ result['file_type'] = collector.filetype_file or None
+ result['programming_language'] = collector.programming_language or None
+ result['is_binary'] = bool(collector.is_binary)
+ result['is_text'] = bool(collector.is_text)
+ result['is_archive'] = bool(collector.is_archive)
+ result['is_media'] = bool(collector.is_media)
+ result['is_source'] = bool(collector.is_source)
+ result['is_script'] = bool(collector.is_script)
+ return result
+
+
+def extract_archives(location, recurse=True):
"""
- Return a list of mappings of package information collected from the
- `location` or an empty list.
+ Yield ExtractEvent while extracting archive(s) and compressed files at
+ `location`. If `recurse` is True, extract nested archives-in-archives
+ recursively.
+ Archives and compressed files are extracted in a directory named
+ "-extract" created in the same directory as the archive.
+ Note: this API is returning an iterable and NOT a sequence.
"""
- from packagedcode.recognize import recognize_package
- package = recognize_package(location)
- if not package:
- return []
- return [package.to_dict()]
+ from extractcode.extract import extract
+ from extractcode import default_kinds
+ for xevent in extract(location, kinds=default_kinds, recurse=recurse):
+ yield xevent
diff --git a/src/scancode/cache.py b/src/scancode/cache.py
deleted file mode 100644
index 1621dc42798..00000000000
--- a/src/scancode/cache.py
+++ /dev/null
@@ -1,374 +0,0 @@
-#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
-# http://nexb.com and https://github.com/nexB/scancode-toolkit/
-# The ScanCode software is licensed under the Apache License version 2.0.
-# Data generated with ScanCode require an acknowledgment.
-# ScanCode is a trademark of nexB Inc.
-#
-# You may not use this software except in compliance with the License.
-# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software distributed
-# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# When you publish or redistribute any data created with ScanCode or any ScanCode
-# derivative work, you must accompany this data with the following acknowledgment:
-#
-# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
-# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
-# ScanCode should be considered or used as legal advice. Consult an Attorney
-# for any legal advice.
-# ScanCode is a free software code scanning tool from nexB Inc. and others.
-# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import codecs
-from collections import OrderedDict
-from functools import partial
-import json
-from hashlib import sha1
-import os
-import posixpath
-import sys
-
-from commoncode import fileutils
-from commoncode.fileutils import as_posixpath
-from commoncode.fileutils import path_to_bytes
-from commoncode.fileutils import path_to_unicode
-from commoncode.system import on_linux
-from commoncode import timeutils
-
-from scancode import scans_cache_dir
-
-
-"""
-Cache scan results for a file or directory disk using a file-based cache.
-
-The approach is to cache the scan of a file using these files:
- - one "global" file contains a log of all the paths scanned.
- - for each file being scanned, we store a file that contains the corresponding file
- info data as JSON. This file is named after the hash of the path of a scanned file.
- - for each unique file being scanned (e.g. based on its content SHA1), we store a
- another JSON file that contains the corresponding scan data. This file is named
- after the hash of the scanned file content.
-
-Once a scan is completed, we iterate the cache to output the final scan results:
-First iterate the global log file to get the paths, from there collect the cached
-file info for that file and from the path and file info collect the cached scanned
-result. This iterator is then streamed to the final JSON output.
-
-Finally once a scan is completed the cache is destroyed to free up disk space.
-
-Internally the cache is organized as a tree of directories named after the first few
-characters or a path hash or file hash. This is to avoid having having too many files
-per directory that can make some filesystems choke as well as having directories that
-are too deep or having file paths that are too long which problematic on some OS.
-"""
-
-# Tracing flags
-TRACE = False
-
-def logger_debug(*args):
- pass
-
-if TRACE:
- import logging
-
- logger = logging.getLogger(__name__)
- # logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
- logging.basicConfig(stream=sys.stdout)
- logger.setLevel(logging.DEBUG)
-
- def logger_debug(*args):
- return logger.debug(' '.join(isinstance(a, unicode) and a or repr(a) for a in args))
-
-
-def get_scans_cache_class(cache_dir=scans_cache_dir):
- """
- Return a new persistent cache class configured with a unique storage directory.
- """
- # create a unique temp directory in cache_dir
- fileutils.create_dir(cache_dir)
- prefix = timeutils.time2tstamp() + u'-'
- cache_dir = fileutils.get_temp_dir(cache_dir, prefix=prefix)
- if on_linux:
- cache_dir = path_to_bytes(cache_dir)
- sc = ScanFileCache(cache_dir)
- sc.setup()
- return partial(ScanFileCache, cache_dir)
-
-
-def info_keys(path, seed=None):
- """
- Return a file info cache "keys" tripple for a path composed of three
- paths segments derived from a checksum.
-
- For example:
- >>> expected = 'fb87db2bb28e9501ac7fdc4812782118f4c94a0f'
- >>> assert expected == sha1('/w421/scancode-toolkit2').hexdigest()
- >>> expected = ('f', 'b', '87db2bb28e9501ac7fdc4812782118f4c94a0f')
- >>> assert expected == info_keys('/w421/scancode-toolkit2')
- """
- # ensure that we always pass bytes to the hash function
- if isinstance(path, unicode):
- path = path_to_bytes(path)
- if seed:
- if isinstance(seed, unicode):
- seed = path_to_bytes(seed)
- path = seed + path
- return keys_from_hash(sha1(path).hexdigest())
-
-
-def scan_keys(path, file_info):
- """
- Return a scan cache keys tripple for a path and file_info. If the file_info
- sha1 is empty (e.g. such as a directory), return a key based on the path instead.
- """
- # we "get" because in some off cases getting file info may have failed
- # or there may be none for a directory
- sha1_digest = file_info.get('sha1')
- if sha1_digest:
- return keys_from_hash(sha1_digest)
- else:
- # we may eventually store directories, in which case we use the
- # path as a key with some extra seed
- return info_keys(path, seed=b'empty hash')
-
-
-def keys_from_hash(hexdigest):
- """
- Return a cache keys triple for a hash hexdigest string.
-
- NOTE: since we use the first character and next two characters as directories, we
- create at most 16 dir at the first level and 16 dir at the second level for each
- first level directory for a maximum total of 16*16 = 256 directories. For a
- million files we would have about 4000 files per directory on average with this
- scheme which should keep most file systems happy and avoid some performance
- issues when there are too many files in a single directory.
-
- For example:
- >>> expected = ('f', 'b', '87db2bb28e9501ac7fdc4812782118f4c94a0f')
- >>> assert expected == keys_from_hash('fb87db2bb28e9501ac7fdc4812782118f4c94a0f')
- """
- if on_linux:
- hexdigest = bytes(hexdigest)
- return hexdigest[0], hexdigest[1], hexdigest[2:]
-
-
-def paths_from_keys(base_path, keys):
- """
- Return a tuple of (parent dir path, filename) for a cache entry built from a cache
- keys triple and a base_directory. Ensure that the parent directory exist.
- """
- if on_linux:
- keys = [path_to_bytes(k) for k in keys]
- base_path = path_to_bytes(base_path)
- else:
- keys = [path_to_unicode(k) for k in keys]
- base_path = path_to_unicode(base_path)
-
- dir1, dir2, file_name = keys
- parent = os.path.join(base_path, dir1, dir2)
- fileutils.create_dir(parent)
- return parent, file_name
-
-
-class ScanFileCache(object):
- """
- A file-based cache for scan results saving results in files and using no locking.
- This is NOT thread-safe and NOT multi-process safe but works OK in our context:
- we cache the scan for a given file once and read it only a few times.
- """
- def __init__(self, cache_dir):
- # subdirs for info and scans_dir caches
- if on_linux:
- infos_dir = b'infos_dir/'
- scans_dir = b'scans_dir/'
- files_log = b'files_log'
- self.cache_base_dir = path_to_bytes(cache_dir)
-
- else:
- infos_dir = u'infos_dir/'
- scans_dir = u'scans_dir/'
- files_log = u'files_log'
- self.cache_base_dir = cache_dir
-
- self.cache_infos_dir = as_posixpath(os.path.join(self.cache_base_dir, infos_dir))
- self.cache_scans_dir = as_posixpath(os.path.join(self.cache_base_dir, scans_dir))
- self.cache_files_log = as_posixpath(os.path.join(self.cache_base_dir, files_log))
-
- def setup(self):
- """
- Setup the cache: must be called at least once globally after cache
- initialization.
- """
- fileutils.create_dir(self.cache_infos_dir)
- fileutils.create_dir(self.cache_scans_dir)
-
- @classmethod
- def log_file_path(cls, logfile_fd, path):
- """
- Log file path in the cache logfile_fd **opened** file descriptor.
- """
- # we dump one path per line written as bytes or unicode
- if on_linux:
- path = path_to_bytes(path) + b'\n'
- else:
- path = path_to_unicode(path) + '\n'
- logfile_fd.write(path)
-
- def get_cached_info_path(self, path):
- """
- Return the path where to store a file info in the cache given a path.
- """
- keys = info_keys(path)
- paths = paths_from_keys(self.cache_infos_dir, keys)
- return posixpath.join(*paths)
-
- def put_info(self, path, file_info):
- """
- Put file_info for path in the cache and return True if the file referenced
- in file_info has already been scanned or False otherwise.
- """
- info_path = self.get_cached_info_path(path)
- with codecs.open(info_path, 'wb', encoding='utf-8') as cached_infos:
- json.dump(file_info, cached_infos, check_circular=False)
- scan_path = self.get_cached_scan_path(path, file_info)
- is_scan_cached = os.path.exists(scan_path)
- if TRACE:
- logger_debug('put_infos:', 'path:', path, 'is_scan_cached:', is_scan_cached, 'file_info:', file_info, '\n')
- return is_scan_cached
-
- def get_info(self, path):
- """
- Return file info from the cache for a path.
- Return None on failure to find the info in the cache.
- """
- info_path = self.get_cached_info_path(path)
- if os.path.exists(info_path):
- with codecs.open(info_path, 'r', encoding='utf-8') as ci:
- return json.load(ci, object_pairs_hook=OrderedDict)
-
- def get_cached_scan_path(self, path, file_info):
- """
- Return the path where to store a scan in the cache given a path and file_info.
- """
- keys = scan_keys(path, file_info)
- paths = paths_from_keys(self.cache_scans_dir, keys)
- return posixpath.join(*paths)
-
- def put_scan(self, path, file_info, scan_result):
- """
- Put scan_result in the cache if not already cached.
- """
- scan_path = self.get_cached_scan_path(path, file_info)
- if not os.path.exists(scan_path):
- with codecs.open(scan_path, 'wb', encoding='utf-8') as cached_scan:
- json.dump(scan_result, cached_scan, check_circular=False)
- if TRACE:
- logger_debug('put_scan:', 'scan_path:', scan_path, 'file_info:', file_info, 'scan_result:', scan_result, '\n')
-
- def get_scan(self, path, file_info):
- """
- Return scan results from the cache for a path and file_info.
- Return None on failure to find the scan results in the cache.
- """
- scan_path = self.get_cached_scan_path(path, file_info)
- if os.path.exists(scan_path):
- with codecs.open(scan_path, 'r', encoding='utf-8') as cached_scan:
- return json.load(cached_scan, object_pairs_hook=OrderedDict)
-
- def iterate(self, scan_names, root_dir=None, paths_subset=tuple()):
- """
- Yield scan data for all cached scans e.g. the whole cache given
- a list of scan names.
- If a `paths_subset` sequence of paths is provided, then only
- these paths are iterated.
-
- The logfile MUST have been closed before calling this method.
- """
- if on_linux:
- paths_subset = set(path_to_bytes(p) for p in paths_subset)
- else:
- paths_subset = set(path_to_unicode(p) for p in paths_subset)
-
- if on_linux:
- log_opener = partial(open, self.cache_files_log, 'rb')
- else:
- log_opener = partial(codecs.open, self.cache_files_log, 'rb', encoding='utf-8')
- EOL = b'\n' if on_linux else '\n'
-
- with log_opener() as cached_files:
- # iterate paths, one by line
- for file_log in cached_files:
- # must be unicode
- path = file_log.rstrip(EOL)
- if paths_subset and path not in paths_subset:
- continue
- file_info = self.get_info(path)
-
- if on_linux:
- unicode_path = path_to_unicode(path)
- else:
- unicode_path = path
-
- if root_dir:
- # must be unicode
- if on_linux:
- root_dir = path_to_unicode(root_dir)
- rooted_path = posixpath.join(root_dir, unicode_path)
- else:
- rooted_path = unicode_path
- rooted_path = fileutils.as_posixpath(rooted_path)
- logger_debug('iterate:', 'rooted_path:', rooted_path)
-
- # rare but possible corner case
- if file_info is None:
- no_info = ('ERROR: file info unavailable in cache: '
- 'This is either a bug or processing was aborted with CTRL-C.')
- scan_result = OrderedDict(path=rooted_path)
- scan_result['scan_errors'] = [no_info]
- if TRACE:
- logger_debug('iterate:', 'scan_result:', scan_result, 'for path:', rooted_path, '\n')
- yield scan_result
- continue
-
- _unicode_path_from_file_info = file_info.pop('path')
- scan_result = OrderedDict(path=rooted_path)
-
- if 'infos' in scan_names:
- # info are always collected but only returned if requested
- # we flatten these as direct attributes of a file object
- scan_result.update(file_info.items())
-
- if not scan_result.get('scan_errors'):
- scan_result['scan_errors'] = []
-
- # check if we have more than just infos
- if ['infos'] != scan_names:
- errors = scan_result['scan_errors']
- scan_details = self.get_scan(path, file_info)
- if scan_details is None:
- no_scan_details = (
- 'ERROR: scan details unavailable in cache: '
- 'This is either a bug or processing was aborted with CTRL-C.')
- errors.append(no_scan_details)
- else:
- # append errors to other top level errors if any
- scan_errors = scan_details.pop('scan_errors', [])
- errors.extend(scan_errors)
- scan_result.update(scan_details)
-
- if TRACE:
- logger_debug('iterate:', 'scan_result:', scan_result, 'for path:', rooted_path, '\n')
- yield scan_result
-
- def clear(self, *args):
- """
- Purge the cache by deleting the corresponding cached data files.
- """
- fileutils.delete(self.cache_base_dir)
diff --git a/src/scancode/cli.py b/src/scancode/cli.py
index 304bf2f7a16..161f15f92cd 100644
--- a/src/scancode/cli.py
+++ b/src/scancode/cli.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
@@ -23,181 +23,164 @@
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
from __future__ import absolute_import
-from __future__ import print_function
from __future__ import division
+from __future__ import print_function
from __future__ import unicode_literals
-# Import early because this import has monkey-patching side effects
+# Import first because this import has monkey-patching side effects
from scancode.pool import get_pool
-import codecs
from collections import OrderedDict
from functools import partial
from itertools import imap
-import os
-from os.path import expanduser
-from os.path import abspath
import sys
from time import time
import traceback
-from types import GeneratorType
+import attr
import click
click.disable_unicode_literals_warning = True
-from click.termui import style
-
-from commoncode import filetype
-from commoncode import fileutils
-from commoncode.fileutils import path_to_bytes
-from commoncode.fileutils import path_to_unicode
-from commoncode import ignore
-from commoncode.system import on_linux
-from commoncode.text import toascii
-
-import plugincode.output
-import plugincode.post_scan
-import plugincode.pre_scan
-
-from scancode import __version__ as version
-
-from scancode.api import DEJACODE_LICENSE_URL
-from scancode.api import _empty_file_infos
-from scancode.api import get_copyrights
-from scancode.api import get_emails
-from scancode.api import get_file_infos
-from scancode.api import get_licenses
-from scancode.api import get_package_infos
-from scancode.api import get_urls
-from scancode.api import Resource
-
-from scancode.cache import get_scans_cache_class
-from scancode.cache import ScanFileCache
+# import early
+from scancode_config import __version__ as scancode_version
+from scancode_config import scancode_cache_dir
+from scancode_config import scancode_temp_dir
+
+from commoncode.fileutils import PATH_TYPE
+from commoncode.timeutils import time2tstamp
+
+from plugincode import CommandLineOption
+from plugincode import PluginManager
+
+# these are important to register plugin managers
+from plugincode import pre_scan
+from plugincode import scan
+from plugincode import post_scan
+from plugincode import output_filter
+from plugincode import output
+
+from scancode import CORE_GROUP
+from scancode import DOC_GROUP
+from scancode import MISC_GROUP
+from scancode import OTHER_SCAN_GROUP
+from scancode import OUTPUT_GROUP
+from scancode import OUTPUT_FILTER_GROUP
+from scancode import OUTPUT_CONTROL_GROUP
+from scancode import POST_SCAN_GROUP
+from scancode import PRE_SCAN_GROUP
+from scancode import SCAN_GROUP
+from scancode import SCAN_OPTIONS_GROUP
+from scancode import notice
+from scancode import print_about
+from scancode import Scanner
+from scancode import validate_option_dependencies
from scancode.interrupt import DEFAULT_TIMEOUT
from scancode.interrupt import fake_interruptible
from scancode.interrupt import interruptible
-from scancode.interrupt import TimeoutError
-
+from scancode.resource import Codebase
+from scancode.resource import Resource
from scancode.utils import BaseCommand
-from scancode.utils import compute_fn_max_len
-from scancode.utils import fixed_width_file_name
+from scancode.utils import path_progress_message
from scancode.utils import progressmanager
-
-echo_stderr = partial(click.secho, err=True)
-
-
# Python 2 and 3 support
try:
# Python 2
unicode
str_orig = str
- bytes = str
- str = unicode
+ bytes = str # NOQA
+ str = unicode # NOQA
except NameError:
# Python 3
- unicode = str
-
-
-# this will init the plugins
-plugincode.pre_scan.initialize()
-plugincode.output.initialize()
-plugincode.post_scan.initialize()
-
-
-info_text = '''
-ScanCode scans code and other files for origin and license.
-Visit https://github.com/nexB/scancode-toolkit/ for support and download.
-
-'''
+ unicode = str # NOQA
-notice_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'NOTICE')
-notice_text = open(notice_path).read()
+# Tracing flags
+TRACE = False
+TRACE_DEEP = False
-delimiter = '\n\n\n'
-[notice_text, extra_notice_text] = notice_text.split(delimiter, 1)
-extra_notice_text = delimiter + extra_notice_text
-delimiter = '\n\n '
-[notice_text, acknowledgment_text] = notice_text.split(delimiter, 1)
-acknowledgment_text = delimiter + acknowledgment_text
+def logger_debug(*args):
+ pass
-notice = acknowledgment_text.strip().replace(' ', '')
-# CLI help groups
-SCANS = 'scans'
-OUTPUT = 'output'
-PRE_SCAN = 'pre-scan'
-POST_SCAN = 'post-scan'
-MISC = 'misc'
-CORE = 'core'
+if TRACE or TRACE_DEEP:
+ import logging
+ logger = logging.getLogger(__name__)
+ logging.basicConfig(stream=sys.stdout)
+ logger.setLevel(logging.DEBUG)
-def print_about(ctx, param, value):
- if not value or ctx.resilient_parsing:
- return
- click.echo(info_text + notice_text + acknowledgment_text + extra_notice_text)
- ctx.exit()
+ def logger_debug(*args):
+ return logger.debug(' '.join(isinstance(a, unicode)
+ and a or repr(a) for a in args))
+echo_stderr = partial(click.secho, err=True)
+# FIXME: this should be pushed out in some external help or pushed down in plugins.
+# FIXME: the glob story is very weird!!!
examples_text = '''
Scancode command lines examples:
(Note for Windows: use '\\' back slash instead of '/' forward slash for paths.)
-Scan the 'samples' directory for licenses and copyrights. Save scan results to
-an HTML app file for interactive scan results navigation. When the scan is done,
-open 'scancode_result.html' in your web browser. Note that additional app files
-are saved in a directory named 'scancode_result_files':
-
- scancode --format html-app samples/ scancode_result.html
-
-Scan a directory for licenses and copyrights. Save scan results to an
-HTML file:
-
- scancode --format html samples/zlib scancode_result.html
-
Scan a single file for copyrights. Print scan results to stdout as JSON:
- scancode --copyright samples/zlib/zlib.h
+ scancode --copyright samples/zlib/zlib.h --json
Scan a single file for licenses, print verbose progress to stderr as each
file is scanned. Save scan to a JSON file:
- scancode --license --verbose samples/zlib/zlib.h licenses.json
+ scancode --license --verbose samples/zlib/zlib.h --json licenses.json
Scan a directory explicitly for licenses and copyrights. Redirect JSON scan
results to a file:
- scancode -f json -l -c samples/zlib/ > scan.json
+ scancode --json -l -c samples/zlib/ > scan.json
-Scan a directory while ignoring a single file. Print scan results to stdout as JSON:
+Scan a directory while ignoring a single file.
+Print scan results to stdout as JSON:
- scancode --ignore README samples/
+ scancode --json --ignore README samples/
-Scan a directory while ignoring all files with txt extension. Print scan results to
-stdout as JSON (It is recommended to use quoted glob patterns to prevent pattern
-expansion by the shell):
+Scan a directory while ignoring all files with .txt extension.
+Print scan results to stdout as JSON.
+It is recommended to use quotes around glob patterns to prevent pattern
+expansion by the shell:
- scancode --ignore "*.txt" samples/
+ scancode --json --ignore "*.txt" samples/
Special characters supported in GLOB pattern:
-* matches everything
-? matches any single character
-[seq] matches any character in seq
-[!seq] matches any character not in seq
+- * matches everything
+- ? matches any single character
+- [seq] matches any character in seq
+- [!seq] matches any character not in seq
+
+For a literal match, wrap the meta-characters in brackets.
+For example, '[?]' matches the character '?'.
+For details on GLOB patterns see https://en.wikipedia.org/wiki/Glob_(programming).
+
+Note: Glob patterns cannot be applied to path as strings.
+For example, this will not ignore "samples/JGroups/licenses".
+
+ scancode --json --ignore "samples*licenses" samples/
-For a literal match, wrap the meta-characters in brackets. For example, '[?]' matches the character '?'.
-For glob see https://en.wikipedia.org/wiki/Glob_(programming).
-Note: Glob patterns cannot be applied to path as strings, for e.g.
- scancode --ignore "samples*licenses" samples/
-will not ignore "samples/JGroups/licenses".
+Scan a directory while ignoring multiple files (or glob patterns).
+Print the scan results to stdout as JSON:
+
+ scancode --json --ignore README --ignore "*.txt" samples/
+
+Scan the 'samples' directory for licenses and copyrights. Save scan results to
+an HTML app file for interactive scan results navigation. When the scan is done,
+open 'scancode_result.html' in your web browser. Note that additional app files
+are saved in a directory named 'scancode_result_files':
+
+ scancode --output-html-app scancode_result.html samples/
-Scan a directory while ignoring multiple files (or glob patterns). Print the scan
-results to stdout as JSON:
+Scan a directory for licenses and copyrights. Save scan results to an
+HTML file:
- scancode --ignore README --ignore "*.txt" samples/
+ scancode --output-html scancode_result.html samples/zlib
To extract archives, see the 'extractcode' command instead.
'''
@@ -213,728 +196,1128 @@ def print_examples(ctx, param, value):
def print_version(ctx, param, value):
if not value or ctx.resilient_parsing:
return
- click.echo('ScanCode version ' + version)
- ctx.exit()
-
-
-def reindex_licenses(ctx, param, value):
- if not value or ctx.resilient_parsing:
- return
- from licensedcode import cache
- click.echo('Checking and rebuilding the license index...')
- cache.reindex()
- click.echo('Done.')
+ click.echo('ScanCode version ' + scancode_version)
ctx.exit()
+# FIXME: this should be pushed out in some external help or pushed down in plugins.
epilog_text = '''Examples (use --examples for more):
\b
Scan the 'samples' directory for licenses and copyrights.
-Save scan results to a JSON file:
+Save scan results to the 'scancode_result.json' JSON file:
- scancode --format json samples scancode_result.json
+ scancode --license --copyright --json=scancode_result.json samples
\b
-Scan the 'samples' directory for licenses and copyrights. Save scan results to
-an HTML app file for interactive web browser results navigation. Additional app
-files are saved to the 'myscan_files' directory:
+Scan the 'samples' directory for licenses and package manifests. Print scan
+results on screen as pretty-formatted JSON (using the special '-' FILE to print
+to on screen/to stdout):
- scancode --format html-app samples myscan.html
+ scancode --json-pp - --license --package samples
Note: when you run scancode, a progress bar is displayed with a counter of the
number of files processed. Use --verbose to display file-by-file progress.
'''
+
class ScanCommand(BaseCommand):
+ """
+ A command class that is aware of ScanCode options that provides enhanced
+ help where each option is grouped by group.
+ """
+
short_usage_help = '''
Try 'scancode --help' for help on options and arguments.'''
- def __init__(self, name, context_settings=None, callback=None,
- params=None, help=None, epilog=None, short_help=None,
- options_metavar='[OPTIONS]', add_help_option=True):
+ def __init__(self, name, context_settings=None, callback=None, params=None,
+ help=None, # NOQA
+ epilog=None, short_help=None,
+ options_metavar='[OPTIONS]', add_help_option=True,
+ plugin_options=()):
+ """
+ Create a new ScanCommand using the `plugin_options` list of
+ CommandLineOption instances.
+ """
+
super(ScanCommand, self).__init__(name, context_settings, callback,
- params, help, epilog, short_help, options_metavar, add_help_option)
-
- for name, callback in plugincode.post_scan.get_post_scan_plugins().items():
- # normalize white spaces in help.
- help_text = ' '.join(callback.__doc__.split())
- option = ScanOption(('--' + name,), is_flag=True, help=help_text, group=POST_SCAN)
- self.params.append(option)
- for name, plugin in plugincode.pre_scan.get_pre_scan_plugins().items():
- attrs = plugin.option_attrs
- attrs['default'] = None
- attrs['group'] = PRE_SCAN
- attrs['help'] = ' '.join(plugin.__doc__.split())
- option = ScanOption(('--' + name,), **attrs)
- self.params.append(option)
+ params, help, epilog, short_help, options_metavar, add_help_option)
+
+ # this makes the options "known" to the command
+ self.params.extend(plugin_options)
def format_options(self, ctx, formatter):
"""
- Overridden from click.Command to write all options into the formatter in groups
- they belong to. If a group is not specified, add the option to MISC group.
+ Overridden from click.Command to write all options into the formatter in
+ help_groups they belong to. If a group is not specified, add the option
+ to MISC_GROUP group.
"""
- groups = OrderedDict([
- (SCANS, []),
- (OUTPUT, []),
- (PRE_SCAN, []),
- (POST_SCAN, []),
- (MISC, []),
- (CORE, []),
+ # this mapping defines the CLI help presentation order
+ help_groups = OrderedDict([
+ (SCAN_GROUP, []),
+ (OTHER_SCAN_GROUP, []),
+ (SCAN_OPTIONS_GROUP, []),
+ (OUTPUT_GROUP, []),
+ (OUTPUT_FILTER_GROUP, []),
+ (OUTPUT_CONTROL_GROUP, []),
+ (PRE_SCAN_GROUP, []),
+ (POST_SCAN_GROUP, []),
+ (CORE_GROUP, []),
+ (MISC_GROUP, []),
+ (DOC_GROUP, []),
])
for param in self.get_params(ctx):
# Get the list of option's name and help text
help_record = param.get_help_record(ctx)
- if help_record:
- if getattr(param, 'group', None):
- groups[param.group].append(help_record)
- else:
- groups['misc'].append(help_record)
+ if not help_record:
+ continue
+ # organize options by group
+ help_group = getattr(param, 'help_group', MISC_GROUP)
+ sort_order = getattr(param, 'sort_order', 100)
+ help_groups[help_group].append((sort_order, help_record))
with formatter.section('Options'):
- for group, option in groups.items():
- if option:
- with formatter.section(group):
- formatter.write_dl(option)
+ for group, help_records in help_groups.items():
+ if not help_records:
+ continue
+ with formatter.section(group):
+ sorted_records = [help_record for _, help_record in sorted(help_records)]
+ formatter.write_dl(sorted_records)
+
+
+try:
+ # IMPORTANT: this discovers, loads and validates all available plugins
+ plugin_classes, plugin_options = PluginManager.load_plugins()
+except ImportError, e:
+ echo_stderr('========================================================================')
+ echo_stderr('ERROR: Unable to import ScanCode plugins.'.upper())
+ echo_stderr('Check your installation configuration (setup.py) or re-install/re-configure ScanCode.')
+ echo_stderr('The following plugin(s) are referenced and cannot be loaded/imported:')
+ echo_stderr(str(e), color='red')
+ echo_stderr('========================================================================')
+ raise e
+
+
+def print_plugins(ctx, param, value):
+ if not value or ctx.resilient_parsing:
+ return
+ for plugin_cls in sorted(plugin_classes, key=lambda pc: (pc.stage, pc.name)):
+ click.echo('--------------------------------------------')
+ click.echo('Plugin: scancode_{self.stage}:{self.name}'.format(self=plugin_cls), nl=False)
+ click.echo(' class: {self.__module__}:{self.__name__}'.format(self=plugin_cls))
+ if hasattr(plugin_cls, 'requires'):
+ requires = ', '.join(plugin_cls.requires)
+ click.echo(' requires: {}'.format(requires), nl=False)
+ click.echo(' doc: {self.__doc__}'.format(self=plugin_cls))
+ click.echo(' options:'.format(self=plugin_cls))
+ for option in plugin_cls.options:
+ name = option.name
+ opts = ', '.join(option.opts)
+ help_group = option.help_group
+ help_txt = option.help # noqa
+ click.echo(' help_group: {help_group!s}, name: {name!s}: {opts}\n help: {help_txt!s}'.format(**locals()))
+ click.echo('')
+ ctx.exit()
-class ScanOption(click.Option):
- """
- Allow an extra param `group` to be set which can be used
- to determine to which group the option belongs.
- """
- def __init__(self, param_decls=None, show_default=False,
- prompt=False, confirmation_prompt=False,
- hide_input=False, is_flag=None, flag_value=None,
- multiple=False, count=False, allow_from_autoenv=True,
- type=None, help=None, group=None, **attrs):
- super(ScanOption, self).__init__(param_decls, show_default,
- prompt, confirmation_prompt,
- hide_input, is_flag, flag_value,
- multiple, count, allow_from_autoenv, type, help, **attrs)
- self.group = group
+@click.command(name='scancode',
+ epilog=epilog_text,
+ cls=ScanCommand,
+ plugin_options=plugin_options)
+@click.pass_context
-def validate_formats(ctx, param, value):
+# ensure that the input path is bytes on Linux, unicode elsewhere
+@click.argument('input', metavar='