Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

analytics: refactor into a module #2826

Merged
merged 46 commits into from Dec 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
ef1f038
analytics: refactor into a module
Nov 20, 2019
b618d03
analytics: return OS when system info not supported
Nov 26, 2019
7d3cbd7
analytics: collect system info under the same module
Nov 26, 2019
64d102c
analytics: move user_id module to a method
Nov 26, 2019
2e7a6d3
:nail_care: sort methods
Nov 26, 2019
5cb437b
analytics: move to single file
Nov 26, 2019
610d1ee
analytics: go back to daemon implementation
Nov 26, 2019
d958870
analytics: exit_code -> return_code
Nov 26, 2019
34d3004
:face_palm: fix wrong module
Nov 26, 2019
5261ed7
tests: analytics.find_or_create_user_id
Nov 26, 2019
467b110
py2: FileNotFoundError -> IOError
Nov 27, 2019
48b4cba
:nail_care: change naming and docstring
Nov 27, 2019
6bc2403
:nail_care: black
Nov 27, 2019
f995204
tests: functional and unit tests for analytics
Nov 27, 2019
2a5cac5
:nail_care: deepsource
Nov 27, 2019
e9d56a9
:nail_care: sort imports
Nov 27, 2019
787ea9c
tests: set temporary global config
Nov 27, 2019
e69fffa
py2 compat issues :shrug:
Nov 28, 2019
656f986
:nail_care: correct wording "disenabled" -> "disabled"
Nov 28, 2019
946dfc1
analytics: send report without loading it to memory
Dec 2, 2019
5596015
analytics: document why tmp_global_config is needed
Dec 2, 2019
2c8e149
analytics: use fspath instead of str
Dec 3, 2019
ca485db
analytics: define report schema and use it on tests
Dec 3, 2019
e0d47a1
:nail_care: formatting
Dec 3, 2019
8b5ca25
analytics: move report schema to tests
Dec 3, 2019
8a32b28
analytics: collect and send on daemon
Dec 3, 2019
624cbb2
:nail_care: more specific comment about analytics
Dec 3, 2019
285620d
tests: mock analytics.collect while testing daemon
Dec 3, 2019
02eaa9a
py35: support for fspath(pathlib.Path())
Dec 4, 2019
ede0103
py2: use convert_to_unicode instead of str
Dec 5, 2019
7a3aae1
tests: add unit test for analytics.system_info
Dec 5, 2019
05a6868
tests: isolate global config from analytics tests
Dec 5, 2019
71ead3f
analytics: use a tempfile for inter-process communication
Dec 5, 2019
7e8bdbd
remove pathlib / fspath changes related to the patch
Dec 6, 2019
bc2471c
tests: adjust scm_class schema
Dec 6, 2019
c82d9dc
compat: bring back unicode literals
Dec 6, 2019
f704e54
tests: stringify tmp_global_config since it doesnt return a pathlike …
Dec 6, 2019
da064e2
analytics: remove the report after sending it
Dec 6, 2019
5a80334
tests: use str, builtin_str in schema
Dec 6, 2019
2145daa
analytics: define private methods
Dec 6, 2019
c50bd17
analytics: collect execution info only when available
Dec 8, 2019
146e75b
analytics: raise error when collecting a not supported os
Dec 8, 2019
c9f958d
analytics: AttributeError -> KeyError
Dec 8, 2019
3244f10
:nail_care: add dot to the end of the comment
Dec 8, 2019
6e4c3d6
tests: require keys on analytics report schema
Dec 8, 2019
5dd6300
:nail_care: black
Dec 8, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
327 changes: 119 additions & 208 deletions dvc/analytics.py
Original file line number Diff line number Diff line change
@@ -1,256 +1,167 @@
"""Collect and send usage analytics"""
from __future__ import unicode_literals
efiop marked this conversation as resolved.
Show resolved Hide resolved

import errno
import json
import logging
import os
import platform
import requests
import sys
import tempfile
import uuid

import distro

from dvc import __version__
from dvc.utils import env2bool
from dvc.utils.compat import str
from dvc.config import Config, to_bool
from dvc.daemon import daemon
from dvc.exceptions import NotDvcRepoError
from dvc.lock import Lock, LockError
from dvc.repo import Repo
from dvc.scm import SCM
from dvc.utils import env2bool, is_binary, makedirs
efiop marked this conversation as resolved.
Show resolved Hide resolved
from dvc.utils.compat import str, FileNotFoundError


logger = logging.getLogger(__name__)


class Analytics(object):
"""Class for collecting and sending usage analytics.

Args:
info (dict): optional existing analytics report.
def collect_and_send_report(args=None, return_code=None):
"""
Collect information from the runtime/environment and the command
being executed into a report and send it over the network.

URL = "https://analytics.dvc.org"
TIMEOUT_POST = 5
To prevent analytics from blocking the execution of the main thread,
sending the report is done in a separate process.

USER_ID_FILE = "user_id"
The inter-process communication happens through a file containing the
report as a JSON, where the _collector_ generates it and the _sender_
removes it after sending it.
"""
report = _runtime_info()

PARAM_DVC_VERSION = "dvc_version"
PARAM_USER_ID = "user_id"
PARAM_SYSTEM_INFO = "system_info"
# Include command execution information on the report only when available.
if args and hasattr(args, "func"):
report.update({"cmd_class": args.func.__name__})

PARAM_OS = "os"
if return_code is not None:
report.update({"cmd_return_code": return_code})

PARAM_WINDOWS_VERSION_MAJOR = "windows_version_major"
PARAM_WINDOWS_VERSION_MINOR = "windows_version_minor"
PARAM_WINDOWS_VERSION_BUILD = "windows_version_build"
PARAM_WINDOWS_VERSION_SERVICE_PACK = "windows_version_service_pack"
with tempfile.NamedTemporaryFile(delete=False, mode="w") as fobj:
json.dump(report, fobj)
daemon(["analytics", fobj.name])

PARAM_MAC_VERSION = "mac_version"

PARAM_LINUX_DISTRO = "linux_distro"
PARAM_LINUX_DISTRO_VERSION = "linux_distro_version"
PARAM_LINUX_DISTRO_LIKE = "linux_distro_like"
def is_enabled():
if env2bool("DVC_TEST"):
return False

PARAM_SCM_CLASS = "scm_class"
PARAM_IS_BINARY = "is_binary"
PARAM_CMD_CLASS = "cmd_class"
PARAM_CMD_RETURN_CODE = "cmd_return_code"
enabled = to_bool(
Config(validate=False)
.config.get(Config.SECTION_CORE, {})
.get(Config.SECTION_CORE_ANALYTICS, "true")
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks ridiculous that we need to make this that hard. Why can't we just:

enabled = Config().config['core']['analytics']

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC In the current form we can only rely on core being present if it comes through the validator, as it will set the default values.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't get what's wrong with my approach? It uses a validator, which will create core section and a default for analytics.


def __init__(self, info=None):
from dvc.config import Config
from dvc.lock import Lock
logger.debug("Analytics is {}abled.".format("en" if enabled else "dis"))

if info is None:
info = {}
return enabled

self.info = info

cdir = Config.get_global_config_dir()
try:
os.makedirs(cdir)
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
def send(report):
"""
Side effect: Removes the report after sending it.

self.user_id_file = os.path.join(cdir, self.USER_ID_FILE)
self.user_id_file_lock = Lock(self.user_id_file + ".lock")
The report is generated and stored in a temporary file, see:
`collect_and_send_report`. Sending happens on another process,
thus, the need of removing such file afterwards.
"""
url = "https://analytics.dvc.org"
headers = {"content-type": "application/json"}

@staticmethod
def load(path):
"""Loads analytics report from json file specified by path.
with open(report, "rb") as fobj:
requests.post(url, data=fobj, headers=headers, timeout=5)

Args:
path (str): path to json file with analytics report.
"""
with open(path, "r") as fobj:
analytics = Analytics(info=json.load(fobj))
os.unlink(path)
return analytics
os.remove(report)

def _write_user_id(self):
import uuid

with open(self.user_id_file, "w+") as fobj:
user_id = str(uuid.uuid4())
info = {self.PARAM_USER_ID: user_id}
json.dump(info, fobj)
return user_id
def _scm_in_use():
try:
scm = SCM(root_dir=Repo.find_root())
return type(scm).__name__
except NotDvcRepoError:
pass

def _read_user_id(self):
if not os.path.exists(self.user_id_file):
return None

with open(self.user_id_file, "r") as fobj:
try:
info = json.load(fobj)
except ValueError as exc:
logger.debug("Failed to load user_id: {}".format(exc))
return None

return info[self.PARAM_USER_ID]

def _get_user_id(self):
from dvc.lock import LockError
def _runtime_info():
"""
Gather information from the environment where DVC runs to fill a report.
"""
return {
"dvc_version": __version__,
"is_binary": is_binary(),
"scm_class": _scm_in_use(),
"system_info": _system_info(),
"user_id": _find_or_create_user_id(),
}

try:
with self.user_id_file_lock:
user_id = self._read_user_id()
if user_id is None:
user_id = self._write_user_id()
return user_id
except LockError:
msg = "Failed to acquire '{}'"
logger.debug(msg.format(self.user_id_file_lock.lockfile))

def _collect_windows(self):
import sys

version = sys.getwindowsversion() # pylint: disable=no-member
info = {}
info[self.PARAM_OS] = "windows"
info[self.PARAM_WINDOWS_VERSION_MAJOR] = version.major
info[self.PARAM_WINDOWS_VERSION_MINOR] = version.minor
info[self.PARAM_WINDOWS_VERSION_BUILD] = version.build
info[self.PARAM_WINDOWS_VERSION_SERVICE_PACK] = version.service_pack
return info

def _collect_darwin(self):
import platform

info = {}
info[self.PARAM_OS] = "mac"
info[self.PARAM_MAC_VERSION] = platform.mac_ver()[0]
return info

def _collect_linux(self):
import distro

info = {}
info[self.PARAM_OS] = "linux"
info[self.PARAM_LINUX_DISTRO] = distro.id()
info[self.PARAM_LINUX_DISTRO_VERSION] = distro.version()
info[self.PARAM_LINUX_DISTRO_LIKE] = distro.like()
return info

def _collect_system_info(self):
import platform

system = platform.system()
def _system_info():
system = platform.system()

if system == "Windows":
return self._collect_windows()
if system == "Windows":
version = sys.getwindowsversion()

if system == "Darwin":
return self._collect_darwin()

if system == "Linux":
return self._collect_linux()

raise NotImplementedError

def collect(self):
"""Collect analytics report."""
from dvc.scm import SCM
from dvc.utils import is_binary
from dvc.repo import Repo
from dvc.exceptions import NotDvcRepoError

self.info[self.PARAM_DVC_VERSION] = __version__
self.info[self.PARAM_IS_BINARY] = is_binary()
self.info[self.PARAM_USER_ID] = self._get_user_id()

self.info[self.PARAM_SYSTEM_INFO] = self._collect_system_info()

try:
scm = SCM(root_dir=Repo.find_root())
self.info[self.PARAM_SCM_CLASS] = type(scm).__name__
except NotDvcRepoError:
pass

def collect_cmd(self, args, ret):
"""Collect analytics info from a CLI command."""
from dvc.command.daemon import CmdDaemonAnalytics

assert isinstance(ret, int) or ret is None

if ret is not None:
self.info[self.PARAM_CMD_RETURN_CODE] = ret

if args is not None and hasattr(args, "func"):
assert args.func != CmdDaemonAnalytics
self.info[self.PARAM_CMD_CLASS] = args.func.__name__

def dump(self):
"""Save analytics report to a temporary file.

Returns:
str: path to the temporary file that contains the analytics report.
"""
import tempfile
return {
"os": "windows",
"windows_version_build": version.build,
"windows_version_major": version.major,
"windows_version_minor": version.minor,
"windows_version_service_pack": version.service_pack,
}

with tempfile.NamedTemporaryFile(delete=False, mode="w") as fobj:
json.dump(self.info, fobj)
return fobj.name
if system == "Darwin":
return {"os": "mac", "mac_version": platform.mac_ver()[0]}

@staticmethod
def is_enabled(cmd=None):
from dvc.config import Config, to_bool
from dvc.command.daemon import CmdDaemonBase
if system == "Linux":
return {
"os": "linux",
"linux_distro": distro.id(),
"linux_distro_like": distro.like(),
"linux_distro_version": distro.version(),
}

if env2bool("DVC_TEST"):
return False
# We don't collect data for any other system.
raise NotImplementedError

if isinstance(cmd, CmdDaemonBase):
return False

core = Config(validate=False).config.get(Config.SECTION_CORE, {})
enabled = to_bool(core.get(Config.SECTION_CORE_ANALYTICS, "true"))
logger.debug(
"Analytics is {}.".format("enabled" if enabled else "disabled")
)
return enabled
def _find_or_create_user_id():
"""
The user's ID is stored on a file under the global config directory.

@staticmethod
def send_cmd(cmd, args, ret):
"""Collect and send analytics for CLI command.
The file should contain a JSON with a "user_id" key:

Args:
args (list): parsed args for the CLI command.
ret (int): return value of the CLI command.
"""
from dvc.daemon import daemon
{"user_id": "16fd2706-8baf-433b-82eb-8c7fada847da"}

if not Analytics.is_enabled(cmd):
return
IDs are generated randomly with UUID.
"""
config_dir = Config.get_global_config_dir()
fname = os.path.join(config_dir, "user_id")
lockfile = os.path.join(config_dir, "user_id.lock")

analytics = Analytics()
analytics.collect_cmd(args, ret)
daemon(["analytics", analytics.dump()])
# Since the `fname` and `lockfile` are under the global config,
# we need to make sure such directory exist already.
makedirs(config_dir, exist_ok=True)

def send(self):
"""Collect and send analytics."""
import requests
try:
with Lock(lockfile):
try:
with open(fname, "r") as fobj:
user_id = json.load(fobj)["user_id"]

if not self.is_enabled():
return
except (FileNotFoundError, ValueError, KeyError):
user_id = str(uuid.uuid4())

self.collect()
with open(fname, "w") as fobj:
json.dump({"user_id": user_id}, fobj)

logger.debug("Sending analytics: {}".format(self.info))
return user_id

try:
requests.post(self.URL, json=self.info, timeout=self.TIMEOUT_POST)
except requests.exceptions.RequestException as exc:
logger.debug("Failed to send analytics: {}".format(str(exc)))
except LockError:
logger.debug("Failed to acquire {lockfile}".format(lockfile=lockfile))
5 changes: 2 additions & 3 deletions dvc/command/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@ def run(self):

class CmdDaemonAnalytics(CmdDaemonBase):
def run(self):
from dvc.analytics import Analytics
from dvc import analytics

analytics = Analytics.load(self.args.target)
analytics.send()
analytics.send(self.args.target)

return 0

Expand Down
3 changes: 1 addition & 2 deletions dvc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@

from dvc.exceptions import DvcException
from dvc.exceptions import NotDvcRepoError
from dvc.utils.compat import open
from dvc.utils.compat import str
from dvc.utils.compat import open, str

logger = logging.getLogger(__name__)

Expand Down
Loading