diff --git a/.github/workflows/check-test-release.yml b/.github/workflows/check-test-release.yml index e82b3663..f0fff764 100644 --- a/.github/workflows/check-test-release.yml +++ b/.github/workflows/check-test-release.yml @@ -88,6 +88,8 @@ jobs: HEROKU_TEAM: iterative-sandbox GITHUB_MATRIX_OS: ${{ matrix.os }} GITHUB_MATRIX_PYTHON: ${{ matrix.python }} + BITBUCKET_USERNAME: ${{ secrets.BITBUCKET_USERNAME }} + BITBUCKET_PASSWORD: ${{ secrets.BITBUCKET_PASSWORD }} - name: "Upload coverage to Codecov" uses: codecov/codecov-action@v1 with: diff --git a/mlem/config.py b/mlem/config.py index f546af15..51afee87 100644 --- a/mlem/config.py +++ b/mlem/config.py @@ -53,6 +53,9 @@ def inner(settings: BaseSettings) -> Dict[str, Any]: return inner +T = TypeVar("T", bound="MlemConfigBase") + + class MlemConfigBase(BaseSettings): """Special base for mlem settings to be able to read them from files""" @@ -93,6 +96,10 @@ def ignore_case( new_value[key] = val return new_value + @classmethod + def local(cls: Type[T]) -> T: + return project_config("", section=cls) + class MlemConfig(MlemConfigBase): """Base Mlem Config""" @@ -149,9 +156,6 @@ def get_config_cls(section: str) -> Type[MlemConfigBase]: raise UnknownConfigSection(section) from e -T = TypeVar("T", bound=MlemConfigBase) - - @overload def project_config( project: Optional[str], diff --git a/mlem/contrib/bitbucketfs.py b/mlem/contrib/bitbucketfs.py new file mode 100644 index 00000000..3e5ad75f --- /dev/null +++ b/mlem/contrib/bitbucketfs.py @@ -0,0 +1,272 @@ +import posixpath +from typing import ClassVar, List, Optional +from urllib.parse import quote_plus, urljoin, urlparse, urlsplit + +import requests +from fsspec import AbstractFileSystem +from fsspec.implementations.memory import MemoryFile +from fsspec.registry import known_implementations +from pydantic import Field +from requests import HTTPError + +from mlem.config import MlemConfigBase +from mlem.core.meta_io import CloudGitResolver + +BITBUCKET_ORG = "https://bitbucket.org" + + +class BitbucketWrapper: + + tree_endpoint = "/api/internal/repositories/{repo}/tree/{rev}/{path}" + repo_endpoint = "/api/2.0/repositories/{repo}" + refs_endpoint = "/api/2.0/repositories/{repo}/refs" + file_endpoint = "/api/2.0/repositories/{repo}/src/{rev}/{path}" + + def __init__( + self, url: str, username: Optional[str], password: Optional[str] + ): + self.username = username + self.password = password + self.url = url + + @property + def auth(self): + if self.username is not None and self.password is not None: + return self.username, self.password + return None + + def tree(self, path: str, repo: str, rev: str): + r = requests.get( + urljoin( + self.url, + self.tree_endpoint.format(path=path or "", repo=repo, rev=rev), + ), + auth=self.auth, + ) + r.raise_for_status() + return r.json()[0]["contents"] + + def get_default_branch(self, repo: str): + r = requests.get( + urljoin(self.url, self.repo_endpoint.format(repo=repo)), + auth=self.auth, + ) + r.raise_for_status() + return r.json()["mainbranch"]["name"] + + def open(self, path: str, repo: str, rev: str): + r = requests.get( + urljoin( + self.url, + self.file_endpoint.format(path=path, repo=repo, rev=rev), + ), + auth=self.auth, + ) + r.raise_for_status() + return r.content + + def get_refs(self, repo: str) -> List[str]: + r = requests.get( + urljoin(self.url, self.refs_endpoint.format(repo=repo)), + auth=self.auth, + ) + r.raise_for_status() + return [v["name"] for v in r.json()["values"]] + + def check_rev(self, repo: str, rev: str) -> bool: + r = requests.head( + urljoin( + self.url, + self.file_endpoint.format(path="", repo=repo, rev=rev), + ) + ) + return r.status_code == 200 + + +class BitbucketConfig(MlemConfigBase): + class Config: + section = "bitbucket" + + USERNAME: Optional[str] = Field(default=None, env="BITBUCKET_USERNAME") + PASSWORD: Optional[str] = Field(default=None, env="BITBUCKET_PASSWORD") + + +class BitBucketFileSystem( + AbstractFileSystem +): # pylint: disable=abstract-method + def __init__( + self, + repo: str, + sha: str = None, + host: str = BITBUCKET_ORG, + username: str = None, + password: str = None, + **kwargs, + ): + super().__init__(**kwargs) + conf = BitbucketConfig.local() + self.password = password or conf.PASSWORD + self.username = username or conf.USERNAME + self.repo = repo + self.host = host + + self.bb = BitbucketWrapper(host, self.username, self.password) + if sha is None: + sha = self.bb.get_default_branch(repo) + self.root = sha + self.ls("") + + def invalidate_cache(self, path=None): + super().invalidate_cache(path) + self.dircache.clear() + + def ls(self, path, detail=False, sha=None, **kwargs): + path = self._strip_protocol(path) + if path not in self.dircache or sha not in [self.root, None]: + try: + r = self.bb.tree( + path=path, repo=self.repo, rev=sha or self.root + ) + except HTTPError as e: + if e.response.status_code == 404: + raise FileNotFoundError() from e + raise + out = [ + { + "name": posixpath.join(path, f["name"]), + "mode": None, + "type": f["type"], + "size": f.get("size", 0), + "sha": sha, + } + for f in r + ] + if sha in [self.root, None]: + self.dircache[path] = out + else: + out = self.dircache[path] + + if detail: + return out + return sorted([f["name"] for f in out]) + + @classmethod + def _strip_protocol(cls, path): + if "@" in path: + return cls._get_kwargs_from_urls(path)["path"] + return super()._strip_protocol(path) + + @classmethod + def _get_kwargs_from_urls(cls, path): + parsed_path = urlsplit(path) + protocol = parsed_path.scheme + if protocol != "bitbucket": + return {"path": path} + repo, path = super()._strip_protocol(path).split("@", maxsplit=2) + sha, path = _mathch_path_with_ref(repo, path) + return { + "path": path, + "sha": sha, + "protocol": protocol, + "repo": repo, + } + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + sha=None, + **kwargs, + ): + if mode != "rb": + raise NotImplementedError + return MemoryFile( + None, + None, + self.bb.open(path, self.repo, rev=sha or self.root), + ) + + +known_implementations["bitbucket"] = { + "class": f"{BitBucketFileSystem.__module__}.{BitBucketFileSystem.__name__}" +} + + +def ls_bb_refs(repo): + conf = BitbucketConfig.local() + password = conf.PASSWORD + username = conf.USERNAME + return BitbucketWrapper( + BITBUCKET_ORG, username=username, password=password + ).get_refs(repo) + + +def _mathch_path_with_ref(repo, path): + path = path.split("/") + sha = path[0] + refs = ls_bb_refs(repo) + branches = {quote_plus(k) for k in refs} + # match beginning of path with one of existing branches + # "" is hack for cases with empty path (like 'github.com/org/rep/tree/branch/') + for i, part in enumerate(path[1:] + [""], start=1): + if sha in branches: + path = path[i:] + break + sha = f"{sha}%2F{part}" + else: + raise ValueError(f'Could not resolve branch from path "{path}"') + return sha, posixpath.join(*path) + + +class BitBucketResolver(CloudGitResolver): + type: ClassVar = "bitbucket" + FS = BitBucketFileSystem + PROTOCOL = "bitbucket" + + # TODO: support on-prem gitlab (other hosts) + PREFIXES = [BITBUCKET_ORG, PROTOCOL + "://"] + versioning_support = True + + @classmethod + def get_kwargs(cls, uri): + sha: Optional[str] + parsed = urlparse(uri) + repo, *path = parsed.path.strip("/").split("/src/") + if not path: + return {"repo": repo, "path": ""} + sha, path = _mathch_path_with_ref(repo, path[0]) + return {"repo": repo, "sha": sha, "path": path} + + @classmethod + def check_rev(cls, options): + conf = BitbucketConfig.local() + password = conf.PASSWORD + username = conf.USERNAME + return BitbucketWrapper( + BITBUCKET_ORG, username=username, password=password + ).check_rev(options["repo"], options["sha"]) + + @classmethod + def get_uri( + cls, + path: str, + project: Optional[str], + rev: Optional[str], + fs: BitBucketFileSystem, + ): + fullpath = posixpath.join(project or "", path) + return f"{BITBUCKET_ORG}/{fs.repo}/src/{fs.root}/{fullpath}" + + @classmethod + def get_project_uri( # pylint: disable=unused-argument + cls, + path: str, + project: Optional[str], + rev: Optional[str], + fs: BitBucketFileSystem, + uri: str, + ): + return f"{BITBUCKET_ORG}/{fs.repo}/src/{fs.root}/{project or ''}" diff --git a/mlem/contrib/github.py b/mlem/contrib/github.py new file mode 100644 index 00000000..b77caa00 --- /dev/null +++ b/mlem/contrib/github.py @@ -0,0 +1,139 @@ +import pathlib +import posixpath +from typing import ClassVar, Dict, Optional +from urllib.parse import quote_plus, urlparse + +import requests +from fsspec.implementations.github import GithubFileSystem + +from mlem.config import LOCAL_CONFIG +from mlem.core.meta_io import CloudGitResolver + + +def ls_branches(repo_url: str) -> Dict[str, str]: + """List branches in remote git repo""" + import git + + git.cmd.Git().ls_remote(repo_url) + g = git.cmd.Git() + remote_refs: Dict[str, str] = dict( + tuple(reversed(ref.split("\t")[:2])) + for ref in g.ls_remote(repo_url).split("\n") + ) + + return {"/".join(k.split("/")[2:]): v for k, v in remote_refs.items()} + + +def ls_github_branches(org: str, repo: str): + """List branches in github repo""" + return _ls_github_refs(org, repo, "branches") + + +def ls_github_tags(org: str, repo: str): + """List tags in github repo""" + return _ls_github_refs(org, repo, "tags") + + +def github_check_rev(org: str, repo: str, rev: str): + """Check that rev exists in a github repo""" + res = requests.head( + f"https://api.github.com/repos/{org}/{repo}/commits/{rev}", + auth=(LOCAL_CONFIG.GITHUB_USERNAME, LOCAL_CONFIG.GITHUB_TOKEN), # type: ignore + ) + return res.status_code == 200 + + +def _ls_github_refs(org: str, repo: str, endpoint: str): + result = requests.get( + f"https://api.github.com/repos/{org}/{repo}/{endpoint}", + auth=(LOCAL_CONFIG.GITHUB_USERNAME, LOCAL_CONFIG.GITHUB_TOKEN), # type: ignore + ) + if result.status_code == 200: + return {b["name"]: b["commit"]["sha"] for b in result.json()} + result.raise_for_status() + return None + + +class GithubResolver(CloudGitResolver): + """Resolve https://github.com URLs""" + + type: ClassVar = "github" + FS: ClassVar = GithubFileSystem + PROTOCOL = "github" + GITHUB_COM = "https://github.com" + + # TODO: support on-prem github (other hosts) + PREFIXES = [GITHUB_COM, PROTOCOL + "://"] + versioning_support = True + + @classmethod + def get_envs(cls): + kwargs = {} + if LOCAL_CONFIG.GITHUB_TOKEN is not None: + kwargs["username"] = LOCAL_CONFIG.GITHUB_USERNAME + kwargs["token"] = LOCAL_CONFIG.GITHUB_TOKEN + return kwargs + + @classmethod + def get_kwargs(cls, uri): + """Parse URI to git repo to get dict with all URI parts""" + # TODO: do we lose URL to the site, like https://github.com? + # should be resolved as part of https://github.com/iterative/mlem/issues/4 + sha: Optional[str] + parsed = urlparse(uri) + parts = pathlib.Path(parsed.path).parts + org, repo, *path = parts[1:] + if not path: + return {"org": org, "repo": repo, "path": ""} + if path[0] == "tree": + sha = path[1] + refs = ls_github_branches(org, repo) + refs.update(ls_github_tags(org, repo)) + branches = {quote_plus(k) for k in refs} + # match beginning of path with one of existing branches + # "" is hack for cases with empty path (like 'github.com/org/rep/tree/branch/') + for i, part in enumerate(path[2:] + [""], start=2): + if sha in branches: + path = path[i:] + break + sha = f"{sha}%2F{part}" + else: + raise ValueError(f'Could not resolve branch from uri "{uri}"') + else: + sha = None + return { + "org": org, + "repo": repo, + "sha": sha, + "path": posixpath.join(*path) if path else "", + } + + @classmethod + def check_rev(cls, options): + return github_check_rev( + options["org"], options["repo"], options["sha"] + ) + + @classmethod + def get_uri( + cls, + path: str, + project: Optional[str], + rev: Optional[str], + fs: GithubFileSystem, + ): + fullpath = posixpath.join(project or "", path) + return ( + f"https://github.com/{fs.org}/{fs.repo}/tree/{fs.root}/{fullpath}" + ) + + @classmethod + def get_project_uri( + cls, + path: str, + project: Optional[str], + rev: Optional[str], + fs: GithubFileSystem, + uri: str, + ): + return f"https://github.com/{fs.org}/{fs.repo}/{project or ''}" diff --git a/mlem/utils/gitlabfs.py b/mlem/contrib/gitlabfs.py similarity index 72% rename from mlem/utils/gitlabfs.py rename to mlem/contrib/gitlabfs.py index 05b0faf1..14899688 100644 --- a/mlem/utils/gitlabfs.py +++ b/mlem/contrib/gitlabfs.py @@ -1,5 +1,5 @@ import posixpath -from typing import Optional +from typing import ClassVar, Optional from urllib.parse import quote_plus, urlparse, urlsplit import gitlab @@ -8,6 +8,8 @@ from fsspec.registry import known_implementations from gitlab import Gitlab, GitlabGetError +from mlem.core.meta_io import CloudGitResolver + GL_TYPES = {"blob": "file", "tree": "directory"} @@ -125,11 +127,6 @@ def _open( ) -known_implementations["gitlab"] = { - "class": f"{GitlabFileSystem.__module__}.{GitlabFileSystem.__name__}" -} - - def ls_gitlab_refs(project_id): gl = Gitlab() project = gl.projects.get(project_id) @@ -154,23 +151,63 @@ def _mathch_path_with_ref(project_id, path): return sha, posixpath.join(*path) -def gitlab_check_rev(project_id: str, rev: str): - gl = gitlab.Gitlab() - try: - gl.projects.get(project_id).branches.get(rev) - return True - except GitlabGetError: - return False - - -def get_gitlab_kwargs(uri: str): - """Parse URI to git repo to get dict with all URI parts""" - # TODO: do we lose URL to the site, like https://github.com? - # should be resolved as part of https://github.com/iterative/mlem/issues/4 - sha: Optional[str] - parsed = urlparse(uri) - project_id, *path = parsed.path.strip("/").split("/-/blob/") - if not path: - return {"project_id": project_id, "path": ""} - sha, path = _mathch_path_with_ref(project_id, path[0]) - return {"project_id": project_id, "sha": sha, "path": path} +known_implementations["gitlab"] = { + "class": f"{GitlabFileSystem.__module__}.{GitlabFileSystem.__name__}" +} + + +class GitlabResolver(CloudGitResolver): + type: ClassVar = "gitlab" + FS = GitlabFileSystem + PROTOCOL = "gitlab" + GITLAB_COM = "https://gitlab.com" + + # TODO: support on-prem gitlab (other hosts) + PREFIXES = [GITLAB_COM, PROTOCOL + "://"] + versioning_support = True + + @classmethod + def get_kwargs(cls, uri): + """Parse URI to git repo to get dict with all URI parts""" + # TODO: do we lose URL to the site, like https://github.com? + # should be resolved as part of https://github.com/iterative/mlem/issues/4 + sha: Optional[str] + parsed = urlparse(uri) + project_id, *path = parsed.path.strip("/").split("/-/blob/") + if not path: + return {"project_id": project_id, "path": ""} + sha, path = _mathch_path_with_ref(project_id, path[0]) + return {"project_id": project_id, "sha": sha, "path": path} + + @classmethod + def check_rev(cls, options): + gl = gitlab.Gitlab() + try: + gl.projects.get(options["project_id"]).branches.get(options["sha"]) + return True + except GitlabGetError: + return False + + @classmethod + def get_uri( + cls, + path: str, + project: Optional[str], + rev: Optional[str], + fs: GitlabFileSystem, + ): + fullpath = posixpath.join(project or "", path) + return ( + f"https://gitlab.com/{fs.project_id}/-/blob/{fs.root}/{fullpath}" + ) + + @classmethod + def get_project_uri( # pylint: disable=unused-argument + cls, + path: str, + project: Optional[str], + rev: Optional[str], + fs: GitlabFileSystem, + uri: str, + ): + return f"https://gitlab.com/{fs.project_id}/-/tree/{fs.root}/{project or ''}" diff --git a/mlem/core/meta_io.py b/mlem/core/meta_io.py index 62ede307..06da3905 100644 --- a/mlem/core/meta_io.py +++ b/mlem/core/meta_io.py @@ -5,13 +5,13 @@ import posixpath from abc import ABC, abstractmethod from inspect import isabstract -from typing import List, Optional, Tuple, Type +from typing import ClassVar, List, Optional, Tuple, Type from fsspec import AbstractFileSystem, get_fs_token_paths -from fsspec.implementations.github import GithubFileSystem from fsspec.implementations.local import LocalFileSystem from pydantic import BaseModel +from mlem.core.base import MlemABC from mlem.core.errors import ( HookNotFound, InvalidArgumentError, @@ -19,16 +19,6 @@ MlemObjectNotFound, RevisionNotFound, ) -from mlem.utils.github import ( - get_github_envs, - get_github_kwargs, - github_check_rev, -) -from mlem.utils.gitlabfs import ( - GitlabFileSystem, - get_gitlab_kwargs, - gitlab_check_rev, -) from mlem.utils.root import MLEM_DIR, find_project_root MLEM_EXT = ".mlem" @@ -86,16 +76,25 @@ def uri_repr(self): return self.uri -class UriResolver(ABC): +class UriResolver(MlemABC): """Base class for resolving location. Turns (path, project, rev, fs) tuple into a normalized `Location` instance""" - impls: List[Type["UriResolver"]] = [] - versioning_support: bool = False + abs_name: ClassVar = "resolver" + + class Config: + type_root = True + + impls: ClassVar[List[Type["UriResolver"]]] = [] + low_priority: ClassVar[bool] = False + versioning_support: ClassVar[bool] = False def __init_subclass__(cls, *args, **kwargs): if not isabstract(cls) and cls not in cls.impls: - cls.impls.append(cls) + if cls.low_priority: + cls.impls.append(cls) + else: + cls.impls.insert(0, cls) super(UriResolver, cls).__init_subclass__(*args, **kwargs) @classmethod @@ -119,6 +118,7 @@ def find_resolver( rev: Optional[str], fs: Optional[AbstractFileSystem], ) -> Type["UriResolver"]: + for i in cls.impls: if i.check(path, project, rev, fs): return i @@ -219,15 +219,11 @@ def get_project_uri( # pylint: disable=unused-argument return uri[: -len(path)] -class GithubResolver(UriResolver): - """Resolve https://github.com URLs""" - - PROTOCOL = "github://" - GITHUB_COM = "https://github.com" - - # TODO: support on-prem github (other hosts) - PREFIXES = [GITHUB_COM, PROTOCOL] - versioning_support = True +class CloudGitResolver(UriResolver, ABC): + FS: ClassVar[Type[AbstractFileSystem]] + PROTOCOL: ClassVar[str] + PREFIXES: ClassVar[List[str]] + versioning_support: ClassVar = True @classmethod def check( @@ -238,33 +234,31 @@ def check( fs: Optional[AbstractFileSystem], ) -> bool: fullpath = posixpath.join(project or "", path) - return isinstance(fs, GithubFileSystem) or any( + return isinstance(fs, cls.FS) or any( fullpath.startswith(h) for h in cls.PREFIXES ) @classmethod def get_fs( cls, uri: str, rev: Optional[str] - ) -> Tuple[GithubFileSystem, str]: - options = get_github_envs() + ) -> Tuple[AbstractFileSystem, str]: + options = cls.get_envs() if not uri.startswith(cls.PROTOCOL): try: - github_kwargs = get_github_kwargs(uri) + kwargs = cls.get_kwargs(uri) except ValueError as e: raise LocationNotFound(*e.args) from e - options.update(github_kwargs) + options.update(kwargs) path = options.pop("path") options["sha"] = rev or options.get("sha", None) else: path = uri try: fs, _, (path,) = get_fs_token_paths( - path, protocol="github", storage_options=options + path, protocol=cls.PROTOCOL, storage_options=options ) except FileNotFoundError as e: # TODO catch HTTPError for wrong orgrepo - if options["sha"] is not None and not github_check_rev( - options["org"], options["repo"], options["sha"] - ): + if options["sha"] is not None and not cls.check_rev(options): raise RevisionNotFound(options["sha"], uri) from e raise LocationNotFound( f"Could not resolve github location {uri}" @@ -272,114 +266,16 @@ def get_fs( return fs, path @classmethod - def get_uri( - cls, - path: str, - project: Optional[str], - rev: Optional[str], - fs: GithubFileSystem, - ): - fullpath = posixpath.join(project or "", path) - return ( - f"https://github.com/{fs.org}/{fs.repo}/tree/{fs.root}/{fullpath}" - ) - - @classmethod - def pre_process( - cls, - path: str, - project: Optional[str], - rev: Optional[str], - fs: Optional[AbstractFileSystem], - ): - if fs is not None and not isinstance(fs, GithubFileSystem): - raise TypeError( - f"{path, project, rev, fs} cannot be resolved by {cls}: fs should be GithubFileSystem, not {fs.__class__}" - ) - if ( - isinstance(fs, GithubFileSystem) - and rev is not None - and fs.root != rev - ): - fs.root = rev - fs.invalidate_cache() - - return path, project, rev, fs - - @classmethod - def get_project_uri( - cls, - path: str, - project: Optional[str], - rev: Optional[str], - fs: GithubFileSystem, - uri: str, - ): - return f"https://github.com/{fs.org}/{fs.repo}/{project or ''}" - - -class GitlabResolver(UriResolver): - PROTOCOL = "gitlab://" - GITLAB_COM = "https://gitlab.com" - - # TODO: support on-prem gitlab (other hosts) - PREFIXES = [GITLAB_COM, PROTOCOL] - versioning_support = True + def get_envs(cls): + return {} @classmethod - def check( - cls, - path: str, - project: Optional[str], - rev: Optional[str], - fs: Optional[AbstractFileSystem], - ) -> bool: - fullpath = posixpath.join(project or "", path) - return isinstance(fs, GitlabFileSystem) or any( - fullpath.startswith(h) for h in cls.PREFIXES - ) - - @classmethod - def get_fs( - cls, uri: str, rev: Optional[str] - ) -> Tuple[AbstractFileSystem, str]: - options = {} # get_github_envs() - if not uri.startswith(cls.PROTOCOL): - try: - gitlab_kwargs = get_gitlab_kwargs(uri) - except ValueError as e: - raise LocationNotFound(*e.args) from e - options.update(gitlab_kwargs) - path = options.pop("path") - options["sha"] = rev or options.get("sha", None) - else: - path = uri - try: - fs, _, (path,) = get_fs_token_paths( - path, protocol="gitlab", storage_options=options - ) - except FileNotFoundError as e: # TODO catch HTTPError for wrong org/repo - if options["sha"] is not None and not gitlab_check_rev( - options["project_id"], options["sha"] - ): - raise RevisionNotFound(options["sha"], uri) from e - raise LocationNotFound( - f"Could not resolve github location {uri}" - ) from e - return fs, path + def get_kwargs(cls, uri): + raise NotImplementedError @classmethod - def get_uri( - cls, - path: str, - project: Optional[str], - rev: Optional[str], - fs: GitlabFileSystem, - ): - fullpath = posixpath.join(project or "", path) - return ( - f"https://gitlab.com/{fs.project_id}/-/blob/{fs.root}/{fullpath}" - ) + def check_rev(cls, options): + raise NotImplementedError @classmethod def pre_process( @@ -389,35 +285,22 @@ def pre_process( rev: Optional[str], fs: Optional[AbstractFileSystem], ): - if fs is not None and not isinstance(fs, GitlabFileSystem): + if fs is not None and not isinstance(fs, cls.FS): raise TypeError( - f"{path, project, rev, fs} cannot be resolved by {cls}: fs should be GithubFileSystem, not {fs.__class__}" + f"{path, project, rev, fs} cannot be resolved by {cls}: fs should be {cls.FS.__class__}, not {fs.__class__}" ) - if ( - isinstance(fs, GitlabFileSystem) - and rev is not None - and fs.root != rev - ): + if isinstance(fs, cls.FS) and rev is not None and fs.root != rev: fs.root = rev fs.invalidate_cache() return path, project, rev, fs - @classmethod - def get_project_uri( # pylint: disable=unused-argument - cls, - path: str, - project: Optional[str], - rev: Optional[str], - fs: GitlabFileSystem, - uri: str, - ): - return f"https://gitlab.com/{fs.project_id}/-/tree/{fs.root}/{project or ''}" - class FSSpecResolver(UriResolver): """Resolve different fsspec URIs""" + low_priority: ClassVar = True + @classmethod def check( cls, diff --git a/mlem/ext.py b/mlem/ext.py index 6097cae4..f14da44d 100644 --- a/mlem/ext.py +++ b/mlem/ext.py @@ -105,6 +105,9 @@ class ExtensionLoader: Extension("mlem.contrib.fastapi", ["fastapi", "uvicorn"], False), Extension("mlem.contrib.callable", [], True), Extension("mlem.contrib.rabbitmq", ["pika"], False, extra="rmq"), + Extension("mlem.contrib.github", [], True), + Extension("mlem.contrib.gitlabfs", [], True), + Extension("mlem.contrib.bitbucketfs", [], True), ) _loaded_extensions: Dict[Extension, ModuleType] = {} diff --git a/mlem/utils/github.py b/mlem/utils/github.py deleted file mode 100644 index e579255b..00000000 --- a/mlem/utils/github.py +++ /dev/null @@ -1,95 +0,0 @@ -import pathlib -import posixpath -from typing import Dict, Optional -from urllib.parse import quote_plus, urlparse - -import requests - -from mlem.config import LOCAL_CONFIG - - -def get_github_kwargs(uri: str): - """Parse URI to git repo to get dict with all URI parts""" - # TODO: do we lose URL to the site, like https://github.com? - # should be resolved as part of https://github.com/iterative/mlem/issues/4 - sha: Optional[str] - parsed = urlparse(uri) - parts = pathlib.Path(parsed.path).parts - org, repo, *path = parts[1:] - if not path: - return {"org": org, "repo": repo, "path": ""} - if path[0] == "tree": - sha = path[1] - refs = ls_github_branches(org, repo) - refs.update(ls_github_tags(org, repo)) - branches = {quote_plus(k) for k in refs} - # match beginning of path with one of existing branches - # "" is hack for cases with empty path (like 'github.com/org/rep/tree/branch/') - for i, part in enumerate(path[2:] + [""], start=2): - if sha in branches: - path = path[i:] - break - sha = f"{sha}%2F{part}" - else: - raise ValueError(f'Could not resolve branch from uri "{uri}"') - else: - sha = None - return { - "org": org, - "repo": repo, - "sha": sha, - "path": posixpath.join(*path) if path else "", - } - - -def get_github_envs() -> Dict: - """Get authentification envs""" - kwargs = {} - if LOCAL_CONFIG.GITHUB_TOKEN is not None: - kwargs["username"] = LOCAL_CONFIG.GITHUB_USERNAME - kwargs["token"] = LOCAL_CONFIG.GITHUB_TOKEN - return kwargs - - -def ls_branches(repo_url: str) -> Dict[str, str]: - """List branches in remote git repo""" - import git - - git.cmd.Git().ls_remote(repo_url) - g = git.cmd.Git() - remote_refs: Dict[str, str] = dict( - tuple(reversed(ref.split("\t")[:2])) - for ref in g.ls_remote(repo_url).split("\n") - ) - - return {"/".join(k.split("/")[2:]): v for k, v in remote_refs.items()} - - -def ls_github_branches(org: str, repo: str): - """List branches in github repo""" - return _ls_github_refs(org, repo, "branches") - - -def ls_github_tags(org: str, repo: str): - """List tags in github repo""" - return _ls_github_refs(org, repo, "tags") - - -def github_check_rev(org: str, repo: str, rev: str): - """Check that rev exists in a github repo""" - res = requests.head( - f"https://api.github.com/repos/{org}/{repo}/commits/{rev}", - auth=(LOCAL_CONFIG.GITHUB_USERNAME, LOCAL_CONFIG.GITHUB_TOKEN), # type: ignore - ) - return res.status_code == 200 - - -def _ls_github_refs(org: str, repo: str, endpoint: str): - result = requests.get( - f"https://api.github.com/repos/{org}/{repo}/{endpoint}", - auth=(LOCAL_CONFIG.GITHUB_USERNAME, LOCAL_CONFIG.GITHUB_TOKEN), # type: ignore - ) - if result.status_code == 200: - return {b["name"]: b["commit"]["sha"] for b in result.json()} - result.raise_for_status() - return None diff --git a/setup.py b/setup.py index cc13cfb8..dddb61ea 100644 --- a/setup.py +++ b/setup.py @@ -163,6 +163,9 @@ "server.heroku = mlem.contrib.heroku.server:HerokuServer", "server.rmq = mlem.contrib.rabbitmq:RabbitMQServer", "storage.dvc = mlem.contrib.dvc:DVCStorage", + "resolver.bitbucket = mlem.contrib.bitbucketfs:BitBucketResolver", + "resolver.github = mlem.contrib.github:GithubResolver", + "resolver.gitlab = mlem.contrib.gitlabfs:GitlabResolver", ], "mlem.config": [ "core = mlem.config:MlemConfig", diff --git a/tests/conftest.py b/tests/conftest.py index 8bae24ab..58d38aee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,7 @@ from mlem.api import init, save from mlem.constants import PREDICT_ARG_NAME, PREDICT_METHOD_NAME from mlem.contrib.fastapi import FastAPIServer +from mlem.contrib.github import ls_github_branches from mlem.contrib.sklearn import SklearnModel from mlem.core.artifacts import LOCAL_STORAGE, FSSpecStorage, LocalArtifact from mlem.core.data_type import DataReader, DataType, DataWriter @@ -27,7 +28,6 @@ from mlem.core.objects import MlemData, MlemModel from mlem.core.requirements import Requirements from mlem.runtime.interface import ModelInterface -from mlem.utils.github import ls_github_branches RESOURCES = "resources" diff --git a/tests/contrib/test_bitbucket.py b/tests/contrib/test_bitbucket.py new file mode 100644 index 00000000..1f9bd916 --- /dev/null +++ b/tests/contrib/test_bitbucket.py @@ -0,0 +1,91 @@ +import os + +import pytest +from pytest_lazyfixture import lazy_fixture + +from mlem.contrib.bitbucketfs import BitBucketFileSystem +from mlem.core.errors import RevisionNotFound +from mlem.core.meta_io import UriResolver, get_fs +from mlem.core.metadata import load_meta +from mlem.core.objects import MlemModel +from tests.conftest import long + +MLEM_TEST_REPO_PROJECT = "iterative-ai/mlem-test" + +MLEM_TEST_REPO_URI = f"https://bitbucket.org/{MLEM_TEST_REPO_PROJECT}" + + +@pytest.fixture() +def fs_no_auth(): + username = os.environ.get("BITBUCKET_USERNAME", None) + try: + del os.environ["BITBUCKET_USERNAME"] + yield BitBucketFileSystem(MLEM_TEST_REPO_PROJECT) + finally: + if username: + os.environ["BITBUCKET_USERNAME"] = username + + +@pytest.fixture() +def fs_auth(): + return BitBucketFileSystem(MLEM_TEST_REPO_PROJECT) + + +@long +@pytest.mark.parametrize( + "fs", + [lazy_fixture("fs_auth"), lazy_fixture("fs_no_auth")], +) +def test_ls(fs): + assert "README.md" in fs.ls("") + + +@long +@pytest.mark.parametrize( + "fs", + [lazy_fixture("fs_auth"), lazy_fixture("fs_no_auth")], +) +def test_open(fs): + with fs.open("README.md", "r") as f: + assert f.read().startswith("### Fixture for mlem tests") + + +@long +@pytest.mark.parametrize( + "uri", + [ + MLEM_TEST_REPO_URI + "/src/main/path", + f"bitbucket://{MLEM_TEST_REPO_PROJECT}@main/path", + ], +) +def test_uri_resolver(uri): + fs, path = get_fs(uri) + + assert isinstance(fs, BitBucketFileSystem) + assert path == "path" + + +@long +@pytest.mark.parametrize( + "rev", + ["main", "branch", "tag", "3897d2ab"], +) +def test_uri_resolver_rev(rev): + location = UriResolver.resolve(MLEM_TEST_REPO_URI, None, rev=rev, fs=None) + assert isinstance(location.fs, BitBucketFileSystem) + assert location.fs.root == rev + assert "README.md" in location.fs.ls("") + + +@long +def test_uri_resolver_wrong_rev(): + with pytest.raises(RevisionNotFound): + UriResolver.resolve( + MLEM_TEST_REPO_URI, None, rev="__not_exists__", fs=None + ) + + +@long +def test_loading_object(): + meta = load_meta("latest", project=MLEM_TEST_REPO_URI + "/src/main/simple") + assert isinstance(meta, MlemModel) diff --git a/tests/utils/test_github.py b/tests/contrib/test_github.py similarity index 87% rename from tests/utils/test_github.py rename to tests/contrib/test_github.py index faa9544b..cb2e7253 100644 --- a/tests/utils/test_github.py +++ b/tests/contrib/test_github.py @@ -2,8 +2,8 @@ import pytest -from mlem.utils.github import ( - get_github_kwargs, +from mlem.contrib.github import ( + GithubResolver, github_check_rev, ls_branches, ls_github_branches, @@ -36,7 +36,7 @@ def test_ls_github_branches(): def set_mock_refs(mocker): def set(rev): mocker.patch( - "mlem.utils.github._ls_github_refs", + "mlem.contrib.github._ls_github_refs", return_value={rev: ""}, ) @@ -55,7 +55,7 @@ def set(rev): ) def test_get_github_kwargs(set_mock_refs, uri, rev): set_mock_refs(rev) - assert get_github_kwargs(uri) == { + assert GithubResolver.get_kwargs(uri) == { "org": "org", "repo": "repo", "path": "path", @@ -65,7 +65,9 @@ def test_get_github_kwargs(set_mock_refs, uri, rev): def test_get_github_kwargs__empty_path(set_mock_refs): set_mock_refs("ref") - assert get_github_kwargs("https://github.com/org/repo/tree/ref/") == { + assert GithubResolver.get_kwargs( + "https://github.com/org/repo/tree/ref/" + ) == { "org": "org", "repo": "repo", "path": "", diff --git a/tests/utils/test_gitlab.py b/tests/contrib/test_gitlab.py similarity index 96% rename from tests/utils/test_gitlab.py rename to tests/contrib/test_gitlab.py index 64ed193b..a889d59a 100644 --- a/tests/utils/test_gitlab.py +++ b/tests/contrib/test_gitlab.py @@ -1,10 +1,10 @@ import pytest +from mlem.contrib.gitlabfs import GitlabFileSystem from mlem.core.errors import RevisionNotFound from mlem.core.meta_io import UriResolver, get_fs from mlem.core.metadata import load_meta from mlem.core.objects import MlemModel -from mlem.utils.gitlabfs import GitlabFileSystem from tests.conftest import long MLEM_TEST_REPO_PROJECT = "iterative.ai/mlem-test"