diff --git a/docs/content/api/modules.json b/docs/content/api/modules.json index a25ad7704ce94..ad0d4e8485fec 100644 --- a/docs/content/api/modules.json +++ b/docs/content/api/modules.json @@ -1 +1 @@ -{"": {"index": {"alabaster_version": "0.7.12", "body": "

All modules for which code is available

\n", "current_page_name": "_modules/index", "customsidebar": null, "sidebars": ["globaltoc.html", "searchbox.html"], "title": "Overview: module code"}}, "dagster": {"config": {"config_schema": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.config.config_schema

\n
[docs]class ConfigSchema:\n """This is a placeholder type. Any time that it appears in documentation, it means that any of\n the following types are acceptable:\n\n #. A Python scalar type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`). For example:\n\n * ``@op(config_schema=int)``\n * ``@op(config_schema=str)``\n\n #. A built-in python collection (:py:class:`~python:list`, or :py:class:`~python:dict`).\n :py:class:`~python:list` is exactly equivalent to :py:class:`~dagster.Array` [\n :py:class:`~dagster.Any` ] and :py:class:`~python:dict` is equivalent to\n :py:class:`~dagster.Permissive`. For example:\n\n * ``@op(config_schema=list)``\n * ``@op(config_schema=dict)``\n\n #. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.Map`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n\n #. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules. For example:\n\n * ``{'some_config': str}`` is equivalent to ``Shape({'some_config: str})``.\n\n * ``{'some_config1': {'some_config2': str}}`` is equivalent to\n ``Shape({'some_config1: Shape({'some_config2: str})})``.\n\n #. A bare python list of length one, whose single element will be wrapped in a\n :py:class:`~dagster.Array` is resolved recursively according to the same\n rules. For example:\n\n * ``[str]`` is equivalent to ``Array[str]``.\n\n * ``[[str]]`` is equivalent to ``Array[Array[str]]``.\n\n * ``[{'some_config': str}]`` is equivalent to ``Array(Shape({'some_config: str}))``.\n\n #. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self):\n raise NotImplementedError(\n "ConfigSchema is a placeholder type and should not be instantiated."\n )
\n
", "current_page_name": "_modules/dagster/config/config_schema", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.config.config_schema"}, "config_type": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.config.config_type

\nimport typing\nfrom enum import Enum as PythonEnum\nfrom typing import Dict, List, Optional\n\nfrom dagster import check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.serdes import whitelist_for_serdes\n\n\n@whitelist_for_serdes\nclass ConfigTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    ENUM = "ENUM"\n\n    SELECTOR = "SELECTOR"\n    STRICT_SHAPE = "STRICT_SHAPE"\n    PERMISSIVE_SHAPE = "PERMISSIVE_SHAPE"\n    SCALAR_UNION = "SCALAR_UNION"\n\n    MAP = "MAP"\n\n    # Closed generic types\n    ARRAY = "ARRAY"\n    NONEABLE = "NONEABLE"\n\n    @staticmethod\n    def has_fields(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR or ConfigTypeKind.is_shape(kind)\n\n    @staticmethod\n    def is_closed_generic(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return (\n            kind == ConfigTypeKind.ARRAY\n            or kind == ConfigTypeKind.NONEABLE\n            or kind == ConfigTypeKind.SCALAR_UNION\n            or kind == ConfigTypeKind.MAP\n        )\n\n    @staticmethod\n    def is_shape(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.STRICT_SHAPE or kind == ConfigTypeKind.PERMISSIVE_SHAPE\n\n    @staticmethod\n    def is_selector(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR\n\n\nclass ConfigType:\n    """\n    The class backing DagsterTypes as they are used processing configuration data.\n    """\n\n    def __init__(\n        self,\n        key: str,\n        kind: ConfigTypeKind,\n        given_name: Optional[str] = None,\n        description: Optional[str] = None,\n        type_params: Optional[List["ConfigType"]] = None,\n    ):\n\n        self.key: str = check.str_param(key, "key")\n        self.kind: ConfigTypeKind = check.inst_param(kind, "kind", ConfigTypeKind)\n        self.given_name: Optional[str] = check.opt_str_param(given_name, "given_name")\n        self._description: Optional[str] = check.opt_str_param(description, "description")\n        self.type_params: Optional[List[ConfigType]] = (\n            check.list_param(type_params, "type_params", of_type=ConfigType)\n            if type_params\n            else None\n        )\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @staticmethod\n    def from_builtin_enum(builtin_enum: typing.Any) -> "ConfigType":\n        check.invariant(BuiltinEnum.contains(builtin_enum), "param must be member of BuiltinEnum")\n        return _CONFIG_MAP[builtin_enum]\n\n    def post_process(self, value):\n        """\n        Implement this in order to take a value provided by the user\n        and perform computation on it. This can be done to coerce data types,\n        fetch things from the environment (e.g. environment variables), or\n        to do custom validation. If the value is not valid, throw a\n        PostProcessingError. Otherwise return the coerced value.\n        """\n        return value\n\n\n@whitelist_for_serdes\nclass ConfigScalarKind(PythonEnum):\n    INT = "INT"\n    STRING = "STRING"\n    FLOAT = "FLOAT"\n    BOOL = "BOOL"\n\n\n# Scalars, Composites, Selectors, Lists, Optional, Any\n\n\nclass ConfigScalar(ConfigType):\n    def __init__(\n        self, key: str, given_name: Optional[str], scalar_kind: ConfigScalarKind, **kwargs: object\n    ):\n        self.scalar_kind = check.inst_param(scalar_kind, "scalar_kind", ConfigScalarKind)\n        super(ConfigScalar, self).__init__(\n            key, kind=ConfigTypeKind.SCALAR, given_name=given_name, **kwargs  # type: ignore\n        )\n\n\nclass BuiltinConfigScalar(ConfigScalar):\n    def __init__(self, scalar_kind, description=None):\n        super(BuiltinConfigScalar, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=scalar_kind,\n            description=description,\n        )\n\n\nclass Int(BuiltinConfigScalar):\n    def __init__(self):\n        super(Int, self).__init__(scalar_kind=ConfigScalarKind.INT, description="")\n\n\nclass String(BuiltinConfigScalar):\n    def __init__(self):\n        super(String, self).__init__(scalar_kind=ConfigScalarKind.STRING, description="")\n\n\nclass Bool(BuiltinConfigScalar):\n    def __init__(self):\n        super(Bool, self).__init__(scalar_kind=ConfigScalarKind.BOOL, description="")\n\n\nclass Float(BuiltinConfigScalar):\n    def __init__(self):\n        super(Float, self).__init__(scalar_kind=ConfigScalarKind.FLOAT, description="")\n\n    def post_process(self, value):\n        return float(value)\n\n\nclass Any(ConfigType):\n    def __init__(self):\n        super(Any, self).__init__(\n            key="Any",\n            given_name="Any",\n            kind=ConfigTypeKind.ANY,\n        )\n\n\n
[docs]class Noneable(ConfigType):\n """Defines a configuration type that is the union of ``NoneType`` and the type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n\n **Examples:**\n\n .. code-block:: python\n\n config_schema={"name": Noneable(str)}\n\n config={"name": "Hello"} # Ok\n config={"name": None} # Ok\n config={} # Error\n """\n\n def __init__(self, inner_type: ConfigType):\n from .field import resolve_to_config_type\n\n self.inner_type = resolve_to_config_type(inner_type)\n super(Noneable, self).__init__(\n key="Noneable.{inner_type}".format(inner_type=self.inner_type.key),\n kind=ConfigTypeKind.NONEABLE,\n type_params=[self.inner_type],\n )
\n\n\n
[docs]class Array(ConfigType):\n """Defines an array (list) configuration type that contains values of type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n """\n\n def __init__(self, inner_type):\n from .field import resolve_to_config_type\n\n self.inner_type = resolve_to_config_type(inner_type)\n super(Array, self).__init__(\n key="Array.{inner_type}".format(inner_type=self.inner_type.key),\n type_params=[self.inner_type],\n kind=ConfigTypeKind.ARRAY,\n )\n\n @property\n def description(self):\n return "List of {inner_type}".format(inner_type=self.key)
\n\n\n
[docs]class EnumValue:\n """Define an entry in a :py:class:`Enum`.\n\n Args:\n config_value (str):\n The string representation of the config to accept when passed.\n python_value (Optional[Any]):\n The python value to convert the enum entry in to. Defaults to the ``config_value``.\n description (Optional[str]):\n A human-readable description of the enum entry.\n\n """\n\n def __init__(\n self,\n config_value: str,\n python_value: Optional[object] = None,\n description: Optional[str] = None,\n ):\n self.config_value = check.str_param(config_value, "config_value")\n self.python_value = config_value if python_value is None else python_value\n self.description = check.opt_str_param(description, "description")
\n\n\n
[docs]class Enum(ConfigType):\n """Defines a enum configuration type that allows one of a defined set of possible values.\n\n Args:\n name (str):\n The name of the enum configuration type.\n enum_values (List[EnumValue]):\n The set of possible values for the enum configuration type.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Enum(\n 'CowboyType',\n [\n EnumValue('good'),\n EnumValue('bad'),\n EnumValue('ugly'),\n ]\n )\n )\n )\n def resolve_standoff(context):\n # ...\n """\n\n def __init__(self, name: str, enum_values: List[EnumValue]):\n check.str_param(name, "name")\n super(Enum, self).__init__(key=name, given_name=name, kind=ConfigTypeKind.ENUM)\n self.enum_values = check.list_param(enum_values, "enum_values", of_type=EnumValue)\n self._valid_python_values = {ev.python_value for ev in enum_values}\n check.invariant(len(self._valid_python_values) == len(enum_values))\n self._valid_config_values = {ev.config_value for ev in enum_values}\n check.invariant(len(self._valid_config_values) == len(enum_values))\n\n @property\n def config_values(self):\n return [ev.config_value for ev in self.enum_values]\n\n def is_valid_config_enum_value(self, config_value):\n return config_value in self._valid_config_values\n\n def post_process(self, value: typing.Any) -> typing.Any:\n if isinstance(value, PythonEnum):\n value = value.name\n\n for ev in self.enum_values:\n if ev.config_value == value:\n return ev.python_value\n\n check.failed(\n (\n "Should never reach this. config_value should be pre-validated. "\n "Got {config_value}"\n ).format(config_value=value)\n )\n\n
[docs] @classmethod\n def from_python_enum(cls, enum, name=None):\n """\n Create a Dagster enum corresponding to an existing Python enum.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n # ...\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v) for v in enum])
\n\n\n
[docs]class ScalarUnion(ConfigType):\n """Defines a configuration type that accepts a scalar value OR a non-scalar value like a\n :py:class:`~dagster.List`, :py:class:`~dagster.Dict`, or :py:class:`~dagster.Selector`.\n\n This allows runtime scalars to be configured without a dictionary with the key ``value`` and\n instead just use the scalar value directly. However this still leaves the option to\n load scalars from a json or pickle file.\n\n Args:\n scalar_type (type):\n The scalar type of values that this configuration type can hold. For example,\n :py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`.\n non_scalar_schema (ConfigSchema):\n The schema of a non-scalar Dagster configuration type. For example, :py:class:`List`,\n :py:class:`Dict`, or :py:class:`~dagster.Selector`.\n key (Optional[str]):\n The configuation type's unique key. If not set, then the key will be set to\n ``ScalarUnion.{scalar_type}-{non_scalar_schema}``.\n\n **Examples:**\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word:\n value: foobar\n\n\n becomes, optionally,\n\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word: foobar\n """\n\n def __init__(\n self, scalar_type: typing.Any, non_scalar_schema: typing.Any, _key: Optional[str] = None\n ):\n from .field import resolve_to_config_type\n\n self.scalar_type = resolve_to_config_type(scalar_type)\n self.non_scalar_type = resolve_to_config_type(non_scalar_schema)\n\n check.param_invariant(self.scalar_type.kind == ConfigTypeKind.SCALAR, "scalar_type")\n check.param_invariant(\n self.non_scalar_type.kind\n in {ConfigTypeKind.STRICT_SHAPE, ConfigTypeKind.SELECTOR, ConfigTypeKind.ARRAY},\n "non_scalar_type",\n )\n\n # https://github.com/dagster-io/dagster/issues/2133\n key = check.opt_str_param(\n _key, "_key", "ScalarUnion.{}-{}".format(self.scalar_type.key, self.non_scalar_type.key)\n )\n\n super(ScalarUnion, self).__init__(\n key=key,\n kind=ConfigTypeKind.SCALAR_UNION,\n type_params=[self.scalar_type, self.non_scalar_type],\n )
\n\n\nConfigAnyInstance = Any()\nConfigBoolInstance = Bool()\nConfigFloatInstance = Float()\nConfigIntInstance = Int()\nConfigStringInstance = String()\n\n_CONFIG_MAP: Dict[check.TypeOrTupleOfTypes, ConfigType] = {\n BuiltinEnum.ANY: ConfigAnyInstance,\n BuiltinEnum.BOOL: ConfigBoolInstance,\n BuiltinEnum.FLOAT: ConfigFloatInstance,\n BuiltinEnum.INT: ConfigIntInstance,\n BuiltinEnum.STRING: ConfigStringInstance,\n}\n\n\n_CONFIG_MAP_BY_NAME: Dict[str, ConfigType] = {\n "Any": ConfigAnyInstance,\n "Bool": ConfigBoolInstance,\n "Float": ConfigFloatInstance,\n "Int": ConfigIntInstance,\n "String": ConfigStringInstance,\n}\n\nALL_CONFIG_BUILTINS = set(_CONFIG_MAP.values())\n\n\ndef get_builtin_scalar_by_name(type_name: str):\n if type_name not in _CONFIG_MAP_BY_NAME:\n check.failed("Scalar {} is not supported".format(type_name))\n return _CONFIG_MAP_BY_NAME[type_name]\n
", "current_page_name": "_modules/dagster/config/config_type", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.config.config_type"}, "field": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.config.field

\nfrom typing import Any, Union, overload\n\nfrom dagster import check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster.serdes import serialize_value\nfrom dagster.utils import is_enum_value\nfrom dagster.utils.typing_api import is_closed_python_optional_type, is_typing_type\n\nfrom .config_type import Array, ConfigAnyInstance, ConfigType, ConfigTypeKind\nfrom .field_utils import FIELD_NO_DEFAULT_PROVIDED, Map, all_optional_type\n\n\ndef _is_config_type_class(obj):\n    return isinstance(obj, type) and issubclass(obj, ConfigType)\n\n\ndef helpful_list_error_string():\n    return "Please use a python list (e.g. [int]) or dagster.Array (e.g. Array(int)) instead."\n\n\nVALID_CONFIG_DESC = """\n1. A Python primitive type that resolve to dagster config\n   types: int, float, bool, str.\n\n2. A dagster config type: Int, Float, Bool, String, StringSource, Path, Any,\n   Array, Noneable, Selector, Shape, Permissive, etc.\n\n3. A bare python dictionary, which is wrapped in Shape. Any\n   values in the dictionary get resolved by the same rules, recursively.\n\n4. A bare python list of length one which itself is config type.\n   Becomes Array with list element as an argument.\n"""\n\n\n@overload\ndef resolve_to_config_type(dagster_type: ConfigType) -> ConfigType:\n    pass\n\n\n@overload\ndef resolve_to_config_type(dagster_type: object) -> Union[ConfigType, bool]:\n    pass\n\n\ndef resolve_to_config_type(dagster_type) -> Union[ConfigType, bool]:\n    from .field_utils import convert_fields_to_dict_type\n\n    # Short circuit if it's already a Config Type\n    if isinstance(dagster_type, ConfigType):\n        return dagster_type\n\n    if isinstance(dagster_type, dict):\n        # Dicts of the special form {type: value} are treated as Maps\n        # mapping from the type to value type, otherwise treat as dict type\n        if len(dagster_type) == 1:\n            key = list(dagster_type.keys())[0]\n            key_type = resolve_to_config_type(key)\n            if not isinstance(key, str):\n                if not key_type:\n                    raise DagsterInvalidDefinitionError(\n                        "Invalid key in map specification: {key} in map {collection}".format(\n                            key=repr(key), collection=dagster_type\n                        )\n                    )\n\n                if not key_type.kind == ConfigTypeKind.SCALAR:\n                    raise DagsterInvalidDefinitionError(\n                        "Non-scalar key in map specification: {key} in map {collection}".format(\n                            key=repr(key), collection=dagster_type\n                        )\n                    )\n\n                inner_type = resolve_to_config_type(dagster_type[key])\n\n                if not inner_type:\n                    raise DagsterInvalidDefinitionError(\n                        "Invalid value in map specification: {value} in map {collection}".format(\n                            value=repr(dagster_type[str]), collection=dagster_type\n                        )\n                    )\n                return Map(key_type, inner_type)\n        return convert_fields_to_dict_type(dagster_type)\n\n    if isinstance(dagster_type, list):\n        if len(dagster_type) != 1:\n            raise DagsterInvalidDefinitionError("Array specifications must only be of length 1")\n\n        inner_type = resolve_to_config_type(dagster_type[0])\n\n        if not inner_type:\n            raise DagsterInvalidDefinitionError(\n                "Invalid member of array specification: {value} in list {the_list}".format(\n                    value=repr(dagster_type[0]), the_list=dagster_type\n                )\n            )\n        return Array(inner_type)\n\n    from dagster.core.types.dagster_type import DagsterType, List, ListType\n    from dagster.core.types.python_set import Set, _TypedPythonSet\n    from dagster.core.types.python_tuple import Tuple, _TypedPythonTuple\n\n    if _is_config_type_class(dagster_type):\n        check.param_invariant(\n            False,\n            "dagster_type",\n            f"Cannot pass config type class {dagster_type} to resolve_to_config_type. "\n            "This error usually occurs when you pass a dagster config type class instead of a class instance into "\n            'another dagster config type. E.g. "Noneable(Permissive)" should instead be "Noneable(Permissive())".',\n        )\n\n    if isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            "You have passed a DagsterType class {dagster_type} to the config system. "\n            "The DagsterType and config schema systems are separate. "\n            "Valid config values are:\\n{desc}".format(\n                dagster_type=repr(dagster_type),\n                desc=VALID_CONFIG_DESC,\n            )\n        )\n\n    if is_closed_python_optional_type(dagster_type):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use typing.Optional as a config type. If you want this field to be "\n            "optional, please use Field(<type>, is_required=False), and if you want this field to "\n            "be required, but accept a value of None, use dagster.Noneable(<type>)."\n        )\n\n    if is_typing_type(dagster_type):\n        raise DagsterInvalidDefinitionError(\n            (\n                "You have passed in {dagster_type} to the config system. Types from "\n                "the typing module in python are not allowed in the config system. "\n                "You must use types that are imported from dagster or primitive types "\n                "such as bool, int, etc."\n            ).format(dagster_type=dagster_type)\n        )\n\n    if dagster_type is List or isinstance(dagster_type, ListType):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use List in the context of config. " + helpful_list_error_string()\n        )\n\n    if dagster_type is Set or isinstance(dagster_type, _TypedPythonSet):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Set in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if dagster_type is Tuple or isinstance(dagster_type, _TypedPythonTuple):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Tuple in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if isinstance(dagster_type, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            (\n                "You have passed an instance of DagsterType {type_name} to the config "\n                "system (Repr of type: {dagster_type}). "\n                "The DagsterType and config schema systems are separate. "\n                "Valid config values are:\\n{desc}"\n            ).format(\n                type_name=dagster_type.display_name,\n                dagster_type=repr(dagster_type),\n                desc=VALID_CONFIG_DESC,\n            ),\n        )\n\n    # If we are passed here either:\n    #  1) We have been passed a python builtin\n    #  2) We have been a dagster wrapping type that needs to be convert its config variant\n    #     e.g. dagster.List\n    #  2) We have been passed an invalid thing. We return False to signify this. It is\n    #     up to callers to report a reasonable error.\n\n    from dagster.primitive_mapping import (\n        is_supported_config_python_builtin,\n        remap_python_builtin_for_config,\n    )\n\n    if BuiltinEnum.contains(dagster_type):\n        return ConfigType.from_builtin_enum(dagster_type)\n\n    if is_supported_config_python_builtin(dagster_type):\n        return remap_python_builtin_for_config(dagster_type)\n\n    if dagster_type is None:\n        return ConfigAnyInstance\n\n    # This means that this is an error and we are return False to a callsite\n    # We do the error reporting there because those callsites have more context\n    return False\n\n\ndef has_implicit_default(config_type):\n    if config_type.kind == ConfigTypeKind.NONEABLE:\n        return True\n\n    return all_optional_type(config_type)\n\n\n
[docs]class Field:\n """Defines the schema for a configuration field.\n\n Fields are used in config schema instead of bare types when one wants to add a description,\n a default value, or to mark it as not required.\n\n Config fields are parsed according to their schemas in order to yield values available at\n job execution time through the config system. Config fields can be set on ops, on\n loaders and materializers for custom, and on other pluggable components of the system, such as\n resources, loggers, and executors.\n\n\n Args:\n config (Any): The schema for the config. This value can be any of:\n\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n default_value (Any):\n A default value for this field, conformant to the schema set by the ``dagster_type``\n argument. If a default value is provided, ``is_required`` should be ``False``.\n\n Note: for config types that do post processing such as Enum, this value must be\n the pre processed version, ie use ``ExampleEnum.VALUE.name`` instead of\n ``ExampleEnum.VALUE``\n\n is_required (bool):\n Whether the presence of this field is required. Defaults to true. If ``is_required``\n is ``True``, no default value should be provided.\n\n description (str):\n A human-readable description of this config field.\n\n Examples:\n\n .. code-block:: python\n\n @op(\n config_schema={\n 'word': Field(str, description='I am a word.'),\n 'repeats': Field(Int, default_value=1, is_required=False),\n }\n )\n def repeat_word(context):\n return context.op_config['word'] * context.op_config['repeats']\n """\n\n def _resolve_config_arg(self, config):\n if isinstance(config, ConfigType):\n return config\n\n config_type = resolve_to_config_type(config)\n if not config_type:\n raise DagsterInvalidDefinitionError(\n (\n "Attempted to pass {value_repr} to a Field that expects a valid "\n "dagster type usable in config (e.g. Dict, Int, String et al)."\n ).format(value_repr=repr(config))\n )\n return config_type\n\n def __init__(\n self,\n config,\n default_value=FIELD_NO_DEFAULT_PROVIDED,\n is_required=None,\n description=None,\n ):\n from .post_process import resolve_defaults\n from .validate import validate_config\n\n self.config_type = check.inst(self._resolve_config_arg(config), ConfigType)\n\n self.description = check.opt_str_param(description, "description")\n\n check.opt_bool_param(is_required, "is_required")\n\n if default_value != FIELD_NO_DEFAULT_PROVIDED:\n check.param_invariant(\n not (callable(default_value)), "default_value", "default_value cannot be a callable"\n )\n\n if is_required is True:\n check.param_invariant(\n default_value == FIELD_NO_DEFAULT_PROVIDED,\n "default_value",\n "required arguments should not specify default values",\n )\n\n self._default_value = default_value\n\n # check explicit default value\n if self.default_provided:\n if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(default_value):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed into a python enum value as the default value "\n "into of a config enum type {name}. You must pass in the underlying "\n "string represention as the default value. One of {value_set}."\n ).format(\n value_set=[ev.config_value for ev in self.config_type.enum_values], # type: ignore\n name=self.config_type.given_name,\n )\n )\n\n evr = validate_config(self.config_type, default_value)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Invalid default_value for Field.",\n evr.errors,\n default_value,\n )\n\n if is_required is None:\n is_optional = has_implicit_default(self.config_type) or self.default_provided\n is_required = not is_optional\n\n # on implicitly optional - set the default value\n # by resolving the defaults of the type\n if is_optional and not self.default_provided:\n evr = resolve_defaults(self.config_type, None)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Unable to resolve implicit default_value for Field.",\n evr.errors,\n None,\n )\n self._default_value = evr.value\n self._is_required = is_required\n\n @property\n def is_required(self) -> bool:\n return self._is_required\n\n @property\n def default_provided(self) -> bool:\n """Was a default value provided\n\n Returns:\n bool: Yes or no\n """\n return self._default_value != FIELD_NO_DEFAULT_PROVIDED\n\n @property\n def default_value(self) -> Any:\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return self._default_value\n\n @property\n def default_value_as_json_str(self) -> str:\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return serialize_value(self.default_value)\n\n def __repr__(self):\n return ("Field({config_type}, default={default}, is_required={is_required})").format(\n config_type=self.config_type,\n default="@"\n if self._default_value == FIELD_NO_DEFAULT_PROVIDED\n else self._default_value,\n is_required=self.is_required,\n )
\n\n\ndef check_opt_field_param(obj, param_name):\n return check.opt_inst_param(obj, param_name, Field)\n
", "current_page_name": "_modules/dagster/config/field", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.config.field"}, "field_utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.config.field_utils

\n# encoding: utf-8\nimport hashlib\nfrom typing import TYPE_CHECKING, Any, Dict, List\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidConfigDefinitionError\n\nfrom .config_type import Array, ConfigType, ConfigTypeKind\n\nif TYPE_CHECKING:\n    from dagster.config.field import Field\n\n\ndef all_optional_type(config_type: ConfigType) -> bool:\n    check.inst_param(config_type, "config_type", ConfigType)\n\n    if ConfigTypeKind.is_shape(config_type.kind):\n        for field in config_type.fields.values():  # type: ignore\n            if field.is_required:\n                return False\n        return True\n\n    if ConfigTypeKind.is_selector(config_type.kind):\n        if len(config_type.fields) == 1:  # type: ignore\n            for field in config_type.fields.values():  # type: ignore\n                if field.is_required:\n                    return False\n            return True\n\n    return False\n\n\nclass __FieldValueSentinel:\n    pass\n\n\nclass __InferOptionalCompositeFieldSentinel:\n    pass\n\n\nFIELD_NO_DEFAULT_PROVIDED = __FieldValueSentinel\n\nINFER_OPTIONAL_COMPOSITE_FIELD = __InferOptionalCompositeFieldSentinel\n\n\nclass _ConfigHasFields(ConfigType):\n    def __init__(self, fields, **kwargs):\n        self.fields = expand_fields_dict(fields)\n        super(_ConfigHasFields, self).__init__(**kwargs)\n\n\nFIELD_HASH_CACHE: Dict[str, Any] = {}\n\n\ndef _memoize_inst_in_field_cache(passed_cls, defined_cls, key):\n    if key in FIELD_HASH_CACHE:\n        return FIELD_HASH_CACHE[key]\n\n    defined_cls_inst = super(defined_cls, passed_cls).__new__(defined_cls)\n\n    FIELD_HASH_CACHE[key] = defined_cls_inst\n    return defined_cls_inst\n\n\ndef _add_hash(m, string):\n    m.update(string.encode("utf-8"))\n\n\ndef _compute_fields_hash(fields, description, field_aliases=None):\n\n    m = hashlib.sha1()  # so that hexdigest is 40, not 64 bytes\n    if description:\n        _add_hash(m, ":description: " + description)\n\n    for field_name in sorted(list(fields.keys())):\n        field = fields[field_name]\n        _add_hash(m, ":fieldname:" + field_name)\n        if field.default_provided:\n            _add_hash(m, ":default_value: " + field.default_value_as_json_str)\n        _add_hash(m, ":is_required: " + str(field.is_required))\n        _add_hash(m, ":type_key: " + field.config_type.key)\n        if field.description:\n            _add_hash(m, ":description: " + field.description)\n\n    field_aliases = check.opt_dict_param(\n        field_aliases, "field_aliases", key_type=str, value_type=str\n    )\n    for field_name in sorted(list(field_aliases.keys())):\n        field_alias = field_aliases[field_name]\n        _add_hash(m, ":fieldname: " + field_name)\n        _add_hash(m, ":fieldalias: " + field_alias)\n\n    return m.hexdigest()\n\n\ndef _define_shape_key_hash(fields, description, field_aliases):\n    return "Shape." + _compute_fields_hash(fields, description, field_aliases=field_aliases)\n\n\n
[docs]class Shape(_ConfigHasFields):\n """Schema for configuration data with string keys and typed values via :py:class:`Field`.\n\n Unlike :py:class:`Permissive`, unspecified fields are not allowed and will throw a\n :py:class:`~dagster.DagsterInvalidConfigError`.\n\n Args:\n fields (Dict[str, Field]):\n The specification of the config dict.\n field_aliases (Dict[str, str]):\n Maps a string key to an alias that can be used instead of the original key. For example,\n an entry {"solids": "ops"} means that someone could use "ops" instead of "solids" as a\n top level string key.\n """\n\n def __new__(\n cls,\n fields,\n description=None,\n field_aliases=None,\n ):\n return _memoize_inst_in_field_cache(\n cls,\n Shape,\n _define_shape_key_hash(expand_fields_dict(fields), description, field_aliases),\n )\n\n def __init__(\n self,\n fields,\n description=None,\n field_aliases=None,\n ):\n fields = expand_fields_dict(fields)\n super(Shape, self).__init__(\n kind=ConfigTypeKind.STRICT_SHAPE,\n key=_define_shape_key_hash(fields, description, field_aliases),\n description=description,\n fields=fields,\n )\n self.field_aliases = check.opt_dict_param(\n field_aliases, "field_aliases", key_type=str, value_type=str\n )
\n\n\n
[docs]class Map(ConfigType):\n """Defines a config dict with arbitrary scalar keys and typed values.\n\n A map can contrain arbitrary keys of the specified scalar type, each of which has\n type checked values. Unlike :py:class:`Shape` and :py:class:`Permissive`, scalar\n keys other than strings can be used, and unlike :py:class:`Permissive`, all\n values are type checked.\n Args:\n key_type (type):\n The type of keys this map can contain. Must be a scalar type.\n inner_type (type):\n The type of the values that this map type can contain.\n key_label_name (string):\n Optional name which describes the role of keys in the map.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Map({str: int})))\n def partially_specified_config(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __init__(self, key_type, inner_type, key_label_name=None):\n from .field import resolve_to_config_type\n\n self.key_type = resolve_to_config_type(key_type)\n self.inner_type = resolve_to_config_type(inner_type)\n self.given_name = key_label_name\n\n check.inst_param(self.key_type, "key_type", ConfigType)\n check.inst_param(self.inner_type, "inner_type", ConfigType)\n check.param_invariant(\n self.key_type.kind == ConfigTypeKind.SCALAR, "key_type", "Key type must be a scalar"\n )\n check.opt_str_param(self.given_name, "name")\n\n super(Map, self).__init__(\n key="Map.{key_type}.{inner_type}{name_key}".format(\n key_type=self.key_type.key,\n inner_type=self.inner_type.key,\n name_key=f":name: {key_label_name}" if key_label_name else "",\n ),\n # We use the given name field to store the key label name\n # this is used elsewhere to give custom types names\n given_name=key_label_name,\n type_params=[self.key_type, self.inner_type],\n kind=ConfigTypeKind.MAP,\n )\n\n @property\n def key_label_name(self):\n return self.given_name
\n\n\ndef _define_permissive_dict_key(fields, description):\n return (\n "Permissive." + _compute_fields_hash(fields, description=description)\n if fields\n else "Permissive"\n )\n\n\n
[docs]class Permissive(_ConfigHasFields):\n """Defines a config dict with a partially specified schema.\n\n A permissive dict allows partial specification of the config schema. Any fields with a\n specified schema will be type checked. Other fields will be allowed, but will be ignored by\n the type checker.\n\n Args:\n fields (Dict[str, Field]): The partial specification of the config dict.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Permissive({'required': Field(String)})))\n def map_config_op(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __new__(cls, fields=None, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Permissive,\n _define_permissive_dict_key(\n expand_fields_dict(fields) if fields else None, description\n ),\n )\n\n def __init__(self, fields=None, description=None):\n fields = expand_fields_dict(fields) if fields else None\n super(Permissive, self).__init__(\n key=_define_permissive_dict_key(fields, description),\n kind=ConfigTypeKind.PERMISSIVE_SHAPE,\n fields=fields or dict(),\n description=description,\n )
\n\n\ndef _define_selector_key(fields, description):\n return "Selector." + _compute_fields_hash(fields, description=description)\n\n\n
[docs]class Selector(_ConfigHasFields):\n """Define a config field requiring the user to select one option.\n\n Selectors are used when you want to be able to present several different options in config but\n allow only one to be selected. For example, a single input might be read in from either a csv\n file or a parquet file, but not both at once.\n\n Note that in some other type systems this might be called an 'input union'.\n\n Functionally, a selector is like a :py:class:`Dict`, except that only one key from the dict can\n be specified in valid config.\n\n Args:\n fields (Dict[str, Field]): The fields from which the user must select.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Selector(\n {\n 'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n 'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n 'en': {'whom': Field(String, default_value='world', is_required=False)},\n }\n ),\n is_required=False,\n default_value={'en': {'whom': 'world'}},\n )\n )\n def hello_world_with_default(context):\n if 'haw' in context.op_config:\n return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n if 'cn' in context.op_config:\n return '\u4f60\u597d\uff0c{whom}!'.format(whom=context.op_config['cn']['whom'])\n if 'en' in context.op_config:\n return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n """\n\n def __new__(cls, fields, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Selector,\n _define_selector_key(expand_fields_dict(fields), description),\n )\n\n def __init__(self, fields, description=None):\n fields = expand_fields_dict(fields)\n super(Selector, self).__init__(\n key=_define_selector_key(fields, description),\n kind=ConfigTypeKind.SELECTOR,\n fields=fields,\n description=description,\n )
\n\n\n# Config syntax expansion code below\n\n\ndef is_potential_field(potential_field: object) -> bool:\n from .field import Field, resolve_to_config_type\n\n return isinstance(potential_field, (Field, dict, list)) or bool(\n resolve_to_config_type(potential_field)\n )\n\n\ndef convert_fields_to_dict_type(fields: Dict[str, object]):\n return _convert_fields_to_dict_type(fields, fields, [])\n\n\ndef _convert_fields_to_dict_type(\n original_root: object, fields: Dict[str, object], stack: List[str]\n) -> Shape:\n return Shape(_expand_fields_dict(original_root, fields, stack))\n\n\ndef expand_fields_dict(fields: Dict[str, object]) -> Dict[str, "Field"]:\n return _expand_fields_dict(fields, fields, [])\n\n\ndef _expand_fields_dict(\n original_root: object, fields: Dict[str, object], stack: List[str]\n) -> Dict[str, "Field"]:\n check.dict_param(fields, "fields")\n return {\n name: _convert_potential_field(original_root, value, stack + [name])\n for name, value in fields.items()\n }\n\n\ndef expand_list(original_root: object, the_list: List[object], stack: List[str]) -> Array:\n\n if len(the_list) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_list, stack, "List must be of length 1"\n )\n\n inner_type = _convert_potential_type(original_root, the_list[0], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_list,\n stack,\n "List have a single item and contain a valid type i.e. [int]. Got item {}".format(\n repr(the_list[0])\n ),\n )\n\n return Array(inner_type)\n\n\ndef expand_map(original_root: object, the_dict: Dict[object, object], stack: List[str]) -> Map:\n\n if len(the_dict) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_dict, stack, "Map dict must be of length 1"\n )\n\n key = list(the_dict.keys())[0]\n key_type = _convert_potential_type(original_root, key, stack)\n if not key_type or not key_type.kind == ConfigTypeKind.SCALAR:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n "Map dict must have a scalar type as its only key. Got key {}".format(repr(key)),\n )\n\n inner_type = _convert_potential_type(original_root, the_dict[key], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n "Map must have a single value and contain a valid type i.e. {{str: int}}. Got item {}".format(\n repr(the_dict[key])\n ),\n )\n\n return Map(key_type, inner_type)\n\n\ndef convert_potential_field(potential_field: object) -> "Field":\n return _convert_potential_field(potential_field, potential_field, [])\n\n\ndef _convert_potential_type(original_root: object, potential_type, stack: List[str]):\n from .field import resolve_to_config_type\n\n if isinstance(potential_type, dict):\n # A dictionary, containing a single key which is a type (int, str, etc) and not a string is interpreted as a Map\n if len(potential_type) == 1:\n key = list(potential_type.keys())[0]\n if not isinstance(key, str) and _convert_potential_type(original_root, key, stack):\n return expand_map(original_root, potential_type, stack)\n\n # Otherwise, the dictionary is interpreted as a Shape\n return Shape(_expand_fields_dict(original_root, potential_type, stack))\n\n if isinstance(potential_type, list):\n return expand_list(original_root, potential_type, stack)\n\n return resolve_to_config_type(potential_type)\n\n\ndef _convert_potential_field(\n original_root: object, potential_field: object, stack: List[str]\n) -> "Field":\n from .field import Field\n\n if potential_field is None:\n raise DagsterInvalidConfigDefinitionError(\n original_root, potential_field, stack, reason="Fields cannot be None"\n )\n\n if not is_potential_field(potential_field):\n raise DagsterInvalidConfigDefinitionError(original_root, potential_field, stack)\n\n if isinstance(potential_field, Field):\n return potential_field\n\n return Field(_convert_potential_type(original_root, potential_field, stack))\n
", "current_page_name": "_modules/dagster/config/field_utils", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.config.field_utils"}}, "core": {"asset_defs": {"asset_group": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.asset_group

\nimport inspect\nimport os\nimport pkgutil\nimport re\nimport warnings\nfrom importlib import import_module\nfrom types import ModuleType\nfrom typing import (\n    Any,\n    Dict,\n    Generator,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.storage.fs_asset_io_manager import fs_asset_io_manager\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.backcompat import ExperimentalWarning\n\nfrom ..definitions.executor_definition import ExecutorDefinition\nfrom ..definitions.job_definition import JobDefinition\nfrom ..definitions.op_definition import OpDefinition\nfrom ..definitions.resource_definition import ResourceDefinition\nfrom ..errors import DagsterInvalidDefinitionError\nfrom .assets import AssetsDefinition\nfrom .assets_job import build_assets_job, build_root_manager, build_source_assets_by_key\nfrom .source_asset import SourceAsset\n\n\n
[docs]class AssetGroup(\n NamedTuple(\n "_AssetGroup",\n [\n ("assets", Sequence[AssetsDefinition]),\n ("source_assets", Sequence[SourceAsset]),\n ("resource_defs", Mapping[str, ResourceDefinition]),\n ("executor_def", Optional[ExecutorDefinition]),\n ],\n )\n):\n """Defines a group of assets, along with environment information in the\n form of resources and an executor.\n\n An AssetGroup can be provided to a :py:class:`RepositoryDefinition`. When\n provided to a repository, the constituent assets can be materialized from\n Dagit. The AssetGroup also provides an interface for creating jobs from\n subselections of assets, which can then be provided to a\n :py:class:`ScheduleDefinition` or :py:class:`SensorDefinition`.\n\n There can only be one AssetGroup per repository.\n\n Args:\n assets (Sequence[AssetsDefinition]): The set of software-defined assets\n to group.\n source_assets (Optional[Sequence[SourceAsset]]): The set of source\n assets that the software-defined may depend on.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A\n dictionary of resource definitions. When the AssetGroup is\n constructed, if there are any unsatisfied resource requirements\n from the assets, it will result in an error. Note that the\n `root_manager` key is a reserved resource key, and will result in\n an error if provided by the user.\n executor_def (Optional[ExecutorDefinition]): The executor definition to\n use when re-materializing assets in this group.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import AssetGroup, asset, AssetIn, AssetKey, SourceAsset, resource\n\n source_asset = SourceAsset("source")\n\n @asset(required_resource_keys={"foo"})\n def start_asset(context, source):\n ...\n\n @asset\n def next_asset(start_asset):\n ...\n\n @resource\n def foo_resource():\n ...\n\n asset_group = AssetGroup(\n assets=[start_asset, next_asset],\n source_assets=[source_asset],\n resource_defs={"foo": foo_resource},\n )\n ...\n\n """\n\n def __new__(\n cls,\n assets: Sequence[AssetsDefinition],\n source_assets: Optional[Sequence[SourceAsset]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n ):\n check.sequence_param(assets, "assets", of_type=AssetsDefinition)\n source_assets = check.opt_sequence_param(\n source_assets, "source_assets", of_type=SourceAsset\n )\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n executor_def = check.opt_inst_param(executor_def, "executor_def", ExecutorDefinition)\n\n source_assets_by_key = build_source_assets_by_key(source_assets)\n root_manager = build_root_manager(source_assets_by_key)\n\n if "root_manager" in resource_defs:\n raise DagsterInvalidDefinitionError(\n "Resource dictionary included resource with key 'root_manager', "\n "which is a reserved resource keyword in Dagster. Please change "\n "this key, and then change all places that require this key to "\n "a new value."\n )\n # In the case of collisions, merge_dicts takes values from the\n # dictionary latest in the list, so we place the user provided resource\n # defs after the defaults.\n resource_defs = merge_dicts(\n {"root_manager": root_manager, "io_manager": fs_asset_io_manager},\n resource_defs,\n )\n\n _validate_resource_reqs_for_asset_group(\n asset_list=assets, source_assets=source_assets, resource_defs=resource_defs\n )\n\n return super(AssetGroup, cls).__new__(\n cls,\n assets=assets,\n source_assets=source_assets,\n resource_defs=resource_defs,\n executor_def=executor_def,\n )\n\n
[docs] @staticmethod\n def all_assets_job_name() -> str:\n """The name of the mega-job that the provided list of assets is coerced into."""\n return "__ASSET_GROUP"
\n\n
[docs] def build_job(\n self,\n name: str,\n selection: Optional[Union[str, List[str]]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n tags: Optional[Dict[str, Any]] = None,\n description: Optional[str] = None,\n ) -> JobDefinition:\n """Defines an executable job from the provided assets, resources, and executor.\n\n Args:\n name (str): The name to give the job.\n selection (Union[str, List[str]]): A single selection query or list of selection queries\n to execute. For example:\n\n - ``['some_asset_key']`` select ``some_asset_key`` itself.\n - ``['*some_asset_key']`` select ``some_asset_key`` and all its ancestors (upstream dependencies).\n - ``['*some_asset_key+++']`` select ``some_asset_key``, all its ancestors, and its descendants (downstream dependencies) within 3 levels down.\n - ``['*some_asset_key', 'other_asset_key_a', 'other_asset_key_b+']`` select ``some_asset_key`` and all its ancestors, ``other_asset_key_a`` itself, and ``other_asset_key_b`` and its direct child asset keys. When subselecting into a multi-asset, all of the asset keys in that multi-asset must be selected.\n\n executor_def (Optional[ExecutorDefinition]): The executor\n definition to use when executing the job. Defaults to the\n executor on the AssetGroup. If no executor was provided on the\n AssetGroup, then it defaults to :py:class:`multi_or_in_process_executor`.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten\n tag values provided at invocation time.\n description (Optional[str]): A description of the job.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import AssetGroup\n\n the_asset_group = AssetGroup(...)\n\n job_with_all_assets = the_asset_group.build_job()\n\n job_with_one_selection = the_asset_group.build_job(selection="some_asset")\n\n job_with_multiple_selections = the_asset_group.build_job(selection=["*some_asset", "other_asset++"])\n """\n\n from dagster.core.selector.subset_selector import parse_op_selection\n\n check.str_param(name, "name")\n\n if not isinstance(selection, str):\n selection = check.opt_list_param(selection, "selection", of_type=str)\n executor_def = check.opt_inst_param(executor_def, "executor_def", ExecutorDefinition)\n description = check.opt_str_param(description, "description")\n\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n mega_job_def = build_assets_job(\n name=name,\n assets=self.assets,\n source_assets=self.source_assets,\n resource_defs=self.resource_defs,\n executor_def=self.executor_def,\n )\n\n if selection:\n op_selection = self._parse_asset_selection(selection, job_name=name)\n # We currently re-use the logic from op selection to parse the\n # asset key selection, but this has disadvantages. Eventually we\n # will want to decouple these implementations.\n # https://github.com/dagster-io/dagster/issues/6647.\n resolved_op_selection_dict = parse_op_selection(mega_job_def, op_selection)\n\n included_assets = []\n excluded_assets: List[Union[AssetsDefinition, SourceAsset]] = list(self.source_assets)\n\n op_names = set(list(resolved_op_selection_dict.keys()))\n\n for asset in self.assets:\n if asset.op.name in op_names:\n included_assets.append(asset)\n else:\n excluded_assets.append(asset)\n else:\n included_assets = cast(List[AssetsDefinition], self.assets)\n # Call to list(...) serves as a copy constructor, so that we don't\n # accidentally add to the original list\n excluded_assets = list(self.source_assets)\n\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n asset_job = build_assets_job(\n name=name,\n assets=included_assets,\n source_assets=excluded_assets,\n resource_defs=self.resource_defs,\n executor_def=self.executor_def,\n description=description,\n tags=tags,\n )\n return asset_job
\n\n def _parse_asset_selection(self, selection: Union[str, List[str]], job_name: str) -> List[str]:\n """Convert selection over asset keys to selection over ops"""\n\n asset_keys_to_ops: Dict[str, List[OpDefinition]] = {}\n op_names_to_asset_keys: Dict[str, Set[str]] = {}\n seen_asset_keys: Set[str] = set()\n\n if isinstance(selection, str):\n selection = [selection]\n\n if len(selection) == 1 and selection[0] == "*":\n return selection\n\n source_asset_keys = set()\n\n for asset in self.assets:\n if asset.op.name not in op_names_to_asset_keys:\n op_names_to_asset_keys[asset.op.name] = set()\n for asset_key in asset.asset_keys:\n asset_key_as_str = ".".join([piece for piece in asset_key.path])\n op_names_to_asset_keys[asset.op.name].add(asset_key_as_str)\n if not asset_key_as_str in asset_keys_to_ops:\n asset_keys_to_ops[asset_key_as_str] = []\n asset_keys_to_ops[asset_key_as_str].append(asset.op)\n\n for asset in self.source_assets:\n if isinstance(asset, SourceAsset):\n asset_key_as_str = ".".join([piece for piece in asset.key.path])\n source_asset_keys.add(asset_key_as_str)\n else:\n for asset_key in asset.asset_keys:\n asset_key_as_str = ".".join([piece for piece in asset_key.path])\n source_asset_keys.add(asset_key_as_str)\n\n op_selection = []\n\n for clause in selection:\n token_matching = re.compile(r"^(\\*?\\+*)?([.\\w\\d\\[\\]?_-]+)(\\+*\\*?)?$").search(\n clause.strip()\n )\n parts = token_matching.groups() if token_matching is not None else None\n if parts is None:\n raise DagsterInvalidDefinitionError(\n f"When attempting to create job '{job_name}', the clause "\n f"{clause} within the asset key selection was invalid. Please "\n "review the selection syntax here: "\n "https://docs.dagster.io/concepts/ops-jobs-graphs/job-execution#op-selection-syntax."\n )\n upstream_part, key_str, downstream_part = parts\n\n # Error if you express a clause in terms of a source asset key.\n # Eventually we will want to support selection over source asset\n # keys as a means of running downstream ops.\n # https://github.com/dagster-io/dagster/issues/6647\n if key_str in source_asset_keys:\n raise DagsterInvalidDefinitionError(\n f"When attempting to create job '{job_name}', the clause '"\n f"{clause}' selects asset_key '{key_str}', which comes from "\n "a source asset. Source assets can't be materialized, and "\n "therefore can't be subsetted into a job. Please choose a "\n "subset on asset keys that are materializable - that is, "\n f"included on assets within the group. Valid assets: {list(asset_keys_to_ops.keys())}"\n )\n if key_str not in asset_keys_to_ops:\n raise DagsterInvalidDefinitionError(\n f"When attempting to create job '{job_name}', the clause "\n f"'{clause}' within the asset key selection did not match "\n f"any asset keys. Present asset keys: {list(asset_keys_to_ops.keys())}"\n )\n\n seen_asset_keys.add(key_str)\n\n for op in asset_keys_to_ops[key_str]:\n\n op_clause = f"{upstream_part}{op.name}{downstream_part}"\n op_selection.append(op_clause)\n\n # Verify that for each selected asset key, the corresponding op had all\n # asset keys selected. Eventually, we will want to have specific syntax\n # that allows for selecting all asset keys for a given multi-asset\n # https://github.com/dagster-io/dagster/issues/6647.\n for op_name, asset_key_set in op_names_to_asset_keys.items():\n are_keys_in_set = [key in seen_asset_keys for key in asset_key_set]\n if any(are_keys_in_set) and not all(are_keys_in_set):\n raise DagsterInvalidDefinitionError(\n f"When building job '{job_name}', the asset '{op_name}' "\n f"contains asset keys {sorted(list(asset_key_set))}, but "\n f"attempted to select only {sorted(list(asset_key_set.intersection(seen_asset_keys)))}. "\n "Selecting only some of the asset keys for a particular "\n "asset is not yet supported behavior. Please select all "\n "asset keys produced by a given asset when subsetting."\n )\n return op_selection\n\n
[docs] @staticmethod\n def from_package_module(\n package_module: ModuleType,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n ) -> "AssetGroup":\n """\n Constructs an AssetGroup that includes all asset definitions and source assets in all\n sub-modules of the given package module.\n\n A package module is the result of importing a package.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A dictionary of resource\n definitions to include on the returned asset group.\n executor_def (Optional[ExecutorDefinition]): An executor to include on the returned\n asset group.\n\n Returns:\n AssetGroup: An asset group with all the assets in the package.\n """\n return AssetGroup.from_modules(\n _find_modules_in_package(package_module),\n resource_defs=resource_defs,\n executor_def=executor_def,\n )
\n\n
[docs] @staticmethod\n def from_package_name(\n package_name: str,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n ) -> "AssetGroup":\n """\n Constructs an AssetGroup that includes all asset definitions and source assets in all\n sub-modules of the given package.\n\n Args:\n package_name (str): The name of a Python package to look for assets inside.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A dictionary of resource\n definitions to include on the returned asset group.\n executor_def (Optional[ExecutorDefinition]): An executor to include on the returned\n asset group.\n\n Returns:\n AssetGroup: An asset group with all the assets in the package.\n """\n package_module = import_module(package_name)\n return AssetGroup.from_package_module(\n package_module, resource_defs=resource_defs, executor_def=executor_def\n )
\n\n
[docs] @staticmethod\n def from_modules(\n modules: Iterable[ModuleType],\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n ) -> "AssetGroup":\n """\n Constructs an AssetGroup that includes all asset definitions and source assets in the given\n modules.\n\n Args:\n modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A dictionary of resource\n definitions to include on the returned asset group.\n executor_def (Optional[ExecutorDefinition]): An executor to include on the returned\n asset group.\n\n Returns:\n AssetGroup: An asset group with all the assets defined in the given modules.\n """\n asset_ids: Set[int] = set()\n asset_keys: Dict[AssetKey, ModuleType] = dict()\n source_assets: List[SourceAsset] = []\n assets: List[AssetsDefinition] = []\n for module in modules:\n for asset in _find_assets_in_module(module):\n if id(asset) not in asset_ids:\n asset_ids.add(id(asset))\n keys = asset.asset_keys if isinstance(asset, AssetsDefinition) else [asset.key]\n for key in keys:\n if key in asset_keys:\n modules_str = ", ".join(\n set([asset_keys[key].__name__, module.__name__])\n )\n raise DagsterInvalidDefinitionError(\n f"Asset key {key} is defined multiple times. Definitions found in modules: {modules_str}."\n )\n else:\n asset_keys[key] = module\n if isinstance(asset, SourceAsset):\n source_assets.append(asset)\n else:\n assets.append(asset)\n\n return AssetGroup(\n assets=assets,\n source_assets=source_assets,\n resource_defs=resource_defs,\n executor_def=executor_def,\n )
\n\n
[docs] @staticmethod\n def from_current_module(\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n ) -> "AssetGroup":\n """\n Constructs an AssetGroup that includes all asset definitions and source assets in the module\n where this is called from.\n\n Args:\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A dictionary of resource\n definitions to include on the returned asset group.\n executor_def (Optional[ExecutorDefinition]): An executor to include on the returned\n asset group.\n\n Returns:\n AssetGroup: An asset group with all the assets defined in the module.\n """\n caller = inspect.stack()[1]\n module = inspect.getmodule(caller[0])\n if module is None:\n check.failed("Could not find a module for the caller")\n return AssetGroup.from_modules([module], resource_defs, executor_def)
\n\n\ndef _find_assets_in_module(\n module: ModuleType,\n) -> Generator[Union[AssetsDefinition, SourceAsset], None, None]:\n """\n Finds assets in the given module and adds them to the given sets of assets and source assets.\n """\n for attr in dir(module):\n value = getattr(module, attr)\n if isinstance(value, (AssetsDefinition, SourceAsset)):\n yield value\n elif isinstance(value, list) and all(\n isinstance(el, (AssetsDefinition, SourceAsset)) for el in value\n ):\n yield from value\n\n\ndef _find_modules_in_package(package_module: ModuleType) -> Iterable[ModuleType]:\n yield package_module\n package_path = package_module.__file__\n if package_path:\n for _, modname, is_pkg in pkgutil.walk_packages([os.path.dirname(package_path)]):\n submodule = import_module(f"{package_module.__name__}.{modname}")\n if is_pkg:\n yield from _find_modules_in_package(submodule)\n else:\n yield submodule\n else:\n raise ValueError(\n f"Tried find modules in package {package_module}, but its __file__ is None"\n )\n\n\ndef _validate_resource_reqs_for_asset_group(\n asset_list: Sequence[AssetsDefinition],\n source_assets: Sequence[SourceAsset],\n resource_defs: Mapping[str, ResourceDefinition],\n):\n present_resource_keys = set(resource_defs.keys())\n for asset_def in asset_list:\n resource_keys = set(asset_def.op.required_resource_keys or {})\n missing_resource_keys = list(set(resource_keys) - present_resource_keys)\n if missing_resource_keys:\n raise DagsterInvalidDefinitionError(\n f"AssetGroup is missing required resource keys for asset '{asset_def.op.name}'. Missing resource keys: {missing_resource_keys}"\n )\n\n for asset_key, output_def in asset_def.output_defs_by_asset_key.items():\n if output_def.io_manager_key and output_def.io_manager_key not in present_resource_keys:\n raise DagsterInvalidDefinitionError(\n f"Output '{output_def.name}' with AssetKey '{asset_key}' "\n f"requires io manager '{output_def.io_manager_key}' but was "\n f"not provided on asset group. Provided resources: {sorted(list(present_resource_keys))}"\n )\n\n for source_asset in source_assets:\n if source_asset.io_manager_key and source_asset.io_manager_key not in present_resource_keys:\n raise DagsterInvalidDefinitionError(\n f"SourceAsset with key {source_asset.key} requires io manager "\n f"with key '{source_asset.io_manager_key}', which was not "\n f"provided on AssetGroup. Provided keys: {sorted(list(present_resource_keys))}"\n )\n\n for resource_key, resource_def in resource_defs.items():\n resource_keys = set(resource_def.required_resource_keys)\n missing_resource_keys = sorted(list(set(resource_keys) - present_resource_keys))\n if missing_resource_keys:\n raise DagsterInvalidDefinitionError(\n "AssetGroup is missing required resource keys for resource '"\n f"{resource_key}'. Missing resource keys: {missing_resource_keys}"\n )\n
", "current_page_name": "_modules/dagster/core/asset_defs/asset_group", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.asset_group"}, "asset_in": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.asset_in

\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey\n\n\n
[docs]class AssetIn(\n NamedTuple(\n "_AssetIn",\n [\n ("asset_key", Optional[AssetKey]),\n ("metadata", Optional[Mapping[str, Any]]),\n ("namespace", Optional[Sequence[str]]),\n ],\n )\n):\n def __new__(\n cls,\n asset_key: Optional[AssetKey] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n namespace: Optional[Sequence[str]] = None,\n ):\n check.invariant(\n not (asset_key and namespace),\n ("Asset key and namespace cannot both be set on AssetIn"),\n )\n\n # if user inputs a single string, coerce to list\n namespace = [namespace] if isinstance(namespace, str) else namespace\n\n return super(AssetIn, cls).__new__(\n cls,\n asset_key=check.opt_inst_param(asset_key, "asset_key", AssetKey),\n metadata=check.opt_inst_param(metadata, "metadata", Mapping),\n namespace=check.opt_list_param(namespace, "namespace", str),\n )
\n
", "current_page_name": "_modules/dagster/core/asset_defs/asset_in", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.asset_in"}, "assets_job": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.assets_job

\nimport itertools\nimport warnings\nfrom typing import AbstractSet, Any, Dict, Mapping, Optional, Sequence, Tuple, Union, cast\n\nfrom dagster import check\nfrom dagster.core.definitions.config import ConfigMapping\nfrom dagster.core.definitions.decorators.op_decorator import op\nfrom dagster.core.definitions.dependency import (\n    DependencyDefinition,\n    IDependencyDefinition,\n    NodeInvocation,\n)\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.executor_definition import ExecutorDefinition\nfrom dagster.core.definitions.graph_definition import GraphDefinition\nfrom dagster.core.definitions.job_definition import JobDefinition\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.output import Out, OutputDefinition\nfrom dagster.core.definitions.partition import PartitionedConfig, PartitionsDefinition\nfrom dagster.core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.core.execution.context.input import InputContext, build_input_context\nfrom dagster.core.execution.context.output import build_output_context\nfrom dagster.core.storage.fs_asset_io_manager import fs_asset_io_manager\nfrom dagster.core.storage.root_input_manager import RootInputManagerDefinition, root_input_manager\nfrom dagster.utils.backcompat import ExperimentalWarning, experimental\nfrom dagster.utils.merger import merge_dicts\n\nfrom .asset_partitions import get_upstream_partitions_for_partition_range\nfrom .assets import AssetsDefinition\nfrom .source_asset import SourceAsset\n\n\n
[docs]@experimental\ndef build_assets_job(\n name: str,\n assets: Sequence[AssetsDefinition],\n source_assets: Optional[Sequence[Union[SourceAsset, AssetsDefinition]]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n description: Optional[str] = None,\n config: Optional[Union[ConfigMapping, Dict[str, Any], PartitionedConfig]] = None,\n tags: Optional[Dict[str, Any]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n) -> JobDefinition:\n """Builds a job that materializes the given assets.\n\n The dependencies between the ops in the job are determined by the asset dependencies defined\n in the metadata on the provided asset nodes.\n\n Args:\n name (str): The name of the job.\n assets (List[AssetsDefinition]): A list of assets or\n multi-assets - usually constructed using the :py:func:`@asset` or :py:func:`@multi_asset`\n decorator.\n source_assets (Optional[Sequence[Union[SourceAsset, AssetsDefinition]]]): A list of\n assets that are not materialized by this job, but that assets in this job depend on.\n resource_defs (Optional[Dict[str, ResourceDefinition]]): Resource defs to be included in\n this job.\n description (Optional[str]): A description of the job.\n\n Examples:\n\n .. code-block:: python\n\n @asset\n def asset1():\n return 5\n\n @asset\n def asset2(asset1):\n return my_upstream_asset + 1\n\n my_assets_job = build_assets_job("my_assets_job", assets=[asset1, asset2])\n\n Returns:\n JobDefinition: A job that materializes the given assets.\n """\n check.str_param(name, "name")\n check.sequence_param(assets, "assets", of_type=AssetsDefinition)\n check.opt_sequence_param(\n source_assets, "source_assets", of_type=(SourceAsset, AssetsDefinition)\n )\n check.opt_str_param(description, "description")\n source_assets_by_key = build_source_assets_by_key(source_assets)\n\n op_defs = build_op_deps(assets, source_assets_by_key.keys())\n root_manager = build_root_manager(source_assets_by_key)\n partitioned_config = build_job_partitions_from_assets(assets, source_assets or [])\n\n return GraphDefinition(\n name=name,\n node_defs=[asset.op for asset in assets],\n dependencies=op_defs,\n description=description,\n input_mappings=None,\n output_mappings=None,\n config=None,\n ).to_job(\n resource_defs=merge_dicts(\n {"io_manager": fs_asset_io_manager}, resource_defs or {}, {"root_manager": root_manager}\n ),\n config=config or partitioned_config,\n tags=tags,\n executor_def=executor_def,\n )
\n\n\ndef build_job_partitions_from_assets(\n assets: Sequence[AssetsDefinition],\n source_assets: Sequence[Union[SourceAsset, AssetsDefinition]],\n) -> Optional[PartitionedConfig]:\n assets_with_partitions_defs = [assets_def for assets_def in assets if assets_def.partitions_def]\n\n if len(assets_with_partitions_defs) == 0:\n return None\n\n first_assets_with_partitions_def: AssetsDefinition = assets_with_partitions_defs[0]\n for assets_def in assets_with_partitions_defs:\n if assets_def.partitions_def != first_assets_with_partitions_def.partitions_def:\n first_asset_key = next(iter(assets_def.asset_keys)).to_string()\n second_asset_key = next(iter(first_assets_with_partitions_def.asset_keys)).to_string()\n raise DagsterInvalidDefinitionError(\n "When an assets job contains multiple partitions assets, they must have the "\n f"same partitions definitions, but asset '{first_asset_key}' and asset "\n f"'{second_asset_key}' have different partitions definitions. "\n )\n\n partitions_defs_by_asset_key: Dict[AssetKey, PartitionsDefinition] = {}\n asset: Union[AssetsDefinition, SourceAsset]\n for asset in itertools.chain.from_iterable([assets, source_assets]):\n if isinstance(asset, AssetsDefinition) and asset.partitions_def is not None:\n for asset_key in asset.asset_keys:\n partitions_defs_by_asset_key[asset_key] = asset.partitions_def\n elif isinstance(asset, SourceAsset) and asset.partitions_def is not None:\n partitions_defs_by_asset_key[asset.key] = asset.partitions_def\n\n def asset_partitions_for_job_partition(\n job_partition_key: str,\n ) -> Mapping[AssetKey, PartitionKeyRange]:\n return {\n asset_key: PartitionKeyRange(job_partition_key, job_partition_key)\n for assets_def in assets\n for asset_key in assets_def.asset_keys\n if assets_def.partitions_def\n }\n\n def run_config_for_partition_fn(partition_key: str) -> Dict[str, Any]:\n ops_config: Dict[str, Any] = {}\n asset_partitions_by_asset_key = asset_partitions_for_job_partition(partition_key)\n\n for assets_def in assets:\n outputs_dict: Dict[str, Dict[str, Any]] = {}\n if assets_def.partitions_def is not None:\n for asset_key, output_def in assets_def.output_defs_by_asset_key.items():\n asset_partition_key_range = asset_partitions_by_asset_key[asset_key]\n outputs_dict[output_def.name] = {\n "start": asset_partition_key_range.start,\n "end": asset_partition_key_range.end,\n }\n\n inputs_dict: Dict[str, Dict[str, Any]] = {}\n for in_asset_key, input_def in assets_def.input_defs_by_asset_key.items():\n upstream_partitions_def = partitions_defs_by_asset_key.get(in_asset_key)\n if assets_def.partitions_def is not None and upstream_partitions_def is not None:\n upstream_partition_key_range = get_upstream_partitions_for_partition_range(\n assets_def, upstream_partitions_def, in_asset_key, asset_partition_key_range\n )\n inputs_dict[input_def.name] = {\n "start": upstream_partition_key_range.start,\n "end": upstream_partition_key_range.end,\n }\n\n ops_config[assets_def.op.name] = {\n "config": {\n "assets": {\n "input_partitions": inputs_dict,\n "output_partitions": outputs_dict,\n }\n }\n }\n\n return {"ops": ops_config}\n\n return PartitionedConfig(\n partitions_def=cast(PartitionsDefinition, first_assets_with_partitions_def.partitions_def),\n run_config_for_partition_fn=lambda p: run_config_for_partition_fn(p.name),\n )\n\n\ndef build_source_assets_by_key(\n source_assets: Optional[Sequence[Union[SourceAsset, AssetsDefinition]]]\n) -> Mapping[AssetKey, Union[SourceAsset, OutputDefinition]]:\n source_assets_by_key: Dict[AssetKey, Union[SourceAsset, OutputDefinition]] = {}\n for asset_source in source_assets or []:\n if isinstance(asset_source, SourceAsset):\n source_assets_by_key[asset_source.key] = asset_source\n elif isinstance(asset_source, AssetsDefinition):\n for asset_key, output_def in asset_source.output_defs_by_asset_key.items():\n if asset_key:\n source_assets_by_key[asset_key] = output_def\n\n return source_assets_by_key\n\n\ndef build_op_deps(\n multi_asset_defs: Sequence[AssetsDefinition], source_paths: AbstractSet[AssetKey]\n) -> Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]:\n op_outputs_by_asset: Dict[AssetKey, Tuple[OpDefinition, str]] = {}\n for multi_asset_def in multi_asset_defs:\n for asset_key, output_def in multi_asset_def.output_defs_by_asset_key.items():\n if asset_key in op_outputs_by_asset:\n raise DagsterInvalidDefinitionError(\n f"The same asset key was included for two definitions: '{asset_key.to_string()}'"\n )\n\n op_outputs_by_asset[asset_key] = (multi_asset_def.op, output_def.name)\n\n op_deps: Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]] = {}\n for multi_asset_def in multi_asset_defs:\n op_name = multi_asset_def.op.name\n op_deps[op_name] = {}\n for asset_key, input_def in multi_asset_def.input_defs_by_asset_key.items():\n if asset_key in op_outputs_by_asset:\n op_def, output_name = op_outputs_by_asset[asset_key]\n op_deps[op_name][input_def.name] = DependencyDefinition(op_def.name, output_name)\n elif asset_key not in source_paths and not input_def.dagster_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n f"Input asset '{asset_key.to_string()}' for asset '{op_name}' is not "\n "produced by any of the provided asset ops and is not one of the provided "\n "sources"\n )\n\n return op_deps\n\n\ndef build_root_manager(\n source_assets_by_key: Mapping[AssetKey, Union[SourceAsset, OutputDefinition]]\n) -> RootInputManagerDefinition:\n source_asset_io_manager_keys = {\n source_asset.io_manager_key for source_asset in source_assets_by_key.values()\n }\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n\n @root_input_manager(required_resource_keys=source_asset_io_manager_keys)\n def _root_manager(input_context: InputContext) -> Any:\n source_asset_key = cast(AssetKey, input_context.asset_key)\n source_asset = source_assets_by_key[source_asset_key]\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n\n @op(out={source_asset_key.path[-1]: Out(asset_key=source_asset_key)})\n def _op():\n pass\n\n resource_config = input_context.step_context.resolved_run_config.resources[\n source_asset.io_manager_key\n ].config\n\n output_context = build_output_context(\n name=source_asset_key.path[-1],\n step_key="none",\n solid_def=_op,\n metadata=cast(Dict[str, Any], source_asset.metadata),\n resource_config=resource_config,\n )\n input_context_with_upstream = build_input_context(\n name=input_context.name,\n metadata=input_context.metadata,\n config=input_context.config,\n dagster_type=input_context.dagster_type,\n upstream_output=output_context,\n op_def=input_context.op_def,\n step_context=input_context.step_context,\n resource_config=resource_config,\n )\n\n io_manager = getattr(cast(Any, input_context.resources), source_asset.io_manager_key)\n return io_manager.load_input(input_context_with_upstream)\n\n return _root_manager\n
", "current_page_name": "_modules/dagster/core/asset_defs/assets_job", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.assets_job"}, "decorators": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.decorators

\nimport warnings\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n    overload,\n)\n\nfrom dagster import check\nfrom dagster.builtins import Nothing\nfrom dagster.config import Field\nfrom dagster.core.decorator_utils import get_function_params, get_valid_name_permutations\nfrom dagster.core.definitions.decorators.op_decorator import _Op\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.input import In\nfrom dagster.core.definitions.output import Out\nfrom dagster.core.definitions.partition import PartitionsDefinition\nfrom dagster.core.definitions.utils import NoValueSentinel\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.core.types.dagster_type import DagsterType\nfrom dagster.utils.backcompat import ExperimentalWarning, experimental_decorator\n\nfrom .asset_in import AssetIn\nfrom .assets import AssetsDefinition\nfrom .partition_mapping import PartitionMapping\n\nASSET_DEPENDENCY_METADATA_KEY = ".dagster/asset_deps"\n\n\n@overload\ndef asset(\n    name: Callable[..., Any],\n) -> AssetsDefinition:\n    ...\n\n\n@overload\ndef asset(\n    name: Optional[str] = ...,\n    namespace: Optional[Sequence[str]] = ...,\n    ins: Optional[Mapping[str, AssetIn]] = ...,\n    non_argument_deps: Optional[Set[AssetKey]] = ...,\n    metadata: Optional[Mapping[str, Any]] = ...,\n    description: Optional[str] = ...,\n    required_resource_keys: Optional[Set[str]] = ...,\n    io_manager_key: Optional[str] = ...,\n    compute_kind: Optional[str] = ...,\n    dagster_type: Optional[DagsterType] = ...,\n    partitions_def: Optional[PartitionsDefinition] = ...,\n    partition_mappings: Optional[Mapping[str, PartitionMapping]] = ...,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n    ...\n\n\n
[docs]@experimental_decorator\ndef asset(\n name: Optional[Union[Callable[..., Any], Optional[str]]] = None,\n namespace: Optional[Sequence[str]] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n non_argument_deps: Optional[Set[AssetKey]] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n io_manager_key: Optional[str] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Create a definition for how to compute an asset.\n\n A software-defined asset is the combination of:\n 1. An asset key, e.g. the name of a table.\n 2. A function, which can be run to compute the contents of the asset.\n 3. A set of upstream assets that are provided as inputs to the function when computing the asset.\n\n Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\n about the upstream assets it depends on. The upstream assets are inferred from the arguments\n to the decorated function. The name of the argument designates the name of the upstream asset.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function.\n namespace (Optional[Sequence[str]]): The namespace that the asset resides in. The namespace + the\n name forms the asset key.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to their metadata\n and namespaces.\n non_argument_deps (Optional[Set[AssetKey]]): Set of asset keys that are upstream dependencies,\n but do not pass an input to the asset.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the op.\n io_manager_key (Optional[str]): The resource key of the IOManager used for storing the\n output of the op as an asset, and for loading it in downstream ops\n (default: "io_manager").\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in Dagit as a badge on the asset.\n dagster_type (Optional[DagsterType]): Allows specifying type validation functions that\n will be executed on the output of the decorated function after it runs.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n\n Examples:\n\n .. code-block:: python\n\n @asset\n def my_asset(my_upstream_asset: int) -> int:\n return my_upstream_asset + 1\n """\n if callable(name):\n return _Asset()(name)\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n return _Asset(\n name=cast(Optional[str], name), # (mypy bug that it can't infer name is Optional[str])\n namespace=namespace,\n ins=ins,\n non_argument_deps=non_argument_deps,\n metadata=metadata,\n description=description,\n required_resource_keys=required_resource_keys,\n io_manager_key=io_manager_key,\n compute_kind=check.opt_str_param(compute_kind, "compute_kind"),\n dagster_type=dagster_type,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n )(fn)\n\n return inner
\n\n\nclass _Asset:\n def __init__(\n self,\n name: Optional[str] = None,\n namespace: Optional[Sequence[str]] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n non_argument_deps: Optional[Set[AssetKey]] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n io_manager_key: Optional[str] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n ):\n self.name = name\n # if user inputs a single string, coerce to list\n self.namespace = [namespace] if isinstance(namespace, str) else namespace\n self.ins = ins or {}\n self.non_argument_deps = non_argument_deps\n self.metadata = metadata\n self.description = description\n self.required_resource_keys = required_resource_keys\n self.io_manager_key = io_manager_key\n self.compute_kind = compute_kind\n self.dagster_type = dagster_type\n self.partitions_def = partitions_def\n self.partition_mappings = partition_mappings\n\n def __call__(self, fn: Callable) -> AssetsDefinition:\n asset_name = self.name or fn.__name__\n\n asset_ins = build_asset_ins(fn, self.namespace, self.ins or {}, self.non_argument_deps)\n\n partition_fn: Optional[Callable] = None\n if self.partitions_def:\n\n def partition_fn(context): # pylint: disable=function-redefined\n return [context.partition_key]\n\n out_asset_key = AssetKey(list(filter(None, [*(self.namespace or []), asset_name])))\n out = Out(\n asset_key=out_asset_key,\n metadata=self.metadata or {},\n io_manager_key=self.io_manager_key,\n dagster_type=self.dagster_type if self.dagster_type else NoValueSentinel,\n asset_partitions_def=self.partitions_def,\n asset_partitions=partition_fn,\n )\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n op = _Op(\n name="__".join(out_asset_key.path),\n description=self.description,\n ins=asset_ins,\n out=out,\n required_resource_keys=self.required_resource_keys,\n tags={"kind": self.compute_kind} if self.compute_kind else None,\n config_schema={\n "assets": {\n "input_partitions": Field(dict, is_required=False),\n "output_partitions": Field(dict, is_required=False),\n }\n },\n )(fn)\n\n # NOTE: we can `cast` below because we know the Ins returned by `build_asset_ins` always\n # have a plain AssetKey asset key. Dynamic asset keys will be deprecated in 0.15.0, when\n # they are gone we can remove this cast.\n return AssetsDefinition(\n input_names_by_asset_key={\n cast(AssetKey, in_def.asset_key): input_name\n for input_name, in_def in asset_ins.items()\n },\n output_names_by_asset_key={out_asset_key: "result"},\n op=op,\n partitions_def=self.partitions_def,\n partition_mappings={\n cast(AssetKey, asset_ins[input_name].asset_key): partition_mapping\n for input_name, partition_mapping in self.partition_mappings.items()\n }\n if self.partition_mappings\n else None,\n )\n\n\n
[docs]@experimental_decorator\ndef multi_asset(\n outs: Dict[str, Out],\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n non_argument_deps: Optional[Set[AssetKey]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n compute_kind: Optional[str] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same op and same\n upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n Args:\n name (Optional[str]): The name of the op.\n outs: (Optional[Dict[str, Out]]): The Outs representing the produced assets.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to their metadata\n and namespaces.\n non_argument_deps (Optional[Set[AssetKey]]): Set of asset keys that are upstream dependencies,\n but do not pass an input to the multi_asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the op.\n io_manager_key (Optional[str]): The resource key of the IOManager used for storing the\n output of the op as an asset, and for loading it in downstream ops\n (default: "io_manager").\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in Dagit as a badge on the asset.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by a multi_asset depend on all assets that are consumed by that\n multi asset. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\n used as input to the asset or produced within the op.\n """\n\n check.invariant(\n all(out.asset_key is None or isinstance(out.asset_key, AssetKey) for out in outs.values()),\n "The asset_key argument for Outs supplied to a multi_asset must be a constant or None, not a function. ",\n )\n internal_asset_deps = check.opt_dict_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n op_name = name or fn.__name__\n asset_ins = build_asset_ins(fn, None, ins or {}, non_argument_deps)\n asset_outs = build_asset_outs(op_name, outs, asset_ins, internal_asset_deps or {})\n\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n op = _Op(\n name=op_name,\n description=description,\n ins=asset_ins,\n out=asset_outs,\n required_resource_keys=required_resource_keys,\n tags={"kind": compute_kind} if compute_kind else None,\n )(fn)\n\n # NOTE: we can `cast` below because we know the Ins returned by `build_asset_ins` always\n # have a plain AssetKey asset key. Dynamic asset keys will be deprecated in 0.15.0, when\n # they are gone we can remove this cast.\n return AssetsDefinition(\n input_names_by_asset_key={\n cast(AssetKey, in_def.asset_key): input_name\n for input_name, in_def in asset_ins.items()\n },\n output_names_by_asset_key={\n cast(AssetKey, out_def.asset_key): output_name for output_name, out_def in asset_outs.items() # type: ignore\n },\n op=op,\n )\n\n return inner
\n\n\ndef build_asset_outs(\n op_name: str,\n outs: Mapping[str, Out],\n ins: Mapping[str, In],\n internal_asset_deps: Mapping[str, Set[AssetKey]],\n) -> Dict[str, Out]:\n\n # if an AssetKey is not supplied, create one based off of the out's name\n asset_keys_by_out_name = {\n out_name: out.asset_key if isinstance(out.asset_key, AssetKey) else AssetKey([out_name])\n for out_name, out in outs.items()\n }\n\n # update asset_key if necessary, add metadata indicating inter asset deps\n outs = {\n out_name: out._replace(\n asset_key=asset_keys_by_out_name[out_name],\n metadata=dict(\n **(out.metadata or {}),\n **(\n {ASSET_DEPENDENCY_METADATA_KEY: internal_asset_deps[out_name]}\n if out_name in internal_asset_deps\n else {}\n ),\n ),\n )\n for out_name, out in outs.items()\n }\n\n # validate that the internal_asset_deps make sense\n valid_asset_deps = set(in_def.asset_key for in_def in ins.values())\n valid_asset_deps.update(asset_keys_by_out_name.values())\n for out_name, asset_keys in internal_asset_deps.items():\n check.invariant(\n out_name in outs,\n f"Invalid out key '{out_name}' supplied to `internal_asset_deps` argument for multi-asset "\n f"{op_name}. Must be one of the outs for this multi-asset {list(outs.keys())}.",\n )\n invalid_asset_deps = asset_keys.difference(valid_asset_deps)\n check.invariant(\n not invalid_asset_deps,\n f"Invalid asset dependencies: {invalid_asset_deps} specified in `internal_asset_deps` "\n f"argument for multi-asset '{op_name}' on key '{out_name}'. Each specified asset key "\n "must be associated with an input to the asset or produced by this asset. Valid "\n f"keys: {valid_asset_deps}",\n )\n\n return outs\n\n\ndef build_asset_ins(\n fn: Callable,\n asset_namespace: Optional[Sequence[str]],\n asset_ins: Mapping[str, AssetIn],\n non_argument_deps: Optional[AbstractSet[AssetKey]],\n) -> Dict[str, In]:\n\n non_argument_deps = check.opt_set_param(non_argument_deps, "non_argument_deps", AssetKey)\n\n params = get_function_params(fn)\n is_context_provided = len(params) > 0 and params[0].name in get_valid_name_permutations(\n "context"\n )\n input_param_names = [\n input_param.name for input_param in (params[1:] if is_context_provided else params)\n ]\n\n all_input_names = set(input_param_names) | asset_ins.keys()\n\n for in_key in asset_ins.keys():\n if in_key not in input_param_names:\n raise DagsterInvalidDefinitionError(\n f"Key '{in_key}' in provided ins dict does not correspond to any of the names "\n "of the arguments to the decorated function"\n )\n\n ins: Dict[str, In] = {}\n for input_name in all_input_names:\n asset_key = None\n\n if input_name in asset_ins:\n asset_key = asset_ins[input_name].asset_key\n metadata = asset_ins[input_name].metadata or {}\n namespace = asset_ins[input_name].namespace\n else:\n metadata = {}\n namespace = None\n\n asset_key = asset_key or AssetKey(\n list(filter(None, [*(namespace or asset_namespace or []), input_name]))\n )\n\n ins[input_name] = In(\n metadata=metadata,\n root_manager_key="root_manager",\n asset_key=asset_key,\n )\n\n for asset_key in non_argument_deps:\n stringified_asset_key = "_".join(asset_key.path)\n if stringified_asset_key:\n # cast due to mypy bug-- doesn't understand Nothing is a type\n ins[stringified_asset_key] = In(dagster_type=cast(type, Nothing), asset_key=asset_key)\n\n return ins\n
", "current_page_name": "_modules/dagster/core/asset_defs/decorators", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.decorators"}, "source_asset": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.source_asset

\nfrom typing import NamedTuple, Optional, Sequence, Union\n\nimport dagster.check as check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.metadata import (\n    MetadataEntry,\n    MetadataMapping,\n    MetadataUserInput,\n    PartitionMetadataEntry,\n    normalize_metadata,\n)\nfrom dagster.core.definitions.partition import PartitionsDefinition\n\n\n
[docs]class SourceAsset(\n NamedTuple(\n "_SourceAsset",\n [\n ("key", AssetKey),\n ("metadata_entries", Sequence[Union[MetadataEntry, PartitionMetadataEntry]]),\n ("io_manager_key", str),\n ("description", Optional[str]),\n ("partitions_def", Optional[PartitionsDefinition]),\n ],\n )\n):\n """A SourceAsset represents an asset that is not generated by any Dagster op in the repository\n that it's referenced from.\n\n Attributes:\n key (AssetKey): The key of the asset.\n metadata_entries (List[MetadataEntry]): Metadata associated with the asset.\n io_manager_key (str): The key for the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n description (Optional[str]): The description of the asset.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n """\n\n def __new__(\n cls,\n key: AssetKey,\n metadata: Optional[MetadataUserInput] = None,\n io_manager_key: str = "io_manager",\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n ):\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = normalize_metadata(metadata, [], allow_invalid=True)\n\n return super().__new__(\n cls,\n key=check.inst_param(key, "key", AssetKey),\n metadata_entries=metadata_entries,\n io_manager_key=check.str_param(io_manager_key, "io_manager_key"),\n description=check.opt_str_param(description, "description"),\n partitions_def=check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n ),\n )\n\n @property\n def metadata(self) -> MetadataMapping:\n # PartitionMetadataEntry (unstable API) case is unhandled\n return {entry.label: entry.entry_data for entry in self.metadata_entries} # type: ignore
\n
", "current_page_name": "_modules/dagster/core/asset_defs/source_asset", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.source_asset"}}, "definitions": {"config": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.config

\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, NamedTuple, Optional, Union, cast\n\nfrom dagster import check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.config import ConfigType\nfrom dagster.config.post_process import resolve_defaults\nfrom dagster.config.validate import process_config, validate_config\nfrom dagster.core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster.core.errors import DagsterInvalidConfigError\nfrom dagster.primitive_mapping import is_supported_config_python_builtin\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\nif TYPE_CHECKING:\n    from .pipeline_definition import PipelineDefinition\n\n\ndef is_callable_valid_config_arg(config: Union[Callable[..., Any], Dict[str, Any]]) -> bool:\n    return BuiltinEnum.contains(config) or is_supported_config_python_builtin(config)\n\n\n
[docs]class ConfigMapping(\n NamedTuple(\n "_ConfigMapping",\n [\n ("config_fn", Callable[[Any], Any]),\n ("config_schema", IDefinitionConfigSchema),\n ("receive_processed_config_values", Optional[bool]),\n ],\n )\n):\n """Defines a config mapping for a graph (or job).\n\n By specifying a config mapping function, you can override the configuration for the child\n ops and graphs contained within a graph.\n\n Config mappings require the configuration schema to be specified as ``config_schema``, which will\n be exposed as the configuration schema for the graph, as well as a configuration mapping\n function, ``config_fn``, which maps the config provided to the composite solid to the config\n that will be provided to the child nodes.\n\n Args:\n config_fn (Callable[[dict], dict]): The function that will be called\n to map the graph config to a config appropriate for the child nodes.\n config_schema (ConfigSchema): The schema of the graph config.\n receive_processed_config_values (Optional[bool]): If true, config values provided to the config_fn\n will be converted to their dagster types before being passed in. For example, if this\n value is true, enum config passed to config_fn will be actual enums, while if false,\n then enum config passed to config_fn will be strings.\n """\n\n def __new__(\n cls,\n config_fn: Callable[[Any], Any],\n config_schema: Optional[Any] = None,\n receive_processed_config_values: Optional[bool] = None,\n ):\n return super(ConfigMapping, cls).__new__(\n cls,\n config_fn=check.callable_param(config_fn, "config_fn"),\n config_schema=convert_user_facing_definition_config_schema(config_schema),\n receive_processed_config_values=check.opt_bool_param(\n receive_processed_config_values, "receive_processed_config_values"\n ),\n )\n\n def resolve_from_unvalidated_config(self, config: Any) -> Any:\n """Validates config against outer config schema, and calls mapping against validated config."""\n\n receive_processed_config_values = check.opt_bool_param(\n self.receive_processed_config_values, "receive_processed_config_values", default=True\n )\n if receive_processed_config_values:\n outer_evr = process_config(\n self.config_schema.config_type,\n config,\n )\n else:\n outer_evr = validate_config(\n self.config_schema.config_type,\n config,\n )\n if not outer_evr.success:\n raise DagsterInvalidConfigError(\n "Error in config mapping ",\n outer_evr.errors,\n config,\n )\n\n outer_config = outer_evr.value\n if not receive_processed_config_values:\n outer_config = resolve_defaults(\n cast(ConfigType, self.config_schema.config_type),\n outer_config,\n ).value\n\n return self.config_fn(outer_config)\n\n def resolve_from_validated_config(self, config: Any) -> Any:\n if self.receive_processed_config_values is not None:\n check.failed(\n "`receive_processed_config_values` parameter has been set, but only applies to "\n "unvalidated config."\n )\n\n return self.config_fn(config)
\n
", "current_page_name": "_modules/dagster/core/definitions/config", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.config"}, "configurable": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.configurable

\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Callable, Dict, Optional, Union\n\nfrom dagster import Field, check\nfrom dagster.config.evaluate_value_result import EvaluateValueResult\n\nfrom .definition_config_schema import (\n    ConfiguredDefinitionConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\n\nclass ConfigurableDefinition(ABC):\n    @property\n    @abstractmethod\n    def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n        raise NotImplementedError()\n\n    @property\n    def has_config_field(self) -> bool:\n        return self.config_schema is not None and bool(self.config_schema.as_field())\n\n    @property\n    def config_field(self) -> Optional[Field]:\n        return None if not self.config_schema else self.config_schema.as_field()\n\n    # getter for typed access\n    def get_config_field(self) -> Field:\n        field = self.config_field\n        if field is None:\n            check.failed("Must check has_config_Field before calling get_config_field")\n        return field\n\n    def apply_config_mapping(self, config: Any) -> EvaluateValueResult:\n        """\n        Applies user-provided config mapping functions to the given configuration and validates the\n        results against the respective config schema.\n\n        Expects incoming config to be validated and have fully-resolved values (StringSource values\n        resolved, Enum types hydrated, etc.) via process_config() during ResolvedRunConfig\n        construction and CompositeSolid config mapping.\n\n        Args:\n            config (Any): A validated and resolved configuration dictionary matching this object's\n            config_schema\n\n        Returns (EvaluateValueResult):\n            If successful, the value is a validated and resolved configuration dictionary for the\n            innermost wrapped object after applying the config mapping transformation function.\n        """\n        # If schema is on a mapped schema this is the innermost resource (base case),\n        # so we aren't responsible for validating against anything farther down.\n        # Returns an EVR for type consistency with config_mapping_fn.\n        return (\n            self.config_schema.resolve_config(config)\n            if isinstance(self.config_schema, ConfiguredDefinitionConfigSchema)\n            else EvaluateValueResult.for_value(config)\n        )\n\n\nclass AnonymousConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method not accept a name argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        config_schema: Optional[Dict[str, Any]] = None,\n        description: Optional[str] = None,\n    ):\n        """\n        Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(description, new_config_schema, config_or_config_fn)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n        config_or_config_fn: Union[Any, Callable[[Any], Any]],\n    ):\n        raise NotImplementedError()\n\n\nclass NamedConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method require a positional `name` argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        name: str,\n        config_schema: Optional[Dict[str, Any]] = None,\n        description: Optional[str] = None,\n    ):\n        """\n        Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            name (str): Name of the new definition. This is a required argument, as this definition\n                type has a name uniqueness constraint.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n\n        name = check.str_param(name, "name")\n\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(name, description, new_config_schema, config_or_config_fn)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        name: str,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n        config_or_config_fn: Union[Any, Callable[[Any], Any]],\n    ):\n        raise NotImplementedError()\n\n\ndef _check_configurable_param(configurable: ConfigurableDefinition) -> Any:\n    from dagster.core.definitions.composition import PendingNodeInvocation\n\n    check.param_invariant(\n        not isinstance(configurable, PendingNodeInvocation),\n        "configurable",\n        (\n            "You have invoked `configured` on a PendingNodeInvocation (an intermediate type), which is "\n            "produced by aliasing or tagging a solid definition. To configure a solid, you must "\n            "call `configured` on either a SolidDefinition and CompositeSolidDefinition. To fix "\n            "this error, make sure to call `configured` on the definition object *before* using "\n            "the `tag` or `alias` methods. For usage examples, see "\n            "https://docs.dagster.io/concepts/configuration/configured"\n        ),\n    )\n    check.inst_param(\n        configurable,\n        "configurable",\n        ConfigurableDefinition,\n        (\n            "Only the following types can be used with the `configured` method: ResourceDefinition, "\n            "ExecutorDefinition, CompositeSolidDefinition, SolidDefinition, and LoggerDefinition. "\n            "For usage examples of `configured`, see "\n            "https://docs.dagster.io/concepts/configuration/configured"\n        ),\n    )\n\n\ndef _is_named_configurable_param(configurable: ConfigurableDefinition) -> bool:\n    return isinstance(configurable, NamedConfigurableDefinition)\n\n\n
[docs]def configured(\n configurable: ConfigurableDefinition,\n config_schema: Optional[Dict[str, Any]] = None,\n **kwargs: Any,\n):\n """\n A decorator that makes it easy to create a function-configured version of an object.\n The following definition types can be configured using this function:\n\n * :py:class:`CompositeSolidDefinition`\n * :py:class:`ExecutorDefinition`\n * :py:class:`LoggerDefinition`\n * :py:class:`ResourceDefinition`\n * :py:class:`SolidDefinition`\n\n If the config that will be supplied to the object is constant, you may alternatively invoke this\n and call the result with a dict of config values to be curried. Examples of both strategies\n below.\n\n Args:\n configurable (ConfigurableDefinition): An object that can be configured.\n config_schema (ConfigSchema): The config schema that the inputs to the decorated function\n must satisfy.\n **kwargs: Arbitrary keyword arguments that will be passed to the initializer of the returned\n object.\n\n Returns:\n (Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])\n\n **Examples:**\n\n .. code-block:: python\n\n dev_s3 = configured(s3_resource, name="dev_s3")({'bucket': 'dev'})\n\n @configured(s3_resource)\n def dev_s3(_):\n return {'bucket': 'dev'}\n\n @configured(s3_resource, {'bucket_prefix', str})\n def dev_s3(config):\n return {'bucket': config['bucket_prefix'] + 'dev'}\n """\n _check_configurable_param(configurable)\n\n if _is_named_configurable_param(configurable):\n\n def _configured(config_or_config_fn):\n fn_name = config_or_config_fn.__name__ if callable(config_or_config_fn) else None\n name = kwargs.get("name") or fn_name\n return configurable.configured(\n config_or_config_fn=config_or_config_fn,\n name=name,\n config_schema=config_schema,\n **{k: v for k, v in kwargs.items() if k != "name"},\n )\n\n return _configured\n else:\n\n def _configured(config_or_config_fn):\n return configurable.configured(\n config_schema=config_schema, config_or_config_fn=config_or_config_fn, **kwargs\n )\n\n return _configured
\n
", "current_page_name": "_modules/dagster/core/definitions/configurable", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.configurable"}, "decorators": {"composite_solid_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.composite_solid_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Dict, List, Optional, Union, overload\n\nfrom dagster import check\nfrom dagster.core.decorator_utils import format_docstring_for_description\n\nfrom ..composition import do_composition, get_validated_config_mapping\nfrom ..input import InputDefinition\nfrom ..output import OutputDefinition\nfrom ..solid_definition import CompositeSolidDefinition\n\n\nclass _CompositeSolid:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        input_defs: Optional[List[InputDefinition]] = None,\n        output_defs: Optional[List[OutputDefinition]] = None,\n        description: Optional[str] = None,\n        config_schema: Optional[Any] = None,\n        config_fn: Optional[Callable[[dict], dict]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.input_defs = check.opt_list_param(input_defs, "input_defs", InputDefinition)\n        self.output_defs = check.opt_nullable_list_param(output_defs, "output", OutputDefinition)\n        self.description = check.opt_str_param(description, "description")\n\n        self.config_schema = config_schema  # gets validated in do_composition\n        self.config_fn = check.opt_callable_param(config_fn, "config_fn")\n\n    def __call__(self, fn: Callable[..., Any]):\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        config_mapping = get_validated_config_mapping(\n            self.name, self.config_schema, self.config_fn, decorator_name="composite_solid"\n        )\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            solid_defs,\n            config_mapping,\n            positional_inputs,\n        ) = do_composition(\n            "@composite_solid",\n            self.name,\n            fn,\n            self.input_defs,\n            self.output_defs,\n            config_mapping,\n            ignore_output_from_composition_fn=False,\n        )\n\n        composite_def = CompositeSolidDefinition(\n            name=self.name,\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            dependencies=dependencies,\n            solid_defs=solid_defs,\n            description=self.description or format_docstring_for_description(fn),\n            config_mapping=config_mapping,\n            positional_inputs=positional_inputs,\n        )\n        update_wrapper(composite_def, fn)\n        return composite_def\n\n\n@overload\ndef composite_solid(\n    name: Callable[..., Any],\n) -> CompositeSolidDefinition:\n    ...\n\n\n@overload\ndef composite_solid(\n    name: Optional[str] = ...,\n    input_defs: Optional[List[InputDefinition]] = ...,\n    output_defs: Optional[List[OutputDefinition]] = ...,\n    description: Optional[str] = ...,\n    config_schema: Optional[Dict[str, Any]] = ...,\n    config_fn: Optional[Callable[[dict], dict]] = ...,\n) -> _CompositeSolid:\n    ...\n\n\n
[docs]def composite_solid(\n name: Optional[Union[Callable[..., Any], str]] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n description: Optional[str] = None,\n config_schema: Optional[Dict[str, Any]] = None,\n config_fn: Optional[Callable[[dict], dict]] = None,\n) -> Union[CompositeSolidDefinition, _CompositeSolid]:\n """Create a composite solid with the specified parameters from the decorated composition\n function.\n\n Using this decorator allows you to build up the dependency graph of the composite by writing a\n function that invokes solids and passes the output to other solids. This is similar to the use\n of the :py:func:`@pipeline <pipeline>` decorator, with the additional ability to remap inputs,\n outputs, and config across the composite boundary.\n\n Args:\n name (Optional[str]): Name for the new composite solid. Must be unique within any\n :py:class:`PipelineDefinition` using the solid.\n description (Optional[str]): Human-readable description of the new composite solid.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs that this composite solid maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit InputDefinitions taking precedence.\n\n Uses of inputs in the body of the decorated composition function will determine\n the :py:class:`InputMappings <InputMapping>` passed to the underlying\n :py:class:`CompositeSolidDefinition`.\n output_defs (Optional[List[OutputDefinition]]):\n Information about the outputs this composite solid maps. Information provided here\n will be combined with what can be inferred from the return type signature if there\n is only one OutputDefinition.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`CompositeSolidDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n config_schema (Optional[ConfigSchema]): If the `config_fn` argument is provided, this\n argument can be provided to set the schema for outer config that is passed to the\n `config_fn`. If `config_fn` is provided, but this argument is not provided, any config\n will be accepted.\n config_fn (Callable[[dict], dict]): By specifying a config mapping\n function, you can override the configuration for the child solids contained within this\n composite solid. ``config_fn``, maps the config provided to the\n composite solid to the config that will be provided to the child solids.\n\n If this argument is provided, the `config_schema` argument can also be provided to limit\n what config values can be passed to the composite solid.\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n @composite_solid\n def add_two(num: int) -> int:\n adder_1 = add_one.alias('adder_1')\n adder_2 = add_one.alias('adder_2')\n\n return adder_2(adder_1(num))\n\n """\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(output_defs is None)\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(config_fn is None)\n return _CompositeSolid()(name)\n\n return _CompositeSolid(\n name=name,\n input_defs=input_defs,\n output_defs=output_defs,\n description=description,\n config_schema=config_schema,\n config_fn=config_fn,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/composite_solid_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.composite_solid_decorator"}, "graph_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.graph_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Dict, List, Optional, Union, overload\n\nfrom dagster import check\nfrom dagster.core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..input import GraphIn, InputDefinition\nfrom ..output import GraphOut, OutputDefinition\n\n\nclass _Graph:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        input_defs: Optional[List[InputDefinition]] = None,\n        output_defs: Optional[List[OutputDefinition]] = None,\n        ins: Optional[Dict[str, GraphIn]] = None,\n        out: Optional[Union[GraphOut, Dict[str, GraphOut]]] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        config_mapping: Optional[ConfigMapping] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n        self.did_pass_outputs = output_defs is not None or out is not None\n        self.output_defs = check.opt_nullable_list_param(\n            output_defs, "output_defs", of_type=OutputDefinition\n        )\n        self.ins = ins\n        self.out = out\n        self.tags = tags\n        self.config_mapping = check.opt_inst_param(config_mapping, "config_mapping", ConfigMapping)\n\n    def __call__(self, fn: Callable[..., Any]) -> GraphDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        if self.ins is not None:\n            input_defs = [inp.to_definition(name) for name, inp in self.ins.items()]\n        else:\n            input_defs = check.opt_list_param(\n                self.input_defs, "input_defs", of_type=InputDefinition\n            )\n\n        if self.out is None:\n            output_defs = self.output_defs\n        elif isinstance(self.out, GraphOut):\n            output_defs = [self.out.to_definition(name=None)]\n        else:\n            check.dict_param(self.out, "out", key_type=str, value_type=GraphOut)\n            output_defs = [out.to_definition(name=name) for name, out in self.out.items()]\n\n        from dagster.core.definitions.decorators.composite_solid_decorator import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            solid_defs,\n            config_mapping,\n            positional_inputs,\n        ) = do_composition(\n            decorator_name="@graph",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=input_defs,\n            provided_output_defs=output_defs,\n            ignore_output_from_composition_fn=False,\n            config_mapping=self.config_mapping,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=solid_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n        )\n        update_wrapper(graph_def, fn)\n        return graph_def\n\n\n@overload\ndef graph(name: Callable[..., Any]) -> GraphDefinition:\n    ...\n\n\n@overload\ndef graph(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    input_defs: Optional[List[InputDefinition]] = ...,\n    output_defs: Optional[List[OutputDefinition]] = ...,\n    ins: Optional[Dict[str, GraphIn]] = ...,\n    out: Optional[Union[GraphOut, Dict[str, GraphOut]]] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    config: Optional[Union[ConfigMapping, Dict[str, Any]]] = ...,\n) -> _Graph:\n    ...\n\n\n
[docs]def graph(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n ins: Optional[Dict[str, GraphIn]] = None,\n out: Optional[Union[GraphOut, Dict[str, GraphOut]]] = None,\n tags: Optional[Dict[str, Any]] = None,\n config: Optional[Union[ConfigMapping, Dict[str, Any]]] = None,\n) -> Union[GraphDefinition, _Graph]:\n """Create a graph with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up a dependency graph by writing a\n function that invokes ops (or other graphs) and passes the output to subsequent invocations.\n\n Args:\n name (Optional[str]):\n The name of the graph. Must be unique within any :py:class:`RepositoryDefinition` containing the graph.\n description (Optional[str]):\n A human-readable description of the graph.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit InputDefinitions taking precedence.\n\n Uses of inputs in the body of the decorated composition function will determine\n the :py:class:`InputMappings <InputMapping>` passed to the underlying\n :py:class:`GraphDefinition`.\n output_defs (Optional[List[OutputDefinition]]):\n Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`GraphDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n ins (Optional[Dict[str, GraphIn]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit GraphIn taking precedence.\n out (Optional[Union[GraphOut, Dict[str, GraphOut]]]):\n Information about the outputs that this graph maps. Information provided here will be\n combined with what can be inferred from the return type signature if the function does\n not use yield.\n\n To map multiple outputs, return a dictionary from the composition function.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n """\n if callable(name):\n check.invariant(description is None)\n return _Graph()(name)\n\n config_mapping = None\n # Case 1: a dictionary of config is provided, convert to config mapping.\n if config is not None and not isinstance(config, ConfigMapping):\n config = check.dict_param(config, "config", key_type=str)\n config_mapping = ConfigMapping(config_fn=lambda _: config, config_schema=None)\n # Case 2: actual config mapping is provided.\n else:\n config_mapping = config\n\n return _Graph(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n ins=ins,\n out=out,\n tags=tags,\n config_mapping=config_mapping,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/graph_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.graph_decorator"}, "hook_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.hook_decorator

\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, List, Optional, Union, cast, overload\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\nfrom ...decorator_utils import get_function_params, validate_expected_params\nfrom ..events import HookExecutionResult\nfrom ..hook_definition import HookDefinition\n\nif TYPE_CHECKING:\n    from dagster.core.events import DagsterEvent\n    from dagster.core.execution.context.hook import HookContext\n\n\ndef _validate_hook_fn_params(fn, expected_positionals):\n    params = get_function_params(fn)\n    missing_positional = validate_expected_params(params, expected_positionals)\n    if missing_positional:\n        raise DagsterInvalidDefinitionError(\n            "'{hook_name}' decorated function does not have required positional "\n            "parameter '{missing_param}'. Hook functions should only have keyword arguments "\n            "that match input names and a first positional parameter named 'context' and "\n            "a second positional parameter named 'event_list'.".format(\n                hook_name=fn.__name__, missing_param=missing_positional\n            )\n        )\n\n\nclass _Hook:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        decorated_fn: Optional[Callable[..., Any]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.required_resource_keys = check.opt_set_param(\n            required_resource_keys, "required_resource_keys"\n        )\n        self.decorated_fn = check.opt_callable_param(decorated_fn, "decorated_fn")\n\n    def __call__(self, fn) -> HookDefinition:\n\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        expected_positionals = ["context", "event_list"]\n\n        _validate_hook_fn_params(fn, expected_positionals)\n\n        hook_def = HookDefinition(\n            name=self.name or "",\n            hook_fn=fn,\n            required_resource_keys=self.required_resource_keys,\n            decorated_fn=self.decorated_fn or fn,\n        )\n        update_wrapper(cast(Callable[..., Any], hook_def), fn)\n        return hook_def\n\n\n@overload\ndef event_list_hook(\n    name: Callable[..., Any],\n) -> HookDefinition:\n    pass\n\n\n@overload\ndef event_list_hook(\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    decorated_fn: Optional[Callable[..., Any]] = ...,\n) -> _Hook:\n    pass\n\n\ndef event_list_hook(\n    name: Optional[Union[Callable[..., Any], str]] = None,\n    required_resource_keys: Optional[AbstractSet[str]] = None,\n    decorated_fn: Optional[Callable[..., Any]] = None,\n) -> Union[HookDefinition, _Hook]:\n    """Create a generic hook with the specified parameters from the decorated function.\n\n    This decorator is currently used internally by Dagster machinery to support success_hook and\n    failure_hook.\n\n    The user-defined hook function requires two parameters:\n    - A `context` object is passed as the first parameter. The context is an instance of\n        :py:class:`context <HookContext>`, and provides access to system\n        information, such as loggers (context.log), resources (context.resources), the solid\n        (context.solid) and its execution step (context.step) which triggers this hook.\n    - An `event_list` object is passed as the second paramter. It provides the full event list of the\n        associated execution step.\n\n    Args:\n        name (Optional[str]): The name of this hook.\n        required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n            hook.\n\n    Examples:\n\n        .. code-block:: python\n\n            @event_list_hook(required_resource_keys={'slack'})\n            def slack_on_materializations(context, event_list):\n                for event in event_list:\n                    if event.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n                        message = '{solid} has materialized an asset {key}.'.format(\n                            solid=context.solid.name,\n                            key=event.asset_key\n                        )\n                        # send a slack message every time a materialization event occurs\n                        context.resources.slack.send_message(message)\n\n\n    """\n    # This case is for when decorator is used bare, without arguments.\n    # e.g. @event_list_hook versus @event_list_hook()\n    if callable(name):\n        check.invariant(required_resource_keys is None)\n        return _Hook()(name)\n\n    return _Hook(\n        name=name, required_resource_keys=required_resource_keys, decorated_fn=decorated_fn\n    )\n\n\nSuccessOrFailureHookFn = Callable[["HookContext"], Any]\n\n\n@overload\ndef success_hook(name: SuccessOrFailureHookFn) -> Union[HookDefinition, _Hook]:\n    ...\n\n\n@overload\ndef success_hook(\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], Union[HookDefinition, _Hook]]:\n    ...\n\n\n
[docs]def success_hook(\n name: Optional[Union[SuccessOrFailureHookFn, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, _Hook, Callable[[SuccessOrFailureHookFn], Union[HookDefinition, _Hook]]]:\n """Create a hook on step success events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n\n .. code-block:: python\n\n @success_hook(required_resource_keys={'slack'})\n def slack_message_on_success(context):\n message = 'op {} succeeded'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @success_hook\n def do_something_on_success(context):\n do_something()\n\n\n """\n\n def wrapper(fn: Callable[["HookContext"], Any]) -> Union[HookDefinition, _Hook]:\n\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(_name, required_resource_keys, decorated_fn=fn)\n def _success_hook(\n context: "HookContext", event_list: List["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_success:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _success_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @success_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper
\n\n\n@overload\ndef failure_hook(name: SuccessOrFailureHookFn) -> Union[HookDefinition, _Hook]:\n ...\n\n\n@overload\ndef failure_hook(\n name: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], Union[HookDefinition, _Hook]]:\n ...\n\n\n
[docs]def failure_hook(\n name: Optional[Union[SuccessOrFailureHookFn, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, _Hook, Callable[[SuccessOrFailureHookFn], Union[HookDefinition, _Hook]]]:\n """Create a hook on step failure events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n\n .. code-block:: python\n\n @failure_hook(required_resource_keys={'slack'})\n def slack_message_on_failure(context):\n message = 'op {} failed'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @failure_hook\n def do_something_on_failure(context):\n do_something()\n\n\n """\n\n def wrapper(fn: Callable[["HookContext"], Any]) -> Union[HookDefinition, _Hook]:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(_name, required_resource_keys, decorated_fn=fn)\n def _failure_hook(\n context: "HookContext", event_list: List["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_failure:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _failure_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @failure_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/hook_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.hook_decorator"}, "job_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.job_decorator

\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Dict, Optional, Union, overload\n\nfrom dagster import check\nfrom dagster.core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..hook_definition import HookDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from ..executor_definition import ExecutorDefinition\n    from ..partition import PartitionedConfig, PartitionsDefinition\n\n\nclass _Job:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n        config: Optional[Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"]] = None,\n        logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet[HookDefinition]] = None,\n        op_retry_policy: Optional[RetryPolicy] = None,\n        version_strategy: Optional[VersionStrategy] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n    ):\n        self.name = name\n        self.description = description\n        self.tags = tags\n        self.resource_defs = resource_defs\n        self.config = config\n        self.logger_defs = logger_defs\n        self.executor_def = executor_def\n        self.hooks = hooks\n        self.op_retry_policy = op_retry_policy\n        self.version_strategy = version_strategy\n        self.partitions_def = partitions_def\n\n    def __call__(self, fn: Callable[..., Any]) -> JobDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        from dagster.core.definitions.decorators.composite_solid_decorator import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            solid_defs,\n            config_mapping,\n            positional_inputs,\n        ) = do_composition(\n            decorator_name="@job",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=[],\n            provided_output_defs=[],\n            ignore_output_from_composition_fn=False,\n            config_mapping=None,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=solid_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n        )\n\n        job_def = graph_def.to_job(\n            description=self.description or format_docstring_for_description(fn),\n            resource_defs=self.resource_defs,\n            config=self.config,\n            tags=self.tags,\n            logger_defs=self.logger_defs,\n            executor_def=self.executor_def,\n            hooks=self.hooks,\n            op_retry_policy=self.op_retry_policy,\n            version_strategy=self.version_strategy,\n            partitions_def=self.partitions_def,\n        )\n        update_wrapper(job_def, fn)\n        return job_def\n\n\n@overload\ndef job(name: Callable[..., Any]) -> JobDefinition:\n    ...\n\n\n@overload\ndef job(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    resource_defs: Optional[Dict[str, ResourceDefinition]] = ...,\n    config: Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    logger_defs: Optional[Dict[str, LoggerDefinition]] = ...,\n    executor_def: Optional["ExecutorDefinition"] = ...,\n    hooks: Optional[AbstractSet[HookDefinition]] = ...,\n    op_retry_policy: Optional[RetryPolicy] = ...,\n    version_strategy: Optional[VersionStrategy] = ...,\n) -> _Job:\n    ...\n\n\n
[docs]def job(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n config: Optional[Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"]] = None,\n tags: Optional[Dict[str, Any]] = None,\n logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n) -> Union[JobDefinition, _Job]:\n """Creates a job with the specified parameters from the decorated graph/op invocation function.\n\n Using this decorator allows you to build an executable job by writing a function that invokes\n ops (or graphs).\n\n Args:\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Dict[str, ResourceDefinition]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagit playground, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the pipeline, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagit playground, so be careful with secrets.\n tags (Optional[Dict[str, Any]]):\n Arbitrary metadata for any execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n logger_defs (Optional[Dict[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multiprocess_executor` .\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoizaton will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition keys\n that can parameterize the job. If this argument is supplied, the config argument\n can't also be supplied.\n\n """\n if callable(name):\n check.invariant(description is None)\n return _Job()(name)\n\n return _Job(\n name=name,\n description=description,\n resource_defs=resource_defs,\n config=config,\n tags=tags,\n logger_defs=logger_defs,\n executor_def=executor_def,\n hooks=hooks,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n partitions_def=partitions_def,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/job_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.job_decorator"}, "op_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.op_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    overload,\n)\n\nfrom dagster import check\nfrom dagster.core.decorator_utils import format_docstring_for_description\n\nfrom ....seven.typing import get_origin\nfrom ...errors import DagsterInvariantViolationError\nfrom ..inference import InferredOutputProps, infer_output_props\nfrom ..input import In, InputDefinition\nfrom ..output import Out, OutputDefinition\nfrom ..policy import RetryPolicy\nfrom ..solid_definition import SolidDefinition\nfrom .solid_decorator import (\n    DecoratedSolidFunction,\n    NoContextDecoratedSolidFunction,\n    resolve_checked_solid_fn_inputs,\n)\n\nif TYPE_CHECKING:\n    from ..op_definition import OpDefinition\n\n\nclass _Op:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        input_defs: Optional[Sequence[InputDefinition]] = None,\n        output_defs: Optional[Sequence[OutputDefinition]] = None,\n        description: Optional[str] = None,\n        required_resource_keys: Optional[Set[str]] = None,\n        config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        version: Optional[str] = None,\n        decorator_takes_context: Optional[bool] = True,\n        retry_policy: Optional[RetryPolicy] = None,\n        ins: Optional[Dict[str, In]] = None,\n        out: Optional[Union[Out, Dict[str, Out]]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.input_defs = check.opt_nullable_sequence_param(\n            input_defs, "input_defs", of_type=InputDefinition\n        )\n        self.output_defs = output_defs\n        self.decorator_takes_context = check.bool_param(\n            decorator_takes_context, "decorator_takes_context"\n        )\n\n        self.description = check.opt_str_param(description, "description")\n\n        # these will be checked within SolidDefinition\n        self.required_resource_keys = required_resource_keys\n        self.tags = tags\n        self.version = version\n        self.retry_policy = retry_policy\n\n        # config will be checked within SolidDefinition\n        self.config_schema = config_schema\n\n        self.ins = check.opt_nullable_dict_param(ins, "ins", key_type=str, value_type=In)\n        self.out = out\n\n    def __call__(self, fn: Callable[..., Any]) -> "OpDefinition":\n        from ..op_definition import OpDefinition\n\n        if self.input_defs is not None and self.ins is not None:\n            check.failed("Values cannot be provided for both the 'input_defs' and 'ins' arguments")\n\n        if self.output_defs is not None and self.out is not None:\n            check.failed("Values cannot be provided for both the 'output_defs' and 'out' arguments")\n\n        inferred_out = infer_output_props(fn)\n\n        if self.ins is not None:\n            input_defs = [inp.to_definition(name) for name, inp in self.ins.items()]\n        else:\n            input_defs = check.opt_list_param(\n                self.input_defs, "input_defs", of_type=InputDefinition\n            )\n\n        output_defs_from_out = _resolve_output_defs_from_outs(\n            inferred_out=inferred_out, out=self.out\n        )\n        resolved_output_defs = (\n            output_defs_from_out if output_defs_from_out is not None else self.output_defs\n        )\n\n        if not self.name:\n            self.name = fn.__name__\n\n        if resolved_output_defs is None:\n            resolved_output_defs = [OutputDefinition.create_from_inferred(infer_output_props(fn))]\n        elif len(resolved_output_defs) == 1:\n            resolved_output_defs = [\n                resolved_output_defs[0].combine_with_inferred(infer_output_props(fn))\n            ]\n\n        compute_fn = (\n            DecoratedSolidFunction(decorated_fn=fn)\n            if self.decorator_takes_context\n            else NoContextDecoratedSolidFunction(decorated_fn=fn)\n        )\n\n        resolved_input_defs = resolve_checked_solid_fn_inputs(\n            decorator_name="@op",\n            fn_name=self.name,\n            compute_fn=compute_fn,\n            explicit_input_defs=input_defs,\n            exclude_nothing=True,\n        )\n\n        op_def = OpDefinition(\n            name=self.name,\n            input_defs=resolved_input_defs,\n            output_defs=resolved_output_defs,\n            compute_fn=compute_fn,\n            config_schema=self.config_schema,\n            description=self.description or format_docstring_for_description(fn),\n            required_resource_keys=self.required_resource_keys,\n            tags=self.tags,\n            version=self.version,\n            retry_policy=self.retry_policy,\n        )\n        update_wrapper(op_def, compute_fn.decorated_fn)\n        return op_def\n\n\ndef _resolve_output_defs_from_outs(\n    inferred_out: InferredOutputProps, out: Optional[Union[Out, dict]]\n) -> Optional[List[OutputDefinition]]:\n    if out is None:\n        return None\n    if isinstance(out, Out):\n        return [out.to_definition(inferred_out.annotation, name=None)]\n    else:\n        check.dict_param(out, "out", key_type=str, value_type=Out)\n\n        # If only a single entry has been provided to the out dict, then slurp the\n        # annotation into the entry.\n        if len(out) == 1:\n            name = list(out.keys())[0]\n            only_out = out[name]\n            return [only_out.to_definition(inferred_out.annotation, name)]\n\n        output_defs = []\n\n        # Introspection on type annotations is experimental, so checking\n        # metaclass is the best we can do.\n        if inferred_out.annotation and not get_origin(inferred_out.annotation) == tuple:\n            raise DagsterInvariantViolationError(\n                "Expected Tuple annotation for multiple outputs, but received non-tuple annotation."\n            )\n        if inferred_out.annotation and not len(inferred_out.annotation.__args__) == len(out):\n            raise DagsterInvariantViolationError(\n                "Expected Tuple annotation to have number of entries matching the "\n                f"number of outputs for more than one output. Expected {len(out)} "\n                f"outputs but annotation has {len(inferred_out.annotation.__args__)}."\n            )\n        for idx, (name, cur_out) in enumerate(out.items()):\n            annotation_type = (\n                inferred_out.annotation.__args__[idx] if inferred_out.annotation else None\n            )\n            output_defs.append(cur_out.to_definition(annotation_type, name=name))\n\n        return output_defs\n\n\n@overload\ndef op(name: Callable[..., Any]) -> SolidDefinition:\n    ...\n\n\n@overload\ndef op(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    ins: Optional[Dict[str, In]] = ...,\n    out: Optional[Union[Out, Dict[str, Out]]] = ...,\n    config_schema: Optional[Union[Any, Dict[str, Any]]] = ...,\n    required_resource_keys: Optional[Set[str]] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    version: Optional[str] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    input_defs: Optional[List[InputDefinition]] = ...,\n    output_defs: Optional[List[OutputDefinition]] = ...,\n) -> _Op:\n    ...\n\n\n
[docs]def op(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n ins: Optional[Dict[str, In]] = None,\n out: Optional[Union[Out, Dict[str, Out]]] = None,\n config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n tags: Optional[Dict[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n) -> Union[SolidDefinition, _Op]:\n """\n Create an op with the specified parameters from the decorated function.\n\n Ins and outs will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the op's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n @op supports ``async def`` functions as well, including async generators when yielding multiple\n events or outputs. Note that async ops will generally be run on their own unless using a custom\n :py:class:`Executor` implementation that supports running them together.\n\n Args:\n name (Optional[str]): Name of op. Must be unique within any :py:class:`GraphDefinition`\n using the op.\n description (Optional[str]): Human-readable description of this op. If not provided, and\n the decorated function has docstring, that docstring will be used as the description.\n ins (Optional[Dict[str, In]]):\n Information about the inputs to the op. Information provided here will be combined\n with what can be inferred from the function signature.\n out (Optional[Union[Out, Dict[str, Out]]]):\n Information about the op outputs. Information provided here will be combined with\n what can be inferred from the return type signature if the function does not use yield.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the op matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Values that are not strings\n will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`.\n version (Optional[str]): (Experimental) The version of the op's compute_fn. Two ops should have\n the same version if and only if they deterministically produce the same outputs when\n provided the same inputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n input_defs (Optional[List[InputDefinition]]):\n (legacy) Preserved to ease migration from :py:class:`solid`. Can be used in place of ins argument.\n output_defs (Optional[List[OutputDefinition]]):\n (legacy) Preserved to ease migration from :py:class:`solid`. Can be used in place of out argument.\n\n Examples:\n\n .. code-block:: python\n\n @op\n def hello_world():\n print('hello')\n\n @op\n def echo(msg: str) -> str:\n return msg\n\n @op(\n ins={'msg': In(str)},\n out=Out(str)\n )\n def echo_2(msg): # same as above\n return msg\n\n @op(\n out={'word': Out(), 'num': Out()}\n )\n def multi_out() -> Tuple[str, int]:\n return 'cool', 4\n """\n\n # This case is for when decorator is used bare, without arguments. e.g. @op versus @op()\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(output_defs is None)\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Op()(name)\n\n return _Op(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n retry_policy=retry_policy,\n ins=ins,\n out=out,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/op_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.op_decorator"}, "pipeline_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.pipeline_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Dict, List, Optional, Set, Union, overload\n\nfrom dagster import check\nfrom dagster.core.decorator_utils import format_docstring_for_description\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom ..graph_definition import GraphDefinition\nfrom ..hook_definition import HookDefinition\nfrom ..input import InputDefinition\nfrom ..mode import ModeDefinition\nfrom ..output import OutputDefinition\nfrom ..pipeline_definition import PipelineDefinition\nfrom ..preset import PresetDefinition\nfrom ..version_strategy import VersionStrategy\n\n\nclass _Pipeline:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        mode_defs: Optional[List[ModeDefinition]] = None,\n        preset_defs: Optional[List[PresetDefinition]] = None,\n        description: Optional[str] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        hook_defs: Optional[Set[HookDefinition]] = None,\n        input_defs: Optional[List[InputDefinition]] = None,\n        output_defs: Optional[List[OutputDefinition]] = None,\n        config_schema: Optional[Dict[str, Any]] = None,\n        config_fn: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,\n        solid_retry_policy: Optional[RetryPolicy] = None,\n        version_strategy: Optional[VersionStrategy] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.mode_definitions = check.opt_list_param(mode_defs, "mode_defs", ModeDefinition)\n        self.preset_definitions = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition)\n        self.description = check.opt_str_param(description, "description")\n        self.tags = check.opt_dict_param(tags, "tags")\n        self.hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n        self.input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n        self.did_pass_outputs = output_defs is not None\n        self.output_defs = check.opt_nullable_list_param(\n            output_defs, "output_defs", of_type=OutputDefinition\n        )\n        self.config_schema = config_schema\n        self.config_fn = check.opt_callable_param(config_fn, "config_fn")\n        self.solid_retry_policy = check.opt_inst_param(\n            solid_retry_policy, "solid_retry_policy", RetryPolicy\n        )\n        self.version_strategy = check.opt_inst_param(\n            version_strategy, "version_strategy", VersionStrategy\n        )\n\n    def __call__(self, fn: Callable[..., Any]) -> PipelineDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        from dagster.core.definitions.decorators.composite_solid_decorator import (\n            do_composition,\n            get_validated_config_mapping,\n        )\n\n        config_mapping = get_validated_config_mapping(\n            self.name, self.config_schema, self.config_fn, decorator_name="pipeline"\n        )\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            solid_defs,\n            config_mapping,\n            positional_inputs,\n        ) = do_composition(\n            "@pipeline",\n            self.name,\n            fn,\n            self.input_defs,\n            self.output_defs,\n            config_mapping,\n            ignore_output_from_composition_fn=not self.did_pass_outputs,\n        )\n\n        pipeline_def = PipelineDefinition(\n            mode_defs=self.mode_definitions,\n            preset_defs=self.preset_definitions,\n            graph_def=GraphDefinition(\n                name=self.name,\n                description=None,  # put desc on the pipeline\n                dependencies=dependencies,\n                node_defs=solid_defs,\n                input_mappings=input_mappings,\n                output_mappings=output_mappings,\n                config=config_mapping,\n                positional_inputs=positional_inputs,\n            ),\n            tags=self.tags,\n            description=self.description or format_docstring_for_description(fn),\n            hook_defs=self.hook_defs,\n            solid_retry_policy=self.solid_retry_policy,\n            version_strategy=self.version_strategy,\n        )\n        update_wrapper(pipeline_def, fn)\n        return pipeline_def\n\n\n@overload\ndef pipeline(\n    name: Callable[..., Any],\n) -> PipelineDefinition:\n    ...\n\n\n@overload\ndef pipeline(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    mode_defs: Optional[List[ModeDefinition]] = ...,\n    preset_defs: Optional[List[PresetDefinition]] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    hook_defs: Optional[Set[HookDefinition]] = ...,\n    input_defs: Optional[List[InputDefinition]] = ...,\n    output_defs: Optional[List[OutputDefinition]] = ...,\n    config_schema: Optional[Dict[str, Any]] = ...,\n    config_fn: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = ...,\n    solid_retry_policy: Optional[RetryPolicy] = ...,\n    version_strategy: Optional[VersionStrategy] = ...,\n) -> _Pipeline:\n    pass\n\n\n
[docs]def pipeline(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n mode_defs: Optional[List[ModeDefinition]] = None,\n preset_defs: Optional[List[PresetDefinition]] = None,\n tags: Optional[Dict[str, Any]] = None,\n hook_defs: Optional[Set[HookDefinition]] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n config_schema: Optional[Dict[str, Any]] = None,\n config_fn: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,\n solid_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n) -> Union[PipelineDefinition, _Pipeline]:\n """Create a pipeline with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up the dependency graph of the pipeline by writing a\n function that invokes solids and passes the output to other solids.\n\n Args:\n name (Optional[str]): The name of the pipeline. Must be unique within any\n :py:class:`RepositoryDefinition` containing the pipeline.\n description (Optional[str]): A human-readable description of the pipeline.\n mode_defs (Optional[List[ModeDefinition]]): The set of modes in which this pipeline can\n operate. Modes are used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline. Modes can be used, e.g., to vary\n available resource and logging implementations between local test and production runs.\n preset_defs (Optional[List[PresetDefinition]]): A set of preset collections of configuration\n options that may be used to execute a pipeline. A preset consists of an environment\n dict, an optional subset of solids to execute, and a mode selection. Presets can be used\n to ship common combinations of options to pipeline end users in Python code, and can\n be selected by tools like Dagit.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the pipeline.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n hook_defs (Optional[Set[HookDefinition]]): A set of hook definitions applied to the\n pipeline. When a hook is applied to a pipeline, it will be attached to all solid\n instances within the pipeline.\n solid_retry_policy (Optional[RetryPolicy]): The default retry policy for all solids in\n this pipeline. Only used if retry policy is not defined on the solid definition or\n solid invocation.\n version_strategy (Optional[VersionStrategy]): The version strategy to use with this\n pipeline. Providing a VersionStrategy will enable memoization on the pipeline.\n\n Example:\n\n .. code-block:: python\n\n @solid(output_defs=[OutputDefinition(int, "two"), OutputDefinition(int, "four")])\n def emit_two_four(_) -> int:\n yield Output(2, "two")\n yield Output(4, "four")\n\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n\n @lambda_solid\n def mult_two(num: int) -> int:\n return num * 2\n\n\n @pipeline\n def math_pipeline():\n two, four = emit_two_four()\n add_one(two)\n mult_two(four)\n """\n\n if input_defs is not None:\n experimental_arg_warning("input_defs", "pipeline")\n\n if output_defs is not None:\n experimental_arg_warning("output_defs", "pipeline")\n\n if config_schema is not None:\n experimental_arg_warning("config_schema", "pipeline")\n\n if config_fn is not None:\n experimental_arg_warning("config_fn", "pipeline")\n\n if callable(name):\n check.invariant(description is None)\n return _Pipeline()(name)\n\n return _Pipeline(\n name=name,\n mode_defs=mode_defs,\n preset_defs=preset_defs,\n description=description,\n tags=tags,\n hook_defs=hook_defs,\n input_defs=input_defs,\n output_defs=output_defs,\n config_schema=config_schema,\n config_fn=config_fn,\n solid_retry_policy=solid_retry_policy,\n version_strategy=version_strategy,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/pipeline_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.pipeline_decorator"}, "repository_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.repository_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Optional, Union, overload\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\nfrom ..graph_definition import GraphDefinition\nfrom ..partition import PartitionSetDefinition\nfrom ..pipeline_definition import PipelineDefinition\nfrom ..repository_definition import (\n    VALID_REPOSITORY_DATA_DICT_KEYS,\n    CachingRepositoryData,\n    RepositoryData,\n    RepositoryDefinition,\n)\nfrom ..schedule_definition import ScheduleDefinition\nfrom ..sensor_definition import SensorDefinition\n\n\nclass _Repository:\n    def __init__(self, name: Optional[str] = None, description: Optional[str] = None):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n\n    def __call__(self, fn: Callable[[], Any]) -> RepositoryDefinition:\n        from dagster.core.asset_defs import AssetGroup\n\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        repository_definitions = fn()\n\n        repository_data: Union[CachingRepositoryData, RepositoryData]\n        if isinstance(repository_definitions, list):\n            bad_definitions = []\n            for i, definition in enumerate(repository_definitions):\n                if not (\n                    isinstance(definition, PipelineDefinition)\n                    or isinstance(definition, PartitionSetDefinition)\n                    or isinstance(definition, ScheduleDefinition)\n                    or isinstance(definition, SensorDefinition)\n                    or isinstance(definition, GraphDefinition)\n                    or isinstance(definition, AssetGroup)\n                ):\n                    bad_definitions.append((i, type(definition)))\n            if bad_definitions:\n                bad_definitions_str = ", ".join(\n                    [\n                        "value of type {type_} at index {i}".format(type_=type_, i=i)\n                        for i, type_ in bad_definitions\n                    ]\n                )\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: all elements of list "\n                    "must be of type JobDefinition, GraphDefinition, PipelineDefinition, "\n                    "PartitionSetDefinition, ScheduleDefinition, or SensorDefinition. "\n                    f"Got {bad_definitions_str}."\n                )\n            repository_data = CachingRepositoryData.from_list(repository_definitions)\n\n        elif isinstance(repository_definitions, dict):\n            if not set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS):\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: dict must not contain "\n                    "keys other than {{'pipelines', 'partition_sets', 'schedules', 'jobs'}}: found "\n                    "{bad_keys}".format(\n                        bad_keys=", ".join(\n                            [\n                                "'{key}'".format(key=key)\n                                for key in repository_definitions.keys()\n                                if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n                            ]\n                        )\n                    )\n                )\n            repository_data = CachingRepositoryData.from_dict(repository_definitions)\n        elif isinstance(repository_definitions, RepositoryData):\n            repository_data = repository_definitions\n        else:\n            raise DagsterInvalidDefinitionError(\n                "Bad return value of type {type_} from repository construction function: must "\n                "return list, dict, or RepositoryData. See the @repository decorator docstring for "\n                "details and examples".format(type_=type(repository_definitions)),\n            )\n\n        repository_def = RepositoryDefinition(\n            name=self.name, description=self.description, repository_data=repository_data\n        )\n\n        update_wrapper(repository_def, fn)\n        return repository_def\n\n\n@overload\ndef repository(name: Callable[..., Any]) -> RepositoryDefinition:\n    ...\n\n\n@overload\ndef repository(name: Optional[str] = ..., description: Optional[str] = ...) -> _Repository:\n    ...\n\n\n
[docs]def repository(\n name: Optional[Union[str, Callable[..., Any]]] = None, description: Optional[str] = None\n) -> Union[RepositoryDefinition, _Repository]:\n """Create a repository from the decorated function.\n\n The decorated function should take no arguments and its return value should one of:\n\n 1. ``List[Union[JobDefinition, PipelineDefinition, PartitionSetDefinition, ScheduleDefinition, SensorDefinition]]``.\n Use this form when you have no need to lazy load pipelines or other definitions. This is the\n typical use case.\n\n 2. A dict of the form:\n\n .. code-block:: python\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n 'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n 'sensors': Dict[str, Callable[[], SensorDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n\n 3. A :py:class:`RepositoryData`. Return this object if you need fine-grained\n control over the construction and indexing of definitions within the repository, e.g., to\n create definitions dynamically from .yaml files in a directory.\n\n Args:\n name (Optional[str]): The name of the repository. Defaults to the name of the decorated\n function.\n description (Optional[str]): A string description of the repository.\n\n Example:\n\n .. code-block:: python\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n ######################################################################\n\n @op(config_schema={n: Field(Int)})\n def return_n(context):\n return context.op_config['n']\n\n @job\n def simple_job():\n return_n()\n\n @job\n def some_job():\n ...\n\n @sensor(job=some_job)\n def some_sensor():\n if foo():\n yield RunRequest(\n run_key= ...,\n run_config={\n 'ops': {'return_n': {'config': {'n': bar()}}}\n }\n )\n\n @job\n def my_job():\n ...\n\n my_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n @repository\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n\n ######################################################################\n # A lazy-loaded repository\n ######################################################################\n\n def make_expensive_job():\n @job\n def expensive_job():\n for i in range(10000):\n return_n.alias(f'return_n_{i}')()\n\n return expensive_job\n\n def make_expensive_schedule():\n @job\n def other_expensive_job():\n for i in range(11000):\n return_n.alias(f'my_return_n_{i}')()\n\n return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n @repository\n def lazy_loaded_repository():\n return {\n 'jobs': {'expensive_job': make_expensive_job},\n 'schedules': {'expensive_schedule: make_expensive_schedule}\n }\n\n\n ######################################################################\n # A complex repository that lazily constructs jobs from a directory\n # of files in a bespoke YAML format\n ######################################################################\n\n class ComplexRepositoryData(RepositoryData):\n def __init__(self, yaml_directory):\n self._yaml_directory = yaml_directory\n\n def get_all_pipelines(self):\n return [\n self._construct_job_def_from_yaml_file(\n self._yaml_file_for_job_name(file_name)\n )\n for file_name in os.listdir(self._yaml_directory)\n ]\n\n ...\n\n @repository\n def complex_repository():\n return ComplexRepositoryData('some_directory')\n\n """\n if callable(name):\n check.invariant(description is None)\n\n return _Repository()(name)\n\n return _Repository(name=name, description=description)
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/repository_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.repository_decorator"}, "schedule_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.schedule_decorator

\nimport copy\nimport datetime\nimport warnings\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Generator,\n    List,\n    NamedTuple,\n    Optional,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.partition import (\n    PartitionScheduleDefinition,\n    PartitionSetDefinition,\n    ScheduleTimeBasedPartitionsDefinition,\n    ScheduleType,\n)\nfrom dagster.core.errors import (\n    DagsterInvalidDefinitionError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster.utils import ensure_gen\nfrom dagster.utils.partitions import (\n    DEFAULT_DATE_FORMAT,\n    DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE,\n    DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,\n    DEFAULT_MONTHLY_FORMAT,\n    create_offset_partition_selector,\n)\n\nfrom ...decorator_utils import get_function_params\nfrom ...storage.tags import check_tags\nfrom ..graph_definition import GraphDefinition\nfrom ..mode import DEFAULT_MODE_NAME\nfrom ..pipeline_definition import PipelineDefinition\nfrom ..run_request import RunRequest, SkipReason\nfrom ..schedule_definition import DefaultScheduleStatus, ScheduleDefinition, is_context_provided\n\nif TYPE_CHECKING:\n    from dagster import Partition, ScheduleEvaluationContext\n\n# Error messages are long\n# pylint: disable=C0301\n\nRunConfig = Dict[str, Any]\nRunRequestGenerator = Generator[Union[RunRequest, SkipReason], None, None]\n\n\nclass DecoratedScheduleFunction(NamedTuple):\n    """Wrapper around the decorated schedule function.  Keeps track of both to better support the\n    optimal return value for direct invocation of the evaluation function"""\n\n    decorated_fn: Callable[..., Union[RunRequest, SkipReason, RunConfig, RunRequestGenerator]]\n    wrapped_fn: Callable[["ScheduleEvaluationContext"], RunRequestGenerator]\n    has_context_arg: bool\n\n\n
[docs]def schedule(\n cron_schedule: str,\n pipeline_name: Optional[str] = None,\n name: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n tags_fn: Optional[Callable[["ScheduleEvaluationContext"], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[["ScheduleEvaluationContext"], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n execution_timezone: Optional[str] = None,\n description: Optional[str] = None,\n job: Optional[Union[PipelineDefinition, GraphDefinition]] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[\n [\n Callable[\n ...,\n Union[RunRequest, SkipReason, RunConfig, RunRequestGenerator],\n ]\n ],\n ScheduleDefinition,\n]:\n """\n Creates a schedule following the provided cron schedule and requests runs for the provided job.\n\n The decorated function takes in a :py:class:`~dagster.ScheduleEvaluationContext` as its only\n argument, and does one of the following:\n\n 1. Return a `RunRequest` object.\n 2. Yield multiple of `RunRequest` objects.\n 3. Return or yield a `SkipReason` object, providing a descriptive message of why no runs were\n requested.\n 4. Return or yield nothing (skipping without providing a reason)\n 5. Return a run config dictionary.\n\n Returns a :py:class:`~dagster.ScheduleDefinition`.\n\n Args:\n cron_schedule (str): A valid cron string specifying when the schedule will run, e.g.,\n ``'45 23 * * 6'`` for a schedule that runs at 11:45 PM every Saturday.\n pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the\n schedule runs.\n name (Optional[str]): The name of the schedule to create.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A function\n that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags`` and ``tags_fn``.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this\n schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def inner(\n fn: Callable[\n ...,\n Union[RunRequest, SkipReason, RunConfig, RunRequestGenerator],\n ]\n ) -> ScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n # perform upfront validation of schedule tags\n _tags_fn: Optional[Callable[["ScheduleEvaluationContext"], Dict[str, str]]] = None\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n check_tags(tags, "tags")\n _tags_fn = cast(Callable[["ScheduleEvaluationContext"], Dict[str, str]], lambda _: tags)\n elif tags_fn:\n _tags_fn = cast(\n Callable[["ScheduleEvaluationContext"], Dict[str, str]],\n lambda context: tags_fn(context) or {},\n )\n\n def _wrapped_fn(context: "ScheduleEvaluationContext"):\n if should_execute:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of should_execute for schedule {schedule_name}",\n ):\n if not should_execute(context):\n yield SkipReason(\n f"should_execute function for {schedule_name} returned false."\n )\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the evaluation of schedule {schedule_name}",\n ):\n result = fn(context) if has_context_arg else fn()\n if isinstance(result, dict):\n # this is the run-config based decorated function, wrap the evaluated run config\n # and tags in a RunRequest\n evaluated_run_config = copy.deepcopy(result)\n evaluated_tags = _tags_fn(context) if _tags_fn else None\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n else:\n # this is a run-request based decorated function\n yield from ensure_gen(result)\n\n has_context_arg = is_context_provided(get_function_params(fn))\n evaluation_fn = DecoratedScheduleFunction(\n decorated_fn=fn,\n wrapped_fn=_wrapped_fn,\n has_context_arg=has_context_arg,\n )\n\n schedule_def = ScheduleDefinition(\n name=schedule_name,\n cron_schedule=cron_schedule,\n pipeline_name=pipeline_name,\n solid_selection=solid_selection,\n mode=mode,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n description=description,\n execution_fn=evaluation_fn,\n job=job,\n default_status=default_status,\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n\n return schedule_def\n\n return inner
\n\n\ndef monthly_schedule(\n pipeline_name: Optional[str],\n start_date: datetime.datetime,\n name: Optional[str] = None,\n execution_day_of_month: int = 1,\n execution_time: datetime.time = datetime.time(0, 0),\n tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[["ScheduleEvaluationContext"], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n end_date: Optional[datetime.datetime] = None,\n execution_timezone: Optional[str] = None,\n partition_months_offset: Optional[int] = 1,\n description: Optional[str] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:\n """Create a partitioned schedule that runs monthly.\n\n The decorated function should accept a datetime object as its only argument. The datetime\n represents the date partition that it's meant to run on.\n\n The decorated function should return a run configuration dictionary, which will be used as\n configuration for the scheduled run.\n\n The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_day_of_month (int): The day of the month on which to run the schedule (must be\n between 1 and 31).\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleEvaluationContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n partition_months_offset (Optional[int]): How many months back to go when choosing the partition\n for a given schedule execution. For example, when partition_months_offset=1, the schedule\n that executes during month N will fill in the partition for month N-1.\n (Default: 1)\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(pipeline_name, "pipeline_name")\n check.int_param(execution_day_of_month, "execution_day")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_int_param(partition_months_offset, "partition_months_offset")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n if (\n start_date.day != 1\n or start_date.hour != 0\n or start_date.minute != 0\n or start_date.second != 0\n ):\n warnings.warn(\n "`start_date` must be at the beginning of the first day of the month for a monthly "\n "schedule. Use `execution_day_of_month` and `execution_time` to execute the schedule "\n "at a specific time within the month. For example, to run the schedule at 3AM on the "\n "23rd of each month starting in October, your schedule definition would look like:"\n """\n@monthly_schedule(\n start_date=datetime.datetime(2020, 10, 1),\n execution_day_of_month=23,\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_day_of_month <= 0 or execution_day_of_month > 31:\n raise DagsterInvalidDefinitionError(\n "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be "\n "between 1 and 31".format(execution_day_of_month)\n )\n\n def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value: Callable[\n ["Partition"], Optional[Dict[str, str]]\n ] = lambda partition: {}\n if tags_fn_for_date:\n tags_fn = cast(\n Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date\n )\n tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)\n\n fmt = DEFAULT_MONTHLY_FORMAT\n\n partitions_def = ScheduleTimeBasedPartitionsDefinition(\n schedule_type=ScheduleType.MONTHLY,\n start=start_date,\n execution_day=execution_day_of_month,\n execution_time=execution_time,\n end=end_date,\n fmt=fmt,\n timezone=execution_timezone,\n offset=partition_months_offset,\n )\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name, # type: ignore[arg-type]\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n partitions_def=partitions_def,\n )\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name,\n partitions_def.get_cron_schedule(),\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=partitions_def.get_execution_time_to_partition_fn()\n ),\n execution_timezone=execution_timezone,\n description=description,\n decorated_fn=fn,\n default_status=default_status,\n )\n update_wrapper(schedule_def, wrapped=fn)\n\n return schedule_def\n\n return inner\n\n\ndef weekly_schedule(\n pipeline_name: Optional[str],\n start_date: datetime.datetime,\n name: Optional[str] = None,\n execution_day_of_week: int = 0,\n execution_time: datetime.time = datetime.time(0, 0),\n tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[["ScheduleEvaluationContext"], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n end_date: Optional[datetime.datetime] = None,\n execution_timezone: Optional[str] = None,\n partition_weeks_offset: Optional[int] = 1,\n description: Optional[str] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:\n """Create a partitioned schedule that runs daily.\n\n The decorated function should accept a datetime object as its only argument. The datetime\n represents the date partition that it's meant to run on.\n\n The decorated function should return a run configuration dictionary, which will be used as\n configuration for the scheduled run.\n\n The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_day_of_week (int): The day of the week on which to run the schedule. Must be\n between 0 (Sunday) and 6 (Saturday).\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleEvaluationContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n partition_weeks_offset (Optional[int]): How many weeks back to go when choosing the partition\n for a given schedule execution. For example, when partition_weeks_offset=1, the schedule\n that executes during week N will fill in the partition for week N-1.\n (Default: 1)\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(pipeline_name, "pipeline_name")\n check.int_param(execution_day_of_week, "execution_day_of_week")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_int_param(partition_weeks_offset, "partition_weeks_offset")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of a day for a weekly schedule. "\n "Use `execution_time` to execute the schedule at a specific time of day. For example, "\n "to run the schedule at 3AM each Tuesday starting on 10/20/2020, your schedule "\n "definition would look like:"\n """\n@weekly_schedule(\n start_date=datetime.datetime(2020, 10, 20),\n execution_day_of_week=1,\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_day_of_week < 0 or execution_day_of_week >= 7:\n raise DagsterInvalidDefinitionError(\n "`execution_day_of_week={}` is not valid for weekly schedule. Execution day must be "\n "between 0 [Sunday] and 6 [Saturday]".format(execution_day_of_week)\n )\n\n def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value: Callable[\n ["Partition"], Optional[Dict[str, str]]\n ] = lambda partition: {}\n if tags_fn_for_date:\n tags_fn = cast(\n Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date\n )\n tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)\n\n fmt = DEFAULT_DATE_FORMAT\n\n partitions_def = ScheduleTimeBasedPartitionsDefinition(\n schedule_type=ScheduleType.WEEKLY,\n start=start_date,\n execution_time=execution_time,\n execution_day=execution_day_of_week,\n end=end_date,\n fmt=fmt,\n timezone=execution_timezone,\n offset=partition_weeks_offset,\n )\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name, # type: ignore[arg-type]\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n partitions_def=partitions_def,\n )\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name,\n partitions_def.get_cron_schedule(),\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=partitions_def.get_execution_time_to_partition_fn(),\n ),\n execution_timezone=execution_timezone,\n description=description,\n decorated_fn=fn,\n default_status=default_status,\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n return schedule_def\n\n return inner\n\n\ndef daily_schedule(\n pipeline_name: Optional[str],\n start_date: datetime.datetime,\n name: Optional[str] = None,\n execution_time: datetime.time = datetime.time(0, 0),\n tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[["ScheduleEvaluationContext"], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n end_date: Optional[datetime.datetime] = None,\n execution_timezone: Optional[str] = None,\n partition_days_offset: Optional[int] = 1,\n description: Optional[str] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:\n """Create a partitioned schedule that runs daily.\n\n The decorated function should accept a datetime object as its only argument. The datetime\n represents the date partition that it's meant to run on.\n\n The decorated function should return a run configuration dictionary, which will be used as\n configuration for the scheduled run.\n\n The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleEvaluationContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n partition_days_offset (Optional[int]): How many days back to go when choosing the partition\n for a given schedule execution. For example, when partition_days_offset=1, the schedule\n that executes during day N will fill in the partition for day N-1.\n (Default: 1)\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(pipeline_name, "pipeline_name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_str_param(name, "name")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_int_param(partition_days_offset, "partition_days_offset")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of a day for a daily schedule. "\n "Use `execution_time` to execute the schedule at a specific time of day. For example, "\n "to run the schedule at 3AM each day starting on 10/20/2020, your schedule "\n "definition would look like:"\n """\n@daily_schedule(\n start_date=datetime.datetime(2020, 10, 20),\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n fmt = DEFAULT_DATE_FORMAT\n\n def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value: Callable[\n ["Partition"], Optional[Dict[str, str]]\n ] = lambda partition: {}\n if tags_fn_for_date:\n tags_fn = cast(\n Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date\n )\n tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)\n\n partitions_def = ScheduleTimeBasedPartitionsDefinition(\n schedule_type=ScheduleType.DAILY,\n start=start_date,\n execution_time=execution_time,\n end=end_date,\n fmt=fmt,\n timezone=execution_timezone,\n offset=partition_days_offset,\n )\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name, # type: ignore[arg-type]\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n partitions_def=partitions_def,\n )\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name,\n partitions_def.get_cron_schedule(),\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=partitions_def.get_execution_time_to_partition_fn(),\n ),\n execution_timezone=execution_timezone,\n description=description,\n decorated_fn=fn,\n default_status=default_status,\n )\n update_wrapper(schedule_def, wrapped=fn)\n return schedule_def\n\n return inner\n\n\ndef hourly_schedule(\n pipeline_name: Optional[str],\n start_date: datetime.datetime,\n name: Optional[str] = None,\n execution_time: datetime.time = datetime.time(0, 0),\n tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[["ScheduleEvaluationContext"], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n end_date: Optional[datetime.datetime] = None,\n execution_timezone: Optional[str] = None,\n partition_hours_offset: Optional[int] = 1,\n description: Optional[str] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:\n """Create a partitioned schedule that runs hourly.\n\n The decorated function should accept a datetime object as its only argument. The datetime\n represents the date partition that it's meant to run on.\n\n The decorated function should return a run configuration dictionary, which will be used as\n configuration for the scheduled run.\n\n The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create. By default, this will be the name\n of the decorated function.\n execution_time (datetime.time): The time at which to execute the schedule. Only the minutes\n component will be respected -- the hour should be 0, and will be ignored if it is not 0.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleEvaluationContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n partition_hours_offset (Optional[int]): How many hours back to go when choosing the partition\n for a given schedule execution. For example, when partition_hours_offset=1, the schedule\n that executes during hour N will fill in the partition for hour N-1.\n (Default: 1)\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(pipeline_name, "pipeline_name")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_int_param(partition_hours_offset, "partition_hours_offset")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n if start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of the hour for an hourly schedule. "\n "Use `execution_time` to execute the schedule at a specific time within the hour. For "\n "example, to run the schedule each hour at 15 minutes past the hour starting at 3AM "\n "on 10/20/2020, your schedule definition would look like:"\n """\n@hourly_schedule(\n start_date=datetime.datetime(2020, 10, 20, 3),\n execution_time=datetime.time(0, 15)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_time.hour != 0:\n warnings.warn(\n "Hourly schedule {schedule_name} created with:\\n"\n "\\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."\n "Since this is an hourly schedule, the hour parameter will be ignored and the schedule "\n "will run on the {minute} mark for the previous hour interval. Replace "\n "datetime.time(hour={hour}, minute={minute}, ...) with "\n "datetime.time(minute={minute}, ...) to fix this warning."\n )\n\n def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value: Callable[\n ["Partition"], Optional[Dict[str, str]]\n ] = lambda partition: {}\n if tags_fn_for_date:\n tags_fn = cast(\n Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date\n )\n tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)\n\n fmt = (\n DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE\n if execution_timezone\n else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n )\n\n partitions_def = ScheduleTimeBasedPartitionsDefinition(\n schedule_type=ScheduleType.HOURLY,\n start=start_date,\n execution_time=execution_time,\n end=end_date,\n fmt=fmt,\n timezone=execution_timezone,\n offset=partition_hours_offset,\n )\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name, # type: ignore[arg-type]\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n partitions_def=partitions_def,\n )\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name,\n partitions_def.get_cron_schedule(),\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=partitions_def.get_execution_time_to_partition_fn(),\n ),\n execution_timezone=execution_timezone,\n description=description,\n decorated_fn=fn,\n default_status=default_status,\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n return schedule_def\n\n return inner\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/schedule_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.schedule_decorator"}, "sensor_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.sensor_decorator

\nimport inspect\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Callable, Generator, List, Optional, Sequence, Union\n\nfrom dagster import check\nfrom dagster.core.definitions.sensor_definition import (\n    DefaultSensorStatus,\n    RunRequest,\n    SensorDefinition,\n    SkipReason,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\n\nfrom ...errors import DagsterInvariantViolationError\nfrom ..events import AssetKey\nfrom ..graph_definition import GraphDefinition\nfrom ..job_definition import JobDefinition\nfrom ..sensor_definition import (\n    AssetSensorDefinition,\n    RunRequest,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SkipReason,\n)\n\nif TYPE_CHECKING:\n    from ...events.log import EventLogEntry\n\n\n
[docs]def sensor(\n pipeline_name: Optional[str] = None,\n name: Optional[str] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, JobDefinition]] = None,\n jobs: Optional[Sequence[Union[GraphDefinition, JobDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[\n [\n Callable[\n [SensorEvaluationContext],\n Union[Generator[Union[RunRequest, SkipReason], None, None], RunRequest, SkipReason],\n ]\n ],\n SensorDefinition,\n]:\n """\n Creates a sensor where the decorated function is used as the sensor's evaluation function. The\n decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Yield multiple of `RunRequest` objects.\n 3. Return or yield a `SkipReason` object, providing a descriptive message of why no runs were\n requested.\n 4. Return or yield nothing (skipping without providing a reason)\n\n Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n Args:\n pipeline_name (Optional[str]): (legacy) Name of the target pipeline. Cannot be used in\n conjunction with `job` or `jobs` parameters.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n solid_selection (Optional[List[str]]): (legacy) A list of solid subselection (including single\n solid names) to execute for runs for this sensor e.g.\n ``['*some_solid+', 'other_solid']``.\n Cannot be used in conjunction with `job` or `jobs` parameters.\n mode (Optional[str]): (legacy) The mode to apply when executing runs for this sensor. Cannot be used\n in conjunction with `job` or `jobs` parameters.\n (default: 'default')\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(name, "name")\n\n def inner(\n fn: Callable[\n ["SensorEvaluationContext"],\n Union[Generator[Union[SkipReason, RunRequest], None, None], SkipReason, RunRequest],\n ]\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n\n sensor_def = SensorDefinition(\n name=name,\n pipeline_name=pipeline_name,\n evaluation_fn=fn,\n solid_selection=solid_selection,\n mode=mode,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n )\n\n update_wrapper(sensor_def, wrapped=fn)\n\n return sensor_def\n\n return inner
\n\n\n
[docs]def asset_sensor(\n asset_key: AssetKey,\n pipeline_name: Optional[str] = None,\n name: Optional[str] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, JobDefinition]] = None,\n jobs: Optional[Sequence[Union[GraphDefinition, JobDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[\n [\n Callable[\n [\n "SensorEvaluationContext",\n "EventLogEntry",\n ],\n Union[Generator[Union[RunRequest, SkipReason], None, None], RunRequest, SkipReason],\n ]\n ],\n AssetSensorDefinition,\n]:\n """\n Creates an asset sensor where the decorated function is used as the asset sensor's evaluation\n function. The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Yield multiple of `RunRequest` objects.\n 3. Return or yield a `SkipReason` object, providing a descriptive message of why no runs were\n requested.\n 4. Return or yield nothing (skipping without providing a reason)\n\n Takes a :py:class:`~dagster.SensorEvaluationContext` and an EventLogEntry corresponding to an\n AssetMaterialization event.\n\n Args:\n asset_key (AssetKey): The asset_key this sensor monitors.\n pipeline_name (Optional[str]): (legacy) Name of the target pipeline. Cannot be used in conjunction with `job` or `jobs` parameters.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n solid_selection (Optional[List[str]]): (legacy) A list of solid subselection (including single\n solid names) to execute for runs for this sensor e.g.\n ``['*some_solid+', 'other_solid']``. Cannot be used in conjunction with `job` or `jobs`\n parameters.\n mode (Optional[str]): (legacy) The mode to apply when executing runs for this sensor. Cannot be used\n in conjunction with `job` or `jobs` parameters.\n (default: 'default')\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n check.opt_str_param(name, "name")\n\n def inner(\n fn: Callable[\n [\n "SensorEvaluationContext",\n "EventLogEntry",\n ],\n Union[Generator[Union[SkipReason, RunRequest], None, None], SkipReason, RunRequest],\n ]\n ) -> AssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(context, event):\n result = fn(context, event)\n\n if inspect.isgenerator(result):\n for item in result:\n yield item\n elif isinstance(result, (RunRequest, SkipReason)):\n yield result\n\n elif result is not None:\n raise DagsterInvariantViolationError(\n (\n "Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n "{result} of type {type_}. Should only return SkipReason or "\n "RunRequest objects."\n ).format(sensor_name=sensor_name, result=result, type_=type(result))\n )\n\n return AssetSensorDefinition(\n name=sensor_name,\n asset_key=asset_key,\n pipeline_name=pipeline_name,\n asset_materialization_fn=_wrapped_fn,\n solid_selection=solid_selection,\n mode=mode,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/sensor_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.sensor_decorator"}, "solid_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.solid_decorator

\nfrom functools import lru_cache, update_wrapper\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    List,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n    overload,\n)\n\nfrom dagster import check\nfrom dagster.core.decorator_utils import format_docstring_for_description\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.core.types.dagster_type import DagsterTypeKind\nfrom dagster.seven import funcsigs\n\nfrom ...decorator_utils import (\n    get_function_params,\n    get_valid_name_permutations,\n    param_is_var_keyword,\n    positional_arg_name_list,\n)\nfrom ..inference import infer_input_props, infer_output_props\nfrom ..input import InputDefinition\nfrom ..output import OutputDefinition\nfrom ..policy import RetryPolicy\nfrom ..solid_definition import SolidDefinition\n\n\nclass DecoratedSolidFunction(NamedTuple):\n    """Wrapper around the decorated solid function to provide commonly used util methods"""\n\n    decorated_fn: Callable[..., Any]\n\n    @lru_cache(maxsize=1)\n    def has_context_arg(self) -> bool:\n        return is_context_provided(get_function_params(self.decorated_fn))\n\n    @lru_cache(maxsize=1)\n    def _get_function_params(self) -> List[funcsigs.Parameter]:\n        return get_function_params(self.decorated_fn)\n\n    def positional_inputs(self) -> List[str]:\n        params = self._get_function_params()\n        input_args = params[1:] if self.has_context_arg() else params\n        return positional_arg_name_list(input_args)\n\n    def has_var_kwargs(self) -> bool:\n        params = self._get_function_params()\n        # var keyword arg has to be the last argument\n        return len(params) > 0 and param_is_var_keyword(params[-1])\n\n\nclass NoContextDecoratedSolidFunction(DecoratedSolidFunction):\n    """Wrapper around a decorated solid function, when the decorator does not permit a context\n    parameter (such as lambda_solid).\n    """\n\n    @lru_cache(maxsize=1)\n    def has_context_arg(self) -> bool:\n        return False\n\n\nclass _Solid:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        input_defs: Optional[Sequence[InputDefinition]] = None,\n        output_defs: Optional[Sequence[OutputDefinition]] = None,\n        description: Optional[str] = None,\n        required_resource_keys: Optional[Set[str]] = None,\n        config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        version: Optional[str] = None,\n        decorator_takes_context: Optional[bool] = True,\n        retry_policy: Optional[RetryPolicy] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.input_defs = check.opt_list_param(input_defs, "input_defs", InputDefinition)\n        self.output_defs = check.opt_nullable_sequence_param(\n            output_defs, "output_defs", OutputDefinition\n        )\n        self.decorator_takes_context = check.bool_param(\n            decorator_takes_context, "decorator_takes_context"\n        )\n\n        self.description = check.opt_str_param(description, "description")\n\n        # these will be checked within SolidDefinition\n        self.required_resource_keys = required_resource_keys\n        self.tags = tags\n        self.version = version\n        self.retry_policy = retry_policy\n\n        # config will be checked within SolidDefinition\n        self.config_schema = config_schema\n\n    def __call__(self, fn: Callable[..., Any]) -> SolidDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        output_defs: Sequence[OutputDefinition]\n        if self.output_defs is None:\n            output_defs = [OutputDefinition.create_from_inferred(infer_output_props(fn))]\n        elif len(self.output_defs) == 1:\n            output_defs = [self.output_defs[0].combine_with_inferred(infer_output_props(fn))]\n        else:\n            output_defs = self.output_defs\n\n        compute_fn = (\n            DecoratedSolidFunction(decorated_fn=fn)\n            if self.decorator_takes_context\n            else NoContextDecoratedSolidFunction(decorated_fn=fn)\n        )\n\n        resolved_input_defs = resolve_checked_solid_fn_inputs(\n            decorator_name="@solid",\n            fn_name=self.name,\n            compute_fn=compute_fn,\n            explicit_input_defs=self.input_defs,\n            exclude_nothing=True,\n        )\n\n        solid_def = SolidDefinition(\n            name=self.name,\n            input_defs=resolved_input_defs,\n            output_defs=output_defs,\n            compute_fn=compute_fn,\n            config_schema=self.config_schema,\n            description=self.description or format_docstring_for_description(fn),\n            required_resource_keys=self.required_resource_keys,\n            tags=self.tags,\n            version=self.version,\n            retry_policy=self.retry_policy,\n        )\n        update_wrapper(solid_def, compute_fn.decorated_fn)\n        return solid_def\n\n\n@overload\ndef solid(name: Callable[..., Any]) -> SolidDefinition:\n    ...\n\n\n@overload\ndef solid(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    input_defs: Optional[Sequence[InputDefinition]] = ...,\n    output_defs: Optional[Sequence[OutputDefinition]] = ...,\n    config_schema: Optional[Union[Any, Dict[str, Any]]] = ...,\n    required_resource_keys: Optional[Set[str]] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    version: Optional[str] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n) -> Union[_Solid, SolidDefinition]:\n    ...\n\n\n
[docs]def solid(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n tags: Optional[Dict[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n) -> Union[_Solid, SolidDefinition]:\n """Create a solid with the specified parameters from the decorated function.\n\n This shortcut simplifies the core :class:`SolidDefinition` API by exploding arguments into\n kwargs of the decorated compute function and omitting additional parameters when they are not\n needed.\n\n Input and output definitions will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the solid's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n @solid supports ``async def`` functions as well, including async generators when yielding multiple\n events or outputs. Note that async solids will generally be run on their own unless using a custom\n :py:class:`Executor` implementation that supports running them together.\n\n Args:\n name (Optional[str]): Name of solid. Must be unique within any :py:class:`PipelineDefinition`\n using the solid.\n description (Optional[str]): Human-readable description of this solid. If not provided, and\n the decorated function has docstring, that docstring will be used as the description.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs to the solid. Information provided here will be combined\n with what can be inferred from the function signature, with these explicit InputDefinitions\n taking precedence.\n output_defs (Optional[List[OutputDefinition]]):\n Information about the solids outputs. Information provided here will be combined with\n what can be inferred from the return type signature if there is only one OutputDefinition\n and the function does not use yield.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the solid matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the solid.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Values that are not strings\n will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`.\n version (Optional[str]): (Experimental) The version of the solid's compute_fn. Two solids should have\n the same version if and only if they deterministically produce the same outputs when\n provided the same inputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this solid.\n\n\n Examples:\n\n .. code-block:: python\n\n @solid\n def hello_world():\n print('hello')\n\n @solid\n def hello_world():\n return {'foo': 'bar'}\n\n @solid\n def hello_world():\n return Output(value={'foo': 'bar'})\n\n @solid\n def hello_world():\n yield Output(value={'foo': 'bar'})\n\n @solid\n def hello_world(foo):\n return foo\n\n @solid(\n input_defs=[InputDefinition(name="foo", str)],\n output_defs=[OutputDefinition(str)]\n )\n def hello_world(foo):\n # explicitly type and name inputs and outputs\n return foo\n\n @solid\n def hello_world(foo: str) -> str:\n # same as above inferred from signature\n return foo\n\n @solid\n def hello_world(context, foo):\n context.log.info('log something')\n return foo\n\n @solid(\n config_schema={'str_value' : Field(str)}\n )\n def hello_world(context, foo):\n # context.solid_config is a dictionary with 'str_value' key\n return foo + context.solid_config['str_value']\n\n """\n # This case is for when decorator is used bare, without arguments. e.g. @solid versus @solid()\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(output_defs is None)\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Solid()(name)\n\n return _Solid(\n name=name,\n input_defs=input_defs,\n output_defs=output_defs,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n retry_policy=retry_policy,\n )
\n\n\ndef resolve_checked_solid_fn_inputs(\n decorator_name: str,\n fn_name: str,\n compute_fn: DecoratedSolidFunction,\n explicit_input_defs: List[InputDefinition],\n exclude_nothing: bool,\n) -> List[InputDefinition]:\n """\n Validate provided input definitions and infer the remaining from the type signature of the compute_fn.\n Returns the resolved set of InputDefinitions.\n\n Args:\n decorator_name (str): Name of the decorator that is wrapping the op/solid function.\n fn_name (str): Name of the decorated function.\n compute_fn (DecoratedSolidFunction): The decorated function, wrapped in the\n DecoratedSolidFunction wrapper.\n explicit_input_defs (List[InputDefinition]): The input definitions that were explicitly\n provided in the decorator.\n exclude_nothing (bool): True if Nothing type inputs should be excluded from compute_fn\n arguments.\n """\n\n if exclude_nothing:\n explicit_names = set(\n inp.name\n for inp in explicit_input_defs\n if not inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n nothing_names = set(\n inp.name\n for inp in explicit_input_defs\n if inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n else:\n explicit_names = set(inp.name for inp in explicit_input_defs)\n nothing_names = set()\n\n params = get_function_params(compute_fn.decorated_fn)\n\n input_args = params[1:] if compute_fn.has_context_arg() else params\n\n # Validate input arguments\n used_inputs = set()\n inputs_to_infer = set()\n has_kwargs = False\n\n for param in cast(List[funcsigs.Parameter], input_args):\n if param.kind == funcsigs.Parameter.VAR_KEYWORD:\n has_kwargs = True\n elif param.kind == funcsigs.Parameter.VAR_POSITIONAL:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has positional vararg parameter "\n f"'{param}'. {decorator_name} decorated functions should only have keyword "\n "arguments that match input names and, if system information is required, a first "\n "positional parameter named 'context'."\n )\n\n else:\n if param.name not in explicit_names:\n if param.name in nothing_names:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has parameter '{param.name}' that is "\n "one of the input_defs of type 'Nothing' which should not be included since "\n "no data will be passed for it. "\n )\n else:\n inputs_to_infer.add(param.name)\n\n else:\n used_inputs.add(param.name)\n\n undeclared_inputs = explicit_names - used_inputs\n if not has_kwargs and undeclared_inputs:\n undeclared_inputs_printed = ", '".join(undeclared_inputs)\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function does not have parameter(s) "\n f"'{undeclared_inputs_printed}', which are in provided input_defs. {decorator_name} "\n "decorated functions should only have keyword arguments that match input names and, if "\n "system information is required, a first positional parameter named 'context'."\n )\n\n inferred_props = {\n inferred.name: inferred\n for inferred in infer_input_props(compute_fn.decorated_fn, compute_fn.has_context_arg())\n }\n input_defs = []\n for input_def in explicit_input_defs:\n if input_def.name in inferred_props:\n # combine any information missing on the explicit def that can be inferred\n input_defs.append(\n input_def.combine_with_inferred(\n inferred_props[input_def.name], decorator_name=decorator_name\n )\n )\n else:\n # pass through those that don't have any inference info, such as Nothing type inputs\n input_defs.append(input_def)\n\n # build defs from the inferred props for those without explicit entries\n input_defs.extend(\n InputDefinition.create_from_inferred(inferred, decorator_name=decorator_name)\n for inferred in inferred_props.values()\n if inferred.name in inputs_to_infer\n )\n\n return input_defs\n\n\ndef is_context_provided(params: List[funcsigs.Parameter]) -> bool:\n if len(params) == 0:\n return False\n return params[0].name in get_valid_name_permutations("context")\n\n\ndef lambda_solid(\n name: Optional[Union[str, Callable[..., Any]]] = None,\n description: Optional[str] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_def: Optional[OutputDefinition] = None,\n) -> Union[_Solid, SolidDefinition]:\n """Create a simple solid from the decorated function.\n\n This shortcut allows the creation of simple solids that do not require\n configuration and whose implementations do not require a\n :py:class:`context <SolidExecutionContext>`.\n\n Lambda solids take any number of inputs and produce a single output.\n\n Inputs can be defined using :class:`InputDefinition` and passed to the ``input_defs`` argument\n of this decorator, or inferred from the type signature of the decorated function.\n\n The single output can be defined using :class:`OutputDefinition` and passed as the\n ``output_def`` argument of this decorator, or its type can be inferred from the type signature\n of the decorated function.\n\n The body of the decorated function should return a single value, which will be yielded as the\n solid's output.\n\n Args:\n name (str): Name of solid.\n description (str): Solid description.\n input_defs (List[InputDefinition]): List of input_defs.\n output_def (OutputDefinition): The output of the solid. Defaults to\n :class:`OutputDefinition() <OutputDefinition>`.\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def hello_world():\n return 'hello'\n\n @lambda_solid(\n input_defs=[InputDefinition(name='foo', str)],\n output_def=OutputDefinition(str)\n )\n def hello_world(foo):\n # explicitly type and name inputs and outputs\n return foo\n\n @lambda_solid\n def hello_world(foo: str) -> str:\n # same as above inferred from signature\n return foo\n\n """\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(description is None)\n return _Solid(\n output_defs=[output_def] if output_def else None, decorator_takes_context=False\n )(name)\n\n return _Solid(\n name=name,\n input_defs=input_defs,\n output_defs=[output_def] if output_def else None,\n description=description,\n decorator_takes_context=False,\n )\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/solid_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.solid_decorator"}}, "dependency": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.dependency

\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    NamedTuple,\n    Optional,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.serdes.serdes import (\n    DefaultNamedTupleSerializer,\n    WhitelistMap,\n    register_serdes_tuple_fallbacks,\n    whitelist_for_serdes,\n)\nfrom dagster.utils import frozentags\n\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .output import OutputDefinition\nfrom .utils import DEFAULT_OUTPUT, struct_to_string, validate_tags\n\nif TYPE_CHECKING:\n    from .composition import MappedInputPlaceholder\n    from .graph_definition import GraphDefinition\n    from .node_definition import NodeDefinition\n\n\nclass NodeInvocation(\n    NamedTuple(\n        "Node",\n        [\n            ("name", str),\n            ("alias", Optional[str]),\n            ("tags", Dict[str, Any]),\n            ("hook_defs", AbstractSet[HookDefinition]),\n            ("retry_policy", Optional[RetryPolicy]),\n        ],\n    )\n):\n    """Identifies an instance of a node in a graph dependency structure.\n\n    Args:\n        name (str): Name of the solid of which this is an instance.\n        alias (Optional[str]): Name specific to this instance of the solid. Necessary when there are\n            multiple instances of the same solid.\n        tags (Optional[Dict[str, Any]]): Optional tags values to extend or override those\n            set on the solid definition.\n        hook_defs (Optional[AbstractSet[HookDefinition]]): A set of hook definitions applied to the\n            solid instance.\n\n    Examples:\n\n    In general, users should prefer not to construct this class directly or use the\n    :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n    :py:func:`@job <job>` API:\n\n    .. code-block:: python\n        from dagster import job\n\n        @job\n        def my_job():\n            other_name = some_op.alias('other_name')\n            some_graph(other_name(some_op))\n\n    """\n\n    def __new__(\n        cls,\n        name: str,\n        alias: Optional[str] = None,\n        tags: Optional[Dict[str, str]] = None,\n        hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n        retry_policy: Optional[RetryPolicy] = None,\n    ):\n        return super().__new__(\n            cls,\n            name=check.str_param(name, "name"),\n            alias=check.opt_str_param(alias, "alias"),\n            tags=frozentags(check.opt_dict_param(tags, "tags", value_type=str, key_type=str)),\n            hook_defs=frozenset(\n                check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n            ),\n            retry_policy=check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy),\n        )\n\n\nSolidInvocation = NodeInvocation\n\n\nclass Node:\n    """\n    Node invocation within a graph. Identified by its name inside the graph.\n    """\n\n    def __init__(\n        self,\n        name: str,\n        definition: "NodeDefinition",\n        graph_definition: "GraphDefinition",\n        tags: Optional[Dict[str, str]] = None,\n        hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n        retry_policy: Optional[RetryPolicy] = None,\n    ):\n        from .graph_definition import GraphDefinition\n        from .solid_definition import NodeDefinition\n\n        self.name = check.str_param(name, "name")\n        self.definition = check.inst_param(definition, "definition", NodeDefinition)\n        self.graph_definition = check.inst_param(\n            graph_definition,\n            "graph_definition",\n            GraphDefinition,\n        )\n        self._additional_tags = validate_tags(tags)\n        self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n        self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n        input_handles = {}\n        for name, input_def in self.definition.input_dict.items():\n            input_handles[name] = SolidInputHandle(self, input_def)\n\n        self._input_handles = input_handles\n\n        output_handles = {}\n        for name, output_def in self.definition.output_dict.items():\n            output_handles[name] = SolidOutputHandle(self, output_def)\n\n        self._output_handles = output_handles\n\n    def input_handles(self):\n        return self._input_handles.values()\n\n    def output_handles(self):\n        return self._output_handles.values()\n\n    def input_handle(self, name: str) -> "SolidInputHandle":\n        check.str_param(name, "name")\n        return self._input_handles[name]\n\n    def output_handle(self, name: str) -> "SolidOutputHandle":\n        check.str_param(name, "name")\n        return self._output_handles[name]\n\n    def has_input(self, name: str) -> bool:\n        return self.definition.has_input(name)\n\n    def input_def_named(self, name: str) -> InputDefinition:\n        return self.definition.input_def_named(name)\n\n    def has_output(self, name: str) -> bool:\n        return self.definition.has_output(name)\n\n    def output_def_named(self, name: str) -> OutputDefinition:\n        return self.definition.output_def_named(name)\n\n    @property\n    def is_graph(self) -> bool:\n        from .graph_definition import GraphDefinition\n\n        return isinstance(self.definition, GraphDefinition)\n\n    def describe_node(self) -> str:\n        from .op_definition import OpDefinition\n        from .solid_definition import CompositeSolidDefinition, SolidDefinition\n\n        if isinstance(self.definition, CompositeSolidDefinition):\n            return f"composite solid '{self.name}'"\n        elif isinstance(self.definition, OpDefinition):\n            return f"op '{self.name}'"\n        elif isinstance(self.definition, SolidDefinition):\n            return f"solid '{self.name}'"\n        else:\n            return f"graph '{self.name}'"\n\n    @property\n    def input_dict(self):\n        return self.definition.input_dict\n\n    @property\n    def output_dict(self):\n        return self.definition.output_dict\n\n    @property\n    def tags(self) -> frozentags:\n        return self.definition.tags.updated_with(self._additional_tags)\n\n    def container_maps_input(self, input_name: str) -> bool:\n        return (\n            self.graph_definition.input_mapping_for_pointer(InputPointer(self.name, input_name))\n            is not None\n        )\n\n    def container_mapped_input(self, input_name: str) -> InputMapping:\n        mapping = self.graph_definition.input_mapping_for_pointer(\n            InputPointer(self.name, input_name)\n        )\n        if mapping is None:\n            check.failed(\n                f"container does not map input {input_name}, check container_maps_input first"\n            )\n        return mapping\n\n    def container_maps_fan_in_input(self, input_name: str, fan_in_index: int) -> bool:\n        return (\n            self.graph_definition.input_mapping_for_pointer(\n                FanInInputPointer(self.name, input_name, fan_in_index)\n            )\n            is not None\n        )\n\n    def container_mapped_fan_in_input(self, input_name: str, fan_in_index: int) -> InputMapping:\n        mapping = self.graph_definition.input_mapping_for_pointer(\n            FanInInputPointer(self.name, input_name, fan_in_index)\n        )\n        if mapping is None:\n            check.failed(\n                f"container does not map fan-in {input_name} idx {fan_in_index}, check "\n                "container_maps_fan_in_input first"\n            )\n\n        return mapping\n\n    @property\n    def hook_defs(self) -> AbstractSet[HookDefinition]:\n        return self._hook_defs\n\n    @property\n    def retry_policy(self) -> Optional[RetryPolicy]:\n        return self._retry_policy\n\n\nclass NodeHandleSerializer(DefaultNamedTupleSerializer):\n    @classmethod\n    def value_to_storage_dict(\n        cls,\n        value: NamedTuple,\n        whitelist_map: WhitelistMap,\n        descent_path: str,\n    ) -> Dict[str, Any]:\n        storage = super().value_to_storage_dict(\n            value,\n            whitelist_map,\n            descent_path,\n        )\n        # persist using legacy name SolidHandle\n        storage["__class__"] = "SolidHandle"\n        return storage\n\n\n@whitelist_for_serdes(serializer=NodeHandleSerializer)\nclass NodeHandle(\n    # mypy does not yet support recursive types\n    # NamedTuple("_NodeHandle", [("name", str), ("parent", Optional["NodeHandle"])])\n    NamedTuple("_NodeHandle", [("name", str), ("parent", Any)])\n):\n    """\n    A structured object to identify nodes in the potentially recursive graph structure.\n    """\n\n    def __new__(cls, name: str, parent: Optional["NodeHandle"]):\n        return super(NodeHandle, cls).__new__(\n            cls,\n            check.str_param(name, "name"),\n            check.opt_inst_param(parent, "parent", NodeHandle),\n        )\n\n    def __str__(self):\n        return self.to_string()\n\n    @property\n    def path(self) -> List[str]:\n        """Return a list representation of the handle.\n\n        Inverse of NodeHandle.from_path.\n\n        Returns:\n            List[str]:\n        """\n        path = []\n        cur = self\n        while cur:\n            path.append(cur.name)\n            cur = cur.parent\n        path.reverse()\n        return path\n\n    def to_string(self) -> str:\n        """Return a unique string representation of the handle.\n\n        Inverse of NodeHandle.from_string.\n        """\n        return self.parent.to_string() + "." + self.name if self.parent else self.name\n\n    def is_or_descends_from(self, handle: "NodeHandle") -> bool:\n        """Check if the handle is or descends from another handle.\n\n        Args:\n            handle (NodeHandle): The handle to check against.\n\n        Returns:\n            bool:\n        """\n        check.inst_param(handle, "handle", NodeHandle)\n\n        for idx in range(len(handle.path)):\n            if idx >= len(self.path):\n                return False\n            if self.path[idx] != handle.path[idx]:\n                return False\n        return True\n\n    def pop(self, ancestor: "NodeHandle") -> Optional["NodeHandle"]:\n        """Return a copy of the handle with some of its ancestors pruned.\n\n        Args:\n            ancestor (NodeHandle): Handle to an ancestor of the current handle.\n\n        Returns:\n            NodeHandle:\n\n        Example:\n\n        .. code-block:: python\n\n            handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n            ancestor = NodeHandle('bar', NodeHandle('foo', None))\n            assert handle.pop(ancestor) == NodeHandle('baz', None)\n        """\n\n        check.inst_param(ancestor, "ancestor", NodeHandle)\n        check.invariant(\n            self.is_or_descends_from(ancestor),\n            "Handle {handle} does not descend from {ancestor}".format(\n                handle=self.to_string(), ancestor=ancestor.to_string()\n            ),\n        )\n\n        return NodeHandle.from_path(self.path[len(ancestor.path) :])\n\n    def with_ancestor(self, ancestor: "NodeHandle") -> Optional["NodeHandle"]:\n        """Returns a copy of the handle with an ancestor grafted on.\n\n        Args:\n            ancestor (NodeHandle): Handle to the new ancestor.\n\n        Returns:\n            NodeHandle:\n\n        Example:\n\n        .. code-block:: python\n\n            handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n            ancestor = NodeHandle('quux' None)\n            assert handle.with_ancestor(ancestor) == NodeHandle(\n                'baz', NodeHandle('bar', NodeHandle('foo', NodeHandle('quux', None)))\n            )\n        """\n        check.opt_inst_param(ancestor, "ancestor", NodeHandle)\n\n        return NodeHandle.from_path((ancestor.path if ancestor else []) + self.path)\n\n    @staticmethod\n    def from_path(path: List[str]) -> "NodeHandle":\n        check.list_param(path, "path", of_type=str)\n\n        cur: Optional["NodeHandle"] = None\n        while len(path) > 0:\n            cur = NodeHandle(name=path.pop(0), parent=cur)\n\n        if cur is None:\n            check.failed(f"Invalid handle path {path}")\n\n        return cur\n\n    @staticmethod\n    def from_string(handle_str: str) -> "NodeHandle":\n        check.str_param(handle_str, "handle_str")\n\n        path = handle_str.split(".")\n        return NodeHandle.from_path(path)\n\n    @classmethod\n    def from_dict(cls, dict_repr: Dict[str, Any]) -> Optional["NodeHandle"]:\n        """This method makes it possible to load a potentially nested NodeHandle after a\n        roundtrip through json.loads(json.dumps(NodeHandle._asdict()))"""\n\n        check.dict_param(dict_repr, "dict_repr", key_type=str)\n        check.invariant(\n            "name" in dict_repr, "Dict representation of NodeHandle must have a 'name' key"\n        )\n        check.invariant(\n            "parent" in dict_repr, "Dict representation of NodeHandle must have a 'parent' key"\n        )\n\n        if isinstance(dict_repr["parent"], (list, tuple)):\n            dict_repr["parent"] = NodeHandle.from_dict(\n                {\n                    "name": dict_repr["parent"][0],\n                    "parent": dict_repr["parent"][1],\n                }\n            )\n\n        return NodeHandle(**{k: dict_repr[k] for k in ["name", "parent"]})\n\n\n# previous name for NodeHandle was SolidHandle\nregister_serdes_tuple_fallbacks({"SolidHandle": NodeHandle})\n\n\nclass SolidInputHandle(\n    NamedTuple("_SolidInputHandle", [("solid", Node), ("input_def", InputDefinition)])\n):\n    def __new__(cls, solid: Node, input_def: InputDefinition):\n        return super(SolidInputHandle, cls).__new__(\n            cls,\n            check.inst_param(solid, "solid", Node),\n            check.inst_param(input_def, "input_def", InputDefinition),\n        )\n\n    def _inner_str(self) -> str:\n        return struct_to_string(\n            "SolidInputHandle",\n            solid_name=self.solid.name,\n            input_name=self.input_def.name,\n        )\n\n    def __str__(self):\n        return self._inner_str()\n\n    def __repr__(self):\n        return self._inner_str()\n\n    def __hash__(self):\n        return hash((self.solid.name, self.input_def.name))\n\n    def __eq__(self, other):\n        return self.solid.name == other.solid.name and self.input_def.name == other.input_def.name\n\n    @property\n    def solid_name(self) -> str:\n        return self.solid.name\n\n    @property\n    def input_name(self) -> str:\n        return self.input_def.name\n\n\nclass SolidOutputHandle(\n    NamedTuple("_SolidOutputHandle", [("solid", Node), ("output_def", OutputDefinition)])\n):\n    def __new__(cls, solid: Node, output_def: OutputDefinition):\n        return super(SolidOutputHandle, cls).__new__(\n            cls,\n            check.inst_param(solid, "solid", Node),\n            check.inst_param(output_def, "output_def", OutputDefinition),\n        )\n\n    def _inner_str(self) -> str:\n        return struct_to_string(\n            "SolidOutputHandle",\n            solid_name=self.solid.name,\n            output_name=self.output_def.name,\n        )\n\n    def __str__(self):\n        return self._inner_str()\n\n    def __repr__(self):\n        return self._inner_str()\n\n    def __hash__(self):\n        return hash((self.solid.name, self.output_def.name))\n\n    def __eq__(self, other: Any):\n        return self.solid.name == other.solid.name and self.output_def.name == other.output_def.name\n\n    def describe(self) -> str:\n        return f"{self.solid_name}:{self.output_def.name}"\n\n    @property\n    def solid_name(self) -> str:\n        return self.solid.name\n\n    @property\n    def is_dynamic(self) -> bool:\n        return self.output_def.is_dynamic\n\n\nclass DependencyType(Enum):\n    DIRECT = "DIRECT"\n    FAN_IN = "FAN_IN"\n    DYNAMIC_COLLECT = "DYNAMIC_COLLECT"\n\n\nclass IDependencyDefinition(ABC):  # pylint: disable=no-init\n    @abstractmethod\n    def get_solid_dependencies(self) -> List["DependencyDefinition"]:\n        pass\n\n    @abstractmethod\n    def is_fan_in(self) -> bool:\n        """The result passed to the corresponding input will be a List made from different solid outputs"""\n\n\n
[docs]class DependencyDefinition(\n NamedTuple(\n "_DependencyDefinition", [("node", str), ("output", str), ("description", Optional[str])]\n ),\n IDependencyDefinition,\n):\n """Represents an edge in the DAG of nodes (ops or graphs) forming a job.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent node and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_b depends on the output named 'result' of\n op_a, and the output named 'other_result' of graph_a, the structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_op', 'result')\n }\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_graph', 'result')\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n node_b(node_a())\n\n\n Args:\n solid (str): (legacy) The name of the solid that is depended on, that is, from which the value\n passed between the two nodes originates.\n output (Optional[str]): The name of the output that is depended on. (default: "result")\n description (Optional[str]): Human-readable description of this dependency.\n node (str): The name of the node (op or graph) that is depended on, that is, from which the value\n passed between the two nodes originates.\n """\n\n def __new__(\n cls,\n solid: Optional[str] = None,\n output: str = DEFAULT_OUTPUT,\n description: Optional[str] = None,\n node: Optional[str] = None,\n ):\n if solid and node:\n raise DagsterInvalidDefinitionError(\n "Both ``node`` and legacy ``solid`` arguments provided to DependencyDefinition. Please use one or the other."\n )\n\n if not solid and not node:\n raise DagsterInvalidDefinitionError(\n "Expected node parameter to be str for DependencyDefinition"\n )\n\n node = node or solid\n return super(DependencyDefinition, cls).__new__(\n cls,\n check.str_param(node, "node"),\n check.str_param(output, "output"),\n check.opt_str_param(description, "description"),\n )\n\n def get_solid_dependencies(self) -> List["DependencyDefinition"]:\n return [self]\n\n def is_fan_in(self) -> bool:\n return False\n\n @property\n def solid(self) -> str:\n return self.node\n\n def get_op_dependencies(self) -> List["DependencyDefinition"]:\n return [self]
\n\n\n
[docs]class MultiDependencyDefinition(\n NamedTuple(\n "_MultiDependencyDefinition",\n [("dependencies", List[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]])],\n ),\n IDependencyDefinition,\n):\n """Represents a fan-in edge in the DAG of op instances forming a job.\n\n This object is used only when an input of type ``List[T]`` is assembled by fanning-in multiple\n upstream outputs of type ``T``.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job or pipeline whose keys represent the dependent ops or graphs and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_c depends on the outputs named 'result' of\n op_a and op_b, this structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'op_c': {\n 'input': MultiDependencyDefinition(\n [\n DependencyDefinition('op_a', 'result'),\n DependencyDefinition('op_b', 'result')\n ]\n )\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n op_c(op_a(), op_b())\n\n Args:\n dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]): List of\n upstream dependencies fanned in to this input.\n """\n\n def __new__(\n cls,\n dependencies: List[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]],\n ):\n from .composition import MappedInputPlaceholder\n\n deps = check.list_param(dependencies, "dependencies")\n seen = {}\n for dep in deps:\n if isinstance(dep, DependencyDefinition):\n key = dep.solid + ":" + dep.output\n if key in seen:\n raise DagsterInvalidDefinitionError(\n 'Duplicate dependencies on node "{dep.solid}" output "{dep.output}" '\n "used in the same MultiDependencyDefinition.".format(dep=dep)\n )\n seen[key] = True\n elif dep is MappedInputPlaceholder:\n pass\n else:\n check.failed("Unexpected dependencies entry {}".format(dep))\n\n return super(MultiDependencyDefinition, cls).__new__(cls, deps)\n\n def get_solid_dependencies(self) -> List[DependencyDefinition]:\n return [dep for dep in self.dependencies if isinstance(dep, DependencyDefinition)]\n\n def get_node_dependencies(self) -> List[DependencyDefinition]:\n return self.get_solid_dependencies()\n\n def is_fan_in(self) -> bool:\n return True\n\n def get_dependencies_and_mappings(self) -> List:\n return self.dependencies
\n\n\nclass DynamicCollectDependencyDefinition(\n NamedTuple("_DynamicCollectDependencyDefinition", [("solid_name", str), ("output_name", str)]),\n IDependencyDefinition,\n):\n def get_solid_dependencies(self) -> List[DependencyDefinition]:\n return [DependencyDefinition(self.solid_name, self.output_name)]\n\n def is_fan_in(self) -> bool:\n return True\n\n\nDepTypeAndOutputHandles = Tuple[\n DependencyType,\n Union[SolidOutputHandle, List[Union[SolidOutputHandle, Type["MappedInputPlaceholder"]]]],\n]\n\nInputToOutputHandleDict = Dict[SolidInputHandle, DepTypeAndOutputHandles]\n\n\ndef _create_handle_dict(\n solid_dict: Dict[str, Node],\n dep_dict: Dict[str, Dict[str, IDependencyDefinition]],\n) -> InputToOutputHandleDict:\n from .composition import MappedInputPlaceholder\n\n check.dict_param(solid_dict, "solid_dict", key_type=str, value_type=Node)\n check.two_dim_dict_param(dep_dict, "dep_dict", value_type=IDependencyDefinition)\n\n handle_dict: InputToOutputHandleDict = {}\n\n for solid_name, input_dict in dep_dict.items():\n from_solid = solid_dict[solid_name]\n for input_name, dep_def in input_dict.items():\n if isinstance(dep_def, MultiDependencyDefinition):\n handles: List[Union[SolidOutputHandle, Type[MappedInputPlaceholder]]] = []\n for inner_dep in dep_def.get_dependencies_and_mappings():\n if isinstance(inner_dep, DependencyDefinition):\n handles.append(solid_dict[inner_dep.solid].output_handle(inner_dep.output))\n elif inner_dep is MappedInputPlaceholder:\n handles.append(inner_dep)\n else:\n check.failed(\n "Unexpected MultiDependencyDefinition dependencies type {}".format(\n inner_dep\n )\n )\n\n handle_dict[from_solid.input_handle(input_name)] = (DependencyType.FAN_IN, handles)\n\n elif isinstance(dep_def, DependencyDefinition):\n handle_dict[from_solid.input_handle(input_name)] = (\n DependencyType.DIRECT,\n solid_dict[dep_def.solid].output_handle(dep_def.output),\n )\n elif isinstance(dep_def, DynamicCollectDependencyDefinition):\n handle_dict[from_solid.input_handle(input_name)] = (\n DependencyType.DYNAMIC_COLLECT,\n solid_dict[dep_def.solid_name].output_handle(dep_def.output_name),\n )\n\n else:\n check.failed(f"Unknown dependency type {dep_def}")\n\n return handle_dict\n\n\nclass DependencyStructure:\n @staticmethod\n def from_definitions(solids: Dict[str, Node], dep_dict: Dict[str, Any]):\n return DependencyStructure(list(dep_dict.keys()), _create_handle_dict(solids, dep_dict))\n\n def __init__(self, solid_names: List[str], handle_dict: InputToOutputHandleDict):\n self._solid_names = solid_names\n self._handle_dict = handle_dict\n\n # Building up a couple indexes here so that one can look up all the upstream output handles\n # or downstream input handles in O(1). Without this, this can become O(N^2) where N is solid\n # count during the GraphQL query in particular\n\n # solid_name => input_handle => list[output_handle]\n self._solid_input_index: dict = defaultdict(dict)\n\n # solid_name => output_handle => list[input_handle]\n self._solid_output_index: dict = defaultdict(lambda: defaultdict(list))\n\n # solid_name => dynamic output_handle that this solid will dupe for\n self._dynamic_fan_out_index: dict = {}\n\n # solid_name => set of dynamic output_handle this collects over\n self._collect_index: Dict[str, set] = defaultdict(set)\n\n for input_handle, (dep_type, output_handle_or_list) in self._handle_dict.items():\n if dep_type == DependencyType.FAN_IN:\n output_handle_list = []\n for handle in output_handle_or_list:\n if not isinstance(handle, SolidOutputHandle):\n continue\n\n if handle.is_dynamic:\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of dynamic outputs. "\n f'Problematic dependency on dynamic output "{handle.describe()}".'\n )\n if self._dynamic_fan_out_index.get(handle.solid_name):\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of dynamic outputs. "\n f'Problematic dependency on output "{handle.describe()}", downstream of '\n f'"{self._dynamic_fan_out_index[handle.solid_name].describe()}".'\n )\n\n output_handle_list.append(handle)\n elif dep_type == DependencyType.DIRECT:\n output_handle = cast(SolidOutputHandle, output_handle_or_list)\n\n if output_handle.is_dynamic:\n self._validate_and_set_fan_out(input_handle, output_handle)\n\n if self._dynamic_fan_out_index.get(output_handle.solid_name):\n self._validate_and_set_fan_out(\n input_handle, self._dynamic_fan_out_index[output_handle.solid_name]\n )\n\n output_handle_list = [output_handle]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n output_handle = cast(SolidOutputHandle, output_handle_or_list)\n\n if output_handle.is_dynamic:\n self._validate_and_set_collect(input_handle, output_handle)\n\n elif self._dynamic_fan_out_index.get(output_handle.solid_name):\n self._validate_and_set_collect(\n input_handle,\n self._dynamic_fan_out_index[output_handle.solid_name],\n )\n else:\n check.failed(\n f"Unexpected dynamic fan in dep created {output_handle} -> {input_handle}"\n )\n\n output_handle_list = [output_handle]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n self._solid_input_index[input_handle.solid.name][input_handle] = output_handle_list\n for output_handle in output_handle_list:\n self._solid_output_index[output_handle.solid.name][output_handle].append(\n input_handle\n )\n\n def _validate_and_set_fan_out(\n self, input_handle: SolidInputHandle, output_handle: SolidOutputHandle\n ) -> Any:\n """Helper function for populating _dynamic_fan_out_index"""\n\n if not input_handle.solid.definition.input_supports_dynamic_output_dep(\n input_handle.input_name\n ):\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot be downstream of dynamic output "\n f'"{output_handle.describe()}" since input "{input_handle.input_name}" maps to a node '\n "that is already downstream of another dynamic output. Nodes cannot be downstream of more "\n "than one dynamic output"\n )\n\n if self._collect_index.get(input_handle.solid_name):\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot be both downstream of dynamic output "\n f"{output_handle.describe()} and collect over dynamic output "\n f"{list(self._collect_index[input_handle.solid_name])[0].describe()}."\n )\n\n if self._dynamic_fan_out_index.get(input_handle.solid_name) is None:\n self._dynamic_fan_out_index[input_handle.solid_name] = output_handle\n return\n\n if self._dynamic_fan_out_index[input_handle.solid_name] != output_handle:\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot be downstream of more than one dynamic output. "\n f'It is downstream of both "{output_handle.describe()}" and '\n f'"{self._dynamic_fan_out_index[input_handle.solid_name].describe()}"'\n )\n\n def _validate_and_set_collect(\n self,\n input_handle: SolidInputHandle,\n output_handle: SolidOutputHandle,\n ) -> None:\n if self._dynamic_fan_out_index.get(input_handle.solid_name):\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot both collect over dynamic output "\n f"{output_handle.describe()} and be downstream of the dynamic output "\n f"{self._dynamic_fan_out_index[input_handle.solid_name].describe()}."\n )\n\n self._collect_index[input_handle.solid_name].add(output_handle)\n\n # if the output is already fanned out\n if self._dynamic_fan_out_index.get(output_handle.solid_name):\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot be downstream of more than one dynamic output. "\n f'It is downstream of both "{output_handle.describe()}" and '\n f'"{self._dynamic_fan_out_index[output_handle.solid_name].describe()}"'\n )\n\n def all_upstream_outputs_from_solid(self, solid_name: str) -> List[SolidOutputHandle]:\n check.str_param(solid_name, "solid_name")\n\n # flatten out all outputs that feed into the inputs of this solid\n return [\n output_handle\n for output_handle_list in self._solid_input_index[solid_name].values()\n for output_handle in output_handle_list\n ]\n\n def input_to_upstream_outputs_for_solid(self, solid_name: str) -> Any:\n """\n Returns a Dict[SolidInputHandle, List[SolidOutputHandle]] that encodes\n where all the the inputs are sourced from upstream. Usually the\n List[SolidOutputHandle] will be a list of one, except for the\n multi-dependency case.\n """\n check.str_param(solid_name, "solid_name")\n return self._solid_input_index[solid_name]\n\n def output_to_downstream_inputs_for_solid(self, solid_name: str) -> Any:\n """\n Returns a Dict[SolidOutputHandle, List[SolidInputHandle]] that\n represents all the downstream inputs for each output in the\n dictionary\n """\n check.str_param(solid_name, "solid_name")\n return self._solid_output_index[solid_name]\n\n def has_direct_dep(self, solid_input_handle: SolidInputHandle) -> bool:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n if solid_input_handle not in self._handle_dict:\n return False\n dep_type, _ = self._handle_dict[solid_input_handle]\n return dep_type == DependencyType.DIRECT\n\n def get_direct_dep(self, solid_input_handle: SolidInputHandle) -> SolidOutputHandle:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n dep_type, dep = self._handle_dict[solid_input_handle]\n check.invariant(\n dep_type == DependencyType.DIRECT,\n f"Cannot call get_direct_dep when dep is not singular, got {dep_type}",\n )\n return cast(SolidOutputHandle, dep)\n\n def has_fan_in_deps(self, solid_input_handle: SolidInputHandle) -> bool:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n if solid_input_handle not in self._handle_dict:\n return False\n dep_type, _ = self._handle_dict[solid_input_handle]\n return dep_type == DependencyType.FAN_IN\n\n def get_fan_in_deps(\n self, solid_input_handle: SolidInputHandle\n ) -> List[Union[SolidOutputHandle, Type["MappedInputPlaceholder"]]]:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n dep_type, deps = self._handle_dict[solid_input_handle]\n check.invariant(\n dep_type == DependencyType.FAN_IN,\n f"Cannot call get_multi_dep when dep is not fan in, got {dep_type}",\n )\n return cast(List[Union[SolidOutputHandle, Type["MappedInputPlaceholder"]]], deps)\n\n def has_dynamic_fan_in_dep(self, solid_input_handle: SolidInputHandle) -> bool:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n if solid_input_handle not in self._handle_dict:\n return False\n dep_type, _ = self._handle_dict[solid_input_handle]\n return dep_type == DependencyType.DYNAMIC_COLLECT\n\n def get_dynamic_fan_in_dep(self, solid_input_handle: SolidInputHandle) -> SolidOutputHandle:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n dep_type, dep = self._handle_dict[solid_input_handle]\n check.invariant(\n dep_type == DependencyType.DYNAMIC_COLLECT,\n f"Cannot call get_dynamic_fan_in_dep when dep is not, got {dep_type}",\n )\n return cast(SolidOutputHandle, dep)\n\n def has_deps(self, solid_input_handle: SolidInputHandle) -> bool:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n return solid_input_handle in self._handle_dict\n\n def get_deps_list(self, solid_input_handle: SolidInputHandle) -> List[SolidOutputHandle]:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n check.invariant(self.has_deps(solid_input_handle))\n dep_type, handle_or_list = self._handle_dict[solid_input_handle]\n if dep_type == DependencyType.DIRECT:\n return [cast(SolidOutputHandle, handle_or_list)]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n return [cast(SolidOutputHandle, handle_or_list)]\n elif dep_type == DependencyType.FAN_IN:\n return [handle for handle in handle_or_list if isinstance(handle, SolidOutputHandle)]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n def input_handles(self) -> List[SolidInputHandle]:\n return list(self._handle_dict.keys())\n\n def get_upstream_dynamic_handle_for_solid(self, solid_name: str) -> Any:\n return self._dynamic_fan_out_index.get(solid_name)\n\n def get_dependency_type(self, solid_input_handle: SolidInputHandle) -> Optional[DependencyType]:\n result = self._handle_dict.get(solid_input_handle)\n if result is None:\n return None\n dep_type, _ = result\n return dep_type\n\n def is_dynamic_mapped(self, solid_name: str) -> bool:\n return solid_name in self._dynamic_fan_out_index\n\n def has_dynamic_downstreams(self, solid_name: str) -> bool:\n for upstream_handle in self._dynamic_fan_out_index.values():\n if upstream_handle.solid_name == solid_name:\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/core/definitions/dependency", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.dependency"}, "events": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.events

\nimport re\nimport warnings\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nfrom dagster import check, seven\nfrom dagster.core.errors import DagsterInvalidAssetKey\nfrom dagster.serdes import DefaultNamedTupleSerializer, whitelist_for_serdes\nfrom dagster.utils.backcompat import experimental_class_param_warning\n\nfrom .metadata import (\n    MetadataEntry,\n    MetadataValue,\n    PartitionMetadataEntry,\n    RawMetadataValue,\n    last_file_comp,\n    normalize_metadata,\n)\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.execution.context.output import OutputContext\n\nASSET_KEY_REGEX = re.compile("^[a-zA-Z0-9_.-]+$")  # alphanumeric, _, -, .\nASSET_KEY_SPLIT_REGEX = re.compile("[^a-zA-Z0-9_]")\nASSET_KEY_STRUCTURED_DELIMITER = "."\n\n\ndef validate_asset_key_string(s: Optional[str]) -> str:\n    if not s or not ASSET_KEY_REGEX.match(s):\n        raise DagsterInvalidAssetKey()\n\n    return s\n\n\ndef parse_asset_key_string(s: str) -> List[str]:\n    return list(filter(lambda x: x, re.split(ASSET_KEY_SPLIT_REGEX, s)))\n\n\n
[docs]@whitelist_for_serdes\nclass AssetKey(NamedTuple("_AssetKey", [("path", List[str])])):\n """Object representing the structure of an asset key. Takes in a sanitized string, list of\n strings, or tuple of strings.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import op\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey('flat_asset_key'),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(['parent', 'child', 'grandchild']),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key_2(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(('parent', 'child', 'grandchild')),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n Args:\n path (Sequence[str]): String, list of strings, or tuple of strings. A list of strings\n represent the hierarchical structure of the asset_key.\n """\n\n def __new__(cls, path: Sequence[str]):\n if isinstance(path, str):\n path = [path]\n else:\n path = list(check.sequence_param(path, "path", of_type=str))\n\n return super(AssetKey, cls).__new__(cls, path=path)\n\n def __str__(self):\n return "AssetKey({})".format(self.path)\n\n def __repr__(self):\n return "AssetKey({})".format(self.path)\n\n def __hash__(self):\n return hash(tuple(self.path))\n\n def __eq__(self, other):\n if not isinstance(other, AssetKey):\n return False\n return self.to_string() == other.to_string()\n\n def to_string(self, legacy: Optional[bool] = False) -> Optional[str]:\n if not self.path:\n return None\n if legacy:\n return ASSET_KEY_STRUCTURED_DELIMITER.join(self.path)\n return seven.json.dumps(self.path)\n\n @staticmethod\n def from_db_string(asset_key_string: Optional[str]) -> Optional["AssetKey"]:\n if not asset_key_string:\n return None\n if asset_key_string[0] == "[":\n # is a json string\n try:\n path = seven.json.loads(asset_key_string)\n except seven.JSONDecodeError:\n path = parse_asset_key_string(asset_key_string)\n else:\n path = parse_asset_key_string(asset_key_string)\n return AssetKey(path)\n\n @staticmethod\n def get_db_prefix(path: List[str], legacy: Optional[bool] = False):\n check.list_param(path, "path", of_type=str)\n if legacy:\n return ASSET_KEY_STRUCTURED_DELIMITER.join(path)\n return seven.json.dumps(path)[:-2] # strip trailing '"]' from json string\n\n @staticmethod\n def from_graphql_input(asset_key: Mapping[str, List[str]]) -> Optional["AssetKey"]:\n if asset_key and asset_key.get("path"):\n return AssetKey(asset_key["path"])\n return None
\n\n\nDynamicAssetKey = Callable[["OutputContext"], Optional[AssetKey]]\n\n\n@whitelist_for_serdes\nclass AssetLineageInfo(\n NamedTuple("_AssetLineageInfo", [("asset_key", AssetKey), ("partitions", AbstractSet[str])])\n):\n def __new__(cls, asset_key, partitions=None):\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partitions = check.opt_set_param(partitions, "partitions", str)\n return super(AssetLineageInfo, cls).__new__(cls, asset_key=asset_key, partitions=partitions)\n\n\n
[docs]class Output(\n NamedTuple(\n "_Output",\n [\n ("value", Any),\n ("output_name", str),\n ("metadata_entries", List[Union[PartitionMetadataEntry, MetadataEntry]]),\n ],\n )\n):\n """Event corresponding to one of a op's outputs.\n\n Op compute functions must explicitly yield events of this type when they have more than\n one output, or when they also yield events of other types, or when defining a op using the\n :py:class:`OpDefinition` API directly.\n\n Outputs are values produced by ops that will be consumed by downstream ops in a job.\n They are type-checked at op boundaries when their corresponding :py:class:`Out`\n or the downstream :py:class:`In` is typed.\n\n Args:\n value (Any): The value returned by the compute function.\n output_name (Optional[str]): Name of the corresponding out. (default:\n "result")\n metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]):\n (Experimental) A set of metadata entries to attach to events related to this Output.\n metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n value: Any,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata_entries: Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = check.opt_list_param(\n metadata_entries,\n "metadata_entries",\n of_type=(MetadataEntry, PartitionMetadataEntry),\n )\n\n return super(Output, cls).__new__(\n cls,\n value,\n check.str_param(output_name, "output_name"),\n normalize_metadata(metadata, metadata_entries),\n )
\n\n\n
[docs]class DynamicOutput(\n NamedTuple(\n "_DynamicOutput",\n [\n ("value", Any),\n ("mapping_key", str),\n ("output_name", str),\n ("metadata_entries", List[Union[PartitionMetadataEntry, MetadataEntry]]),\n ],\n )\n):\n """\n Variant of :py:class:`Output <dagster.Output>` used to support\n dynamic mapping & collect. Each ``DynamicOutput`` produced by an op represents\n one item in a set that can be processed individually with ``map`` or gathered\n with ``collect``.\n\n Each ``DynamicOutput`` must have a unique ``mapping_key`` to distinguish it with it's set.\n\n Args:\n value (Any):\n The value returned by the compute function.\n mapping_key (str):\n The key that uniquely identifies this dynamic value relative to its peers.\n This key will be used to identify the downstream ops when mapped, ie\n ``mapped_op[example_mapping_key]``\n output_name (Optional[str]):\n Name of the corresponding :py:class:`DynamicOut` defined on the op.\n (default: "result")\n metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]):\n (Experimental) A set of metadata entries to attach to events related to this output.\n metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n value: Any,\n mapping_key: str,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata_entries: Optional[List[Union[PartitionMetadataEntry, MetadataEntry]]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n\n return super(DynamicOutput, cls).__new__(\n cls,\n value=value,\n mapping_key=check_valid_name(check.str_param(mapping_key, "mapping_key")),\n output_name=check.str_param(output_name, "output_name"),\n metadata_entries=normalize_metadata(metadata, metadata_entries),\n )
\n\n\n@whitelist_for_serdes\nclass AssetObservation(\n NamedTuple(\n "_AssetObservation",\n [\n ("asset_key", AssetKey),\n ("description", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ("partition", Optional[str]),\n ],\n )\n):\n """Event that captures metadata about an asset at a point in time.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the asset.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the asset.\n partition (Optional[str]): The name of a partition of the asset that the metadata\n corresponds to.\n metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: Union[List[str], AssetKey, str],\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n partition: Optional[str] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n elif isinstance(asset_key, list):\n check.list_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n else:\n check.tuple_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n\n return super(AssetObservation, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata_entries=cast(\n List[MetadataEntry], normalize_metadata(metadata, metadata_entries)\n ),\n partition=check.opt_str_param(partition, "partition"),\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n\n
[docs]@whitelist_for_serdes\nclass AssetMaterialization(\n NamedTuple(\n "_AssetMaterialization",\n [\n ("asset_key", AssetKey),\n ("description", Optional[str]),\n ("metadata_entries", List[Union[MetadataEntry, PartitionMetadataEntry]]),\n ("partition", Optional[str]),\n ("tags", Dict[str, str]),\n ],\n )\n):\n """Event indicating that an op has materialized an asset.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, asset materializations can not be passed to other\n ops, and their persistence is controlled by op logic, rather than by the Dagster\n framework.\n\n Op authors should use these events to organize metadata about the side effects of their\n computations, enabling tooling like the Assets dashboard in Dagit.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the materialized asset across job\n runs\n description (Optional[str]): A longer human-readable description of the materialized value.\n metadata_entries (Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]]): Arbitrary metadata about the\n materialized value.\n partition (Optional[str]): The name of the partition that was materialized.\n tags (Optional[Dict[str, str]]): (Experimental) Tag metadata for a given asset\n materialization. Used for search and organization of the asset entry in the asset\n catalog in Dagit.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: Union[List[str], AssetKey, str],\n description: Optional[str] = None,\n metadata_entries: Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]] = None,\n partition: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n elif isinstance(asset_key, list):\n check.list_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n else:\n check.tuple_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n if tags:\n experimental_class_param_warning("tags", "AssetMaterialization")\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n\n return super(AssetMaterialization, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata_entries=normalize_metadata(metadata, metadata_entries),\n partition=check.opt_str_param(partition, "partition"),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n
[docs] @staticmethod\n def file(\n path: str,\n description: Optional[str] = None,\n asset_key: Optional[Union[str, List[str], AssetKey]] = None,\n ) -> "AssetMaterialization":\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n if not asset_key:\n asset_key = path\n\n return AssetMaterialization(\n asset_key=cast(Union[str, AssetKey, List[str]], asset_key),\n description=description,\n metadata_entries=[MetadataEntry("path", value=MetadataValue.path(path))],\n )
\n\n\nclass MaterializationSerializer(DefaultNamedTupleSerializer):\n @classmethod\n def value_from_unpacked(cls, unpacked_dict, klass):\n # override the default `from_storage_dict` implementation in order to skip the deprecation\n # warning for historical Materialization events, loaded from event_log storage\n return Materialization(skip_deprecation_warning=True, **unpacked_dict)\n\n\n@whitelist_for_serdes(serializer=MaterializationSerializer)\nclass Materialization(\n NamedTuple(\n "_Materialization",\n [\n ("label", str),\n ("description", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ("asset_key", AssetKey),\n ("partition", Optional[str]),\n ("tags", Dict[str, str]),\n ],\n )\n):\n """Event indicating that an op has materialized a value.\n\n Solid compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, materializations can not be passed to other ops,\n and their persistence is controlled by op logic, rather than by the Dagster framework.\n\n Solid authors should use these events to organize metadata about the side effects of their\n computations to enable downstream tooling like artifact catalogues and diff tools.\n\n Args:\n label (str): A short display name for the materialized value.\n description (Optional[str]): A longer human-radable description of the materialized value.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the\n materialized value.\n asset_key (Optional[Union[str, AssetKey]]): An optional parameter to identify the materialized asset\n across runs\n partition (Optional[str]): The name of the partition that was materialized.\n tags (Optional[Dict[str, str]]): (Experimental) Tag metadata for a given asset\n materialization. Used for search and organization of the asset entry in the asset\n catalog in Dagit.\n """\n\n def __new__(\n cls,\n label: Optional[str] = None,\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n asset_key: Optional[Union[str, AssetKey]] = None,\n partition: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n skip_deprecation_warning: Optional[bool] = False,\n ):\n if asset_key and isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n asset_key = cast(AssetKey, asset_key)\n if not label:\n check.param_invariant(\n asset_key and asset_key.path,\n "label",\n "Either label or asset_key with a path must be provided",\n )\n label = asset_key.to_string()\n\n if not skip_deprecation_warning:\n warnings.warn("`Materialization` is deprecated; use `AssetMaterialization` instead.")\n\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n\n return super(Materialization, cls).__new__(\n cls,\n label=check.str_param(label, "label"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n asset_key=asset_key,\n partition=check.opt_str_param(partition, "partition"),\n tags=check.opt_dict_param(tags, "tags"),\n )\n\n @staticmethod\n def file(\n path: str,\n description: Optional[str] = None,\n asset_key: Optional[Union[str, AssetKey]] = None,\n ) -> "Materialization":\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n return Materialization(\n label=last_file_comp(path),\n description=description,\n metadata_entries=[MetadataEntry("path", value=MetadataValue.path(path))],\n asset_key=asset_key,\n )\n\n\n
[docs]@whitelist_for_serdes\nclass ExpectationResult(\n NamedTuple(\n "_ExpectationResult",\n [\n ("success", bool),\n ("label", Optional[str]),\n ("description", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ],\n )\n):\n """Event corresponding to a data quality test.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that a data quality test has produced a (positive or\n negative) result.\n\n Args:\n success (bool): Whether the expectation passed or not.\n label (Optional[str]): Short display name for expectation. Defaults to "result".\n description (Optional[str]): A longer human-readable description of the expectation.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the\n expectation.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n label: Optional[str] = None,\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n\n return super(ExpectationResult, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n label=check.opt_str_param(label, "label", "result"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=cast(\n List[MetadataEntry], normalize_metadata(metadata, metadata_entries)\n ),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass TypeCheck(\n NamedTuple(\n "_TypeCheck",\n [\n ("success", bool),\n ("description", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ],\n )\n):\n """Event corresponding to a successful typecheck.\n\n Events of this type should be returned by user-defined type checks when they need to encapsulate\n additional metadata about a type check's success or failure. (i.e., when using\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or the underlying\n :py:func:`PythonObjectDagsterType` API.)\n\n Solid compute functions should generally avoid yielding events of this type to avoid confusion.\n\n Args:\n success (bool): ``True`` if the type check succeeded, ``False`` otherwise.\n description (Optional[str]): A human-readable description of the type check.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the\n type check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n\n return super(TypeCheck, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=cast(\n List[MetadataEntry], normalize_metadata(metadata, metadata_entries)\n ),\n )
\n\n\n
[docs]class Failure(Exception):\n """Event indicating op failure.\n\n Raise events of this type from within op compute functions or custom type checks in order to\n indicate an unrecoverable failure in user code to the Dagster machinery and return\n structured metadata about the failure.\n\n Args:\n description (Optional[str]): A human-readable description of the failure.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the\n failure.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __init__(\n self,\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n\n super(Failure, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata_entries = normalize_metadata(metadata, metadata_entries)
\n\n\n
[docs]class RetryRequested(Exception):\n """\n An exception to raise from an op to indicate that it should be retried.\n\n Args:\n max_retries (Optional[int]):\n The max number of retries this step should attempt before failing\n seconds_to_wait (Optional[Union[float,int]]):\n Seconds to wait before restarting the step after putting the step in\n to the up_for_retry state\n\n Example:\n\n .. code-block:: python\n\n @op\n def flakes():\n try:\n flakey_operation()\n except Exception as e:\n raise RetryRequested(max_retries=3) from e\n """\n\n def __init__(\n self, max_retries: Optional[int] = 1, seconds_to_wait: Optional[Union[float, int]] = None\n ):\n super(RetryRequested, self).__init__()\n self.max_retries = check.int_param(max_retries, "max_retries")\n self.seconds_to_wait = check.opt_numeric_param(seconds_to_wait, "seconds_to_wait")
\n\n\nclass ObjectStoreOperationType(Enum):\n SET_OBJECT = "SET_OBJECT"\n GET_OBJECT = "GET_OBJECT"\n RM_OBJECT = "RM_OBJECT"\n CP_OBJECT = "CP_OBJECT"\n\n\nclass ObjectStoreOperation(\n NamedTuple(\n "_ObjectStoreOperation",\n [\n ("op", ObjectStoreOperationType),\n ("key", str),\n ("dest_key", Optional[str]),\n ("obj", Any),\n ("serialization_strategy_name", Optional[str]),\n ("object_store_name", Optional[str]),\n ("value_name", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n """This event is used internally by Dagster machinery when values are written to and read from\n an ObjectStore.\n\n Users should not import this class or yield events of this type from user code.\n\n Args:\n op (ObjectStoreOperationType): The type of the operation on the object store.\n key (str): The key of the object on which the operation was performed.\n dest_key (Optional[str]): The destination key, if any, to which the object was copied.\n obj (Any): The object, if any, retrieved by the operation.\n serialization_strategy_name (Optional[str]): The name of the serialization strategy, if any,\n employed by the operation\n object_store_name (Optional[str]): The name of the object store that performed the\n operation.\n value_name (Optional[str]): The name of the input/output\n version (Optional[str]): (Experimental) The version of the stored data.\n mapping_key (Optional[str]): The mapping key when a dynamic output is used.\n """\n\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n key: str,\n dest_key: Optional[str] = None,\n obj: Any = None,\n serialization_strategy_name: Optional[str] = None,\n object_store_name: Optional[str] = None,\n value_name: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperation, cls).__new__(\n cls,\n op=op,\n key=check.str_param(key, "key"),\n dest_key=check.opt_str_param(dest_key, "dest_key"),\n obj=obj,\n serialization_strategy_name=check.opt_str_param(\n serialization_strategy_name, "serialization_strategy_name"\n ),\n object_store_name=check.opt_str_param(object_store_name, "object_store_name"),\n value_name=check.opt_str_param(value_name, "value_name"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n @classmethod\n def serializable(cls, inst, **kwargs):\n return cls(\n **dict(\n {\n "op": inst.op.value,\n "key": inst.key,\n "dest_key": inst.dest_key,\n "obj": None,\n "serialization_strategy_name": inst.serialization_strategy_name,\n "object_store_name": inst.object_store_name,\n "value_name": inst.value_name,\n "version": inst.version,\n },\n **kwargs,\n )\n )\n\n\nclass HookExecutionResult(\n NamedTuple("_HookExecutionResult", [("hook_name", str), ("is_skipped", bool)])\n):\n """This event is used internally to indicate the execution result of a hook, e.g. whether the\n user-defined hook function is skipped.\n\n Args:\n hook_name (str): The name of the hook.\n is_skipped (bool): ``False`` if the hook_fn is executed, ``True`` otheriwse.\n """\n\n def __new__(cls, hook_name: str, is_skipped: Optional[bool] = None):\n return super(HookExecutionResult, cls).__new__(\n cls,\n hook_name=check.str_param(hook_name, "hook_name"),\n is_skipped=cast(bool, check.opt_bool_param(is_skipped, "is_skipped", default=False)),\n )\n\n\nUserEvent = Union[Materialization, AssetMaterialization, AssetObservation, ExpectationResult]\n
", "current_page_name": "_modules/dagster/core/definitions/events", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.events"}, "executor_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.executor_definition

\nfrom enum import Enum as PyEnum\nfrom functools import update_wrapper\nfrom typing import Any, Dict, Optional\n\nfrom dagster import check\nfrom dagster.builtins import Int\nfrom dagster.config import Field, Selector\nfrom dagster.core.definitions.configurable import (\n    ConfiguredDefinitionConfigSchema,\n    NamedConfigurableDefinition,\n)\nfrom dagster.core.definitions.reconstruct import ReconstructablePipeline\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.execution.retries import RetryMode, get_retries_config\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\n\nclass ExecutorRequirement(PyEnum):\n    """\n    An ExecutorDefinition can include a list of requirements that the system uses to\n    check whether the executor will be able to work for a particular job/pipeline execution.\n    """\n\n    # The passed in IPipeline must be reconstructable across process boundaries\n    RECONSTRUCTABLE_PIPELINE = "RECONSTRUCTABLE_PIPELINE"  # This needs to still exist for folks who may have written their own executor\n    RECONSTRUCTABLE_JOB = "RECONSTRUCTABLE_PIPELINE"\n\n    # The DagsterInstance must be loadable in a different process\n    NON_EPHEMERAL_INSTANCE = "NON_EPHEMERAL_INSTANCE"\n\n    # Any solid outputs on the pipeline must be persisted\n    PERSISTENT_OUTPUTS = "PERSISTENT_OUTPUTS"\n\n\ndef multiple_process_executor_requirements():\n    return [\n        ExecutorRequirement.RECONSTRUCTABLE_JOB,\n        ExecutorRequirement.NON_EPHEMERAL_INSTANCE,\n        ExecutorRequirement.PERSISTENT_OUTPUTS,\n    ]\n\n\n
[docs]class ExecutorDefinition(NamedConfigurableDefinition):\n """\n Args:\n name (str): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available in `init_context.executor_config`. If not set, Dagster will accept any config\n provided.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular pipeline execution.\n executor_creation_fn(Optional[Callable]): Should accept an :py:class:`InitExecutorContext`\n and return an instance of :py:class:`Executor`\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n executor.\n """\n\n def __init__(\n self,\n name,\n config_schema=None,\n requirements=None,\n executor_creation_fn=None,\n description=None,\n ):\n self._name = check.str_param(name, "name")\n if callable(requirements):\n self._requirements_fn = requirements\n else:\n requirements_lst = check.opt_list_param(\n requirements, "requirements", of_type=ExecutorRequirement\n )\n self._requirements_fn = lambda _: requirements_lst\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._executor_creation_fn = check.opt_callable_param(\n executor_creation_fn, "executor_creation_fn"\n )\n self._description = check.opt_str_param(description, "description")\n\n @property\n def name(self):\n return self._name\n\n @property\n def description(self):\n return self._description\n\n @property\n def config_schema(self):\n return self._config_schema\n\n def get_requirements(self, executor_config: Dict[str, Any]):\n return self._requirements_fn(executor_config)\n\n @property\n def executor_creation_fn(self):\n return self._executor_creation_fn\n\n def copy_for_configured(self, name, description, config_schema, _):\n return ExecutorDefinition(\n name=name,\n config_schema=config_schema,\n executor_creation_fn=self.executor_creation_fn,\n description=description or self.description,\n requirements=self._requirements_fn,\n )\n\n # Backcompat: Overrides configured method to provide name as a keyword argument.\n # If no name is provided, the name is pulled off of this ExecutorDefinition.\n
[docs] def configured(\n self,\n config_or_config_fn: Any,\n name: Optional[str] = None,\n config_schema: Optional[Dict[str, Any]] = None,\n description: Optional[str] = None,\n ):\n """\n Wraps this object in an object of the same type that provides configuration to the inner\n object.\n\n Args:\n config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n that fully satisfies this object's config schema or (2) A function that accepts run\n configuration and returns run configuration that fully satisfies this object's\n config schema. In the latter case, config_schema must be specified. When\n passing a function, it's easiest to use :py:func:`configured`.\n name (Optional[str]): Name of the new definition. If not provided, the emitted\n definition will inherit the name of the `ExecutorDefinition` upon which this\n function is called.\n config_schema (Optional[ConfigSchema]): If config_or_config_fn is a function, the config\n schema that its input must satisfy. If not set, Dagster will accept any config\n provided.\n description (Optional[str]): Description of the new definition. If not specified,\n inherits the description of the definition being configured.\n\n Returns (ConfigurableDefinition): A configured version of this object.\n """\n\n name = check.opt_str_param(name, "name")\n\n new_config_schema = ConfiguredDefinitionConfigSchema(\n self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n )\n\n return self.copy_for_configured(\n name or self.name, description, new_config_schema, config_or_config_fn\n )
\n\n\n
[docs]def executor(\n name=None,\n config_schema=None,\n requirements=None,\n):\n """Define an executor.\n\n The decorated function should accept an :py:class:`InitExecutorContext` and return an instance\n of :py:class:`Executor`.\n\n Args:\n name (Optional[str]): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.executor_config`. If not set, Dagster will accept any config provided for.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular pipeline execution.\n """\n if callable(name):\n check.invariant(config_schema is None)\n check.invariant(requirements is None)\n return _ExecutorDecoratorCallable()(name)\n\n return _ExecutorDecoratorCallable(\n name=name, config_schema=config_schema, requirements=requirements\n )
\n\n\nclass _ExecutorDecoratorCallable:\n def __init__(self, name=None, config_schema=None, requirements=None):\n self.name = check.opt_str_param(name, "name")\n self.config_schema = config_schema # type check in definition\n self.requirements = requirements\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n executor_def = ExecutorDefinition(\n name=self.name,\n config_schema=self.config_schema,\n executor_creation_fn=fn,\n requirements=self.requirements,\n )\n\n update_wrapper(executor_def, wrapped=fn)\n\n return executor_def\n\n\ndef _core_in_process_executor_creation(config: Dict[str, Any]):\n from dagster.core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n # shouldn't need to .get() here - issue with defaults in config setup\n retries=RetryMode.from_config(config["retries"]),\n marker_to_close=config.get("marker_to_close"),\n )\n\n\nIN_PROC_CONFIG = {\n "retries": get_retries_config(),\n "marker_to_close": Field(str, is_required=False),\n}\n\n\n
[docs]@executor(\n name="in_process",\n config_schema=IN_PROC_CONFIG,\n)\ndef in_process_executor(init_context):\n """The in-process executor executes all steps in a single process.\n\n For legacy pipelines, this will be the default executor. To select it explicitly,\n include the following top-level fragment in config:\n\n .. code-block:: yaml\n\n execution:\n in_process:\n\n Execution priority can be configured using the ``dagster/priority`` tag via solid/op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_in_process_executor_creation(init_context.executor_config)
\n\n\n@executor(name="execute_in_process_executor")\ndef execute_in_process_executor(_):\n """Executor used by execute_in_process.\n\n Use of this executor triggers special behavior in the config system that ignores all incoming\n executor config. This is because someone might set executor config on a job, and when we foist\n this executor onto the job for `execute_in_process`, that config becomes nonsensical.\n """\n from dagster.core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n retries=RetryMode.ENABLED,\n marker_to_close=None,\n )\n\n\ndef _core_multiprocess_executor_creation(config: Dict[str, Any]):\n from dagster.core.executor.multiprocess import MultiprocessExecutor\n\n # unpack optional selector\n start_method = None\n start_cfg = {}\n start_selector = config.get("start_method")\n if start_selector:\n start_method, start_cfg = list(start_selector.items())[0]\n\n return MultiprocessExecutor(\n max_concurrent=config["max_concurrent"],\n retries=RetryMode.from_config(config["retries"]),\n start_method=start_method,\n explicit_forkserver_preload=start_cfg.get("preload_modules"),\n )\n\n\nMULTI_PROC_CONFIG = {\n "max_concurrent": Field(Int, is_required=False, default_value=0),\n "start_method": Field(\n Selector(\n {\n "spawn": {},\n "forkserver": {\n "preload_modules": Field(\n [str],\n is_required=False,\n description="Explicit modules to preload in the forkserver.",\n ),\n },\n # fork currently unsupported due to threads usage\n }\n ),\n is_required=False,\n description=(\n "Select how subprocesses are created. Defaults to spawn.\\n"\n "When forkserver is selected, set_forkserver_preload will be called with either:\\n"\n "* the preload_modules list if provided by config\\n"\n "* the module containing the Job if it was loaded from a module\\n"\n "* dagster\\n"\n "https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods"\n ),\n ),\n "retries": get_retries_config(),\n}\n\n\n
[docs]@executor(\n name="multiprocess",\n config_schema=MULTI_PROC_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef multiprocess_executor(init_context):\n """The multiprocess executor executes each step in an individual process.\n\n Any job that does not specify custom executors will use the multiprocess_executor by default.\n For jobs or legacy pipelines, to configure the multiprocess executor, include a fragment such\n as the following in your config:\n\n .. code-block:: yaml\n\n execution:\n multiprocess:\n config:\n max_concurrent: 4\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n Execution priority can be configured using the ``dagster/priority`` tag via solid/op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_multiprocess_executor_creation(init_context.executor_config)
\n\n\ndefault_executors = [in_process_executor, multiprocess_executor]\n\n\ndef check_cross_process_constraints(init_context):\n from dagster.core.executor.init import InitExecutorContext\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n requirements_lst = init_context.executor_def.get_requirements(init_context.executor_config)\n\n if ExecutorRequirement.RECONSTRUCTABLE_JOB in requirements_lst:\n _check_intra_process_pipeline(init_context.pipeline)\n\n if ExecutorRequirement.NON_EPHEMERAL_INSTANCE in requirements_lst:\n _check_non_ephemeral_instance(init_context.instance)\n\n\ndef _check_intra_process_pipeline(pipeline):\n from dagster.core.definitions import JobDefinition\n\n if not isinstance(pipeline, ReconstructablePipeline):\n target = "job" if isinstance(pipeline.get_definition(), JobDefinition) else "pipeline"\n raise DagsterUnmetExecutorRequirementsError(\n 'You have attempted to use an executor that uses multiple processes with the {target} "{name}" '\n "that is not reconstructable. {target_cap} must be loaded in a way that allows dagster to reconstruct "\n "them in a new process. This means: \\n"\n " * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\\n"\n " * loading the {target} through the reconstructable() function\\n".format(\n target=target, name=pipeline.get_definition().name, target_cap=target.capitalize()\n )\n )\n\n\ndef _check_non_ephemeral_instance(instance):\n if instance.is_ephemeral:\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with an "\n "ephemeral DagsterInstance. A non-ephemeral instance is needed to coordinate "\n "execution between multiple processes. You can configure your default instance "\n "via $DAGSTER_HOME or ensure a valid one is passed when invoking the python APIs. "\n "You can learn more about setting up a persistent DagsterInstance from the "\n "DagsterInstance docs here: https://docs.dagster.io/deployment/dagster-instance#default-local-behavior"\n )\n\n\ndef _get_default_executor_requirements(executor_config):\n return multiple_process_executor_requirements() if "multiprocess" in executor_config else []\n\n\n@executor(\n name="multi_or_in_process_executor",\n config_schema=Field(\n Selector(\n {"multiprocess": MULTI_PROC_CONFIG, "in_process": IN_PROC_CONFIG},\n ),\n default_value={"multiprocess": {}},\n ),\n requirements=_get_default_executor_requirements,\n)\ndef multi_or_in_process_executor(init_context):\n """The default executor for a job.\n\n This is the executor available by default on a :py:class:`JobDefinition`\n that does not provide custom executors. This executor has a multiprocessing-enabled mode, and a\n single-process mode. By default, multiprocessing mode is enabled. Switching between multiprocess\n mode and in-process mode can be achieved via config.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n\n\n execution:\n config:\n in_process:\n\n When using the multiprocess mode, ``max_concurrent`` and ``retries`` can also be configured.\n\n\n multiprocess:\n config:\n max_concurrent: 4\n retries:\n enabled:\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n When using the in_process mode, then only retries can be configured.\n\n Execution priority can be configured using the ``dagster/priority`` tag via solid metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n if "multiprocess" in init_context.executor_config:\n return _core_multiprocess_executor_creation(init_context.executor_config["multiprocess"])\n else:\n return _core_in_process_executor_creation(init_context.executor_config["in_process"])\n
", "current_page_name": "_modules/dagster/core/definitions/executor_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.executor_definition"}, "graph_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.graph_definition

\nfrom collections import OrderedDict\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Optional,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nfrom toposort import CircularDependencyError, toposort_flatten\n\nfrom dagster import check\nfrom dagster.config import Field, Shape\nfrom dagster.config.config_type import ConfigType\nfrom dagster.config.validate import validate_config\nfrom dagster.core.definitions.config import ConfigMapping\nfrom dagster.core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster.core.definitions.mode import ModeDefinition\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.definitions.utils import check_valid_name\nfrom dagster.core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster.core.storage.io_manager import io_manager\nfrom dagster.core.types.dagster_type import (\n    DagsterType,\n    DagsterTypeKind,\n    construct_dagster_type_dictionary,\n)\nfrom dagster.utils import merge_dicts\n\nfrom .dependency import (\n    DependencyStructure,\n    IDependencyDefinition,\n    Node,\n    NodeHandle,\n    NodeInvocation,\n    SolidInputHandle,\n)\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .logger_definition import LoggerDefinition\nfrom .node_definition import NodeDefinition\nfrom .output import OutputDefinition, OutputMapping\nfrom .preset import PresetDefinition\nfrom .solid_container import create_execution_structure, validate_dependency_dict\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster.core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster.core.instance import DagsterInstance\n\n    from .executor_definition import ExecutorDefinition\n    from .job_definition import JobDefinition\n    from .partition import PartitionedConfig, PartitionsDefinition\n    from .solid_definition import SolidDefinition\n\n\ndef _check_node_defs_arg(graph_name: str, node_defs: Optional[List[NodeDefinition]]):\n    node_defs = node_defs or []\n\n    if not isinstance(node_defs, list):\n        raise DagsterInvalidDefinitionError(\n            '"nodes" arg to "{name}" is not a list. Got {val}.'.format(\n                name=graph_name, val=repr(node_defs)\n            )\n        )\n    for node_def in node_defs:\n        if isinstance(node_def, NodeDefinition):\n            continue\n        elif callable(node_def):\n            raise DagsterInvalidDefinitionError(\n                """You have passed a lambda or function {func} into {name} that is\n                not a node. You have likely forgetten to annotate this function with\n                the @op or @graph decorators.'\n                """.format(\n                    name=graph_name, func=node_def.__name__\n                )\n            )\n        else:\n            raise DagsterInvalidDefinitionError(\n                "Invalid item in node list: {item}".format(item=repr(node_def))\n            )\n\n    return node_defs\n\n\ndef _create_adjacency_lists(\n    solids: List[Node],\n    dep_structure: DependencyStructure,\n) -> Tuple[Dict[str, Set[Node]], Dict[str, Set[Node]]]:\n    visit_dict = {s.name: False for s in solids}\n    forward_edges: Dict[str, Set[Node]] = {s.name: set() for s in solids}\n    backward_edges: Dict[str, Set[Node]] = {s.name: set() for s in solids}\n\n    def visit(solid_name):\n        if visit_dict[solid_name]:\n            return\n\n        visit_dict[solid_name] = True\n\n        for output_handle in dep_structure.all_upstream_outputs_from_solid(solid_name):\n            forward_node = output_handle.solid.name\n            backward_node = solid_name\n            if forward_node in forward_edges:\n                forward_edges[forward_node].add(backward_node)\n                backward_edges[backward_node].add(forward_node)\n                visit(forward_node)\n\n    for s in solids:\n        visit(s.name)\n\n    return (forward_edges, backward_edges)\n\n\n
[docs]class GraphDefinition(NodeDefinition):\n """Defines a Dagster graph.\n\n A graph is made up of\n\n - Nodes, which can either be an op (the functional unit of computation), or another graph.\n - Dependencies, which determine how the values produced by nodes as outputs flow from\n one node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n (DAG) of compute.\n\n End users should prefer the :func:`@graph <graph>` decorator. GraphDefinition is generally\n intended to be used by framework authors or for programatically generated graphs.\n\n Args:\n name (str): The name of the graph. Must be unique within any :py:class:`GraphDefinition`\n or :py:class:`JobDefinition` containing the graph.\n description (Optional[str]): A human-readable description of the pipeline.\n node_defs (Optional[List[NodeDefinition]]): The set of ops / graphs used in this graph.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the graph. Keys of the top level dict are either the string names of ops in the\n graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[List[InputMapping]]): Defines the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[List[OutputMapping]]): Defines the outputs of the nested graph,\n and how they map from the outputs of its constituent ops.\n config (Optional[ConfigMapping]): Defines the config of the graph, and how its schema maps\n to the config of its constituent ops.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n Examples:\n\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(num):\n return num + 1\n\n graph_def = GraphDefinition(\n name='basic',\n node_defs=[return_one, add_one],\n dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n )\n """\n\n def __init__(\n self,\n name: str,\n description: Optional[str] = None,\n node_defs: Optional[List[NodeDefinition]] = None,\n dependencies: Optional[\n Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]\n ] = None,\n input_mappings: Optional[List[InputMapping]] = None,\n output_mappings: Optional[List[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Dict[str, Any]] = None,\n **kwargs,\n ):\n self._node_defs = _check_node_defs_arg(name, node_defs)\n self._dagster_type_dict = construct_dagster_type_dictionary(self._node_defs)\n self._dependencies = validate_dependency_dict(dependencies)\n self._dependency_structure, self._node_dict = create_execution_structure(\n self._node_defs, self._dependencies, graph_definition=self\n )\n\n # List[InputMapping]\n self._input_mappings, input_defs = _validate_in_mappings(\n check.opt_list_param(input_mappings, "input_mappings"),\n self._node_dict,\n self._dependency_structure,\n name,\n class_name=type(self).__name__,\n )\n # List[OutputMapping]\n self._output_mappings = _validate_out_mappings(\n check.opt_list_param(output_mappings, "output_mappings"),\n self._node_dict,\n self._dependency_structure,\n name,\n class_name=type(self).__name__,\n )\n\n self._config_mapping = check.opt_inst_param(config, "config", ConfigMapping)\n\n super(GraphDefinition, self).__init__(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=[output_mapping.definition for output_mapping in self._output_mappings],\n tags=tags,\n **kwargs,\n )\n\n # must happen after base class construction as properties are assumed to be there\n # eager computation to detect cycles\n self.solids_in_topological_order = self._solids_in_topological_order()\n\n def _solids_in_topological_order(self):\n\n _forward_edges, backward_edges = _create_adjacency_lists(\n self.solids, self.dependency_structure\n )\n\n try:\n order = toposort_flatten(backward_edges)\n except CircularDependencyError as err:\n raise DagsterInvalidDefinitionError(str(err)) from err\n\n return [self.solid_named(solid_name) for solid_name in order]\n\n @property\n def node_type_str(self) -> str:\n return "graph"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @property\n def solids(self) -> List[Node]:\n return list(set(self._node_dict.values()))\n\n @property\n def node_dict(self) -> Dict[str, Node]:\n return self._node_dict\n\n @property\n def node_defs(self) -> List[NodeDefinition]:\n return self._node_defs\n\n def has_solid_named(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._node_dict\n\n def solid_named(self, name: str) -> Node:\n check.str_param(name, "name")\n check.invariant(\n name in self._node_dict,\n "{graph_name} has no solid named {name}.".format(graph_name=self._name, name=name),\n )\n\n return self._node_dict[name]\n\n def get_solid(self, handle: NodeHandle) -> Node:\n check.inst_param(handle, "handle", NodeHandle)\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n name = lineage.pop()\n solid = self.solid_named(name)\n while lineage:\n name = lineage.pop()\n solid = solid.definition.solid_named(name)\n\n return solid\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_node_defs()\n\n def iterate_solid_defs(self) -> Iterator["SolidDefinition"]:\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_solid_defs()\n\n @property\n def input_mappings(self) -> List[InputMapping]:\n return self._input_mappings\n\n @property\n def output_mappings(self) -> List[OutputMapping]:\n return self._output_mappings\n\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n return self._config_mapping\n\n @property\n def has_config_mapping(self) -> bool:\n return self._config_mapping is not None\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._dagster_type_dict.values()\n\n def has_dagster_type(self, name):\n check.str_param(name, "name")\n return name in self._dagster_type_dict\n\n def dagster_type_named(self, name):\n check.str_param(name, "name")\n return self._dagster_type_dict[name]\n\n def get_input_mapping(self, input_name: str) -> InputMapping:\n\n check.str_param(input_name, "input_name")\n for mapping in self._input_mappings:\n if mapping.definition.name == input_name:\n return mapping\n check.failed(f"Could not find input mapping {input_name}")\n\n def input_mapping_for_pointer(\n self, pointer: Union[InputPointer, FanInInputPointer]\n ) -> Optional[InputMapping]:\n check.inst_param(pointer, "pointer", (InputPointer, FanInInputPointer))\n\n for mapping in self._input_mappings:\n if mapping.maps_to == pointer:\n return mapping\n return None\n\n def get_output_mapping(self, output_name: str) -> OutputMapping:\n check.str_param(output_name, "output_name")\n for mapping in self._output_mappings:\n if mapping.definition.name == output_name:\n return mapping\n check.failed(f"Could not find output mapping {output_name}")\n\n def resolve_output_to_origin(\n self, output_name: str, handle: NodeHandle\n ) -> Tuple[OutputDefinition, NodeHandle]:\n check.str_param(output_name, "output_name")\n check.inst_param(handle, "handle", NodeHandle)\n\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n mapped_solid = self.solid_named(mapping.maps_from.solid_name)\n return mapped_solid.definition.resolve_output_to_origin(\n mapping.maps_from.output_name,\n NodeHandle(mapped_solid.name, handle),\n )\n\n def default_value_for_input(self, input_name: str) -> Any:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return self.input_def_named(input_name).default_value\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_solid = self.solid_named(mapping.maps_to.solid_name)\n\n return mapped_solid.definition.default_value_for_input(mapping.maps_to.input_name)\n\n def input_has_default(self, input_name: str) -> bool:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return True\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_solid = self.solid_named(mapping.maps_to.solid_name)\n\n return mapped_solid.definition.input_has_default(mapping.maps_to.input_name)\n\n @property\n def dependencies(self) -> Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]:\n return self._dependencies\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._dependency_structure\n\n @property\n def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self.config_mapping.config_schema if self.config_mapping is not None else None\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n mapping = self.get_input_mapping(input_name)\n target_node = mapping.maps_to.solid_name\n # check if input mapped to solid which is downstream of another dynamic output within\n if self.dependency_structure.is_dynamic_mapped(target_node):\n return False\n\n # check if input mapped to solid which starts new dynamic downstream\n if self.dependency_structure.has_dynamic_downstreams(target_node):\n return False\n\n return self.solid_named(target_node).definition.input_supports_dynamic_output_dep(\n mapping.maps_to.input_name\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: Any,\n config_or_config_fn: Any,\n ):\n if not self.has_config_mapping:\n raise DagsterInvalidDefinitionError(\n "Only graphs utilizing config mapping can be pre-configured. The graph "\n '"{graph_name}" does not have a config mapping, and thus has nothing to be '\n "configured.".format(graph_name=self.name)\n )\n config_mapping = cast(ConfigMapping, self.config_mapping)\n return GraphDefinition(\n name=name,\n description=check.opt_str_param(description, "description", default=self.description),\n node_defs=self._node_defs,\n dependencies=self._dependencies,\n input_mappings=self._input_mappings,\n output_mappings=self._output_mappings,\n config=ConfigMapping(\n config_mapping.config_fn,\n config_schema=config_schema,\n receive_processed_config_values=config_mapping.receive_processed_config_values,\n ),\n )\n\n def node_names(self):\n return list(self._node_dict.keys())\n\n
[docs] def to_job(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n config: Optional[Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"]] = None,\n tags: Optional[Dict[str, Any]] = None,\n logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n op_selection: Optional[List[str]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n ) -> "JobDefinition":\n """\n Make this graph in to an executable Job by providing remaining components required for execution.\n\n Args:\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Dict[str, ResourceDefinition]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its solids and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagit playground, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagit playground, so be careful with secrets.\n tags (Optional[Dict[str, Any]]):\n Arbitrary metadata for any execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n logger_defs (Optional[Dict[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each solid (and optionally, resource) in the job can be versioned. If\n provided, memoizaton will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition\n keys that can parameterize the job. If this argument is supplied, the config\n argument can't also be supplied.\n\n Returns:\n JobDefinition\n """\n from .executor_definition import ExecutorDefinition, multi_or_in_process_executor\n from .job_definition import JobDefinition\n from .partition import PartitionedConfig, PartitionsDefinition\n\n job_name = check_valid_name(name or self.name)\n\n tags = check.opt_dict_param(tags, "tags", key_type=str)\n executor_def = check.opt_inst_param(\n executor_def, "executor_def", ExecutorDefinition, default=multi_or_in_process_executor\n )\n\n if resource_defs and "io_manager" in resource_defs:\n resource_defs_with_defaults = resource_defs\n else:\n resource_defs_with_defaults = merge_dicts(\n {"io_manager": default_job_io_manager}, resource_defs or {}\n )\n\n hooks = check.opt_set_param(hooks, "hooks", of_type=HookDefinition)\n op_retry_policy = check.opt_inst_param(op_retry_policy, "op_retry_policy", RetryPolicy)\n op_selection = check.opt_list_param(op_selection, "op_selection", of_type=str)\n presets = []\n config_mapping = None\n partitioned_config = None\n\n if partitions_def:\n check.inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n check.invariant(\n config is None, "Can't supply both the 'config' and 'partitions_def' arguments"\n )\n partitioned_config = PartitionedConfig(partitions_def, lambda _: {})\n\n if isinstance(config, ConfigMapping):\n config_mapping = config\n elif isinstance(config, PartitionedConfig):\n partitioned_config = config\n elif isinstance(config, dict):\n presets = [PresetDefinition(name="default", run_config=config)]\n # Using config mapping here is a trick to make it so that the preset will be used even\n # when no config is supplied for the job.\n config_mapping = _config_mapping_with_default_value(\n self._get_config_schema(resource_defs_with_defaults, executor_def, logger_defs),\n config,\n job_name,\n self.name,\n )\n elif config is not None:\n check.failed(\n f"config param must be a ConfigMapping, a PartitionedConfig, or a dictionary, but "\n f"is an object of type {type(config)}"\n )\n\n return JobDefinition(\n name=job_name,\n description=description or self.description,\n graph_def=self,\n mode_def=ModeDefinition(\n resource_defs=resource_defs_with_defaults,\n logger_defs=logger_defs,\n executor_defs=[executor_def],\n _config_mapping=config_mapping,\n _partitioned_config=partitioned_config,\n ),\n preset_defs=presets,\n tags=tags,\n hook_defs=hooks,\n version_strategy=version_strategy,\n op_retry_policy=op_retry_policy,\n ).get_job_def_for_op_selection(op_selection)
\n\n def coerce_to_job(self):\n # attempt to coerce a Graph in to a Job, raising a useful error if it doesn't work\n try:\n return self.to_job()\n except DagsterInvalidDefinitionError as err:\n raise DagsterInvalidDefinitionError(\n f"Failed attempting to coerce Graph {self.name} in to a Job. "\n "Use to_job instead, passing the required information."\n ) from err\n\n def _get_config_schema(\n self,\n resource_defs: Optional[Dict[str, ResourceDefinition]],\n executor_def: "ExecutorDefinition",\n logger_defs: Optional[Dict[str, LoggerDefinition]],\n ) -> ConfigType:\n from .job_definition import JobDefinition\n\n return (\n JobDefinition(\n name=self.name,\n graph_def=self,\n mode_def=ModeDefinition(\n resource_defs=resource_defs,\n executor_defs=[executor_def],\n logger_defs=logger_defs,\n ),\n )\n .get_run_config_schema("default")\n .run_config_schema_type\n )\n\n
[docs] def execute_in_process(\n self,\n run_config: Any = None,\n instance: Optional["DagsterInstance"] = None,\n resources: Optional[Dict[str, Any]] = None,\n raise_on_error: bool = True,\n op_selection: Optional[List[str]] = None,\n run_id: Optional[str] = None,\n ) -> "ExecuteInProcessResult":\n """\n Execute this graph in-process, collecting results in-memory.\n\n Args:\n run_config (Optional[Dict[str, Any]]):\n Run config to provide to execution. The configuration for the underlying graph\n should exist under the "ops" key.\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n resources (Optional[Dict[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n """\n from dagster.core.execution.build_resources import wrap_resources_for_execution\n from dagster.core.execution.execute_in_process import core_execute_in_process\n from dagster.core.instance import DagsterInstance\n\n from .executor_definition import execute_in_process_executor\n from .job_definition import JobDefinition\n\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resources = check.opt_dict_param(resources, "resources", key_type=str)\n\n resource_defs = wrap_resources_for_execution(resources)\n in_proc_mode = ModeDefinition(\n executor_defs=[execute_in_process_executor], resource_defs=resource_defs\n )\n ephemeral_job = JobDefinition(\n name=self._name, graph_def=self, mode_def=in_proc_mode\n ).get_job_def_for_op_selection(op_selection)\n\n run_config = run_config if run_config is not None else {}\n op_selection = check.opt_list_param(op_selection, "op_selection", str)\n\n return core_execute_in_process(\n node=self,\n ephemeral_pipeline=ephemeral_job,\n run_config=run_config,\n instance=instance,\n output_capturing_enabled=True,\n raise_on_error=raise_on_error,\n run_id=run_id,\n )
\n\n @property\n def parent_graph_def(self) -> Optional["GraphDefinition"]:\n return None\n\n @property\n def is_subselected(self) -> bool:\n return False
\n\n\nclass SubselectedGraphDefinition(GraphDefinition):\n """Defines a subselected graph.\n\n Args:\n parent_graph_def (GraphDefinition): The parent graph that this current graph is subselected\n from. This is used for tracking where the subselected graph originally comes from.\n Note that we allow subselecting a subselected graph, and this field refers to the direct\n parent graph of the current subselection, rather than the original root graph.\n node_defs (Optional[List[NodeDefinition]]): A list of all top level nodes in the graph. A\n node can be an op or a graph that contains other nodes.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the subselected graph. Keys of the top level dict are either the string names of\n ops in the graph or, in the case of aliased solids, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[List[InputMapping]]): Define the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[List[OutputMapping]]): Define the outputs of the nested graph, and\n how they map from the outputs of its constituent ops.\n """\n\n def __init__(\n self,\n parent_graph_def: GraphDefinition,\n node_defs: Optional[List[NodeDefinition]],\n dependencies: Optional[Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]],\n input_mappings: Optional[List[InputMapping]],\n output_mappings: Optional[List[OutputMapping]],\n ):\n self._parent_graph_def = check.inst_param(\n parent_graph_def, "parent_graph_def", GraphDefinition\n )\n super(SubselectedGraphDefinition, self).__init__(\n name=parent_graph_def.name, # should we create special name for subselected graphs\n node_defs=node_defs,\n dependencies=dependencies,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config=parent_graph_def.config_mapping,\n tags=parent_graph_def.tags,\n )\n\n @property\n def parent_graph_def(self) -> GraphDefinition:\n return self._parent_graph_def\n\n def get_top_level_omitted_nodes(self) -> List[Node]:\n return [\n solid for solid in self.parent_graph_def.solids if not self.has_solid_named(solid.name)\n ]\n\n @property\n def is_subselected(self) -> bool:\n return True\n\n\ndef _validate_in_mappings(\n input_mappings: List[InputMapping],\n solid_dict: Dict[str, Node],\n dependency_structure: DependencyStructure,\n name: str,\n class_name: str,\n) -> Tuple[List[InputMapping], Iterable[InputDefinition]]:\n from .composition import MappedInputPlaceholder\n\n input_def_dict: Dict[str, InputDefinition] = OrderedDict()\n mapping_keys = set()\n\n for mapping in input_mappings:\n # handle incorrect objects passed in as mappings\n if not isinstance(mapping, InputMapping):\n if isinstance(mapping, InputDefinition):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' you passed an InputDefinition "\n "named '{input_name}' directly in to input_mappings. Return "\n "an InputMapping by calling mapping_to on the InputDefinition.".format(\n name=name, input_name=mapping.name, class_name=class_name\n )\n )\n else:\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' received unexpected type '{type}' in input_mappings. "\n "Provide an OutputMapping using InputDefinition(...).mapping_to(...)".format(\n type=type(mapping), name=name, class_name=class_name\n )\n )\n\n if input_def_dict.get(mapping.definition.name):\n if input_def_dict[mapping.definition.name] != mapping.definition:\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' multiple input mappings with same "\n "definition name but different definitions".format(\n name=name, class_name=class_name\n ),\n )\n else:\n input_def_dict[mapping.definition.name] = mapping.definition\n\n target_solid = solid_dict.get(mapping.maps_to.solid_name)\n if target_solid is None:\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' input mapping references solid "\n "'{solid_name}' which it does not contain.".format(\n name=name, solid_name=mapping.maps_to.solid_name, class_name=class_name\n )\n )\n if not target_solid.has_input(mapping.maps_to.input_name):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' input mapping to solid '{mapping.maps_to.solid_name}' "\n "which contains no input named '{mapping.maps_to.input_name}'".format(\n name=name, mapping=mapping, class_name=class_name\n )\n )\n\n target_input = target_solid.input_def_named(mapping.maps_to.input_name)\n solid_input_handle = SolidInputHandle(target_solid, target_input)\n\n if mapping.maps_to_fan_in:\n maps_to = cast(FanInInputPointer, mapping.maps_to)\n if not dependency_structure.has_fan_in_deps(solid_input_handle):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{maps_to.solid_name}.{maps_to.input_name}" (index {maps_to.fan_in_index} of fan-in) '\n f"is not a MultiDependencyDefinition."\n )\n inner_deps = dependency_structure.get_fan_in_deps(solid_input_handle)\n if (maps_to.fan_in_index >= len(inner_deps)) or (\n inner_deps[maps_to.fan_in_index] is not MappedInputPlaceholder\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{maps_to.solid_name}.{maps_to.input_name}" index {maps_to.fan_in_index} in '\n f"the MultiDependencyDefinition is not a MappedInputPlaceholder"\n )\n mapping_keys.add(f"{maps_to.solid_name}.{maps_to.input_name}.{maps_to.fan_in_index}")\n target_type = target_input.dagster_type.get_inner_type_for_fan_in()\n fan_in_msg = " (index {} of fan-in)".format(maps_to.fan_in_index)\n else:\n if dependency_structure.has_deps(solid_input_handle):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' input mapping target "\n '"{mapping.maps_to.solid_name}.{mapping.maps_to.input_name}" '\n "is already satisfied by output".format(\n name=name, mapping=mapping, class_name=class_name\n )\n )\n\n mapping_keys.add(\n "{mapping.maps_to.solid_name}.{mapping.maps_to.input_name}".format(mapping=mapping)\n )\n target_type = target_input.dagster_type\n fan_in_msg = ""\n\n if (\n # no need to check mapping type for graphs because users can't specify ins/out type on graphs\n class_name not in (GraphDefinition.__name__, SubselectedGraphDefinition.__name__)\n and target_type != mapping.definition.dagster_type\n ):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' input "\n "'{mapping.definition.name}' of type {mapping.definition.dagster_type.display_name} maps to "\n "{mapping.maps_to.solid_name}.{mapping.maps_to.input_name}{fan_in_msg} of different type "\n "{target_type.display_name}. InputMapping source and "\n "destination must have the same type.".format(\n mapping=mapping,\n name=name,\n target_type=target_type,\n class_name=class_name,\n fan_in_msg=fan_in_msg,\n )\n )\n\n for input_handle in dependency_structure.input_handles():\n if dependency_structure.has_fan_in_deps(input_handle):\n for idx, dep in enumerate(dependency_structure.get_fan_in_deps(input_handle)):\n if dep is MappedInputPlaceholder:\n mapping_str = (\n "{input_handle.solid_name}.{input_handle.input_name}.{idx}".format(\n input_handle=input_handle, idx=idx\n )\n )\n if mapping_str not in mapping_keys:\n raise DagsterInvalidDefinitionError(\n "Unsatisfied MappedInputPlaceholder at index {idx} in "\n "MultiDependencyDefinition for '{input_handle.solid_name}.{input_handle.input_name}'".format(\n input_handle=input_handle, idx=idx\n )\n )\n\n return input_mappings, input_def_dict.values()\n\n\ndef _validate_out_mappings(\n output_mappings: List[OutputMapping],\n solid_dict: Dict[str, Node],\n dependency_structure: DependencyStructure,\n name: str,\n class_name: str,\n) -> List[OutputMapping]:\n for mapping in output_mappings:\n if isinstance(mapping, OutputMapping):\n\n target_solid = solid_dict.get(mapping.maps_from.solid_name)\n if target_solid is None:\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' output mapping references node "\n "'{solid_name}' which it does not contain.".format(\n name=name, solid_name=mapping.maps_from.solid_name, class_name=class_name\n )\n )\n if not target_solid.has_output(mapping.maps_from.output_name):\n raise DagsterInvalidDefinitionError(\n "In {class_name} {name} output mapping from {described_node} "\n "which contains no output named '{mapping.maps_from.output_name}'".format(\n described_node=target_solid.describe_node(),\n name=name,\n mapping=mapping,\n class_name=class_name,\n )\n )\n\n target_output = target_solid.output_def_named(mapping.maps_from.output_name)\n\n if (\n mapping.definition.dagster_type.kind != DagsterTypeKind.ANY\n and (target_output.dagster_type != mapping.definition.dagster_type)\n and class_name != "GraphDefinition"\n ):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' output "\n "'{mapping.definition.name}' of type {mapping.definition.dagster_type.display_name} "\n "maps from {mapping.maps_from.solid_name}.{mapping.maps_from.output_name} of different type "\n "{target_output.dagster_type.display_name}. OutputMapping source "\n "and destination must have the same type.".format(\n class_name=class_name,\n mapping=mapping,\n name=name,\n target_output=target_output,\n )\n )\n\n if target_output.is_dynamic and not mapping.definition.is_dynamic:\n raise DagsterInvalidDefinitionError(\n f'In {class_name} "{name}" can not map from {target_output.__class__.__name__} '\n f'"{target_output.name}" to {mapping.definition.__class__.__name__} '\n f'"{mapping.definition.name}". Definition types must align.'\n )\n\n dynamic_handle = dependency_structure.get_upstream_dynamic_handle_for_solid(\n target_solid.name\n )\n if dynamic_handle and not mapping.definition.is_dynamic:\n raise DagsterInvalidDefinitionError(\n f'In {class_name} "{name}" output "{mapping.definition.name}" mapping from '\n f"{target_solid.describe_node()} must be a DynamicOutputDefinition since it is "\n f'downstream of dynamic output "{dynamic_handle.describe()}".'\n )\n\n elif isinstance(mapping, OutputDefinition):\n raise DagsterInvalidDefinitionError(\n "You passed an OutputDefinition named '{output_name}' directly "\n "in to output_mappings. Return an OutputMapping by calling "\n "mapping_from on the OutputDefinition.".format(output_name=mapping.name)\n )\n else:\n raise DagsterInvalidDefinitionError(\n "Received unexpected type '{type}' in output_mappings. "\n "Provide an OutputMapping using OutputDefinition(...).mapping_from(...)".format(\n type=type(mapping)\n )\n )\n return output_mappings\n\n\ndef _config_mapping_with_default_value(\n inner_schema: ConfigType,\n default_config: Dict[str, Any],\n job_name: str,\n graph_name: str,\n) -> ConfigMapping:\n if not isinstance(inner_schema, Shape):\n check.failed("Only Shape (dictionary) config_schema allowed on Job ConfigMapping")\n\n def config_fn(x):\n return x\n\n updated_fields = {}\n field_aliases = inner_schema.field_aliases\n for name, field in inner_schema.fields.items():\n if name in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[name],\n description=field.description,\n )\n elif name in field_aliases and field_aliases[name] in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[field_aliases[name]],\n description=field.description,\n )\n else:\n updated_fields[name] = field\n\n config_schema = Shape(\n fields=updated_fields,\n description="run config schema with default values from default_config",\n field_aliases=inner_schema.field_aliases,\n )\n\n config_evr = validate_config(config_schema, default_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error in config when building job '{job_name}' from graph '{graph_name}' ",\n config_evr.errors,\n default_config,\n )\n\n return ConfigMapping(\n config_fn=config_fn, config_schema=config_schema, receive_processed_config_values=False\n )\n\n\n@io_manager(\n description="The default io manager for Jobs. Uses filesystem but switches to in-memory when invoked through execute_in_process."\n)\ndef default_job_io_manager(init_context):\n from dagster.core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n return PickledObjectFilesystemIOManager(base_dir=init_context.instance.storage_directory())\n
", "current_page_name": "_modules/dagster/core/definitions/graph_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.graph_definition"}, "hook_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.hook_definition

\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, NamedTuple, Optional\n\nfrom dagster import check\n\nfrom ..decorator_utils import get_function_params\nfrom ..errors import DagsterInvalidInvocationError\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from ..events import DagsterEvent\n\n\n
[docs]class HookDefinition(\n NamedTuple(\n "_HookDefinition",\n [\n ("name", str),\n ("hook_fn", Callable),\n ("required_resource_keys", AbstractSet[str]),\n ("decorated_fn", Callable),\n ],\n )\n):\n """Define a hook which can be triggered during a op execution (e.g. a callback on the step\n execution failure event during a op execution).\n\n Args:\n name (str): The name of this hook.\n hook_fn (Callable): The callback function that will be triggered.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n """\n\n def __new__(\n cls,\n name: str,\n hook_fn: Callable[..., Any],\n required_resource_keys: Optional[AbstractSet[str]] = None,\n decorated_fn: Optional[Callable[..., Any]] = None,\n ):\n return super(HookDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n hook_fn=check.callable_param(hook_fn, "hook_fn"),\n required_resource_keys=frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n ),\n decorated_fn=check.callable_param(decorated_fn, "decorated_fn"),\n )\n\n def __call__(self, *args, **kwargs):\n """This is invoked when the hook is used as a decorator.\n\n We currently support hooks to decorate the following:\n\n - PipelineDefinition: when the hook decorates a job definition, it will be added to\n all the op invocations within the job.\n\n Example:\n .. code-block:: python\n\n @success_hook\n def slack_message_on_success(_):\n ...\n\n @slack_message_on_success\n @job\n def a_job():\n foo(bar())\n\n """\n from ..execution.context.hook import HookContext\n from .graph_definition import GraphDefinition\n from .hook_invocation import hook_invocation_result\n from .pipeline_definition import PipelineDefinition\n\n if len(args) > 0 and isinstance(args[0], (PipelineDefinition, GraphDefinition)):\n # when it decorates a pipeline, we apply this hook to all the solid invocations within\n # the pipeline.\n return args[0].with_hooks({self})\n else:\n if not self.decorated_fn:\n raise DagsterInvalidInvocationError(\n "Only hook definitions created using one of the hook decorators can be invoked."\n )\n fxn_args = get_function_params(self.decorated_fn)\n # If decorated fxn has two arguments, then this is an event list hook fxn, and parameter\n # names are always context and event_list\n if len(fxn_args) == 2:\n context_arg_name = fxn_args[0].name\n event_list_arg_name = fxn_args[1].name\n if len(args) + len(kwargs) != 2:\n raise DagsterInvalidInvocationError(\n "Decorated function expects two parameters, context and event_list, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], "context", HookContext)\n event_list = check.opt_list_param(\n args[1] if len(args) > 1 else kwargs[event_list_arg_name],\n event_list_arg_name,\n )\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n if event_list_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{event_list_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n event_list = check.opt_list_param(\n kwargs[event_list_arg_name], event_list_arg_name\n )\n return hook_invocation_result(self, context, event_list)\n else:\n context_arg_name = fxn_args[0].name\n if len(args) + len(kwargs) != 1:\n raise DagsterInvalidInvocationError(\n f"Decorated function expects one parameter, {context_arg_name}, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], context_arg_name, HookContext)\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n return hook_invocation_result(self, context)
\n
", "current_page_name": "_modules/dagster/core/definitions/hook_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.hook_definition"}, "input": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.input

\nfrom types import FunctionType\nfrom typing import TYPE_CHECKING, Any, Callable, Mapping, NamedTuple, Optional, Set, Type, Union\n\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.metadata import MetadataEntry, normalize_metadata\nfrom dagster.core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster.core.types.dagster_type import (\n    BuiltinScalarDagsterType,\n    DagsterType,\n    resolve_dagster_type,\n)\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom .inference import InferredInputProps\nfrom .utils import NoValueSentinel, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.execution.context.input import InputContext\n\n\n# unfortunately since type_check functions need TypeCheckContext which is only available\n# at runtime, we can only check basic types before runtime\ndef _check_default_value(input_name, dagster_type, default_value):\n    if default_value is not NoValueSentinel:\n        if dagster_type.is_nothing:\n            raise DagsterInvalidDefinitionError(\n                "Setting a default_value is invalid on InputDefinitions of type Nothing"\n            )\n\n        if isinstance(dagster_type, BuiltinScalarDagsterType):\n            type_check = dagster_type.type_check_scalar_value(default_value)\n            if not type_check.success:\n                raise DagsterInvalidDefinitionError(\n                    (\n                        "Type check failed for the default_value of InputDefinition "\n                        "{input_name} of type {dagster_type}. "\n                        "Received value {value} of type {type}"\n                    ).format(\n                        input_name=input_name,\n                        dagster_type=dagster_type.display_name,\n                        value=default_value,\n                        type=type(default_value),\n                    ),\n                )\n\n    return default_value\n\n\n
[docs]class InputDefinition:\n """Defines an argument to a solid's compute function.\n\n Inputs may flow from previous solids' outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n name (str): Name of the input.\n dagster_type (Optional[Union[Type, DagsterType]]]): The type of this input.\n Users should provide the Python type of the objects that they expect to be passed for\n this input, or a :py:class:`DagsterType` that defines a runtime check that they want\n to be run on this input. Defaults to :py:class:`Any`.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n root_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`RootInputManager` used for loading this input when it is not connected to an\n upstream output.\n metadata (Optional[Dict[str, Any]]): A dict of metadata for the input.\n asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n (or function that produces an AssetKey from the InputContext) which should be associated\n with this InputDefinition. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the InputContext) which should be associated with this InputDefinition.\n """\n\n def __init__(\n self,\n name=None,\n dagster_type=None,\n description=None,\n default_value=NoValueSentinel,\n root_manager_key=None,\n metadata=None,\n asset_key=None,\n asset_partitions=None,\n # when adding new params, make sure to update combine_with_inferred below\n ):\n self._name = check_valid_name(name) if name else None\n\n self._type_not_set = dagster_type is None\n self._dagster_type = check.inst(resolve_dagster_type(dagster_type), DagsterType)\n\n self._description = check.opt_str_param(description, "description")\n\n self._default_value = _check_default_value(self._name, self._dagster_type, default_value)\n\n if root_manager_key:\n experimental_arg_warning("root_manager_key", "InputDefinition.__init__")\n\n self._root_manager_key = check.opt_str_param(root_manager_key, "root_manager_key")\n\n self._metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n self._metadata_entries = check.is_list(\n normalize_metadata(self._metadata, [], allow_invalid=True), MetadataEntry\n )\n\n if asset_key:\n experimental_arg_warning("asset_key", "InputDefinition.__init__")\n\n if not callable(asset_key):\n check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n self._asset_key = asset_key\n\n if asset_partitions:\n experimental_arg_warning("asset_partitions", "InputDefinition.__init__")\n check.param_invariant(\n asset_key is not None,\n "asset_partitions",\n 'Cannot specify "asset_partitions" argument without also specifying "asset_key"',\n )\n if callable(asset_partitions):\n self._asset_partitions_fn = asset_partitions\n elif asset_partitions is not None:\n asset_partitions = check.opt_set_param(asset_partitions, "asset_partitions", str)\n self._asset_partitions_fn = lambda _: asset_partitions\n else:\n self._asset_partitions_fn = None\n\n @property\n def name(self):\n return self._name\n\n @property\n def dagster_type(self):\n return self._dagster_type\n\n @property\n def description(self):\n return self._description\n\n @property\n def has_default_value(self):\n return self._default_value is not NoValueSentinel\n\n @property\n def default_value(self):\n check.invariant(self.has_default_value, "Can only fetch default_value if has_default_value")\n return self._default_value\n\n @property\n def root_manager_key(self):\n return self._root_manager_key\n\n @property\n def metadata(self):\n return self._metadata\n\n @property\n def is_asset(self):\n return self._asset_key is not None\n\n @property\n def metadata_entries(self):\n return self._metadata_entries\n\n @property\n def hardcoded_asset_key(self) -> Optional[AssetKey]:\n if not callable(self._asset_key):\n return self._asset_key\n else:\n return None\n\n def get_asset_key(self, context) -> Optional[AssetKey]:\n """Get the AssetKey associated with this InputDefinition for the given\n :py:class:`InputContext` (if any).\n\n Args:\n context (InputContext): The InputContext that this InputDefinition is being evaluated\n in\n """\n if callable(self._asset_key):\n return self._asset_key(context)\n else:\n return self.hardcoded_asset_key\n\n def get_asset_partitions(self, context) -> Optional[Set[str]]:\n """Get the set of partitions that this solid will read from this InputDefinition for the given\n :py:class:`InputContext` (if any).\n\n Args:\n context (InputContext): The InputContext that this InputDefinition is being evaluated\n in\n """\n if self._asset_partitions_fn is None:\n return None\n\n return self._asset_partitions_fn(context)\n\n def mapping_to(self, solid_name, input_name, fan_in_index=None):\n """Create an input mapping to an input of a child solid.\n\n In a CompositeSolidDefinition, you can use this helper function to construct\n an :py:class:`InputMapping` to the input of a child solid.\n\n Args:\n solid_name (str): The name of the child solid to which to map this input.\n input_name (str): The name of the child solid' input to which to map this input.\n fan_in_index (Optional[int]): The index in to a fanned in input, else None\n\n Examples:\n\n .. code-block:: python\n\n input_mapping = InputDefinition('composite_input', Int).mapping_to(\n 'child_solid', 'int_input'\n )\n """\n check.str_param(solid_name, "solid_name")\n check.str_param(input_name, "input_name")\n check.opt_int_param(fan_in_index, "fan_in_index")\n\n if fan_in_index is not None:\n maps_to = FanInInputPointer(solid_name, input_name, fan_in_index)\n else:\n maps_to = InputPointer(solid_name, input_name)\n return InputMapping(self, maps_to)\n\n @staticmethod\n def create_from_inferred(\n inferred: InferredInputProps, decorator_name: str\n ) -> "InputDefinition":\n return InputDefinition(\n name=inferred.name,\n dagster_type=_checked_inferred_type(inferred, decorator_name),\n description=inferred.description,\n default_value=inferred.default_value,\n )\n\n def combine_with_inferred(\n self, inferred: InferredInputProps, decorator_name: str\n ) -> "InputDefinition":\n """\n Return a new InputDefinition that merges this ones properties with those inferred from type signature.\n This can update: dagster_type, description, and default_value if they are not set.\n """\n\n check.invariant(\n self.name == inferred.name,\n f"InferredInputProps name {inferred.name} did not align with InputDefinition name {self.name}",\n )\n\n dagster_type = self._dagster_type\n if self._type_not_set:\n dagster_type = _checked_inferred_type(inferred, decorator_name=decorator_name)\n\n description = self._description\n if description is None and inferred.description is not None:\n description = inferred.description\n\n default_value = self._default_value\n if not self.has_default_value:\n default_value = inferred.default_value\n\n return InputDefinition(\n name=self.name,\n dagster_type=dagster_type,\n description=description,\n default_value=default_value,\n root_manager_key=self._root_manager_key,\n metadata=self._metadata,\n asset_key=self._asset_key,\n asset_partitions=self._asset_partitions_fn,\n )
\n\n\ndef _checked_inferred_type(inferred: InferredInputProps, decorator_name: str) -> DagsterType:\n try:\n resolved_type = resolve_dagster_type(inferred.annotation)\n except DagsterError as e:\n raise DagsterInvalidDefinitionError(\n f"Problem using type '{inferred.annotation}' from type annotation for argument "\n f"'{inferred.name}', correct the issue or explicitly set the dagster_type on "\n "your InputDefinition."\n ) from e\n\n if resolved_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n f"Input parameter {inferred.name} is annotated with {resolved_type.display_name} "\n "which is a type that represents passing no data. This type must be used "\n f"via InputDefinition and no parameter should be included in the {decorator_name} decorated function."\n )\n return resolved_type\n\n\nclass InputPointer(NamedTuple("_InputPointer", [("solid_name", str), ("input_name", str)])):\n def __new__(cls, solid_name: str, input_name: str):\n return super(InputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.str_param(input_name, "input_name"),\n )\n\n\nclass FanInInputPointer(\n NamedTuple(\n "_FanInInputPointer", [("solid_name", str), ("input_name", str), ("fan_in_index", int)]\n )\n):\n def __new__(cls, solid_name: str, input_name: str, fan_in_index: int):\n return super(FanInInputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.str_param(input_name, "input_name"),\n check.int_param(fan_in_index, "fan_in_index"),\n )\n\n\n
[docs]class InputMapping(\n NamedTuple(\n "_InputMapping",\n [("definition", InputDefinition), ("maps_to", Union[InputPointer, FanInInputPointer])],\n )\n):\n """Defines an input mapping for a composite solid.\n\n Args:\n definition (InputDefinition): Defines the input to the composite solid.\n solid_name (str): The name of the child solid onto which to map the input.\n input_name (str): The name of the input to the child solid onto which to map the input.\n """\n\n def __new__(cls, definition: InputDefinition, maps_to: Union[InputPointer, FanInInputPointer]):\n return super(InputMapping, cls).__new__(\n cls,\n check.inst_param(definition, "definition", InputDefinition),\n check.inst_param(maps_to, "maps_to", (InputPointer, FanInInputPointer)),\n )\n\n @property\n def maps_to_fan_in(self):\n return isinstance(self.maps_to, FanInInputPointer)\n\n def describe(self) -> str:\n idx = self.maps_to.fan_in_index if isinstance(self.maps_to, FanInInputPointer) else ""\n return f"{self.definition.name} -> {self.maps_to.solid_name}:{self.maps_to.input_name}{idx}"
\n\n\n
[docs]class In(\n NamedTuple(\n "_In",\n [\n ("dagster_type", Union[DagsterType, Type[NoValueSentinel]]),\n ("description", Optional[str]),\n ("default_value", Any),\n ("root_manager_key", Optional[str]),\n ("metadata", Optional[Mapping[str, Any]]),\n ("asset_key", Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]),\n ("asset_partitions", Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]]),\n ],\n )\n):\n """\n Defines an argument to an op's compute function.\n\n Inputs may flow from previous op's outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this input. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n root_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`RootInputManager` used for loading this input when it is not connected to an\n upstream output.\n metadata (Optional[Dict[str, Any]]): A dict of metadata for the input.\n asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n (or function that produces an AssetKey from the InputContext) which should be associated\n with this In. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the InputContext) which should be associated with this In.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n default_value: Any = NoValueSentinel,\n root_manager_key: Optional[str] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n ):\n return super(In, cls).__new__(\n cls,\n dagster_type=NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type),\n description=check.opt_str_param(description, "description"),\n default_value=default_value,\n root_manager_key=check.opt_str_param(root_manager_key, "root_manager_key"),\n metadata=check.opt_dict_param(metadata, "metadata", key_type=str),\n asset_key=check.opt_inst_param(asset_key, "asset_key", (AssetKey, FunctionType)),\n asset_partitions=check.opt_inst_param(\n asset_partitions, "asset_partitions", (Set[str], FunctionType)\n ),\n )\n\n @staticmethod\n def from_definition(input_def: InputDefinition):\n return In(\n dagster_type=input_def.dagster_type,\n description=input_def.description,\n default_value=input_def._default_value, # pylint: disable=protected-access\n root_manager_key=input_def.root_manager_key,\n metadata=input_def.metadata,\n asset_key=input_def._asset_key, # pylint: disable=protected-access\n asset_partitions=input_def._asset_partitions_fn, # pylint: disable=protected-access\n )\n\n def to_definition(self, name: str) -> InputDefinition:\n dagster_type = self.dagster_type if self.dagster_type is not NoValueSentinel else None\n return InputDefinition(\n name=name,\n dagster_type=dagster_type,\n description=self.description,\n default_value=self.default_value,\n root_manager_key=self.root_manager_key,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n )
\n\n\n
[docs]class GraphIn(NamedTuple("_GraphIn", [("description", Optional[str])])):\n """\n Represents information about an input that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the input.\n """\n\n def __new__(cls, description=None):\n return super(GraphIn, cls).__new__(cls, description=description)\n\n def to_definition(self, name: str) -> InputDefinition:\n return InputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/core/definitions/input", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.input"}, "job_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.job_definition

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    Optional,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.composition import MappedInputPlaceholder\nfrom dagster.core.definitions.dependency import (\n    DependencyDefinition,\n    DynamicCollectDependencyDefinition,\n    IDependencyDefinition,\n    MultiDependencyDefinition,\n    Node,\n    NodeHandle,\n    NodeInvocation,\n    SolidOutputHandle,\n)\nfrom dagster.core.definitions.node_definition import NodeDefinition\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvalidSubsetError\nfrom dagster.core.selector.subset_selector import (\n    LeafNodeSelection,\n    OpSelectionData,\n    parse_op_selection,\n)\nfrom dagster.core.storage.fs_asset_io_manager import fs_asset_io_manager\nfrom dagster.core.utils import str_format_set\n\nfrom .executor_definition import ExecutorDefinition\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .hook_definition import HookDefinition\nfrom .mode import ModeDefinition\nfrom .partition import PartitionSetDefinition\nfrom .pipeline_definition import PipelineDefinition\nfrom .preset import PresetDefinition\nfrom .resource_definition import ResourceDefinition\nfrom .run_request import RunRequest\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster.core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster.core.instance import DagsterInstance\n    from dagster.core.snap import PipelineSnapshot\n\n\n
[docs]class JobDefinition(PipelineDefinition):\n def __init__(\n self,\n mode_def: ModeDefinition,\n graph_def: GraphDefinition,\n name: Optional[str] = None,\n description: Optional[str] = None,\n preset_defs: Optional[List[PresetDefinition]] = None,\n tags: Optional[Dict[str, Any]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n _op_selection_data: Optional[OpSelectionData] = None,\n ):\n\n self._cached_partition_set: Optional["PartitionSetDefinition"] = None\n self._op_selection_data = check.opt_inst_param(\n _op_selection_data, "_op_selection_data", OpSelectionData\n )\n\n super(JobDefinition, self).__init__(\n name=name,\n description=description,\n mode_defs=[mode_def],\n preset_defs=preset_defs,\n tags=tags,\n hook_defs=hook_defs,\n solid_retry_policy=op_retry_policy,\n graph_def=graph_def,\n version_strategy=version_strategy,\n )\n\n @property\n def target_type(self) -> str:\n return "job"\n\n @property\n def is_job(self) -> bool:\n return True\n\n def describe_target(self):\n return f"{self.target_type} '{self.name}'"\n\n @property\n def executor_def(self) -> ExecutorDefinition:\n return self.mode_definitions[0].executor_defs[0]\n\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n return self.mode_definitions[0].resource_defs\n\n
[docs] def execute_in_process(\n self,\n run_config: Optional[Dict[str, Any]] = None,\n instance: Optional["DagsterInstance"] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n op_selection: Optional[List[str]] = None,\n run_id: Optional[str] = None,\n ) -> "ExecuteInProcessResult":\n """\n Execute the Job in-process, gathering results in-memory.\n\n The `executor_def` on the Job will be ignored, and replaced with the in-process executor.\n If using the default `io_manager`, it will switch from filesystem to in-memory.\n\n\n Args:\n run_config (Optional[Dict[str, Any]]:\n The configuration for the run\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for jobs with partitioned config.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n\n """\n from dagster.core.definitions.executor_definition import execute_in_process_executor\n from dagster.core.execution.execute_in_process import core_execute_in_process\n\n run_config = check.opt_dict_param(run_config, "run_config")\n op_selection = check.opt_list_param(op_selection, "op_selection", str)\n partition_key = check.opt_str_param(partition_key, "partition_key")\n\n check.invariant(\n len(self._mode_definitions) == 1,\n "execute_in_process only supported on job / single mode pipeline",\n )\n\n base_mode = self.get_mode_definition()\n # create an ephemeral in process mode by replacing the executor_def and\n # switching the default fs io_manager to in mem, if another was not set\n in_proc_mode = ModeDefinition(\n name="in_process",\n executor_defs=[execute_in_process_executor],\n resource_defs=_swap_default_io_man(base_mode.resource_defs, self),\n logger_defs=base_mode.loggers,\n _config_mapping=base_mode.config_mapping,\n _partitioned_config=base_mode.partitioned_config,\n )\n\n ephemeral_job = JobDefinition(\n name=self._name,\n graph_def=self._graph_def,\n mode_def=in_proc_mode,\n hook_defs=self.hook_defs,\n tags=self.tags,\n op_retry_policy=self._solid_retry_policy,\n version_strategy=self.version_strategy,\n ).get_job_def_for_op_selection(op_selection)\n\n tags = None\n if partition_key:\n if not base_mode.partitioned_config:\n check.failed(\n f"Provided partition key `{partition_key}` for job `{self._name}` without a partitioned config"\n )\n check.invariant(\n not run_config,\n "Cannot provide both run_config and partition_key arguments to `execute_in_process`",\n )\n partition_set = self.get_partition_set_def()\n if not partition_set:\n check.failed(\n f"Provided partition key `{partition_key}` for job `{self._name}` without a partitioned config"\n )\n\n partition = partition_set.get_partition(partition_key)\n run_config = partition_set.run_config_for_partition(partition)\n tags = partition_set.tags_for_partition(partition)\n\n return core_execute_in_process(\n node=self._graph_def,\n ephemeral_pipeline=ephemeral_job,\n run_config=run_config,\n instance=instance,\n output_capturing_enabled=True,\n raise_on_error=raise_on_error,\n run_tags=tags,\n run_id=run_id,\n )
\n\n @property\n def op_selection_data(self) -> Optional[OpSelectionData]:\n return self._op_selection_data\n\n def get_job_def_for_op_selection(\n self,\n op_selection: Optional[List[str]] = None,\n ) -> "JobDefinition":\n if not op_selection:\n return self\n\n op_selection = check.opt_list_param(op_selection, "op_selection", str)\n\n resolved_op_selection_dict = parse_op_selection(self, op_selection)\n\n sub_graph = get_subselected_graph_definition(self.graph, resolved_op_selection_dict)\n\n return JobDefinition(\n name=self.name,\n description=self.description,\n mode_def=self.get_mode_definition(),\n preset_defs=self.preset_defs,\n tags=self.tags,\n hook_defs=self.hook_defs,\n op_retry_policy=self._solid_retry_policy,\n graph_def=sub_graph,\n version_strategy=self.version_strategy,\n _op_selection_data=OpSelectionData(\n op_selection=op_selection,\n resolved_op_selection=set(\n resolved_op_selection_dict.keys()\n ), # equivalent to solids_to_execute. currently only gets top level nodes.\n parent_job_def=self, # used by pipeline snapshot lineage\n ),\n )\n\n def get_partition_set_def(self) -> Optional["PartitionSetDefinition"]:\n if not self.is_single_mode:\n return None\n\n mode = self.get_mode_definition()\n if not mode.partitioned_config:\n return None\n\n if not self._cached_partition_set:\n\n tags_fn = mode.partitioned_config.tags_for_partition_fn\n if not tags_fn:\n tags_fn = lambda _: {}\n self._cached_partition_set = PartitionSetDefinition(\n job_name=self.name,\n name=f"{self.name}_partition_set",\n partitions_def=mode.partitioned_config.partitions_def,\n run_config_fn_for_partition=mode.partitioned_config.run_config_for_partition_fn,\n tags_fn_for_partition=tags_fn,\n mode=mode.name,\n )\n\n return self._cached_partition_set\n\n def run_request_for_partition(self, partition_key: str, run_key: Optional[str]) -> RunRequest:\n partition_set = self.get_partition_set_def()\n if not partition_set:\n check.failed("Called run_request_for_partition on a non-partitioned job")\n\n partition = partition_set.get_partition(partition_key)\n run_config = partition_set.run_config_for_partition(partition)\n tags = partition_set.tags_for_partition(partition)\n return RunRequest(run_key=run_key, run_config=run_config, tags=tags)\n\n
[docs] def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "JobDefinition":\n """Apply a set of hooks to all op instances within the job."""\n\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n\n job_def = JobDefinition(\n name=self.name,\n graph_def=self._graph_def,\n mode_def=self.mode_definitions[0],\n preset_defs=self.preset_defs,\n tags=self.tags,\n hook_defs=hook_defs | self.hook_defs,\n description=self._description,\n op_retry_policy=self._solid_retry_policy,\n _op_selection_data=self._op_selection_data,\n )\n\n update_wrapper(job_def, self, updated=())\n\n return job_def
\n\n def get_parent_pipeline_snapshot(self) -> Optional["PipelineSnapshot"]:\n return (\n self.op_selection_data.parent_job_def.get_pipeline_snapshot()\n if self.op_selection_data\n else None\n )
\n\n\ndef _swap_default_io_man(resources: Dict[str, ResourceDefinition], job: PipelineDefinition):\n """\n Used to create the user facing experience of the default io_manager\n switching to in-memory when using execute_in_process.\n """\n from dagster.core.storage.mem_io_manager import mem_io_manager\n\n from .graph_definition import default_job_io_manager\n\n if (\n # pylint: disable=comparison-with-callable\n resources.get("io_manager") in [default_job_io_manager, fs_asset_io_manager]\n and job.version_strategy is None\n ):\n updated_resources = dict(resources)\n updated_resources["io_manager"] = mem_io_manager\n return updated_resources\n\n return resources\n\n\ndef _dep_key_of(node: Node) -> NodeInvocation:\n return NodeInvocation(\n name=node.definition.name,\n alias=node.name,\n tags=node.tags,\n hook_defs=node.hook_defs,\n retry_policy=node.retry_policy,\n )\n\n\ndef get_subselected_graph_definition(\n graph: GraphDefinition,\n resolved_op_selection_dict: Dict,\n parent_handle: Optional[NodeHandle] = None,\n) -> SubselectedGraphDefinition:\n deps: Dict[\n Union[str, NodeInvocation],\n Dict[str, IDependencyDefinition],\n ] = {}\n\n selected_nodes: List[Tuple[str, NodeDefinition]] = []\n\n for node in graph.solids_in_topological_order:\n node_handle = NodeHandle(node.name, parent=parent_handle)\n # skip if the node isn't selected\n if node.name not in resolved_op_selection_dict:\n continue\n\n # rebuild graph if any nodes inside the graph are selected\n if node.is_graph and resolved_op_selection_dict[node.name] is not LeafNodeSelection:\n definition = get_subselected_graph_definition(\n node.definition,\n resolved_op_selection_dict[node.name],\n parent_handle=node_handle,\n )\n # use definition if the node as a whole is selected. this includes selecting the entire graph\n else:\n definition = node.definition\n selected_nodes.append((node.name, definition))\n\n # build dependencies for the node. we do it for both cases because nested graphs can have\n # inputs and outputs too\n deps[_dep_key_of(node)] = {}\n for input_handle in node.input_handles():\n if graph.dependency_structure.has_direct_dep(input_handle):\n output_handle = graph.dependency_structure.get_direct_dep(input_handle)\n if output_handle.solid.name in resolved_op_selection_dict:\n deps[_dep_key_of(node)][input_handle.input_def.name] = DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n elif graph.dependency_structure.has_dynamic_fan_in_dep(input_handle):\n output_handle = graph.dependency_structure.get_dynamic_fan_in_dep(input_handle)\n if output_handle.solid.name in resolved_op_selection_dict:\n deps[_dep_key_of(node)][\n input_handle.input_def.name\n ] = DynamicCollectDependencyDefinition(\n solid_name=output_handle.solid.name,\n output_name=output_handle.output_def.name,\n )\n elif graph.dependency_structure.has_fan_in_deps(input_handle):\n output_handles = graph.dependency_structure.get_fan_in_deps(input_handle)\n multi_dependencies = [\n DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n for output_handle in output_handles\n if (\n isinstance(output_handle, SolidOutputHandle)\n and output_handle.solid.name in resolved_op_selection_dict\n )\n ]\n deps[_dep_key_of(node)][input_handle.input_def.name] = MultiDependencyDefinition(\n cast(\n List[Union[DependencyDefinition, Type[MappedInputPlaceholder]]],\n multi_dependencies,\n )\n )\n # else input is unconnected\n\n # filter out unselected input/output mapping\n new_input_mappings = list(\n filter(\n lambda input_mapping: input_mapping.maps_to.solid_name\n in [name for name, _ in selected_nodes],\n graph._input_mappings, # pylint: disable=protected-access\n )\n )\n new_output_mappings = list(\n filter(\n lambda output_mapping: output_mapping.maps_from.solid_name\n in [name for name, _ in selected_nodes],\n graph._output_mappings, # pylint: disable=protected-access\n )\n )\n\n try:\n return SubselectedGraphDefinition(\n parent_graph_def=graph,\n dependencies=deps,\n node_defs=[definition for _, definition in selected_nodes],\n input_mappings=new_input_mappings,\n output_mappings=new_output_mappings,\n )\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(resolved_op_selection_dict)} for graph "\n f"{graph.name} results in an invalid graph."\n ) from exc\n
", "current_page_name": "_modules/dagster/core/definitions/job_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.job_definition"}, "logger_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.logger_definition

\nimport logging\nfrom typing import TYPE_CHECKING, Any, Callable, Optional, Union\n\nfrom dagster import check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.configurable import AnonymousConfigurableDefinition\nfrom dagster.core.errors import DagsterInvalidInvocationError\n\nfrom ..decorator_utils import get_function_params\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\nif TYPE_CHECKING:\n    from dagster.core.definitions import JobDefinition, PipelineDefinition\n    from dagster.core.execution.context.logger import InitLoggerContext, UnboundInitLoggerContext\n\n    InitLoggerFunction = Callable[[InitLoggerContext], logging.Logger]\n\n\n
[docs]class LoggerDefinition(AnonymousConfigurableDefinition):\n """Core class for defining loggers.\n\n Loggers are job-scoped logging handlers, which will be automatically invoked whenever\n dagster messages are logged from within a job.\n\n Args:\n logger_fn (Callable[[InitLoggerContext], logging.Logger]): User-provided function to\n instantiate the logger. This logger will be automatically invoked whenever the methods\n on ``context.log`` are called from within job/pipeline compute logic.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of this logger.\n """\n\n def __init__(\n self,\n logger_fn: "InitLoggerFunction",\n config_schema: Any = None,\n description: Optional[str] = None,\n ):\n self._logger_fn = check.callable_param(logger_fn, "logger_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n\n def __call__(self, *args, **kwargs):\n from dagster.core.execution.context.logger import UnboundInitLoggerContext\n\n from .logger_invocation import logger_invocation_result\n\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Logger initialization function has context argument, but no context argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of logger received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.logger_fn)[0].name\n\n if args:\n context = check.opt_inst_param(\n args[0],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, pipeline_def=None),\n )\n return logger_invocation_result(self, context)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Logger initialization expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, pipeline_def=None),\n )\n\n return logger_invocation_result(self, context)\n\n @property\n def logger_fn(self) -> "InitLoggerFunction":\n return self._logger_fn\n\n @property\n def config_schema(self) -> Any:\n return self._config_schema\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n def copy_for_configured(\n self, description: Optional[str], config_schema: Any, _\n ) -> "LoggerDefinition":\n return LoggerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n logger_fn=self.logger_fn,\n )
\n\n\n
[docs]def logger(\n config_schema: Any = None, description: Optional[str] = None\n) -> Union["LoggerDefinition", Callable[["InitLoggerFunction"], "LoggerDefinition"]]:\n """Define a logger.\n\n The decorated function should accept an :py:class:`InitLoggerContext` and return an instance of\n :py:class:`python:logging.Logger`. This function will become the ``logger_fn`` of an underlying\n :py:class:`LoggerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the logger.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @logger versus @logger()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return LoggerDefinition(logger_fn=config_schema)\n\n def _wrap(logger_fn: "InitLoggerFunction") -> "LoggerDefinition":\n return LoggerDefinition(\n logger_fn=logger_fn,\n config_schema=config_schema,\n description=description,\n )\n\n return _wrap
\n\n\n
[docs]def build_init_logger_context(\n logger_config: Any = None,\n pipeline_def: Optional["PipelineDefinition"] = None,\n job_def: Optional["JobDefinition"] = None,\n) -> "UnboundInitLoggerContext":\n """Builds logger initialization context from provided parameters.\n\n This function can be used to provide the context argument to the invocation of a logger\n definition.\n\n Note that you may only specify one of pipeline_def and job_def.\n\n Args:\n logger_config (Any): The config to provide during initialization of logger.\n pipeline_def (Optional[PipelineDefinition]): The pipeline definition that the logger will be\n used with.\n job_def (Optional[JobDefinition]): The job definition that the logger will be used with.\n\n Examples:\n .. code-block:: python\n\n context = build_init_logger_context()\n logger_to_init(context)\n """\n from dagster.core.definitions import JobDefinition, PipelineDefinition\n from dagster.core.execution.context.logger import UnboundInitLoggerContext\n\n check.opt_inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.opt_inst_param(job_def, "job_def", JobDefinition)\n\n check.invariant(\n not (pipeline_def and job_def),\n "In build_init_logger_context, you may only specify one of the pipeline_def and job_def parameters, not both.",\n )\n\n return UnboundInitLoggerContext(\n logger_config=logger_config, pipeline_def=pipeline_def or job_def\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/logger_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.logger_definition"}, "metadata": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.metadata

\nimport functools\nimport os\nimport re\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nfrom dagster import check, seven\nfrom dagster.core.errors import DagsterInvalidMetadata\nfrom dagster.serdes import whitelist_for_serdes\nfrom dagster.utils.backcompat import (\n    canonicalize_backcompat_args,\n    deprecation_warning,\n    experimental,\n    experimental_class_warning,\n)\n\nfrom .table import TableColumn, TableColumnConstraints, TableConstraints, TableRecord, TableSchema\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.events import AssetKey\n\nRawMetadataValue = Union[\n    "MetadataValue",\n    dict,\n    float,\n    int,\n    list,\n    str,\n]\n\nMetadataMapping = Mapping[str, "MetadataValue"]\nMetadataUserInput = Mapping[str, RawMetadataValue]\n\n\ndef last_file_comp(path: str) -> str:\n    return os.path.basename(os.path.normpath(path))\n\n\n# ########################\n# ##### NORMALIZATION\n# ########################\n\n\ndef normalize_metadata(\n    metadata: Mapping[str, RawMetadataValue],\n    metadata_entries: Sequence[Union["MetadataEntry", "PartitionMetadataEntry"]],\n    allow_invalid: bool = False,\n) -> List[Union["MetadataEntry", "PartitionMetadataEntry"]]:\n    if metadata and metadata_entries:\n        raise DagsterInvalidMetadata(\n            "Attempted to provide both `metadata` and `metadata_entries` arguments to an event. "\n            "Must provide only one of the two."\n        )\n\n    if metadata_entries:\n        deprecation_warning(\n            'Argument "metadata_entries"',\n            "0.15.0",\n            additional_warn_txt="Use argument `metadata` instead. The `MetadataEntry` `description` attribute is also deprecated-- argument `metadata` takes a label: value dictionary.",\n            stacklevel=4,  # to get the caller of `normalize_metadata`\n        )\n        return check.list_param(\n            metadata_entries, "metadata_entries", (MetadataEntry, PartitionMetadataEntry)\n        )\n\n    # This is a stopgap measure to deal with unsupported metadata values, which occur when we try\n    # to convert arbitrary metadata (on e.g. OutputDefinition) to a MetadataValue, which is required\n    # for serialization. This will cause unsupported values to be silently replaced with a\n    # string placeholder.\n    elif allow_invalid:\n        metadata_entries = []\n        for k, v in metadata.items():\n            try:\n                metadata_entries.append(package_metadata_value(k, v))\n            except DagsterInvalidMetadata:\n                deprecation_warning(\n                    "Support for arbitrary metadata values",\n                    "0.15.0",\n                    additional_warn_txt=f"In the future, all user-supplied metadata values must be one of {RawMetadataValue}",\n                    stacklevel=4,  # to get the caller of `normalize_metadata`\n                )\n                metadata_entries.append(\n                    MetadataEntry.text(f"[{v.__class__.__name__}] (unserializable)", k)\n                )\n        return metadata_entries\n\n    return [\n        package_metadata_value(k, v)\n        for k, v in check.opt_dict_param(metadata, "metadata", key_type=str).items()\n    ]\n\n\ndef normalize_metadata_value(raw_value: RawMetadataValue):\n    from dagster.core.definitions.events import AssetKey\n\n    if isinstance(raw_value, MetadataValue):\n        return raw_value\n    elif isinstance(raw_value, str):\n        return MetadataValue.text(raw_value)\n    elif isinstance(raw_value, float):\n        return MetadataValue.float(raw_value)\n    elif isinstance(raw_value, int):\n        return MetadataValue.int(raw_value)\n    elif isinstance(raw_value, dict):\n        return MetadataValue.json(raw_value)\n    elif isinstance(raw_value, os.PathLike):\n        return MetadataValue.path(raw_value)\n    elif isinstance(raw_value, AssetKey):\n        return MetadataValue.asset(raw_value)\n    elif isinstance(raw_value, TableSchema):\n        return MetadataValue.table_schema(raw_value)\n\n    raise DagsterInvalidMetadata(\n        f"Its type was {type(raw_value)}. Consider wrapping the value with the appropriate "\n        "MetadataValue type."\n    )\n\n\ndef package_metadata_value(label: str, raw_value: RawMetadataValue) -> "MetadataEntry":\n    check.str_param(label, "label")\n\n    if isinstance(raw_value, (MetadataEntry, PartitionMetadataEntry)):\n        raise DagsterInvalidMetadata(\n            f"Expected a metadata value, found an instance of {raw_value.__class__.__name__}. Consider "\n            "instead using a MetadataValue wrapper for the value."\n        )\n    try:\n        value = normalize_metadata_value(raw_value)\n    except DagsterInvalidMetadata as e:\n        raise DagsterInvalidMetadata(\n            f'Could not resolve the metadata value for "{label}" to a known type. {e}'\n        ) from None\n    return MetadataEntry(label=label, value=value)\n\n\n# ########################\n# ##### METADATA VALUE\n# ########################\n\n\n
[docs]class MetadataValue:\n """Utility class to wrap metadata values passed into Dagster events so that they can be\n displayed in Dagit and other tooling.\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": "hello",\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n "num_rows": 0,\n },\n )\n """\n\n
[docs] @staticmethod\n def text(text: str) -> "TextMetadataValue":\n """Static constructor for a metadata value wrapping text as\n :py:class:`TextMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": MetadataValue.text("hello")\n },\n )\n\n Args:\n text (str): The text string for a metadata entry.\n """\n return TextMetadataValue(text)
\n\n
[docs] @staticmethod\n def url(url: str) -> "UrlMetadataValue":\n """Static constructor for a metadata value wrapping a URL as\n :py:class:`UrlMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata={\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n }\n )\n\n\n Args:\n url (str): The URL for a metadata entry.\n """\n return UrlMetadataValue(url)
\n\n
[docs] @staticmethod\n def path(path: Union[str, os.PathLike]) -> "PathMetadataValue":\n """Static constructor for a metadata value wrapping a path as\n :py:class:`PathMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "filepath": MetadataValue.path("path/to/file"),\n }\n )\n\n Args:\n path (str): The path for a metadata entry.\n """\n return PathMetadataValue(path)
\n\n
[docs] @staticmethod\n def json(data: Dict[str, Any]) -> "JsonMetadataValue":\n """Static constructor for a metadata value wrapping a path as\n :py:class:`JsonMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata={\n "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n },\n )\n\n Args:\n data (Dict[str, Any]): The JSON data for a metadata entry.\n """\n return JsonMetadataValue(data)
\n\n
[docs] @staticmethod\n def md(data: str) -> "MarkdownMetadataValue":\n """Static constructor for a metadata value wrapping markdown data as\n :py:class:`MarkdownMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata={\n 'Details': MetadataValue.md(md_str)\n },\n )\n\n Args:\n md_str (str): The markdown for a metadata entry.\n """\n return MarkdownMetadataValue(data)
\n\n
[docs] @staticmethod\n def python_artifact(python_artifact: Callable) -> "PythonArtifactMetadataValue":\n """Static constructor for a metadata value wrapping a python artifact as\n :py:class:`PythonArtifactMetadataValue`. Can be used as the value type for the\n `metadata` parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "class": MetadataValue.python_artifact(MyClass),\n "function": MetadataValue.python_artifact(my_function),\n }\n )\n\n Args:\n value (Callable): The python class or function for a metadata entry.\n """\n check.callable_param(python_artifact, "python_artifact")\n return PythonArtifactMetadataValue(python_artifact.__module__, python_artifact.__name__)
\n\n
[docs] @staticmethod\n def float(value: float) -> "FloatMetadataValue":\n """Static constructor for a metadata value wrapping a float as\n :py:class:`FloatMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n }\n )\n\n Args:\n value (float): The float value for a metadata entry.\n """\n\n return FloatMetadataValue(value)
\n\n
[docs] @staticmethod\n def int(value: int) -> "IntMetadataValue":\n """Static constructor for a metadata value wrapping an int as\n :py:class:`IntMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "number of rows": MetadataValue.int(len(df)),\n },\n )\n\n Args:\n value (int): The int value for a metadata entry.\n """\n\n return IntMetadataValue(value)
\n\n @staticmethod\n def pipeline_run(run_id: str) -> "DagsterPipelineRunMetadataValue":\n check.str_param(run_id, "run_id")\n return DagsterPipelineRunMetadataValue(run_id)\n\n
[docs] @staticmethod\n def dagster_run(run_id: str) -> "DagsterPipelineRunMetadataValue":\n """Static constructor for a metadata value wrapping a reference to a Dagster run.\n\n Args:\n run_id (str): The ID of the run.\n """\n return MetadataValue.pipeline_run(run_id)
\n\n
[docs] @staticmethod\n def asset(asset_key: "AssetKey") -> "DagsterAssetMetadataValue":\n """Static constructor for a metadata value referencing a Dagster asset, by key.\n\n For example:\n\n .. code-block:: python\n\n @op\n def validate_table(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey("my_table"),\n metadata={\n "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n },\n )\n\n Args:\n asset_key (AssetKey): The asset key referencing the asset.\n """\n\n from dagster.core.definitions.events import AssetKey\n\n check.inst_param(asset_key, "asset_key", AssetKey)\n return DagsterAssetMetadataValue(asset_key)
\n\n
[docs] @staticmethod\n @experimental\n def table(\n records: List[TableRecord], schema: Optional[TableSchema] = None\n ) -> "TableMetadataValue":\n """Static constructor for a metadata value wrapping arbitrary tabular data as\n :py:class:`TableMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not has_errors,\n label="is_valid",\n metadata={\n "errors": MetadataValue.table(\n records=[\n TableRecord(code="invalid-data-type", row=2, col="name"}]\n ],\n schema=TableSchema(\n columns=[\n TableColumn(name="code", type="string"),\n TableColumn(name="row", type="int"),\n TableColumn(name="col", type="string"),\n ]\n )\n ),\n },\n )\n\n Args:\n records (List[TableRecord]): The data as a list of records (i.e. rows).\n schema (Optional[TableSchema]): A schema for the table.\n """\n return TableMetadataValue(records, schema)
\n\n
[docs] @staticmethod\n @experimental\n def table_schema(\n schema: TableSchema,\n ) -> "TableSchemaMetadataValue":\n """Static constructor for a metadata value wrapping a table schema as\n :py:class:`TableSchemaMetadataValue`. Can be used as the value type\n for the `metadata` parameter for supported events. For example:\n\n .. code-block:: python\n\n schema = TableSchema(\n columns = [\n TableColumn(name="id", type="int"),\n TableColumn(name="status", type="bool"),\n ]\n )\n\n DagsterType(\n type_check_fn=some_validation_fn,\n name='MyTable',\n metadata={\n 'my_table_schema': MetadataValue.table_schema(schema),\n }\n )\n\n Args:\n schema (TableSchema): The table schema for a metadata entry.\n """\n return TableSchemaMetadataValue(schema)
\n\n\n# ########################\n# ##### METADATA VALUE TYPES\n# ########################\n\n# NOTE: We have `type: ignore` in a few places below because mypy complains about an instance method\n# (e.g. `text`) overriding a static method on the superclass of the same name. This is not a concern\n# for us because these static methods should never be called on instances.\n\n# NOTE: `XMetadataValue` classes are serialized with a storage name of `XMetadataEntryData` to\n# maintain backward compatibility. See docstring of `whitelist_for_serdes` for more info.\n\n\n
[docs]@whitelist_for_serdes(storage_name="TextMetadataEntryData")\nclass TextMetadataValue( # type: ignore\n NamedTuple(\n "_TextMetadataValue",\n [\n ("text", Optional[str]),\n ],\n ),\n MetadataValue,\n):\n """Container class for text metadata entry data.\n\n Args:\n text (Optional[str]): The text data.\n """\n\n def __new__(cls, text: Optional[str]):\n return super(TextMetadataValue, cls).__new__(\n cls, check.opt_str_param(text, "text", default="")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="UrlMetadataEntryData")\nclass UrlMetadataValue( # type: ignore\n NamedTuple(\n "_UrlMetadataValue",\n [\n ("url", Optional[str]),\n ],\n ),\n MetadataValue,\n):\n """Container class for URL metadata entry data.\n\n Args:\n url (Optional[str]): The URL as a string.\n """\n\n def __new__(cls, url: Optional[str]):\n return super(UrlMetadataValue, cls).__new__(\n cls, check.opt_str_param(url, "url", default="")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="PathMetadataEntryData")\nclass PathMetadataValue( # type: ignore\n NamedTuple("_PathMetadataValue", [("path", Optional[str])]), MetadataValue\n):\n """Container class for path metadata entry data.\n\n Args:\n path (Optional[str]): The path as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(PathMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="JsonMetadataEntryData")\nclass JsonMetadataValue(\n NamedTuple(\n "_JsonMetadataValue",\n [\n ("data", Dict[str, Any]),\n ],\n ),\n MetadataValue,\n):\n """Container class for JSON metadata entry data.\n\n Args:\n data (Dict[str, Any]): The JSON data.\n """\n\n def __new__(cls, data: Optional[Dict[str, Any]]):\n data = check.opt_dict_param(data, "data", key_type=str)\n try:\n # check that the value is JSON serializable\n seven.dumps(data)\n except TypeError:\n raise DagsterInvalidMetadata("Value is a dictionary but is not JSON serializable.")\n return super(JsonMetadataValue, cls).__new__(cls, data)
\n\n\n
[docs]@whitelist_for_serdes(storage_name="MarkdownMetadataEntryData")\nclass MarkdownMetadataValue(\n NamedTuple(\n "_MarkdownMetadataValue",\n [\n ("md_str", Optional[str]),\n ],\n ),\n MetadataValue,\n):\n """Container class for markdown metadata entry data.\n\n Args:\n md_str (Optional[str]): The markdown as a string.\n """\n\n def __new__(cls, md_str: Optional[str]):\n return super(MarkdownMetadataValue, cls).__new__(\n cls, check.opt_str_param(md_str, "md_str", default="")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="PythonArtifactMetadataEntryData")\nclass PythonArtifactMetadataValue(\n NamedTuple(\n "_PythonArtifactMetadataValue",\n [\n ("module", str),\n ("name", str),\n ],\n ),\n MetadataValue,\n):\n """Container class for python artifact metadata entry data.\n\n Args:\n module (str): The module where the python artifact can be found\n name (str): The name of the python artifact\n """\n\n def __new__(cls, module: str, name: str):\n return super(PythonArtifactMetadataValue, cls).__new__(\n cls, check.str_param(module, "module"), check.str_param(name, "name")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="FloatMetadataEntryData")\nclass FloatMetadataValue(\n NamedTuple(\n "_FloatMetadataValue",\n [\n ("value", Optional[float]),\n ],\n ),\n MetadataValue,\n):\n """Container class for float metadata entry data.\n\n Args:\n value (Optional[float]): The float value.\n """\n\n def __new__(cls, value: Optional[float]):\n return super(FloatMetadataValue, cls).__new__(cls, check.opt_float_param(value, "value"))
\n\n\n
[docs]@whitelist_for_serdes(storage_name="IntMetadataEntryData")\nclass IntMetadataValue(\n NamedTuple(\n "_IntMetadataValue",\n [\n ("value", Optional[int]),\n ],\n ),\n MetadataValue,\n):\n """Container class for int metadata entry data.\n\n Args:\n value (Optional[int]): The int value.\n """\n\n def __new__(cls, value: Optional[int]):\n return super(IntMetadataValue, cls).__new__(cls, check.opt_int_param(value, "value"))
\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterPipelineRunMetadataEntryData")\nclass DagsterPipelineRunMetadataValue(\n NamedTuple(\n "_DagsterPipelineRunMetadataValue",\n [\n ("run_id", str),\n ],\n ),\n MetadataValue,\n):\n """Representation of a dagster pipeline run.\n\n Args:\n run_id (str): The pipeline run id\n """\n\n def __new__(cls, run_id: str):\n return super(DagsterPipelineRunMetadataValue, cls).__new__(\n cls, check.str_param(run_id, "run_id")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterAssetMetadataEntryData")\nclass DagsterAssetMetadataValue(\n NamedTuple("_DagsterAssetMetadataValue", [("asset_key", "AssetKey")]), MetadataValue\n):\n """Representation of a dagster asset.\n\n Args:\n asset_key (AssetKey): The dagster asset key\n """\n\n def __new__(cls, asset_key: "AssetKey"):\n from dagster.core.definitions.events import AssetKey\n\n return super(DagsterAssetMetadataValue, cls).__new__(\n cls, check.inst_param(asset_key, "asset_key", AssetKey)\n )
\n\n\n
[docs]@experimental\n@whitelist_for_serdes(storage_name="TableMetadataEntryData")\nclass TableMetadataValue(\n NamedTuple(\n "_TableMetadataValue",\n [\n ("records", List[TableRecord]),\n ("schema", TableSchema),\n ],\n ),\n MetadataValue,\n):\n """Container class for table metadata entry data.\n\n Args:\n records (TableRecord): The data as a list of records (i.e. rows).\n schema (Optional[TableSchema]): A schema for the table.\n """\n\n @staticmethod\n def infer_column_type(value):\n if isinstance(value, bool):\n return "bool"\n elif isinstance(value, int):\n return "int"\n elif isinstance(value, float):\n return "float"\n else:\n return "string"\n\n def __new__(cls, records: List[TableRecord], schema: Optional[TableSchema]):\n\n check.list_param(records, "records", of_type=TableRecord)\n check.opt_inst_param(schema, "schema", TableSchema)\n\n if len(records) == 0:\n schema = check.not_none(schema, "schema must be provided if records is empty")\n else:\n columns = set(records[0].data.keys())\n for record in records[1:]:\n check.invariant(\n set(record.data.keys()) == columns, "All records must have the same fields"\n )\n schema = schema or TableSchema(\n columns=[\n TableColumn(name=k, type=TableMetadataValue.infer_column_type(v))\n for k, v in records[0].data.items()\n ]\n )\n\n return super(TableMetadataValue, cls).__new__(\n cls,\n records,\n schema,\n )
\n\n\n
[docs]@experimental\n@whitelist_for_serdes(storage_name="TableSchemaMetadataEntryData")\nclass TableSchemaMetadataValue(\n NamedTuple("_TableSchemaMetadataValue", [("schema", TableSchema)]), MetadataValue\n):\n """Representation of a schema for arbitrary tabular data.\n\n Args:\n schema (TableSchema): The dictionary containing the schema representation.\n """\n\n def __new__(cls, schema: TableSchema):\n return super(TableSchemaMetadataValue, cls).__new__(\n cls, check.inst_param(schema, "schema", TableSchema)\n )
\n\n\n# ########################\n# ##### METADATA ENTRY\n# ########################\n\n\ndef deprecated_metadata_entry_constructor(fn):\n @functools.wraps(fn)\n def wrapper(*args, **kwargs):\n deprecation_warning(\n f"Function `MetadataEntry.{fn.__name__}`",\n "0.15.0",\n additional_warn_txt=re.sub(\n r"\\n\\s*",\n " ",\n """\n The recommended way to supply metadata is to pass a `Dict[str,\n MetadataValue]` to the `metadata` keyword argument. To construct `MetadataEntry`\n directly, call constructor and pass a `MetadataValue`: `MetadataEntry(label="foo",\n value=MetadataValue.text("bar")",\n """,\n ),\n )\n return fn(*args, **kwargs)\n\n return wrapper\n\n\n# NOTE: This would better be implemented as a generic with `MetadataValue` set as a\n# typevar, but as of 2022-01-25 mypy does not support generics on NamedTuple.\n#\n# NOTE: This currently stores value in the `entry_data` NamedTuple attribute. In the next release,\n# we will change the name of the NamedTuple property to `value`, and need to implement custom\n# serialization so that it continues to be saved as `entry_data` for backcompat purposes.\n
[docs]@whitelist_for_serdes(storage_name="EventMetadataEntry")\nclass MetadataEntry(\n NamedTuple(\n "_MetadataEntry",\n [\n ("label", str),\n ("description", Optional[str]),\n ("entry_data", MetadataValue),\n ],\n ),\n):\n """The standard structure for describing metadata for Dagster events.\n\n Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\n in Dagit and other tooling.\n\n Should be yielded from within an IO manager to append metadata for a given input/output event.\n For other event types, passing a dict with `MetadataValue` values to the `metadata` argument\n is preferred.\n\n Args:\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n value (MetadataValue): Typed metadata entry data. The different types allow\n for customized display in tools like dagit.\n """\n\n def __new__(\n cls,\n label: str,\n description: Optional[str] = None,\n entry_data: Optional["RawMetadataValue"] = None,\n value: Optional["RawMetadataValue"] = None,\n ):\n if description is not None:\n deprecation_warning(\n 'The "description" attribute on "MetadataEntry"',\n "0.15.0",\n )\n value = cast(\n RawMetadataValue,\n canonicalize_backcompat_args(\n new_val=value,\n new_arg="value",\n old_val=entry_data,\n old_arg="entry_data",\n breaking_version="0.15.0",\n ),\n )\n value = normalize_metadata_value(value)\n\n return super(MetadataEntry, cls).__new__(\n cls,\n check.str_param(label, "label"),\n check.opt_str_param(description, "description"),\n check.inst_param(value, "value", MetadataValue),\n )\n\n @property\n def value(self):\n """Alias of `entry_data`."""\n return self.entry_data\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def text(text: Optional[str], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing text as\n :py:class:`TextMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[\n MetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n\n Args:\n text (Optional[str]): The text of this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, TextMetadataValue(text))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def url(url: Optional[str], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing a URL as\n :py:class:`UrlMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata_entries=[\n MetadataEntry.url(\n "http://mycoolsite.com/my_dashboard", label="dashboard_url"\n ),\n ],\n )\n\n Args:\n url (Optional[str]): The URL contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, UrlMetadataValue(url))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def path(path: Optional[str], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing a path as\n :py:class:`PathMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[MetadataEntry.path("path/to/file", label="filepath")],\n )\n\n Args:\n path (Optional[str]): The path contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, PathMetadataValue(path))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def fspath(\n path: Optional[str], label: Optional[str] = None, description: Optional[str] = None\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing a filesystem path as\n :py:class:`PathMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[MetadataEntry.fspath("path/to/file")],\n )\n\n Args:\n path (Optional[str]): The path contained by this metadata entry.\n label (Optional[str]): Short display label for this metadata entry. Defaults to the\n base name of the path.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n if not label:\n path = cast(str, check.str_param(path, "path"))\n label = last_file_comp(path)\n\n return MetadataEntry.path(path, label, description)
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def json(\n data: Optional[Dict[str, Any]],\n label: str,\n description: Optional[str] = None,\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing JSON data as\n :py:class:`JsonMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata_entries=[\n MetadataEntry.json(\n label="metadata", data={"missing_columns": missing_things},\n )\n ],\n )\n\n Args:\n data (Optional[Dict[str, Any]]): The JSON data contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, JsonMetadataValue(data))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def md(md_str: Optional[str], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing markdown data as\n :py:class:`MarkdownMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata_entries=[MetadataEntry.md(md_str=md_str)],\n )\n\n Args:\n md_str (Optional[str]): The markdown contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, MarkdownMetadataValue(md_str))
\n\n @staticmethod\n @deprecated_metadata_entry_constructor\n def python_artifact(\n python_artifact: Callable[..., Any], label: str, description: Optional[str] = None\n ) -> "MetadataEntry":\n check.callable_param(python_artifact, "python_artifact")\n return MetadataEntry(\n label,\n description,\n PythonArtifactMetadataValue(python_artifact.__module__, python_artifact.__name__),\n )\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def float(\n value: Optional[float], label: str, description: Optional[str] = None\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing float as\n :py:class:`FloatMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[MetadataEntry.float(calculate_bytes(df), "size (bytes)")],\n )\n\n Args:\n value (Optional[float]): The float value contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n return MetadataEntry(label, description, FloatMetadataValue(value))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def int(value: Optional[int], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing int as\n :py:class:`IntMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[MetadataEntry.int(len(df), "number of rows")],\n )\n\n Args:\n value (Optional[int]): The int value contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n return MetadataEntry(label, description, IntMetadataValue(value))
\n\n @staticmethod\n @deprecated_metadata_entry_constructor\n def pipeline_run(run_id: str, label: str, description: Optional[str] = None) -> "MetadataEntry":\n check.str_param(run_id, "run_id")\n return MetadataEntry(label, description, DagsterPipelineRunMetadataValue(run_id))\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def asset(\n asset_key: "AssetKey", label: str, description: Optional[str] = None\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry referencing a Dagster asset, by key.\n\n For example:\n\n .. code-block:: python\n\n @op\n def validate_table(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey("my_table"),\n metadata_entries=[\n MetadataEntry.asset(AssetKey('my_other_table'), "Related asset"),\n ],\n )\n\n Args:\n asset_key (AssetKey): The asset key referencing the asset.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n from dagster.core.definitions.events import AssetKey\n\n check.inst_param(asset_key, "asset_key", AssetKey)\n return MetadataEntry(label, description, DagsterAssetMetadataValue(asset_key))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n @experimental\n def table(\n records: List[TableRecord],\n label: str,\n description: Optional[str] = None,\n schema: Optional[TableSchema] = None,\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing tabluar data as\n :py:class:`TableMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not has_errors,\n label="is_valid",\n metadata_entries=[\n MetadataEntry.table(\n label="errors",\n records=[\n TableRecord(code="invalid-data-type", row=2, col="name"}]\n ],\n schema=TableSchema(\n columns=[\n TableColumn(name="code", type="string"),\n TableColumn(name="row", type="int"),\n TableColumn(name="col", type="string"),\n ]\n )\n ),\n ],\n )\n\n Args:\n records (List[TableRecord]): The data as a list of records (i.e. rows).\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n schema (Optional[TableSchema]): A schema for the table. If none is provided, one will be\n automatically generated by examining the first record. The schema will include as columns all\n field names present in the first record, with a type of `"string"`, `"int"`,\n `"bool"` or `"float"` inferred from the first record's values. If a value does\n not directly match one of the above types, it will be treated as a string.\n """\n return MetadataEntry(label, description, TableMetadataValue(records, schema))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n @experimental\n def table_schema(\n schema: TableSchema, label: str, description: Optional[str] = None\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing a table schema as\n :py:class:`TableSchemaMetadataValue`. For example:\n\n .. code-block:: python\n\n schema = TableSchema(\n columns = [\n TableColumn(name="id", type="int"),\n TableColumn(name="status", type="bool"),\n ]\n )\n\n DagsterType(\n type_check_fn=some_validation_fn,\n name='MyTable',\n metadata_entries=[\n MetadataEntry.table_schema(\n schema,\n label='schema',\n )\n ]\n )\n\n Args:\n schema (TableSchema): The table schema for a metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(\n label,\n description,\n TableSchemaMetadataValue(schema),\n )
\n\n\nclass PartitionMetadataEntry(\n NamedTuple(\n "_PartitionMetadataEntry",\n [\n ("partition", str),\n ("entry", "MetadataEntry"),\n ],\n )\n):\n """Event containing an :py:class:`MetadataEntry` and the name of a partition that the entry\n applies to.\n\n This can be yielded or returned in place of MetadataEntries for cases where you are trying\n to associate metadata more precisely.\n """\n\n def __new__(cls, partition: str, entry: MetadataEntry):\n experimental_class_warning("PartitionMetadataEntry")\n return super(PartitionMetadataEntry, cls).__new__(\n cls,\n check.str_param(partition, "partition"),\n check.inst_param(entry, "entry", MetadataEntry),\n )\n
", "current_page_name": "_modules/dagster/core/definitions/metadata", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "table": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.metadata.table

\nimport warnings\nfrom typing import Any, Dict, List, NamedTuple, Optional, Type, Union, cast\n\nimport dagster.check as check\nfrom dagster.serdes.serdes import DefaultNamedTupleSerializer, whitelist_for_serdes\nfrom dagster.utils.backcompat import ExperimentalWarning, experimental\n\n# ########################\n# ##### TABLE RECORD\n# ########################\n\n\nclass _TableRecordSerializer(DefaultNamedTupleSerializer):\n    @classmethod\n    def value_from_unpacked(\n        cls,\n        unpacked_dict: Dict[str, Any],\n        klass: Type,\n    ):\n        return klass(**unpacked_dict["data"])\n\n\n
[docs]@experimental\n@whitelist_for_serdes(serializer=_TableRecordSerializer)\nclass TableRecord(NamedTuple("TableRecord", [("data", Dict[str, Union[str, int, float, bool]])])):\n """Represents one record in a table. All passed keyword arguments are treated as field key/value\n pairs in the record. Field keys are arbitrary strings-- field values must be strings, integers,\n floats, or bools.\n """\n\n def __new__(cls, **data):\n check.dict_param(\n data,\n "data",\n value_type=(str, float, int, bool, type(None)),\n additional_message="Record fields must be one of types: (str, float, int, bool)",\n )\n return super(TableRecord, cls).__new__(cls, data=data)
\n\n\n# ########################\n# ##### TABLE SCHEMA\n# ########################\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass TableSchema(\n NamedTuple(\n "TableSchema",\n [\n ("columns", List["TableColumn"]),\n ("constraints", "TableConstraints"),\n ],\n )\n):\n """Representation of a schema for tabular data. Schema is composed of two parts:\n\n - A required list of columns (`TableColumn`). Each column specifies a\n `name`, `type`, set of `constraints`, and (optional) `description`. `type`\n defaults to `string` if unspecified. Column constraints\n (`TableColumnConstraints`) consist of boolean properties `unique` and\n `nullable`, as well as a list of strings `other` containing string\n descriptions of all additional constraints (e.g. `"<= 5"`).\n - An optional list of table-level constraints (`TableConstraints`). A\n table-level constraint cannot be expressed in terms of a single column,\n e.g. col a > col b. Presently, all table-level constraints must be\n expressed as strings under the `other` attribute of a `TableConstraints`\n object.\n\n .. code-block:: python\n\n # example schema\n TableSchema(\n constraints = TableConstraints(\n other = [\n "foo > bar",\n ],\n ),\n columns = [\n TableColumn(\n name = "foo",\n type = "string",\n description = "Foo description",\n constraints = TableColumnConstraints(\n required = True,\n other = [\n "starts with the letter 'a'",\n ],\n ),\n ),\n TableColumn(\n name = "bar",\n type = "string",\n ),\n TableColumn(\n name = "baz",\n type = "custom_type",\n constraints = TableColumnConstraints(\n unique = True,\n )\n ),\n ],\n )\n\n Args:\n columns (List[TableColumn]): The columns of the table.\n constraints (Optional[TableConstraints]): The constraints of the table.\n """\n\n def __new__(\n cls,\n columns: List["TableColumn"],\n constraints: Optional["TableConstraints"] = None,\n ):\n return super(TableSchema, cls).__new__(\n cls,\n columns=check.list_param(columns, "columns", of_type=TableColumn),\n constraints=check.opt_inst_param(\n constraints, "constraints", TableConstraints, default=_DEFAULT_TABLE_CONSTRAINTS\n ),\n )
\n\n\n# ########################\n# ##### TABLE CONSTRAINTS\n# ########################\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass TableConstraints(\n NamedTuple(\n "TableConstraints",\n [\n ("other", List[str]),\n ],\n )\n):\n """Descriptor for "table-level" constraints. Presently only one property,\n `other` is supported. This contains strings describing arbitrary\n table-level constraints. A table-level constraint is a constraint defined\n in terms of multiple columns (e.g. col_A > col_B) or in terms of rows.\n\n Args:\n other (List[str]): Descriptions of arbitrary table-level constraints.\n """\n\n def __new__(\n cls,\n other: List[str],\n ):\n return super(TableConstraints, cls).__new__(\n cls,\n other=check.list_param(other, "other", of_type=str),\n )
\n\n\nwith warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n _DEFAULT_TABLE_CONSTRAINTS = TableConstraints(other=[])\n\n# ########################\n# ##### TABLE COLUMN\n# ########################\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass TableColumn(\n NamedTuple(\n "TableColumn",\n [\n ("name", str),\n ("type", str),\n ("description", Optional[str]),\n ("constraints", "TableColumnConstraints"),\n ],\n )\n):\n """Descriptor for a table column. The only property that must be specified\n by the user is `name`. If no `type` is specified, `string` is assumed. If\n no `constraints` are specified, the column is assumed to be nullable\n (i.e. `required = False`) and have no other constraints beyond the data type.\n\n Args:\n name (List[str]): Descriptions of arbitrary table-level constraints.\n type (Optional[str]): The type of the column. Can be an arbitrary\n string. Defaults to `"string"`.\n description (Optional[str]): Description of this column. Defaults to `None`.\n constraints (Optional[TableColumnConstraints]): Column-level constraints.\n If unspecified, column is nullable with no constraints.\n """\n\n def __new__(\n cls,\n name: str,\n type: str = "string", # pylint: disable=redefined-builtin\n description: Optional[str] = None,\n constraints: Optional["TableColumnConstraints"] = None,\n ):\n return super(TableColumn, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n type=check.str_param(type, "type"),\n description=check.opt_str_param(description, "description"),\n constraints=cast(\n "TableColumnConstraints",\n check.opt_inst_param(\n constraints,\n "constraints",\n TableColumnConstraints,\n default=_DEFAULT_TABLE_COLUMN_CONSTRAINTS,\n ),\n ),\n )
\n\n\n# ########################\n# ##### TABLE COLUMN CONSTRAINTS\n# ########################\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass TableColumnConstraints(\n NamedTuple(\n "TableColumnConstraints",\n [\n ("nullable", bool),\n ("unique", bool),\n ("other", Optional[List[str]]),\n ],\n )\n):\n """Descriptor for a table column's constraints. Nullability and uniqueness are specified with\n boolean properties. All other constraints are described using arbitrary strings under the\n `other` property.\n\n Args:\n nullable (Optional[bool]): If true, this column can hold null values.\n unique (Optional[bool]): If true, all values in this column must be unique.\n other (List[str]): Descriptions of arbitrary column-level constraints\n not expressible by the predefined properties.\n """\n\n def __new__(\n cls,\n nullable: bool = True,\n unique: bool = False,\n other: Optional[List[str]] = None,\n ):\n return super(TableColumnConstraints, cls).__new__(\n cls,\n nullable=check.bool_param(nullable, "nullable"),\n unique=check.bool_param(unique, "unique"),\n other=check.opt_list_param(other, "other"),\n )
\n\n\nwith warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n _DEFAULT_TABLE_COLUMN_CONSTRAINTS = TableColumnConstraints()\n
", "current_page_name": "_modules/dagster/core/definitions/metadata/table", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}, {"link": "../", "title": "dagster.core.definitions.metadata"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.metadata.table"}, "title": "dagster.core.definitions.metadata"}, "mode": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.mode

\nfrom typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional\n\nfrom dagster import check\nfrom dagster.core.definitions.executor_definition import ExecutorDefinition, default_executors\nfrom dagster.loggers import default_loggers\nfrom dagster.utils.merger import merge_dicts\n\nfrom .config import ConfigMapping\nfrom .logger_definition import LoggerDefinition\nfrom .resource_definition import ResourceDefinition\nfrom .utils import check_valid_name\n\nDEFAULT_MODE_NAME = "default"\n\nif TYPE_CHECKING:\n    from .partition import PartitionedConfig\n\n\n
[docs]class ModeDefinition(\n NamedTuple(\n "_ModeDefinition",\n [\n ("name", str),\n ("resource_defs", Dict[str, ResourceDefinition]),\n ("loggers", Dict[str, LoggerDefinition]),\n ("executor_defs", List[ExecutorDefinition]),\n ("description", Optional[str]),\n ("config_mapping", Optional[ConfigMapping]),\n ("partitioned_config", Optional["PartitionedConfig"]),\n ],\n )\n):\n """Define a mode in which a pipeline can operate.\n\n A mode provides pipelines with a set of resource implementations, loggers, system storages,\n and executors.\n\n Args:\n name (Optional[str]): The name of the mode. Must be unique within the\n :py:class:`PipelineDefinition` to which the mode is attached. (default: "default").\n resource_defs (Optional[Dict[str, ResourceDefinition]]): A dictionary of string resource\n keys to their implementations. Individual solids may require resources to be present by\n these keys.\n logger_defs (Optional[Dict[str, LoggerDefinition]]): A dictionary of string logger\n identifiers to their implementations.\n executor_defs (Optional[List[ExecutorDefinition]]): The set of executors available when\n executing in this mode. By default, this will be the 'in_process' and 'multiprocess'\n executors (:py:data:`~dagster.default_executors`).\n description (Optional[str]): A human-readable description of the mode.\n _config_mapping (Optional[ConfigMapping]): Only for internal use.\n _partitions (Optional[PartitionedConfig]): Only for internal use.\n """\n\n def __new__(\n cls,\n name: Optional[str] = None,\n resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n executor_defs: Optional[List[ExecutorDefinition]] = None,\n description: Optional[str] = None,\n _config_mapping: Optional[ConfigMapping] = None,\n _partitioned_config: Optional["PartitionedConfig"] = None,\n ):\n\n from .partition import PartitionedConfig\n\n resource_defs = check.opt_dict_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n\n for key in resource_defs:\n if not key.isidentifier():\n check.failed(f"Resource key '{key}' must be a valid Python identifier.")\n\n if resource_defs and "io_manager" in resource_defs:\n resource_defs_with_defaults = resource_defs\n else:\n from dagster.core.storage.mem_io_manager import mem_io_manager\n\n resource_defs_with_defaults = merge_dicts(\n {"io_manager": mem_io_manager}, resource_defs or {}\n )\n\n return super(ModeDefinition, cls).__new__(\n cls,\n name=check_valid_name(name) if name else DEFAULT_MODE_NAME,\n resource_defs=resource_defs_with_defaults,\n loggers=(\n check.opt_dict_param(\n logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition\n )\n or default_loggers()\n ),\n executor_defs=check.list_param(\n executor_defs if executor_defs else default_executors,\n "executor_defs",\n of_type=ExecutorDefinition,\n ),\n description=check.opt_str_param(description, "description"),\n config_mapping=check.opt_inst_param(_config_mapping, "_config_mapping", ConfigMapping),\n partitioned_config=check.opt_inst_param(\n _partitioned_config, "_partitioned_config", PartitionedConfig\n ),\n )\n\n @property\n def resource_key_set(self):\n return frozenset(self.resource_defs.keys())\n\n @staticmethod\n def from_resources(resources, name=None):\n check.dict_param(resources, "resources", key_type=str)\n\n return ModeDefinition(\n name=name,\n resource_defs={\n resource_name: ResourceDefinition.hardcoded_resource(resource)\n for resource_name, resource in resources.items()\n },\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/mode", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.mode"}, "op_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.op_definition

\nfrom typing import Dict\n\nfrom .input import In\nfrom .output import Out\nfrom .solid_definition import SolidDefinition\n\n\n
[docs]class OpDefinition(SolidDefinition):\n """\n Defines an op, the functional unit of user-defined computation.\n\n For more details on what a op is, refer to the\n `Ops Overview <../../concepts/ops-jobs-graphs/ops>`_ .\n\n End users should prefer the :func:`@op <op>` decorator. OpDefinition is generally intended to be\n used by framework authors or for programatically generated ops.\n\n Args:\n name (str): Name of the op. Must be unique within any :py:class:`GraphDefinition` or\n :py:class:`JobDefinition` that contains the op.\n input_defs (List[InputDefinition]): Inputs of the op.\n compute_fn (Callable): The core of the op, the function that performs the actual\n computation. The signature of this function is determined by ``input_defs``, and\n optionally, an injected first argument, ``context``, a collection of information\n provided by the system.\n\n This function will be coerced into a generator or an async generator, which must yield\n one :py:class:`Output` for each of the op's ``output_defs``, and additionally may\n yield other types of Dagster events, including :py:class:`AssetMaterialization` and\n :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the op.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that the config provided for the op matches this schema and will fail if it does not. If\n not set, Dagster will accept any config provided for the op.\n description (Optional[str]): Human-readable description of the op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this op.\n version (Optional[str]): (Experimental) The version of the op's compute_fn. Two ops should\n have the same version if and only if they deterministically produce the same outputs\n when provided the same inputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n OpDefinition(\n name="add_one",\n input_defs=[InputDefinition("num", Int)],\n output_defs=[OutputDefinition(Int)], # default name ("result")\n compute_fn=_add_one,\n )\n """\n\n @property\n def node_type_str(self) -> str:\n return "op"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @property\n def ins(self) -> Dict[str, In]:\n return {input_def.name: In.from_definition(input_def) for input_def in self.input_defs}\n\n @property\n def outs(self) -> Dict[str, Out]:\n return {output_def.name: Out.from_definition(output_def) for output_def in self.output_defs}
\n
", "current_page_name": "_modules/dagster/core/definitions/op_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.op_definition"}, "output": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.output

\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    List,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey, DynamicAssetKey\nfrom dagster.core.definitions.metadata import MetadataEntry, MetadataUserInput, normalize_metadata\nfrom dagster.core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster.core.types.dagster_type import DagsterType, resolve_dagster_type\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom .inference import InferredOutputProps\nfrom .input import NoValueSentinel\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.partition import PartitionsDefinition\n    from dagster.core.execution.context.output import OutputContext\n\nTOut = TypeVar("TOut", bound="OutputDefinition")\n\n\n
[docs]class OutputDefinition:\n """Defines an output from a solid's compute function.\n\n Solids can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many solids have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Output definitions may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]): The type of this output.\n Users should provide the Python type of the objects that they expect the solid to yield\n for this output, or a :py:class:`DagsterType` that defines a runtime check that they\n want to be run on this output. Defaults to :py:class:`Any`.\n name (Optional[str]): Name of the output. (default: "result")\n description (Optional[str]): Human-readable description of the output.\n is_required (Optional[bool]): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the IOManager used for storing this\n output and loading it in downstream steps (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n asset_key (Optional[AssetKey]]): (Experimental) An AssetKey which should be associated\n with this OutputDefinition. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], OutputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the OutputContext) which should be associated with this OutputDefinition.\n """\n\n def __init__(\n self,\n dagster_type=None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n asset_key: Optional[Union[AssetKey, DynamicAssetKey]] = None,\n asset_partitions: Optional[\n Union[AbstractSet[str], Callable[["OutputContext"], AbstractSet[str]]]\n ] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None\n # make sure new parameters are updated in combine_with_inferred below\n ):\n from dagster.core.definitions.partition import PartitionsDefinition\n\n self._name = check_valid_name(check.opt_str_param(name, "name", DEFAULT_OUTPUT))\n self._type_not_set = dagster_type is None\n self._dagster_type = resolve_dagster_type(dagster_type)\n self._description = check.opt_str_param(description, "description")\n self._is_required = check.bool_param(is_required, "is_required")\n self._io_manager_key = check.opt_str_param(\n io_manager_key,\n "io_manager_key",\n default="io_manager",\n )\n self._metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n self._metadata_entries = check.is_list(\n normalize_metadata(self._metadata, [], allow_invalid=True), MetadataEntry\n )\n\n if asset_key:\n experimental_arg_warning("asset_key", "OutputDefinition.__init__")\n\n if callable(asset_key):\n warnings.warn(\n "Passing a function as the `asset_key` argument to `Out` or `OutputDefinition` is "\n "deprecated behavior and will be removed in version 0.15.0."\n )\n else:\n check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n self._asset_key = asset_key\n\n if asset_partitions:\n experimental_arg_warning("asset_partitions", "OutputDefinition.__init__")\n check.param_invariant(\n asset_key is not None,\n "asset_partitions",\n 'Cannot specify "asset_partitions" argument without also specifying "asset_key"',\n )\n\n self._asset_partitions_fn: Optional[Callable[["OutputContext"], AbstractSet[str]]]\n if callable(asset_partitions):\n self._asset_partitions_fn = asset_partitions\n elif asset_partitions is not None:\n asset_partitions = check.opt_set_param(asset_partitions, "asset_partitions", str)\n\n def _fn(_context: "OutputContext") -> AbstractSet[str]:\n return cast(AbstractSet[str], asset_partitions) # mypy bug?\n\n self._asset_partitions_fn = _fn\n else:\n self._asset_partitions_fn = None\n\n if asset_partitions_def:\n experimental_arg_warning("asset_partitions_def", "OutputDefinition.__init__")\n self._asset_partitions_def = check.opt_inst_param(\n asset_partitions_def, "asset_partition_def", PartitionsDefinition\n )\n\n @property\n def name(self):\n return self._name\n\n @property\n def dagster_type(self) -> DagsterType:\n return self._dagster_type\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def is_required(self) -> bool:\n return self._is_required\n\n @property\n def io_manager_key(self) -> str:\n return self._io_manager_key\n\n @property\n def optional(self) -> bool:\n return not self.is_required\n\n @property\n def metadata(self) -> MetadataUserInput:\n return self._metadata\n\n @property\n def metadata_entries(self) -> List[MetadataEntry]:\n return self._metadata_entries\n\n @property\n def is_dynamic(self) -> bool:\n return False\n\n @property\n def is_asset(self) -> bool:\n return self._asset_key is not None\n\n @property\n def asset_partitions_def(self) -> Optional["PartitionsDefinition"]:\n return self._asset_partitions_def\n\n @property\n def hardcoded_asset_key(self) -> Optional[AssetKey]:\n if not callable(self._asset_key):\n return self._asset_key\n else:\n return None\n\n def get_asset_key(self, context: "OutputContext") -> Optional[AssetKey]:\n """Get the AssetKey associated with this OutputDefinition for the given\n :py:class:`OutputContext` (if any).\n\n Args:\n context (OutputContext): The OutputContext that this OutputDefinition is being evaluated\n in\n """\n if callable(self._asset_key):\n return self._asset_key(context)\n else:\n return self.hardcoded_asset_key\n\n def get_asset_partitions(self, context: "OutputContext") -> Optional[AbstractSet[str]]:\n """Get the set of partitions associated with this OutputDefinition for the given\n :py:class:`OutputContext` (if any).\n\n Args:\n context (OutputContext): The OutputContext that this OutputDefinition is being evaluated\n in\n """\n if self._asset_partitions_fn is None:\n return None\n\n return self._asset_partitions_fn(context)\n\n def mapping_from(self, solid_name: str, output_name: Optional[str] = None) -> "OutputMapping":\n """Create an output mapping from an output of a child solid.\n\n In a CompositeSolidDefinition, you can use this helper function to construct\n an :py:class:`OutputMapping` from the output of a child solid.\n\n Args:\n solid_name (str): The name of the child solid from which to map this output.\n output_name (str): The name of the child solid's output from which to map this output.\n\n Examples:\n\n .. code-block:: python\n\n output_mapping = OutputDefinition(Int).mapping_from('child_solid')\n """\n return OutputMapping(self, OutputPointer(solid_name, output_name))\n\n @staticmethod\n def create_from_inferred(inferred: InferredOutputProps) -> "OutputDefinition":\n return OutputDefinition(\n dagster_type=_checked_inferred_type(inferred.annotation),\n description=inferred.description,\n )\n\n def combine_with_inferred(self: TOut, inferred: InferredOutputProps) -> TOut:\n dagster_type = self.dagster_type\n if self._type_not_set:\n dagster_type = _checked_inferred_type(inferred.annotation)\n if self.description is None:\n description = inferred.description\n else:\n description = self.description\n\n return self.__class__(\n name=self.name,\n dagster_type=dagster_type,\n description=description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self._metadata,\n asset_key=self._asset_key,\n asset_partitions=self._asset_partitions_fn,\n asset_partitions_def=self.asset_partitions_def,\n )
\n\n\ndef _checked_inferred_type(inferred: Any) -> DagsterType:\n try:\n return resolve_dagster_type(inferred)\n except DagsterError as e:\n raise DagsterInvalidDefinitionError(\n f"Problem using type '{inferred}' from return type annotation, correct the issue "\n "or explicitly set the dagster_type on your OutputDefinition."\n ) from e\n\n\nclass DynamicOutputDefinition(OutputDefinition):\n """\n Variant of :py:class:`OutputDefinition <dagster.OutputDefinition>` for an\n output that will dynamically alter the graph at runtime.\n\n When using in a composition function such as :py:func:`@pipeline <dagster.pipeline>`,\n dynamic outputs must be used with either\n\n * ``map`` - clone downstream solids for each separate :py:class:`DynamicOutput`\n * ``collect`` - gather across all :py:class:`DynamicOutput` in to a list\n\n Uses the same constructor as :py:class:`OutputDefinition <dagster.OutputDefinition>`\n\n .. code-block:: python\n\n @solid(\n config_schema={\n "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n },\n output_defs=[DynamicOutputDefinition(str)],\n )\n def files_in_directory(context):\n path = context.solid_config["path"]\n dirname, _, filenames = next(os.walk(path))\n for file in filenames:\n yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n @pipeline\n def process_directory():\n files = files_in_directory()\n\n # use map to invoke a solid on each dynamic output\n file_results = files.map(process_file)\n\n # use collect to gather the results in to a list\n summarize_directory(file_results.collect())\n """\n\n @property\n def is_dynamic(self) -> bool:\n return True\n\n\nclass OutputPointer(NamedTuple("_OutputPointer", [("solid_name", str), ("output_name", str)])):\n def __new__(cls, solid_name: str, output_name: Optional[str] = None):\n return super(OutputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.opt_str_param(output_name, "output_name", DEFAULT_OUTPUT),\n )\n\n\n
[docs]class OutputMapping(\n NamedTuple("_OutputMapping", [("definition", OutputDefinition), ("maps_from", OutputPointer)])\n):\n """Defines an output mapping for a composite solid.\n\n Args:\n definition (OutputDefinition): Defines the output of the composite solid.\n solid_name (str): The name of the child solid from which to map the output.\n output_name (str): The name of the child solid's output from which to map the output.\n """\n\n def __new__(cls, definition: OutputDefinition, maps_from: OutputPointer):\n return super(OutputMapping, cls).__new__(\n cls,\n check.inst_param(definition, "definition", OutputDefinition),\n check.inst_param(maps_from, "maps_from", OutputPointer),\n )
\n\n\n
[docs]class Out(\n NamedTuple(\n "_Out",\n [\n ("dagster_type", Union[DagsterType, Type[NoValueSentinel]]),\n ("description", Optional[str]),\n ("is_required", bool),\n ("io_manager_key", str),\n ("metadata", Optional[MetadataUserInput]),\n ("asset_key", Optional[Union[AssetKey, DynamicAssetKey]]),\n (\n "asset_partitions",\n Optional[Union[AbstractSet[str], Callable[["OutputContext"], AbstractSet[str]]]],\n ),\n ("asset_partitions_def", Optional["PartitionsDefinition"]),\n ],\n )\n):\n """\n Defines an output from an op's compute function.\n\n Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many ops have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Outs may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the output manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n asset_key (Optional[AssetKey]): (Experimental) An AssetKey which should be associated\n with this Out. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], OutputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the OutputContext) which should be associated with this Out.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n asset_key: Optional[AssetKey] = None,\n asset_partitions: Optional[\n Union[AbstractSet[str], Callable[["OutputContext"], AbstractSet[str]]]\n ] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n # make sure new parameters are updated in combine_with_inferred below\n ):\n if asset_partitions_def:\n experimental_arg_warning("assets_definition", "Out.__new__")\n return super(Out, cls).__new__(\n cls,\n dagster_type=NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type),\n description=description,\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default="io_manager"\n ),\n metadata=metadata,\n asset_key=asset_key,\n asset_partitions=asset_partitions,\n asset_partitions_def=asset_partitions_def,\n )\n\n @staticmethod\n def from_definition(output_def: "OutputDefinition"):\n return Out(\n dagster_type=output_def.dagster_type,\n description=output_def.description,\n is_required=output_def.is_required,\n io_manager_key=output_def.io_manager_key,\n metadata=output_def.metadata,\n asset_key=output_def._asset_key, # type: ignore # pylint: disable=protected-access\n asset_partitions=output_def._asset_partitions_fn, # pylint: disable=protected-access\n asset_partitions_def=output_def.asset_partitions_def, # pylint: disable=protected-access\n )\n\n def to_definition(self, annotation_type: type, name: Optional[str]) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type if self.dagster_type is not NoValueSentinel else annotation_type\n )\n\n return OutputDefinition(\n dagster_type=dagster_type,\n name=name,\n description=self.description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n asset_partitions_def=self.asset_partitions_def,\n )
\n\n\n
[docs]class DynamicOut(Out):\n """\n Variant of :py:class:`Out <dagster.Out>` for an output that will dynamically alter the graph at\n runtime.\n\n When using in a composition function such as :py:func:`@graph <dagster.graph>`,\n dynamic outputs must be used with either\n\n * ``map`` - clone downstream ops for each separate :py:class:`DynamicOut`\n * ``collect`` - gather across all :py:class:`DynamicOut` in to a list\n\n Uses the same constructor as :py:class:`Out <dagster.Out>`\n\n .. code-block:: python\n\n @op(\n config_schema={\n "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n },\n out=DynamicOut(str),\n )\n def files_in_directory(context):\n path = context.op_config["path"]\n dirname, _, filenames = next(os.walk(path))\n for file in filenames:\n yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n @job\n def process_directory():\n files = files_in_directory()\n\n # use map to invoke an op on each dynamic output\n file_results = files.map(process_file)\n\n # use collect to gather the results in to a list\n summarize_directory(file_results.collect())\n """\n\n def to_definition(self, annotation_type: type, name: Optional[str]) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type if self.dagster_type is not NoValueSentinel else annotation_type\n )\n\n return DynamicOutputDefinition(\n dagster_type=dagster_type,\n name=name,\n description=self.description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n )
\n\n\n
[docs]class GraphOut(NamedTuple("_GraphOut", [("description", Optional[str])])):\n """\n Represents information about the outputs that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the output.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphOut, cls).__new__(cls, description=description)\n\n def to_definition(self, name: Optional[str]) -> "OutputDefinition":\n return OutputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/core/definitions/output", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.output"}, "partition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.partition

\nimport copy\nimport inspect\nfrom abc import ABC, abstractmethod\nfrom datetime import datetime, time, timedelta\nfrom enum import Enum\nfrom typing import Any, Callable, Dict, Generic, List, NamedTuple, Optional, TypeVar, Union, cast\n\nimport pendulum\nfrom dateutil.relativedelta import relativedelta\n\nfrom dagster import check\nfrom dagster.serdes import whitelist_for_serdes\n\nfrom ...seven.compat.pendulum import PendulumDateTime, to_timezone\nfrom ...utils import frozenlist, merge_dicts\nfrom ...utils.schedules import schedule_execution_time_iterator\nfrom ..decorator_utils import get_function_params\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    DagsterUnknownPartitionError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom ..storage.pipeline_run import PipelineRun\nfrom ..storage.tags import check_tags\nfrom .mode import DEFAULT_MODE_NAME\nfrom .run_request import RunRequest, SkipReason\nfrom .schedule_definition import (\n    DefaultScheduleStatus,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n)\nfrom .utils import check_valid_name\n\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\n\nT = TypeVar("T")\n\n\n
[docs]class Partition(Generic[T]):\n """\n A Partition represents a single slice of the entire set of a job's possible work. It consists\n of a value, which is an object that represents that partition, and an optional name, which is\n used to label the partition in a human-readable way.\n\n Args:\n value (Any): The object for this partition\n name (str): Name for this partition\n """\n\n def __init__(self, value: T, name: Optional[str] = None):\n self._value = value\n self._name = cast(str, check.opt_str_param(name, "name", str(value)))\n\n @property\n def value(self) -> T:\n return self._value\n\n @property\n def name(self) -> str:\n return self._name\n\n def __eq__(self, other) -> bool:\n return (\n isinstance(other, Partition) and self.value == other.value and self.name == other.name\n )
\n\n\ndef schedule_partition_range(\n start: datetime,\n end: Optional[datetime],\n cron_schedule: str,\n fmt: str,\n timezone: Optional[str],\n execution_time_to_partition_fn: Callable,\n current_time: Optional[datetime],\n) -> List[Partition[datetime]]:\n if end and start > end:\n raise DagsterInvariantViolationError(\n 'Selected date range start "{start}" is after date range end "{end}'.format(\n start=start.strftime(fmt),\n end=end.strftime(fmt),\n )\n )\n\n tz = timezone if timezone else "UTC"\n\n _current_time = current_time if current_time else pendulum.now(tz)\n\n # Coerce to the definition timezone\n _start = (\n to_timezone(start, tz)\n if isinstance(start, PendulumDateTime)\n else pendulum.instance(start, tz=tz)\n )\n _current_time = (\n to_timezone(_current_time, tz)\n if isinstance(_current_time, PendulumDateTime)\n else pendulum.instance(_current_time, tz=tz)\n )\n\n # The end partition time should be before the last partition that\n # executes before the current time\n end_partition_time = execution_time_to_partition_fn(_current_time)\n\n # The partition set has an explicit end time that represents the end of the partition range\n if end:\n _end = (\n to_timezone(end, tz)\n if isinstance(end, PendulumDateTime)\n else pendulum.instance(end, tz=tz)\n )\n\n # If the explicit end time is before the last partition time,\n # update the end partition time\n end_partition_time = min(_end, end_partition_time)\n\n end_timestamp = end_partition_time.timestamp()\n\n partitions: List[Partition[datetime]] = []\n for next_time in schedule_execution_time_iterator(_start.timestamp(), cron_schedule, tz):\n\n partition_time = execution_time_to_partition_fn(next_time)\n\n if partition_time.timestamp() > end_timestamp:\n break\n\n if partition_time.timestamp() < _start.timestamp():\n continue\n\n partitions.append(Partition(value=partition_time, name=partition_time.strftime(fmt)))\n\n return partitions\n\n\n@whitelist_for_serdes\nclass ScheduleType(Enum):\n HOURLY = "HOURLY"\n DAILY = "DAILY"\n WEEKLY = "WEEKLY"\n MONTHLY = "MONTHLY"\n\n @property\n def ordinal(self):\n return {"HOURLY": 1, "DAILY": 2, "WEEKLY": 3, "MONTHLY": 4}[self.value]\n\n @property\n def delta(self):\n if self == ScheduleType.HOURLY:\n return timedelta(hours=1)\n elif self == ScheduleType.DAILY:\n return timedelta(days=1)\n elif self == ScheduleType.WEEKLY:\n return timedelta(weeks=1)\n elif self == ScheduleType.MONTHLY:\n return relativedelta(months=1)\n else:\n check.failed(f"Unexpected ScheduleType {self}")\n\n def __gt__(self, other):\n return self.ordinal > other.ordinal\n\n def __lt__(self, other):\n return self.ordinal < other.ordinal\n\n\nclass PartitionsDefinition(ABC, Generic[T]):\n @abstractmethod\n def get_partitions(self, current_time: Optional[datetime] = None) -> List[Partition[T]]:\n ...\n\n def __str__(self) -> str:\n joined_keys = ", ".join([f"'{key}'" for key in self.get_partition_keys()])\n return joined_keys\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> List[str]:\n return [partition.name for partition in self.get_partitions(current_time)]\n\n def get_default_partition_mapping(self):\n from dagster.core.asset_defs.partition_mapping import IdentityPartitionMapping\n\n return IdentityPartitionMapping()\n\n\nclass StaticPartitionsDefinition(\n PartitionsDefinition[str],\n): # pylint: disable=unsubscriptable-object\n def __init__(self, partition_keys: List[str]):\n check.list_param(partition_keys, "partition_keys", of_type=str)\n\n # Dagit selects partition ranges following the format '2022-01-13...2022-01-14'\n # "..." is an invalid substring in partition keys\n if any(["..." in partition_key for partition_key in partition_keys]):\n raise DagsterInvalidDefinitionError("'...' is an invalid substring in a partition key")\n\n self._partitions = [Partition(key) for key in partition_keys]\n\n def get_partitions(\n self, current_time: Optional[datetime] = None # pylint: disable=unused-argument\n ) -> List[Partition[str]]:\n return self._partitions\n\n def __eq__(self, other) -> bool:\n return (\n isinstance(other, StaticPartitionsDefinition)\n and self._partitions == other.get_partitions()\n )\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(partition_keys={[p.name for p in self._partitions]})"\n\n\nclass ScheduleTimeBasedPartitionsDefinition(\n PartitionsDefinition[datetime], # pylint: disable=unsubscriptable-object\n NamedTuple(\n "_ScheduleTimeBasedPartitionsDefinition",\n [\n ("schedule_type", ScheduleType),\n ("start", datetime),\n ("execution_time", time),\n ("execution_day", Optional[int]),\n ("end", Optional[datetime]),\n ("fmt", str),\n ("timezone", Optional[str]),\n ("offset", Optional[int]),\n ],\n ),\n):\n """Computes the partitions backwards from the scheduled execution times"""\n\n def __new__(\n cls,\n schedule_type: ScheduleType,\n start: datetime,\n execution_time: Optional[time] = None,\n execution_day: Optional[int] = None,\n end: Optional[datetime] = None,\n fmt: Optional[str] = None,\n timezone: Optional[str] = None,\n offset: Optional[int] = None,\n ):\n if end is not None:\n check.invariant(\n start <= end,\n f'Selected date range start "{start}" '\n f'is after date range end "{end}"'.format(\n start=start.strftime(fmt) if fmt is not None else start,\n end=cast(datetime, end).strftime(fmt) if fmt is not None else end,\n ),\n )\n if schedule_type in [ScheduleType.HOURLY, ScheduleType.DAILY]:\n check.invariant(\n not execution_day,\n f'Execution day should not be provided for schedule type "{schedule_type}"',\n )\n elif schedule_type is ScheduleType.WEEKLY:\n execution_day = execution_day if execution_day is not None else 0\n check.invariant(\n execution_day is not None and 0 <= execution_day <= 6,\n f'Execution day "{execution_day}" must be between 0 and 6 for '\n f'schedule type "{schedule_type}"',\n )\n elif schedule_type is ScheduleType.MONTHLY:\n execution_day = execution_day if execution_day is not None else 1\n check.invariant(\n execution_day is not None and 1 <= execution_day <= 31,\n f'Execution day "{execution_day}" must be between 1 and 31 for '\n f'schedule type "{schedule_type}"',\n )\n\n return super(ScheduleTimeBasedPartitionsDefinition, cls).__new__(\n cls,\n check.inst_param(schedule_type, "schedule_type", ScheduleType),\n check.inst_param(start, "start", datetime),\n check.opt_inst_param(execution_time, "execution_time", time, time(0, 0)),\n check.opt_int_param(\n execution_day,\n "execution_day",\n ),\n check.opt_inst_param(end, "end", datetime),\n cast(str, check.opt_str_param(fmt, "fmt", default=DEFAULT_DATE_FORMAT)),\n check.opt_str_param(timezone, "timezone", default="UTC"),\n check.opt_int_param(offset, "offset", default=1),\n )\n\n def get_partitions(self, current_time: Optional[datetime] = None) -> List[Partition[datetime]]:\n check.opt_inst_param(current_time, "current_time", datetime)\n\n return schedule_partition_range(\n start=self.start,\n end=self.end,\n cron_schedule=self.get_cron_schedule(),\n fmt=self.fmt,\n timezone=self.timezone,\n execution_time_to_partition_fn=self.get_execution_time_to_partition_fn(),\n current_time=current_time,\n )\n\n def get_cron_schedule(self) -> str:\n return get_cron_schedule(self.schedule_type, self.execution_time, self.execution_day)\n\n def get_execution_time_to_partition_fn(self) -> Callable[[datetime], datetime]:\n if self.schedule_type is ScheduleType.HOURLY:\n # Using subtract(minutes=d.minute) here instead of .replace(minute=0) because on\n # pendulum 1, replace(minute=0) sometimes changes the timezone:\n # >>> a = create_pendulum_time(2021, 11, 7, 0, 0, tz="US/Central")\n #\n # >>> a.add(hours=1)\n # <Pendulum [2021-11-07T01:00:00-05:00]>\n # >>> a.add(hours=1).replace(minute=0)\n # <Pendulum [2021-11-07T01:00:00-06:00]>\n return lambda d: pendulum.instance(d).subtract(hours=self.offset, minutes=d.minute)\n elif self.schedule_type is ScheduleType.DAILY:\n return (\n lambda d: pendulum.instance(d).replace(hour=0, minute=0).subtract(days=self.offset)\n )\n elif self.schedule_type is ScheduleType.WEEKLY:\n execution_day = cast(int, self.execution_day)\n day_difference = (execution_day - (self.start.weekday() + 1)) % 7\n return (\n lambda d: pendulum.instance(d)\n .replace(hour=0, minute=0)\n .subtract(\n weeks=self.offset,\n days=day_difference,\n )\n )\n elif self.schedule_type is ScheduleType.MONTHLY:\n execution_day = cast(int, self.execution_day)\n return (\n lambda d: pendulum.instance(d)\n .replace(hour=0, minute=0)\n .subtract(months=self.offset, days=execution_day - 1)\n )\n else:\n check.assert_never(self.schedule_type)\n\n\nclass DynamicPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_DynamicPartitionsDefinition",\n [("partition_fn", Callable[[Optional[datetime]], Union[List[Partition], List[str]]])],\n ),\n):\n def __new__(\n cls, partition_fn: Callable[[Optional[datetime]], Union[List[Partition], List[str]]]\n ):\n return super(DynamicPartitionsDefinition, cls).__new__(\n cls, check.callable_param(partition_fn, "partition_fn")\n )\n\n def get_partitions(self, current_time: Optional[datetime] = None) -> List[Partition]:\n partitions = self.partition_fn(current_time)\n if all(isinstance(partition, Partition) for partition in partitions):\n return cast(List[Partition], partitions)\n else:\n return [Partition(p) for p in partitions]\n\n\n
[docs]class PartitionSetDefinition(Generic[T]):\n """\n Defines a partition set, representing the set of slices making up an axis of a pipeline\n\n Args:\n name (str): Name for this partition set\n pipeline_name (str): The name of the pipeline definition\n partition_fn (Optional[Callable[void, List[Partition]]]): User-provided function to define\n the set of valid partition objects.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with this partition. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this partition. (default: 'default')\n run_config_fn_for_partition (Callable[[Partition], Any]): A\n function that takes a :py:class:`~dagster.Partition` and returns the run\n configuration that parameterizes the execution for this partition.\n tags_fn_for_partition (Callable[[Partition], Optional[dict[str, str]]]): A function that\n takes a :py:class:`~dagster.Partition` and returns a list of key value pairs that will\n be added to the generated run for this partition.\n partitions_def (Optional[PartitionsDefinition]): A set of parameters used to construct the set\n of valid partition objects.\n """\n\n def __init__(\n self,\n name: str,\n pipeline_name: Optional[str] = None,\n partition_fn: Optional[Callable[..., Union[List[Partition[T]], List[str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n run_config_fn_for_partition: Callable[[Partition[T]], Any] = lambda _partition: {},\n tags_fn_for_partition: Callable[\n [Partition[T]], Optional[Dict[str, str]]\n ] = lambda _partition: {},\n partitions_def: Optional[\n PartitionsDefinition[T] # pylint: disable=unsubscriptable-object\n ] = None,\n job_name: Optional[str] = None,\n ):\n check.invariant(\n partition_fn is not None or partitions_def is not None,\n "One of `partition_fn` or `partitions_def` must be supplied.",\n )\n check.invariant(\n not (partition_fn and partitions_def),\n "Only one of `partition_fn` or `partitions_def` must be supplied.",\n )\n check.invariant(\n (pipeline_name or job_name) and not (pipeline_name and job_name),\n "Exactly one one of `job_name` and `pipeline_name` must be supplied.",\n )\n\n _wrap_partition_fn = None\n\n if partition_fn is not None:\n partition_fn_param_count = len(inspect.signature(partition_fn).parameters)\n\n def _wrap_partition(x: Union[str, Partition]) -> Partition:\n if isinstance(x, Partition):\n return x\n if isinstance(x, str):\n return Partition(x)\n raise DagsterInvalidDefinitionError(\n "Expected <Partition> | <str>, received {type}".format(type=type(x))\n )\n\n def _wrap_partition_fn(current_time=None) -> List[Partition]:\n if not current_time:\n current_time = pendulum.now("UTC")\n\n check.callable_param(partition_fn, "partition_fn")\n\n if partition_fn_param_count == 1:\n obj_list = cast(\n Callable[..., List[Union[Partition[T], str]]],\n partition_fn,\n )(current_time)\n else:\n obj_list = partition_fn() # type: ignore\n\n return [_wrap_partition(obj) for obj in obj_list]\n\n self._name = check_valid_name(name)\n self._pipeline_name = check.opt_str_param(pipeline_name, "pipeline_name")\n self._job_name = check.opt_str_param(job_name, "job_name")\n self._partition_fn = _wrap_partition_fn\n self._solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n self._mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n self._user_defined_run_config_fn_for_partition = check.callable_param(\n run_config_fn_for_partition, "run_config_fn_for_partition"\n )\n self._user_defined_tags_fn_for_partition = check.callable_param(\n tags_fn_for_partition, "tags_fn_for_partition"\n )\n check.opt_inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n if partitions_def is not None:\n self._partitions_def = partitions_def\n else:\n if partition_fn is None:\n check.failed("One of `partition_fn` or `partitions_def` must be supplied.")\n self._partitions_def = DynamicPartitionsDefinition(partition_fn=_wrap_partition_fn)\n\n @property\n def name(self):\n return self._name\n\n @property\n def pipeline_name(self):\n return self._pipeline_name\n\n @property\n def job_name(self):\n return self._job_name\n\n @property\n def pipeline_or_job_name(self) -> str:\n # one is guaranteed to be set\n return cast(str, self._pipeline_name or self._job_name)\n\n @property\n def solid_selection(self):\n return self._solid_selection\n\n @property\n def mode(self):\n return self._mode\n\n def run_config_for_partition(self, partition: Partition[T]) -> Dict[str, Any]:\n return copy.deepcopy(self._user_defined_run_config_fn_for_partition(partition))\n\n def tags_for_partition(self, partition: Partition[T]) -> Dict[str, str]:\n user_tags = copy.deepcopy(self._user_defined_tags_fn_for_partition(partition))\n check_tags(user_tags, "user_tags")\n\n tags = merge_dicts(user_tags, PipelineRun.tags_for_partition_set(self, partition))\n\n return tags\n\n
[docs] def get_partitions(self, current_time: Optional[datetime] = None) -> List[Partition[T]]:\n """Return the set of known partitions.\n\n Arguments:\n current_time (Optional[datetime]): The evaluation time for the partition function, which\n is passed through to the ``partition_fn`` (if it accepts a parameter). Defaults to\n the current time in UTC.\n """\n return self._partitions_def.get_partitions(current_time)
\n\n def get_partition(self, name: str) -> Partition[T]:\n for partition in self.get_partitions():\n if partition.name == name:\n return partition\n\n raise DagsterUnknownPartitionError(f"Could not find a partition with key `{name}`")\n\n def get_partition_names(self, current_time: Optional[datetime] = None) -> List[str]:\n return [part.name for part in self.get_partitions(current_time)]\n\n
[docs] def create_schedule_definition(\n self,\n schedule_name,\n cron_schedule,\n partition_selector,\n should_execute=None,\n environment_vars=None,\n execution_timezone=None,\n description=None,\n decorated_fn=None,\n job=None,\n default_status=DefaultScheduleStatus.STOPPED,\n ):\n """Create a ScheduleDefinition from a PartitionSetDefinition.\n\n Arguments:\n schedule_name (str): The name of the schedule.\n cron_schedule (str): A valid cron string for the schedule\n partition_selector (Callable[ScheduleEvaluationContext, PartitionSetDefinition], Union[Partition, List[Partition]]):\n Function that determines the partition to use at a given execution time. Can return\n either a single Partition or a list of Partitions. For time-based partition sets,\n will likely be either `identity_partition_selector` or a selector returned by\n `create_offset_partition_selector`.\n should_execute (Optional[function]): Function that runs at schedule execution time that\n determines whether a schedule should execute. Defaults to a function that always returns\n ``True``.\n environment_vars (Optional[dict]): The environment variables to set for the schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n\n Returns:\n PartitionScheduleDefinition: The generated PartitionScheduleDefinition for the partition\n selector\n """\n\n check.str_param(schedule_name, "schedule_name")\n check.str_param(cron_schedule, "cron_schedule")\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.callable_param(partition_selector, "partition_selector")\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n def _execution_fn(context):\n check.inst_param(context, "context", ScheduleEvaluationContext)\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of partition_selector for schedule {schedule_name}",\n ):\n selector_result = partition_selector(context, self)\n\n if isinstance(selector_result, SkipReason):\n yield selector_result\n return\n\n selected_partitions = (\n selector_result\n if isinstance(selector_result, (frozenlist, list))\n else [selector_result]\n )\n\n check.is_list(selected_partitions, of_type=Partition)\n\n if not selected_partitions:\n yield SkipReason("Partition selector returned an empty list of partitions.")\n return\n\n missing_partition_names = [\n partition.name\n for partition in selected_partitions\n if partition.name not in self.get_partition_names(context.scheduled_execution_time)\n ]\n\n if missing_partition_names:\n yield SkipReason(\n "Partition selector returned partition"\n + ("s" if len(missing_partition_names) > 1 else "")\n + f" not in the partition set: {', '.join(missing_partition_names)}."\n )\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of should_execute for schedule {schedule_name}",\n ):\n if should_execute and not should_execute(context):\n yield SkipReason(\n "should_execute function for {schedule_name} returned false.".format(\n schedule_name=schedule_name\n )\n )\n return\n\n for selected_partition in selected_partitions:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of run_config_fn for schedule {schedule_name}",\n ):\n run_config = self.run_config_for_partition(selected_partition)\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {schedule_name}",\n ):\n tags = self.tags_for_partition(selected_partition)\n yield RunRequest(\n run_key=selected_partition.name if len(selected_partitions) > 0 else None,\n run_config=run_config,\n tags=tags,\n )\n\n return PartitionScheduleDefinition(\n name=schedule_name,\n cron_schedule=cron_schedule,\n pipeline_name=self._pipeline_name,\n tags_fn=None,\n solid_selection=self._solid_selection,\n mode=self._mode,\n should_execute=None,\n environment_vars=environment_vars,\n partition_set=self,\n execution_timezone=execution_timezone,\n execution_fn=_execution_fn,\n description=description,\n decorated_fn=decorated_fn,\n job=job,\n default_status=default_status,\n )
\n\n\n
[docs]class PartitionScheduleDefinition(ScheduleDefinition):\n __slots__ = ["_partition_set"]\n\n def __init__(\n self,\n name,\n cron_schedule,\n pipeline_name,\n tags_fn,\n solid_selection,\n mode,\n should_execute,\n environment_vars,\n partition_set,\n run_config_fn=None,\n execution_timezone=None,\n execution_fn=None,\n description=None,\n decorated_fn=None,\n job=None,\n default_status=DefaultScheduleStatus.STOPPED,\n ):\n super(PartitionScheduleDefinition, self).__init__(\n name=check_valid_name(name),\n cron_schedule=cron_schedule,\n pipeline_name=pipeline_name,\n run_config_fn=run_config_fn,\n tags_fn=tags_fn,\n solid_selection=solid_selection,\n mode=mode,\n should_execute=should_execute,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n execution_fn=execution_fn,\n description=description,\n job=job,\n default_status=default_status,\n )\n self._partition_set = check.inst_param(\n partition_set, "partition_set", PartitionSetDefinition\n )\n self._decorated_fn = check.opt_callable_param(decorated_fn, "decorated_fn")\n\n def __call__(self, *args, **kwargs):\n if not self._decorated_fn:\n raise DagsterInvalidInvocationError(\n "Only partition schedules created using one of the partition schedule decorators "\n "can be directly invoked."\n )\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Schedule decorated function has date argument, but no date argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Schedule invocation received multiple arguments. Only a first "\n "positional date parameter should be provided when invoking."\n )\n\n date_param_name = get_function_params(self._decorated_fn)[0].name\n\n if args:\n date = check.opt_inst_param(args[0], date_param_name, datetime)\n else:\n if date_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Schedule invocation expected argument '{date_param_name}'."\n )\n date = check.opt_inst_param(kwargs[date_param_name], date_param_name, datetime)\n\n return self._decorated_fn(date)\n\n def get_partition_set(self):\n return self._partition_set
\n\n\n
[docs]class PartitionedConfig(Generic[T]):\n """Defines a way of configuring a job where the job can be run on one of a discrete set of\n partitions, and each partition corresponds to run configuration for the job.\n\n Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\n and view the run history across partitions.\n """\n\n def __init__(\n self,\n partitions_def: PartitionsDefinition[T], # pylint: disable=unsubscriptable-object\n run_config_for_partition_fn: Callable[[Partition[T]], Dict[str, Any]],\n decorated_fn: Optional[Callable[..., Dict[str, Any]]] = None,\n tags_for_partition_fn: Optional[Callable[[Partition[T]], Dict[str, str]]] = None,\n ):\n self._partitions = check.inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n self._run_config_for_partition_fn = check.callable_param(\n run_config_for_partition_fn, "run_config_for_partition_fn"\n )\n self._decorated_fn = decorated_fn\n self._tags_for_partition_fn = check.opt_callable_param(\n tags_for_partition_fn, "tags_for_partition_fn"\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition[T]: # pylint: disable=unsubscriptable-object\n return self._partitions\n\n @property\n def run_config_for_partition_fn(self) -> Callable[[Partition[T]], Dict[str, Any]]:\n return self._run_config_for_partition_fn\n\n @property\n def tags_for_partition_fn(self) -> Optional[Callable[[Partition[T]], Dict[str, str]]]:\n return self._tags_for_partition_fn\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> List[str]:\n return [partition.name for partition in self.partitions_def.get_partitions(current_time)]\n\n
[docs] def get_run_config_for_partition_key(self, partition_key: str) -> Dict[str, Any]:\n """Generates the run config corresponding to a partition key.\n\n Args:\n partition_key (str): the key for a partition that should be used to generate a run config.\n """\n partitions = self.partitions_def.get_partitions()\n partition = [p for p in partitions if p.name == partition_key]\n if len(partition) == 0:\n raise DagsterInvalidInvocationError(f"No partition for partition key {partition_key}.")\n return self.run_config_for_partition_fn(partition[0])
\n\n def __call__(self, *args, **kwargs):\n if self._decorated_fn is None:\n raise DagsterInvalidInvocationError(\n "Only PartitionedConfig objects created using one of the partitioned config "\n "decorators can be directly invoked."\n )\n else:\n return self._decorated_fn(*args, **kwargs)
\n\n\n
[docs]def static_partitioned_config(\n partition_keys: List[str],\n tags_for_partition_fn: Optional[Callable[[str], Dict[str, str]]] = None,\n) -> Callable[[Callable[[str], Dict[str, Any]]], PartitionedConfig]:\n """Creates a static partitioned config for a job.\n\n The provided partition_keys returns a static list of strings identifying the set of partitions,\n given an optional datetime argument (representing the current time). The list of partitions\n is static, so while the run config returned by the decorated function may change over time, the\n list of valid partition keys does not.\n\n This has performance advantages over `dynamic_partitioned_config` in terms of loading different\n partition views in Dagit.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_keys (List[str]): A list of valid partition keys, which serve as the range of\n values that can be provided to the decorated run config function.\n\n Returns:\n PartitionedConfig\n """\n check.list_param(partition_keys, "partition_keys", str)\n\n def inner(fn: Callable[[str], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n def _run_config_wrapper(partition: Partition[T]) -> Dict[str, Any]:\n return fn(partition.name)\n\n def _tag_wrapper(partition: Partition[T]) -> Dict[str, str]:\n return tags_for_partition_fn(partition.name) if tags_for_partition_fn else {}\n\n return PartitionedConfig(\n partitions_def=StaticPartitionsDefinition(partition_keys),\n run_config_for_partition_fn=_run_config_wrapper,\n decorated_fn=fn,\n tags_for_partition_fn=_tag_wrapper,\n )\n\n return inner
\n\n\n
[docs]def dynamic_partitioned_config(\n partition_fn: Callable[[Optional[datetime]], List[str]],\n tags_for_partition_fn: Optional[Callable[[str], Dict[str, str]]] = None,\n) -> Callable[[Callable[[str], Dict[str, Any]]], PartitionedConfig]:\n """Creates a dynamic partitioned config for a job.\n\n The provided partition_fn returns a list of strings identifying the set of partitions, given\n an optional datetime argument (representing the current time). The list of partitions returned\n may change over time.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_fn (Callable[[datetime.datetime], Sequence[str]]): A function that generates a\n list of valid partition keys, which serve as the range of values that can be provided\n to the decorated run config function.\n\n Returns:\n PartitionedConfig\n """\n check.callable_param(partition_fn, "partition_fn")\n\n def inner(fn: Callable[[str], Dict[str, Any]]) -> PartitionedConfig:\n def _run_config_wrapper(partition: Partition[T]) -> Dict[str, Any]:\n return fn(partition.name)\n\n def _tag_wrapper(partition: Partition[T]) -> Dict[str, str]:\n return tags_for_partition_fn(partition.name) if tags_for_partition_fn else {}\n\n return PartitionedConfig(\n partitions_def=DynamicPartitionsDefinition(partition_fn),\n run_config_for_partition_fn=_run_config_wrapper,\n decorated_fn=fn,\n tags_for_partition_fn=_tag_wrapper,\n )\n\n return inner
\n\n\ndef get_cron_schedule(\n schedule_type: ScheduleType,\n time_of_day: time = time(0, 0),\n execution_day: Optional[int] = None,\n) -> str:\n minute = time_of_day.minute\n hour = time_of_day.hour\n\n if schedule_type is ScheduleType.HOURLY:\n return f"{minute} * * * *"\n elif schedule_type is ScheduleType.DAILY:\n return f"{minute} {hour} * * *"\n elif schedule_type is ScheduleType.WEEKLY:\n return f"{minute} {hour} * * {execution_day if execution_day != None else 0}"\n elif schedule_type is ScheduleType.MONTHLY:\n return f"{minute} {hour} {execution_day if execution_day != None else 1} * *"\n else:\n check.assert_never(schedule_type)\n
", "current_page_name": "_modules/dagster/core/definitions/partition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.partition"}, "partitioned_schedule": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.partitioned_schedule

\nfrom datetime import time\nfrom typing import Optional, Union, cast\n\nfrom dagster import check\n\nfrom .job_definition import JobDefinition\nfrom .partition import (\n    Partition,\n    PartitionSetDefinition,\n    PartitionedConfig,\n    ScheduleType,\n    get_cron_schedule,\n)\nfrom .run_request import SkipReason\nfrom .schedule_definition import (\n    DefaultScheduleStatus,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n)\nfrom .time_window_partitions import TimeWindow, TimeWindowPartitionsDefinition\n\n\n
[docs]def build_schedule_from_partitioned_job(\n job: JobDefinition,\n description: Optional[str] = None,\n name: Optional[str] = None,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> ScheduleDefinition:\n """\n Creates a schedule from a time window-partitioned job.\n\n The schedule executes at the cadence specified by the partitioning of the given job.\n """\n check.invariant(len(job.mode_definitions) == 1, "job must only have one mode")\n check.invariant(\n job.mode_definitions[0].partitioned_config is not None, "job must be a partitioned job"\n )\n check.invariant(\n not (day_of_week and day_of_month),\n "Cannot provide both day_of_month and day_of_week parameter to build_schedule_from_partitioned_job.",\n )\n\n partitioned_config = cast(PartitionedConfig, job.mode_definitions[0].partitioned_config)\n partition_set = cast(PartitionSetDefinition, job.get_partition_set_def())\n\n check.inst(partitioned_config.partitions_def, TimeWindowPartitionsDefinition)\n partitions_def = cast(TimeWindowPartitionsDefinition, partitioned_config.partitions_def)\n\n minute_of_hour = cast(int, check.opt_int_param(minute_of_hour, "minute_of_hour", default=0))\n\n if partitions_def.schedule_type == ScheduleType.HOURLY:\n check.invariant(hour_of_day is None, "Cannot set hour parameter with hourly partitions.")\n\n hour_of_day = cast(int, check.opt_int_param(hour_of_day, "hour_of_day", default=0))\n execution_time = time(minute=minute_of_hour, hour=hour_of_day)\n\n if partitions_def.schedule_type == ScheduleType.DAILY:\n check.invariant(\n day_of_week is None, "Cannot set day of week parameter with daily partitions."\n )\n check.invariant(\n day_of_month is None, "Cannot set day of month parameter with daily partitions."\n )\n\n if partitions_def.schedule_type == ScheduleType.MONTHLY:\n default = partitions_def.day_offset or 1\n execution_day = check.opt_int_param(day_of_month, "day_of_month", default=default)\n elif partitions_def.schedule_type == ScheduleType.WEEKLY:\n default = partitions_def.day_offset or 0\n execution_day = check.opt_int_param(day_of_week, "day_of_week", default=default)\n else:\n execution_day = 0\n\n cron_schedule = get_cron_schedule(partitions_def.schedule_type, execution_time, execution_day)\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n cron_schedule=cron_schedule,\n partition_selector=latest_window_partition_selector,\n execution_timezone=partitions_def.timezone,\n description=description,\n job=job,\n default_status=default_status,\n )\n\n return schedule_def
\n\n\nschedule_from_partitions = build_schedule_from_partitioned_job\n\n\ndef latest_window_partition_selector(\n context: ScheduleEvaluationContext, partition_set_def: PartitionSetDefinition[TimeWindow]\n) -> Union[SkipReason, Partition[TimeWindow]]:\n """Creates a selector for partitions that are time windows. Selects the latest partition that\n exists as of the schedule tick time.\n """\n partitions = partition_set_def.get_partitions(context.scheduled_execution_time)\n if len(partitions) == 0:\n return SkipReason()\n else:\n return partitions[-1]\n
", "current_page_name": "_modules/dagster/core/definitions/partitioned_schedule", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.partitioned_schedule"}, "pipeline_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.pipeline_definition

\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, FrozenSet, List, Optional, Set, Union\n\nfrom dagster import check\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.definitions.solid_definition import NodeDefinition\nfrom dagster.core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster.core.storage.output_manager import IOutputManagerDefinition\nfrom dagster.core.storage.root_input_manager import (\n    IInputManagerDefinition,\n    RootInputManagerDefinition,\n)\nfrom dagster.core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster.core.types.dagster_type import DagsterType, DagsterTypeKind\nfrom dagster.core.utils import str_format_set\nfrom dagster.utils import frozentags, merge_dicts\nfrom dagster.utils.backcompat import experimental_class_warning\n\nfrom .dependency import (\n    DependencyDefinition,\n    DependencyStructure,\n    DynamicCollectDependencyDefinition,\n    IDependencyDefinition,\n    MultiDependencyDefinition,\n    Node,\n    NodeHandle,\n    NodeInvocation,\n    SolidInputHandle,\n)\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .hook_definition import HookDefinition\nfrom .mode import ModeDefinition\nfrom .node_definition import NodeDefinition\nfrom .preset import PresetDefinition\nfrom .utils import validate_tags\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.partition import PartitionSetDefinition\n    from dagster.core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster.core.host_representation import PipelineIndex\n    from dagster.core.instance import DagsterInstance\n    from dagster.core.snap import ConfigSchemaSnapshot, PipelineSnapshot\n\n    from .run_config_schema import RunConfigSchema\n\n\n
[docs]class PipelineDefinition:\n """Defines a Dagster pipeline.\n\n A pipeline is made up of\n\n - Solids, each of which is a single functional unit of data computation.\n - Dependencies, which determine how the values produced by solids as their outputs flow from\n one solid to another. This tells Dagster how to arrange solids, and potentially multiple\n aliased instances of solids, into a directed, acyclic graph (DAG) of compute.\n - Modes, which can be used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline, and to switch between them.\n - Presets, which can be used to ship common combinations of pipeline config options in Python\n code, and to switch between them.\n\n Args:\n solid_defs (List[SolidDefinition]): The set of solids used in this pipeline.\n name (str): The name of the pipeline. Must be unique within any\n :py:class:`RepositoryDefinition` containing the pipeline.\n description (Optional[str]): A human-readable description of the pipeline.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each solid's inputs on the outputs of\n other solids in the pipeline. Keys of the top level dict are either the string names of\n solids in the pipeline or, in the case of aliased solids,\n :py:class:`NodeInvocations <NodeInvocation>`. Values of the top level dict are\n themselves dicts, which map input names belonging to the solid or aliased solid to\n :py:class:`DependencyDefinitions <DependencyDefinition>`.\n mode_defs (Optional[List[ModeDefinition]]): The set of modes in which this pipeline can\n operate. Modes are used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline. Modes can be used, e.g., to vary available\n resource and logging implementations between local test and production runs.\n preset_defs (Optional[List[PresetDefinition]]): A set of preset collections of configuration\n options that may be used to execute a pipeline. A preset consists of an environment\n dict, an optional subset of solids to execute, and a mode selection. Presets can be used\n to ship common combinations of options to pipeline end users in Python code, and can\n be selected by tools like Dagit.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the pipeline.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n hook_defs (Optional[AbstractSet[HookDefinition]]): A set of hook definitions applied to the\n pipeline. When a hook is applied to a pipeline, it will be attached to all solid\n instances within the pipeline.\n solid_retry_policy (Optional[RetryPolicy]): The default retry policy for all solids in\n this pipeline. Only used if retry policy is not defined on the solid definition or\n solid invocation.\n\n\n _parent_pipeline_def (INTERNAL ONLY): Used for tracking pipelines created using solid subsets.\n\n Examples:\n\n .. code-block:: python\n\n @solid\n def return_one(_):\n return 1\n\n\n @solid(input_defs=[InputDefinition('num')], required_resource_keys={'op'})\n def apply_op(context, num):\n return context.resources.op(num)\n\n @resource(config_schema=Int)\n def adder_resource(init_context):\n return lambda x: x + init_context.resource_config\n\n\n add_mode = ModeDefinition(\n name='add_mode',\n resource_defs={'op': adder_resource},\n description='Mode that adds things',\n )\n\n\n add_three_preset = PresetDefinition(\n name='add_three_preset',\n run_config={'resources': {'op': {'config': 3}}},\n mode='add_mode',\n )\n\n\n pipeline_def = PipelineDefinition(\n name='basic',\n solid_defs=[return_one, apply_op],\n dependencies={'apply_op': {'num': DependencyDefinition('return_one')}},\n mode_defs=[add_mode],\n preset_defs=[add_three_preset],\n )\n """\n\n def __init__(\n self,\n solid_defs: Optional[List[NodeDefinition]] = None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n dependencies: Optional[\n Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]\n ] = None,\n mode_defs: Optional[List[ModeDefinition]] = None,\n preset_defs: Optional[List[PresetDefinition]] = None,\n tags: Optional[Dict[str, Any]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n solid_retry_policy: Optional[RetryPolicy] = None,\n graph_def=None,\n _parent_pipeline_def=None, # https://github.com/dagster-io/dagster/issues/2115\n version_strategy: Optional[VersionStrategy] = None,\n ):\n # If a graph is specificed directly use it\n if check.opt_inst_param(graph_def, "graph_def", GraphDefinition):\n self._graph_def = graph_def\n self._name = name or graph_def.name\n\n # Otherwise fallback to legacy construction\n else:\n if name is None:\n check.failed("name must be set provided")\n self._name = name\n\n if solid_defs is None:\n check.failed("solid_defs must be provided")\n\n self._graph_def = GraphDefinition(\n name=name,\n dependencies=dependencies,\n node_defs=solid_defs,\n input_mappings=None,\n output_mappings=None,\n config=None,\n description=None,\n )\n\n # tags and description can exist on graph as well, but since\n # same graph may be in multiple pipelines/jobs, keep separate layer\n self._description = check.opt_str_param(description, "description")\n self._tags = validate_tags(tags)\n\n self._current_level_node_defs = self._graph_def.node_defs\n\n mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition)\n\n if not mode_definitions:\n mode_definitions = [ModeDefinition()]\n\n self._mode_definitions = mode_definitions\n\n seen_modes = set()\n for mode_def in mode_definitions:\n if mode_def.name in seen_modes:\n raise DagsterInvalidDefinitionError(\n (\n 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '\n "Modes must have unique names."\n ).format(mode_name=mode_def.name, pipeline_name=self.name)\n )\n seen_modes.add(mode_def.name)\n\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n self._solid_retry_policy = check.opt_inst_param(\n solid_retry_policy, "solid_retry_policy", RetryPolicy\n )\n\n self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition)\n self._preset_dict: Dict[str, PresetDefinition] = {}\n for preset in self._preset_defs:\n if preset.name in self._preset_dict:\n raise DagsterInvalidDefinitionError(\n (\n 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '\n "PresetDefinitions must have unique names."\n ).format(name=preset.name, pipeline_name=self.name)\n )\n if preset.mode not in seen_modes:\n raise DagsterInvalidDefinitionError(\n (\n 'PresetDefinition "{name}" in "{pipeline_name}" '\n 'references mode "{mode}" which is not defined.'\n ).format(name=preset.name, pipeline_name=self.name, mode=preset.mode)\n )\n self._preset_dict[preset.name] = preset\n\n self._resource_requirements = {\n mode_def.name: _checked_resource_reqs_for_mode(\n mode_def,\n self._current_level_node_defs,\n self._graph_def._dagster_type_dict,\n self._graph_def._node_dict,\n self._hook_defs,\n self._graph_def._dependency_structure,\n )\n for mode_def in self._mode_definitions\n }\n\n # Recursively explore all nodes in the this pipeline\n self._all_node_defs = _build_all_node_defs(self._current_level_node_defs)\n self._parent_pipeline_def = check.opt_inst_param(\n _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition\n )\n self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {}\n self._cached_external_pipeline = None\n\n self.version_strategy = check.opt_inst_param(\n version_strategy, "version_strategy", VersionStrategy\n )\n\n if self.version_strategy is not None:\n experimental_class_warning("VersionStrategy")\n\n @property\n def name(self):\n return self._name\n\n @property\n def target_type(self):\n return "pipeline"\n\n @property\n def is_job(self) -> bool:\n return False\n\n def describe_target(self):\n return f"{self.target_type} '{self.name}'"\n\n @property\n def tags(self):\n return frozentags(**merge_dicts(self._graph_def.tags, self._tags))\n\n @property\n def description(self):\n return self._description\n\n @property\n def graph(self):\n return self._graph_def\n\n @property\n def dependency_structure(self):\n return self._graph_def.dependency_structure\n\n @property\n def dependencies(self):\n return self._graph_def.dependencies\n\n def get_run_config_schema(self, mode: Optional[str] = None) -> "RunConfigSchema":\n check.str_param(mode, "mode")\n\n mode_def = self.get_mode_definition(mode)\n\n if mode_def.name in self._cached_run_config_schemas:\n return self._cached_run_config_schemas[mode_def.name]\n\n self._cached_run_config_schemas[mode_def.name] = _create_run_config_schema(\n self,\n mode_def,\n self._resource_requirements[mode_def.name],\n )\n return self._cached_run_config_schemas[mode_def.name]\n\n @property\n def mode_definitions(self) -> List[ModeDefinition]:\n return self._mode_definitions\n\n @property\n def preset_defs(self) -> List[PresetDefinition]:\n return self._preset_defs\n\n def _get_mode_definition(self, mode: str) -> Optional[ModeDefinition]:\n check.str_param(mode, "mode")\n for mode_definition in self._mode_definitions:\n if mode_definition.name == mode:\n return mode_definition\n\n return None\n\n def get_default_mode(self) -> ModeDefinition:\n return self._mode_definitions[0]\n\n @property\n def is_single_mode(self) -> bool:\n return len(self._mode_definitions) == 1\n\n @property\n def is_multi_mode(self) -> bool:\n return len(self._mode_definitions) > 1\n\n def is_using_memoization(self, run_tags: Dict[str, str]) -> bool:\n tags = merge_dicts(self.tags, run_tags)\n # If someone provides a false value for memoized run tag, then they are intentionally\n # switching off memoization.\n if tags.get(MEMOIZED_RUN_TAG) == "false":\n return False\n return (\n MEMOIZED_RUN_TAG in tags and tags.get(MEMOIZED_RUN_TAG) == "true"\n ) or self.version_strategy is not None\n\n def has_mode_definition(self, mode: str) -> bool:\n check.str_param(mode, "mode")\n return bool(self._get_mode_definition(mode))\n\n def get_default_mode_name(self) -> str:\n return self._mode_definitions[0].name\n\n def get_mode_definition(self, mode: Optional[str] = None) -> ModeDefinition:\n check.opt_str_param(mode, "mode")\n if mode is None:\n check.invariant(self.is_single_mode)\n return self.get_default_mode()\n\n mode_def = self._get_mode_definition(mode)\n\n if mode_def is None:\n check.failed(\n "Could not find mode {mode} in pipeline {name}".format(mode=mode, name=self.name),\n )\n\n return mode_def\n\n @property\n def available_modes(self) -> List[str]:\n return [mode_def.name for mode_def in self._mode_definitions]\n\n def get_required_resource_defs_for_mode(self, mode: str) -> Dict[str, ResourceDefinition]:\n return {\n resource_key: resource\n for resource_key, resource in self.get_mode_definition(mode).resource_defs.items()\n if resource_key in self._resource_requirements[mode]\n }\n\n @property\n def all_node_defs(self) -> List[NodeDefinition]:\n return list(self._all_node_defs.values())\n\n @property\n def top_level_solid_defs(self) -> List[NodeDefinition]:\n return self._current_level_node_defs\n\n def solid_def_named(self, name: str) -> NodeDefinition:\n check.str_param(name, "name")\n\n check.invariant(name in self._all_node_defs, "{} not found".format(name))\n return self._all_node_defs[name]\n\n def has_solid_def(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._all_node_defs\n\n def get_solid(self, handle):\n return self._graph_def.get_solid(handle)\n\n def has_solid_named(self, name):\n return self._graph_def.has_solid_named(name)\n\n def solid_named(self, name):\n return self._graph_def.solid_named(name)\n\n @property\n def solids(self):\n return self._graph_def.solids\n\n @property\n def solids_in_topological_order(self):\n return self._graph_def.solids_in_topological_order\n\n def all_dagster_types(self):\n return self._graph_def.all_dagster_types()\n\n def has_dagster_type(self, name):\n return self._graph_def.has_dagster_type(name)\n\n def dagster_type_named(self, name):\n return self._graph_def.dagster_type_named(name)\n\n def get_pipeline_subset_def(\n self, solids_to_execute: Optional[AbstractSet[str]]\n ) -> "PipelineDefinition":\n return (\n self if solids_to_execute is None else _get_pipeline_subset_def(self, solids_to_execute)\n )\n\n def has_preset(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._preset_dict\n\n def get_preset(self, name: str) -> PresetDefinition:\n check.str_param(name, "name")\n if name not in self._preset_dict:\n raise DagsterInvariantViolationError(\n (\n 'Could not find preset for "{name}". Available presets '\n 'for pipeline "{pipeline_name}" are {preset_names}.'\n ).format(\n name=name,\n preset_names=list(self._preset_dict.keys()),\n pipeline_name=self.name,\n )\n )\n\n return self._preset_dict[name]\n\n def get_pipeline_snapshot(self) -> "PipelineSnapshot":\n return self.get_pipeline_index().pipeline_snapshot\n\n def get_pipeline_snapshot_id(self) -> str:\n return self.get_pipeline_index().pipeline_snapshot_id\n\n def get_pipeline_index(self) -> "PipelineIndex":\n from dagster.core.host_representation import PipelineIndex\n from dagster.core.snap import PipelineSnapshot\n\n return PipelineIndex(\n PipelineSnapshot.from_pipeline_def(self), self.get_parent_pipeline_snapshot()\n )\n\n def get_config_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n return self.get_pipeline_snapshot().config_schema_snapshot\n\n @property\n def is_subset_pipeline(self) -> bool:\n return False\n\n @property\n def parent_pipeline_def(self) -> Optional["PipelineDefinition"]:\n return None\n\n def get_parent_pipeline_snapshot(self) -> Optional["PipelineSnapshot"]:\n return None\n\n @property\n def solids_to_execute(self) -> Optional[FrozenSet[str]]:\n return None\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n def get_all_hooks_for_handle(self, handle: NodeHandle) -> FrozenSet[HookDefinition]:\n """Gather all the hooks for the given solid from all places possibly attached with a hook.\n\n A hook can be attached to any of the following objects\n * Solid (solid invocation)\n * PipelineDefinition\n\n Args:\n handle (NodeHandle): The solid's handle\n\n Returns:\n FrozenSet[HookDefinition]\n """\n check.inst_param(handle, "handle", NodeHandle)\n hook_defs: AbstractSet[HookDefinition] = set()\n\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n # hooks on top-level solid\n name = lineage.pop()\n solid = self._graph_def.solid_named(name)\n hook_defs = hook_defs.union(solid.hook_defs)\n\n # hooks on non-top-level solids\n while lineage:\n name = lineage.pop()\n solid = solid.definition.solid_named(name)\n hook_defs = hook_defs.union(solid.hook_defs)\n\n # hooks applied to a pipeline definition will run on every solid\n hook_defs = hook_defs.union(self.hook_defs)\n\n return frozenset(hook_defs)\n\n def get_retry_policy_for_handle(self, handle: NodeHandle) -> Optional[RetryPolicy]:\n solid = self.get_solid(handle)\n\n if solid.retry_policy:\n return solid.retry_policy\n elif solid.definition.retry_policy:\n return solid.definition.retry_policy\n\n # could be expanded to look in composite_solid / graph containers\n else:\n return self._solid_retry_policy\n\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PipelineDefinition":\n """Apply a set of hooks to all solid instances within the pipeline."""\n\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n\n pipeline_def = PipelineDefinition(\n name=self.name,\n graph_def=self._graph_def,\n mode_defs=self.mode_definitions,\n preset_defs=self.preset_defs,\n tags=self.tags,\n hook_defs=hook_defs | self.hook_defs,\n description=self._description,\n solid_retry_policy=self._solid_retry_policy,\n _parent_pipeline_def=self._parent_pipeline_def,\n )\n\n update_wrapper(pipeline_def, self, updated=())\n\n return pipeline_def\n\n # make Callable for decorator reference updates\n def __call__(self, *args, **kwargs):\n if self.is_job:\n msg = (\n f"Attempted to call job '{self.name}' directly. Jobs should be invoked by "\n "using an execution API function (e.g. `job.execute_in_process`)."\n )\n else:\n msg = (\n f"Attempted to call pipeline '{self.name}' directly. Pipelines should be invoked by "\n "using an execution API function (e.g. `execute_pipeline`)."\n )\n raise DagsterInvariantViolationError(msg)
\n\n\nclass PipelineSubsetDefinition(PipelineDefinition):\n @property\n def solids_to_execute(self):\n return frozenset(self._graph_def.node_names())\n\n @property\n def solid_selection(self) -> List[str]:\n # we currently don't pass the real solid_selection (the solid query list) down here.\n # so in the short-term, to make the call sites cleaner, we will convert the solids to execute\n # to a list\n return self._graph_def.node_names()\n\n @property\n def parent_pipeline_def(self) -> PipelineDefinition:\n return self._parent_pipeline_def\n\n def get_parent_pipeline_snapshot(self) -> Optional["PipelineSnapshot"]:\n return self._parent_pipeline_def.get_pipeline_snapshot()\n\n @property\n def is_subset_pipeline(self) -> bool:\n return True\n\n def get_pipeline_subset_def(\n self, solids_to_execute: Optional[AbstractSet[str]]\n ) -> "PipelineSubsetDefinition":\n raise DagsterInvariantViolationError("Pipeline subsets may not be subset again.")\n\n\ndef _dep_key_of(solid: Node) -> NodeInvocation:\n return NodeInvocation(\n name=solid.definition.name,\n alias=solid.name,\n tags=solid.tags,\n hook_defs=solid.hook_defs,\n retry_policy=solid.retry_policy,\n )\n\n\ndef _get_pipeline_subset_def(\n pipeline_def: PipelineDefinition,\n solids_to_execute: AbstractSet[str],\n) -> "PipelineSubsetDefinition":\n """\n Build a pipeline which is a subset of another pipeline.\n Only includes the solids which are in solids_to_execute.\n """\n\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.set_param(solids_to_execute, "solids_to_execute", of_type=str)\n graph = pipeline_def.graph\n for solid_name in solids_to_execute:\n if not graph.has_solid_named(solid_name):\n raise DagsterInvalidSubsetError(\n "{target_type} {pipeline_name} has no {node_type} named {name}.".format(\n target_type=pipeline_def.target_type,\n pipeline_name=pipeline_def.name,\n name=solid_name,\n node_type="ops" if pipeline_def.is_job else "solids",\n ),\n )\n\n # go in topo order to ensure deps dict is ordered\n solids = list(\n filter(lambda solid: solid.name in solids_to_execute, graph.solids_in_topological_order)\n )\n\n deps: Dict[\n Union[str, NodeInvocation],\n Dict[str, IDependencyDefinition],\n ] = {_dep_key_of(solid): {} for solid in solids}\n\n for solid in solids:\n for input_handle in solid.input_handles():\n if graph.dependency_structure.has_direct_dep(input_handle):\n output_handle = pipeline_def.dependency_structure.get_direct_dep(input_handle)\n if output_handle.solid.name in solids_to_execute:\n deps[_dep_key_of(solid)][input_handle.input_def.name] = DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n elif graph.dependency_structure.has_dynamic_fan_in_dep(input_handle):\n output_handle = graph.dependency_structure.get_dynamic_fan_in_dep(input_handle)\n if output_handle.solid.name in solids_to_execute:\n deps[_dep_key_of(solid)][\n input_handle.input_def.name\n ] = DynamicCollectDependencyDefinition(\n solid_name=output_handle.solid.name,\n output_name=output_handle.output_def.name,\n )\n elif graph.dependency_structure.has_fan_in_deps(input_handle):\n output_handles = graph.dependency_structure.get_fan_in_deps(input_handle)\n deps[_dep_key_of(solid)][input_handle.input_def.name] = MultiDependencyDefinition(\n [\n DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n for output_handle in output_handles\n if output_handle.solid.name in solids_to_execute\n ]\n )\n # else input is unconnected\n\n try:\n sub_pipeline_def = PipelineSubsetDefinition(\n name=pipeline_def.name, # should we change the name for subsetted pipeline?\n solid_defs=list({solid.definition for solid in solids}),\n mode_defs=pipeline_def.mode_definitions,\n dependencies=deps,\n _parent_pipeline_def=pipeline_def,\n tags=pipeline_def.tags,\n hook_defs=pipeline_def.hook_defs,\n )\n\n return sub_pipeline_def\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(solids_to_execute)} for {pipeline_def.target_type} "\n f"{pipeline_def.name} results in an invalid {pipeline_def.target_type}"\n ) from exc\n\n\ndef _checked_resource_reqs_for_mode(\n mode_def: ModeDefinition,\n node_defs: List[NodeDefinition],\n dagster_type_dict: Dict[str, DagsterType],\n solid_dict: Dict[str, Node],\n pipeline_hook_defs: AbstractSet[HookDefinition],\n dependency_structure: DependencyStructure,\n) -> Set[str]:\n """\n Calculate the resource requirements for the pipeline in this mode and ensure they are\n provided by the mode.\n\n We combine these operations in to one traversal to allow for raising excpetions that provide\n as much context as possible about where the unsatisfied resource requirement came from.\n """\n resource_reqs: Set[str] = set()\n mode_output_managers = set(\n key\n for key, resource_def in mode_def.resource_defs.items()\n if isinstance(resource_def, IOutputManagerDefinition)\n )\n mode_resources = set(mode_def.resource_defs.keys())\n for node_def in node_defs:\n for solid_def in node_def.iterate_solid_defs():\n for required_resource in solid_def.required_resource_keys:\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=solid_def.describe_node(),\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n for output_def in solid_def.output_defs:\n resource_reqs.add(output_def.io_manager_key)\n if output_def.io_manager_key not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="IO manager",\n resource_key=output_def.io_manager_key,\n descriptor=f"output '{output_def.name}' of {solid_def.describe_node()}",\n mode_def=mode_def,\n resource_defs_of_type=mode_output_managers,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n resource_reqs.update(\n _checked_type_resource_reqs_for_mode(\n mode_def,\n dagster_type_dict,\n )\n )\n\n # Validate unsatisfied inputs can be materialized from config\n resource_reqs.update(\n _checked_input_resource_reqs_for_mode(dependency_structure, solid_dict, mode_def)\n )\n\n for solid in solid_dict.values():\n for hook_def in solid.hook_defs:\n for required_resource in hook_def.required_resource_keys:\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"hook '{hook_def.name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n for hook_def in pipeline_hook_defs:\n for required_resource in hook_def.required_resource_keys:\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"hook '{hook_def.name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n for resource_key, resource in mode_def.resource_defs.items():\n for required_resource in resource.required_resource_keys:\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"resource at key '{resource_key}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n # Finally, recursively add any resources that the set of required resources require\n while True:\n new_resources: Set[str] = set()\n for resource_key in resource_reqs:\n resource = mode_def.resource_defs[resource_key]\n new_resources.update(resource.required_resource_keys - resource_reqs)\n\n if not len(new_resources):\n break\n\n resource_reqs.update(new_resources)\n\n return resource_reqs\n\n\ndef _checked_type_resource_reqs_for_mode(\n mode_def: ModeDefinition,\n dagster_type_dict: Dict[str, DagsterType],\n) -> Set[str]:\n """\n Calculate all the resource requirements related to DagsterTypes for this mode and ensure the\n mode provides those resources.\n """\n\n resource_reqs = set()\n mode_resources = set(mode_def.resource_defs.keys())\n for dagster_type in dagster_type_dict.values():\n for required_resource in dagster_type.required_resource_keys:\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"type '{dagster_type.display_name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n if dagster_type.loader:\n for required_resource in dagster_type.loader.required_resource_keys():\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"the loader on type '{dagster_type.display_name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n if dagster_type.materializer:\n for required_resource in dagster_type.materializer.required_resource_keys():\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"the materializer on type '{dagster_type.display_name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n return resource_reqs\n\n\ndef _checked_input_resource_reqs_for_mode(\n dependency_structure: DependencyStructure,\n node_dict: Dict[str, Node],\n mode_def: ModeDefinition,\n outer_dependency_structures: Optional[List[DependencyStructure]] = None,\n outer_solids: Optional[List[Node]] = None,\n) -> Set[str]:\n outer_dependency_structures = check.opt_list_param(\n outer_dependency_structures, "outer_dependency_structures", DependencyStructure\n )\n outer_solids = check.opt_list_param(outer_solids, "outer_solids", Node)\n\n resource_reqs = set()\n mode_root_input_managers = set(\n key\n for key, resource_def in mode_def.resource_defs.items()\n if isinstance(resource_def, RootInputManagerDefinition)\n )\n\n for node in node_dict.values():\n if node.is_graph:\n graph_def = node.definition.ensure_graph_def()\n # check inner solids\n resource_reqs.update(\n _checked_input_resource_reqs_for_mode(\n dependency_structure=graph_def.dependency_structure,\n node_dict=graph_def.node_dict,\n mode_def=mode_def,\n outer_dependency_structures=outer_dependency_structures\n + [dependency_structure],\n outer_solids=outer_solids + [node],\n )\n )\n for handle in node.input_handles():\n source_output_handles = None\n if dependency_structure.has_deps(handle):\n # input is connected to outputs from the same dependency structure\n source_output_handles = dependency_structure.get_deps_list(handle)\n else:\n # input is connected to outputs from outer dependency structure, e.g. first solids\n # in a composite\n curr_node = node\n curr_handle = handle\n curr_index = len(outer_solids) - 1\n\n # Checks to see if input is mapped to an outer dependency structure\n while curr_index >= 0 and curr_node.container_maps_input(curr_handle.input_name):\n curr_handle = SolidInputHandle(\n solid=outer_solids[curr_index],\n input_def=curr_node.container_mapped_input(\n curr_handle.input_name\n ).definition,\n )\n\n if outer_dependency_structures[curr_index].has_deps(curr_handle):\n source_output_handles = outer_dependency_structures[\n curr_index\n ].get_deps_list(curr_handle)\n break\n\n curr_node = outer_solids[curr_index]\n curr_index -= 1\n\n if source_output_handles:\n # input is connected to source output handles within the graph\n for source_output_handle in source_output_handles:\n output_manager_key = source_output_handle.output_def.io_manager_key\n output_manager_def = mode_def.resource_defs[output_manager_key]\n if not isinstance(output_manager_def, IInputManagerDefinition):\n raise DagsterInvalidDefinitionError(\n f'Input "{handle.input_def.name}" of {node.describe_node()} is '\n f'connected to output "{source_output_handle.output_def.name}" '\n f"of {source_output_handle.solid.describe_node()}. That output does not "\n "have an output "\n f"manager that knows how to load inputs, so we don't know how "\n f"to load the input. To address this, assign an IOManager to "\n f"the upstream output."\n )\n else:\n # input is unconnected\n input_def = handle.input_def\n if (\n not input_def.dagster_type.loader\n and not input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n and not input_def.root_manager_key\n and not input_def.has_default_value\n ):\n raise DagsterInvalidDefinitionError(\n "Input '{input_name}' in {described_node} is not connected to "\n "the output of a previous node and can not be loaded from configuration, "\n "making it impossible to execute. "\n "Possible solutions are:\\n"\n " * add a dagster_type_loader for the type '{dagster_type}'\\n"\n " * connect '{input_name}' to the output of another node\\n".format(\n described_node=node.describe_node(),\n input_name=input_def.name,\n dagster_type=input_def.dagster_type.display_name,\n )\n )\n\n # If a root manager is provided, it's always used. I.e. it has priority over\n # the other ways of loading unsatisfied inputs - dagster type loaders and\n # default values.\n if input_def.root_manager_key:\n resource_reqs.add(input_def.root_manager_key)\n if input_def.root_manager_key not in mode_def.resource_defs:\n error_msg = _get_missing_resource_error_msg(\n resource_type="root input manager",\n resource_key=input_def.root_manager_key,\n descriptor=f"unsatisfied input '{input_def.name}' of {node.describe_node()}",\n mode_def=mode_def,\n resource_defs_of_type=mode_root_input_managers,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n return resource_reqs\n\n\ndef _get_missing_resource_error_msg(\n resource_type, resource_key, descriptor, mode_def, resource_defs_of_type\n):\n if mode_def.name == "default":\n return (\n f"{resource_type} key '{resource_key}' is required by "\n f"{descriptor}, but is not provided. Provide a {resource_type} for key '{resource_key}', "\n f"or change '{resource_key}' to one of the provided {resource_type} keys: "\n f"{sorted(resource_defs_of_type)}."\n )\n else:\n return (\n f"{resource_type} key '{resource_key}' is required by "\n f"{descriptor}, but is not provided by mode '{mode_def.name}'. "\n f"In mode '{mode_def.name}', provide a {resource_type} for key '{resource_key}', "\n f"or change '{resource_key}' to one of the provided root input managers keys: {sorted(resource_defs_of_type)}."\n )\n\n\ndef _build_all_node_defs(node_defs: List[NodeDefinition]) -> Dict[str, NodeDefinition]:\n all_defs: Dict[str, NodeDefinition] = {}\n for current_level_node_def in node_defs:\n for node_def in current_level_node_def.iterate_node_defs():\n if node_def.name in all_defs:\n if all_defs[node_def.name] != node_def:\n raise DagsterInvalidDefinitionError(\n 'Detected conflicting node definitions with the same name "{name}"'.format(\n name=node_def.name\n )\n )\n else:\n all_defs[node_def.name] = node_def\n\n return all_defs\n\n\ndef _create_run_config_schema(\n pipeline_def: PipelineDefinition,\n mode_definition: ModeDefinition,\n required_resources: Set[str],\n) -> "RunConfigSchema":\n from .run_config import (\n RunConfigSchemaCreationData,\n construct_config_type_dictionary,\n define_run_config_schema_type,\n )\n from .run_config_schema import RunConfigSchema\n\n # When executing with a subset pipeline, include the missing solids\n # from the original pipeline as ignored to allow execution with\n # run config that is valid for the original\n if isinstance(pipeline_def.graph, SubselectedGraphDefinition):\n ignored_solids = pipeline_def.graph.get_top_level_omitted_nodes()\n elif pipeline_def.is_subset_pipeline:\n if pipeline_def.parent_pipeline_def is None:\n check.failed("Unexpected subset pipeline state")\n\n ignored_solids = [\n solid\n for solid in pipeline_def.parent_pipeline_def.graph.solids\n if not pipeline_def.has_solid_named(solid.name)\n ]\n else:\n ignored_solids = []\n\n run_config_schema_type = define_run_config_schema_type(\n RunConfigSchemaCreationData(\n pipeline_name=pipeline_def.name,\n solids=pipeline_def.graph.solids,\n graph_def=pipeline_def.graph,\n dependency_structure=pipeline_def.graph.dependency_structure,\n mode_definition=mode_definition,\n logger_defs=mode_definition.loggers,\n ignored_solids=ignored_solids,\n required_resources=required_resources,\n is_using_graph_job_op_apis=pipeline_def.is_job,\n )\n )\n\n if mode_definition.config_mapping:\n outer_config_type = mode_definition.config_mapping.config_schema.config_type\n else:\n outer_config_type = run_config_schema_type\n\n if outer_config_type is None:\n check.failed("Unexpected outer_config_type value of None")\n\n config_type_dict_by_name, config_type_dict_by_key = construct_config_type_dictionary(\n pipeline_def.all_node_defs,\n outer_config_type,\n )\n\n return RunConfigSchema(\n run_config_schema_type=run_config_schema_type,\n config_type_dict_by_name=config_type_dict_by_name,\n config_type_dict_by_key=config_type_dict_by_key,\n config_mapping=mode_definition.config_mapping,\n )\n
", "current_page_name": "_modules/dagster/core/definitions/pipeline_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.pipeline_definition"}, "policy": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.policy

\nfrom enum import Enum\nfrom random import random\nfrom typing import NamedTuple, Optional\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\n\n
[docs]class Backoff(Enum):\n """\n A modifier for delay as a function of attempt number.\n\n LINEAR: `attempt_num * delay`\n EXPONENTIAL: `((2 ^ attempt_num) - 1) * delay`\n """\n\n LINEAR = "LINEAR"\n EXPONENTIAL = "EXPONENTIAL"
\n\n\n
[docs]class Jitter(Enum):\n """A randomizing modifier for delay, applied after backoff calculation.\n\n FULL: between 0 and the calculated delay based on backoff: `random() * backoff_delay`\n PLUS_MINUS: +/- the delay: `backoff_delay + ((2 * (random() * delay)) - delay)`\n """\n\n FULL = "FULL"\n PLUS_MINUS = "PLUS_MINUS"
\n\n\n
[docs]class RetryPolicy(\n NamedTuple(\n "_RetryPolicy",\n [\n ("max_retries", int),\n ("delay", Optional[check.Numeric]),\n # declarative time modulation to allow calc witout running user function\n ("backoff", Optional[Backoff]),\n ("jitter", Optional[Jitter]),\n ],\n ),\n):\n """\n A declarative policy for when to request retries when an exception occurs during op execution.\n\n Args:\n max_retries (int):\n The maximum number of retries to attempt. Defaults to 1.\n delay (Optional[Union[int,float]]):\n The time in seconds to wait between the retry being requested and the next attempt\n being started. This unit of time can be modulated as a function of attempt number\n with backoff and randomly with jitter.\n backoff (Optional[Backoff]):\n A modifier for delay as a function of retry attempt number.\n jitter (Optional[Jitter]):\n A randomizing modifier for delay, applied after backoff calculation.\n """\n\n def __new__(\n cls,\n max_retries: int = 1,\n delay: Optional[check.Numeric] = None,\n backoff: Optional[Backoff] = None,\n jitter: Optional[Jitter] = None,\n ):\n if backoff is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set jitter on RetryPolicy without also setting delay"\n )\n\n if jitter is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set backoff on RetryPolicy without also setting delay"\n )\n\n return super().__new__(\n cls,\n max_retries=check.int_param(max_retries, "max_retries"),\n delay=check.opt_numeric_param(delay, "delay"),\n backoff=check.opt_inst_param(backoff, "backoff", Backoff),\n jitter=check.opt_inst_param(jitter, "jitter", Jitter),\n )\n\n def calculate_delay(self, attempt_num: int) -> check.Numeric:\n return calculate_delay(\n attempt_num=attempt_num,\n backoff=self.backoff,\n jitter=self.jitter,\n base_delay=self.delay or 0,\n )
\n\n\ndef calculate_delay(attempt_num, backoff, jitter, base_delay):\n if backoff is Backoff.EXPONENTIAL:\n calc_delay = ((2**attempt_num) - 1) * base_delay\n elif backoff is Backoff.LINEAR:\n calc_delay = base_delay * attempt_num\n elif backoff is None:\n calc_delay = base_delay\n else:\n check.assert_never(backoff)\n\n if jitter is Jitter.FULL:\n calc_delay = random() * calc_delay\n elif jitter is Jitter.PLUS_MINUS:\n calc_delay = calc_delay + ((2 * (random() * base_delay)) - base_delay)\n elif jitter is None:\n pass\n else:\n check.assert_never(jitter)\n\n return calc_delay\n
", "current_page_name": "_modules/dagster/core/definitions/policy", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.policy"}, "preset": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.preset

\nfrom typing import Dict, List, NamedTuple, Optional\n\nimport pkg_resources\nimport yaml\n\nfrom dagster import check\nfrom dagster.core.definitions.utils import config_from_files, config_from_yaml_strings\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.utils.merger import deep_merge_dicts\n\nfrom .mode import DEFAULT_MODE_NAME\nfrom .utils import check_valid_name\n\n\n
[docs]class PresetDefinition(\n NamedTuple(\n "_PresetDefinition",\n [\n ("name", str),\n ("run_config", Optional[Dict[str, object]]),\n ("solid_selection", Optional[List[str]]),\n ("mode", str),\n ("tags", Dict[str, str]),\n ],\n )\n):\n """Defines a preset configuration in which a pipeline can execute.\n\n Presets can be used in Dagit to load predefined configurations into the tool.\n\n Presets may also be used from the Python API (in a script, or in test) as follows:\n\n .. code-block:: python\n\n execute_pipeline(pipeline_def, preset='example_preset')\n\n Presets may also be used with the command line tools:\n\n .. code-block:: shell\n\n $ dagster pipeline execute example_pipeline --preset example_preset\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n run_config (Optional[dict]): A dict representing the config to set with the preset.\n This is equivalent to the ``run_config`` argument to :py:func:`execute_pipeline`.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default: 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n """\n\n def __new__(\n cls,\n name: str,\n run_config: Optional[Dict[str, object]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n tags: Optional[Dict[str, object]] = None,\n ):\n\n return super(PresetDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n run_config=run_config,\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n mode=check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME),\n tags=check.opt_dict_param(tags, "tags", key_type=str),\n )\n\n
[docs] @staticmethod\n def from_files(name, config_files=None, solid_selection=None, mode=None, tags=None):\n """Static constructor for presets from YAML files.\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n config_files (Optional[List[str]]): List of paths or glob patterns for yaml files\n to load and parse as the run config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML files.\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n check.str_param(name, "name")\n config_files = check.opt_list_param(config_files, "config_files")\n solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n\n merged = config_from_files(config_files)\n\n return PresetDefinition(name, merged, solid_selection, mode, tags)
\n\n
[docs] @staticmethod\n def from_yaml_strings(name, yaml_strings=None, solid_selection=None, mode=None, tags=None):\n """Static constructor for presets from YAML strings.\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n yaml_strings (Optional[List[str]]): List of yaml strings to parse as the environment\n config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n check.str_param(name, "name")\n yaml_strings = check.opt_list_param(yaml_strings, "yaml_strings", of_type=str)\n solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n\n merged = config_from_yaml_strings(yaml_strings)\n\n return PresetDefinition(name, merged, solid_selection, mode, tags)
\n\n
[docs] @staticmethod\n def from_pkg_resources(\n name, pkg_resource_defs=None, solid_selection=None, mode=None, tags=None\n ):\n """Load a preset from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n\n .. code-block:: python\n\n PresetDefinition.from_pkg_resources(\n name='local',\n mode='local',\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n pkg_resource_defs (Optional[List[(str, str)]]): List of pkg_resource modules/files to\n load as the run config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with this partition. e.g.\n ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n pkg_resource_defs = check.opt_list_param(\n pkg_resource_defs, "pkg_resource_defs", of_type=tuple\n )\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs} "\n f'on preset "{name}".'\n ) from err\n\n return PresetDefinition.from_yaml_strings(name, yaml_strings, solid_selection, mode, tags)
\n\n
[docs] def get_environment_yaml(self):\n """Get the environment dict set on a preset as YAML.\n\n Returns:\n str: The environment dict as YAML.\n """\n return yaml.dump(self.run_config or {}, default_flow_style=False)
\n\n
[docs] def with_additional_config(self, run_config):\n """Return a new PresetDefinition with additional config merged in to the existing config."""\n\n check.opt_nullable_dict_param(run_config, "run_config")\n if run_config is None:\n return self\n else:\n initial_config = self.run_config or {}\n return PresetDefinition(\n name=self.name,\n solid_selection=self.solid_selection,\n mode=self.mode,\n tags=self.tags,\n run_config=deep_merge_dicts(initial_config, run_config),\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/preset", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.preset"}, "reconstruct": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.reconstruct

\nimport inspect\nimport os\nimport sys\nfrom functools import lru_cache\nfrom typing import TYPE_CHECKING, FrozenSet, List, NamedTuple, Optional, Union, overload\n\nfrom dagster import check, seven\nfrom dagster.core.code_pointer import (\n    CodePointer,\n    CustomPointer,\n    FileCodePointer,\n    ModuleCodePointer,\n    get_python_file_from_target,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.origin import (\n    DEFAULT_DAGSTER_ENTRY_POINT,\n    PipelinePythonOrigin,\n    RepositoryPythonOrigin,\n)\nfrom dagster.core.selector import parse_solid_selection\nfrom dagster.serdes import pack_value, unpack_value, whitelist_for_serdes\nfrom dagster.utils import frozenlist\nfrom dagster.utils.backcompat import experimental\n\nfrom .pipeline_base import IPipeline\n\nif TYPE_CHECKING:\n    from dagster.core.asset_defs.asset_group import AssetGroup\n\n    from .graph_definition import GraphDefinition\n    from .pipeline_definition import PipelineDefinition\n    from .repository_definition import RepositoryDefinition\n\n\ndef get_ephemeral_repository_name(pipeline_name: str) -> str:\n    check.str_param(pipeline_name, "pipeline_name")\n    return "__repository__{pipeline_name}".format(pipeline_name=pipeline_name)\n\n\n@whitelist_for_serdes\nclass ReconstructableRepository(\n    NamedTuple(\n        "_ReconstructableRepository",\n        [\n            ("pointer", CodePointer),\n            ("container_image", Optional[str]),\n            ("executable_path", Optional[str]),\n            ("entry_point", List[str]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        pointer,\n        container_image=None,\n        executable_path=None,\n        entry_point=None,\n    ):\n        return super(ReconstructableRepository, cls).__new__(\n            cls,\n            pointer=check.inst_param(pointer, "pointer", CodePointer),\n            container_image=check.opt_str_param(container_image, "container_image"),\n            executable_path=check.opt_str_param(executable_path, "executable_path"),\n            entry_point=(\n                frozenlist(check.list_param(entry_point, "entry_point", of_type=str))\n                if entry_point != None\n                else DEFAULT_DAGSTER_ENTRY_POINT\n            ),\n        )\n\n    @lru_cache(maxsize=1)\n    def get_definition(self):\n        return repository_def_from_pointer(self.pointer)\n\n    def get_reconstructable_pipeline(self, name):\n        return ReconstructablePipeline(self, name)\n\n    @classmethod\n    def for_file(cls, file, fn_name, working_directory=None, container_image=None):\n        if not working_directory:\n            working_directory = os.getcwd()\n        return cls(FileCodePointer(file, fn_name, working_directory), container_image)\n\n    @classmethod\n    def for_module(cls, module, fn_name, working_directory=None, container_image=None):\n        return cls(ModuleCodePointer(module, fn_name, working_directory), container_image)\n\n    def get_python_origin(self):\n        return RepositoryPythonOrigin(\n            executable_path=self.executable_path if self.executable_path else sys.executable,\n            code_pointer=self.pointer,\n            container_image=self.container_image,\n            entry_point=self.entry_point,\n        )\n\n    def get_python_origin_id(self):\n        return self.get_python_origin().get_id()\n\n\n
[docs]@whitelist_for_serdes\nclass ReconstructablePipeline(\n NamedTuple(\n "_ReconstructablePipeline",\n [\n ("repository", ReconstructableRepository),\n ("pipeline_name", str),\n ("solid_selection_str", Optional[str]),\n ("solids_to_execute", Optional[FrozenSet[str]]),\n ],\n ),\n IPipeline,\n):\n """Defines a reconstructable pipeline. When your pipeline/job must cross process boundaries,\n Dagster must know how to reconstruct the pipeline/job on the other side of the process boundary.\n\n Args:\n repository (ReconstructableRepository): The reconstructable representation of the repository\n the pipeline/job belongs to.\n pipeline_name (str): The name of the pipeline/job.\n solid_selection_str (Optional[str]): The string value of a comma separated list of user-input\n solid/op selection. None if no selection is specified, i.e. the entire pipeline/job will\n be run.\n solids_to_execute (Optional[FrozenSet[str]]): A set of solid/op names to execute. None if no selection\n is specified, i.e. the entire pipeline/job will be run.\n """\n\n def __new__(\n cls,\n repository,\n pipeline_name,\n solid_selection_str=None,\n solids_to_execute=None,\n ):\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n return super(ReconstructablePipeline, cls).__new__(\n cls,\n repository=check.inst_param(repository, "repository", ReconstructableRepository),\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n solid_selection_str=check.opt_str_param(solid_selection_str, "solid_selection_str"),\n solids_to_execute=solids_to_execute,\n )\n\n @property\n def solid_selection(self) -> Optional[List[str]]:\n return seven.json.loads(self.solid_selection_str) if self.solid_selection_str else None\n\n @lru_cache(maxsize=1)\n def get_definition(self):\n from dagster.core.definitions.job_definition import JobDefinition\n\n defn = self.repository.get_definition().get_pipeline(self.pipeline_name)\n\n if isinstance(defn, JobDefinition):\n return (\n self.repository.get_definition().get_pipeline(self.pipeline_name)\n # jobs use pre-resolved selection\n .get_job_def_for_op_selection(self.solid_selection)\n )\n return (\n self.repository.get_definition().get_pipeline(self.pipeline_name)\n # pipelines use post-resolved selection\n .get_pipeline_subset_def(self.solids_to_execute)\n )\n\n def get_reconstructable_repository(self):\n return self.repository\n\n def _subset_for_execution(\n self,\n solids_to_execute: Optional[Optional[FrozenSet[str]]],\n solid_selection: Optional[List[str]],\n ) -> "ReconstructablePipeline":\n # no selection\n if solid_selection is None and solids_to_execute is None:\n return ReconstructablePipeline(\n repository=self.repository,\n pipeline_name=self.pipeline_name,\n )\n\n from dagster.core.definitions import JobDefinition, PipelineDefinition\n\n pipeline_def = self.get_definition()\n if isinstance(pipeline_def, JobDefinition):\n # when subselecting a job\n # * job subselection depend on solid_selection rather than solids_to_execute\n # * we'll resolve the op selection later in the stack\n if solid_selection is None:\n # when the pre-resolution info is unavailable (e.g. subset from existing run),\n # we need to fill the solid_selection in order to pass the value down to deeper stack.\n solid_selection = list(solids_to_execute) if solids_to_execute else None\n return ReconstructablePipeline(\n repository=self.repository,\n pipeline_name=self.pipeline_name,\n solid_selection_str=seven.json.dumps(solid_selection),\n solids_to_execute=None,\n )\n elif isinstance(pipeline_def, PipelineDefinition):\n # when subselecting a pipeline\n # * pipeline subselection depend on solids_to_excute rather than solid_selection\n # * we resolve a list of solid selection queries to a frozenset of qualified solid names\n # e.g. ['foo_solid+'] to {'foo_solid', 'bar_solid'}\n if solid_selection and solids_to_execute is None:\n # when post-resolution query is unavailable, resolve the query\n solids_to_execute = parse_solid_selection(pipeline_def, solid_selection)\n return ReconstructablePipeline(\n repository=self.repository,\n pipeline_name=self.pipeline_name,\n solid_selection_str=seven.json.dumps(solid_selection) if solid_selection else None,\n solids_to_execute=frozenset(solids_to_execute) if solids_to_execute else None,\n )\n else:\n raise Exception(f"Unexpected pipeline/job type {pipeline_def.__class__.__name__}")\n\n def subset_for_execution(\n self, solid_selection: Optional[List[str]]\n ) -> "ReconstructablePipeline":\n # take a list of unresolved selection queries\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n\n return self._subset_for_execution(solids_to_execute=None, solid_selection=solid_selection)\n\n def subset_for_execution_from_existing_pipeline(\n self, solids_to_execute: Optional[FrozenSet[str]]\n ) -> "ReconstructablePipeline":\n # take a frozenset of resolved solid names from an existing pipeline\n # so there's no need to parse the selection\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n\n return self._subset_for_execution(solids_to_execute=solids_to_execute, solid_selection=None)\n\n def describe(self):\n return '"{name}" in repository ({repo})'.format(\n repo=self.repository.pointer.describe, name=self.pipeline_name\n )\n\n @staticmethod\n def for_file(python_file, fn_name):\n return bootstrap_standalone_recon_pipeline(\n FileCodePointer(python_file, fn_name, os.getcwd())\n )\n\n @staticmethod\n def for_module(module, fn_name):\n return bootstrap_standalone_recon_pipeline(ModuleCodePointer(module, fn_name, os.getcwd()))\n\n def to_dict(self):\n return pack_value(self)\n\n @staticmethod\n def from_dict(val):\n check.dict_param(val, "val")\n\n inst = unpack_value(val)\n check.invariant(\n isinstance(inst, ReconstructablePipeline),\n "Deserialized object is not instance of ReconstructablePipeline, got {type}".format(\n type=type(inst)\n ),\n )\n return inst\n\n def get_python_origin(self):\n return PipelinePythonOrigin(self.pipeline_name, self.repository.get_python_origin())\n\n def get_python_origin_id(self):\n return self.get_python_origin().get_id()\n\n
[docs] def get_module(self) -> Optional[str]:\n """Return the module the pipeline is found in, the origin is a module code pointer"""\n pointer = self.get_python_origin().get_repo_pointer()\n if isinstance(pointer, ModuleCodePointer):\n return pointer.module\n\n return None
\n\n\n
[docs]def reconstructable(target):\n """\n Create a :py:class:`~dagster.core.definitions.reconstructable.ReconstructablePipeline` from a\n function that returns a :py:class:`~dagster.PipelineDefinition`/:py:class:`~dagster.JobDefinition`,\n or a function decorated with :py:func:`@pipeline <dagster.pipeline>`/:py:func:`@job <dagster.job>`.\n\n When your pipeline/job must cross process boundaries, e.g., for execution on multiple nodes or\n in different systems (like ``dagstermill``), Dagster must know how to reconstruct the pipeline/job\n on the other side of the process boundary.\n\n Passing a job created with ``~dagster.GraphDefinition.to_job`` to ``reconstructable()``,\n requires you to wrap that job's definition in a module-scoped function, and pass that function\n instead:\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def my_graph():\n ...\n\n def define_my_job():\n return my_graph.to_job()\n\n reconstructable(define_my_job)\n\n This function implements a very conservative strategy for reconstruction, so that its behavior\n is easy to predict, but as a consequence it is not able to reconstruct certain kinds of pipelines\n or jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\n call), or in interactive environments such as the Python REPL or Jupyter notebooks.\n\n If you need to reconstruct objects constructed in these ways, you should use\n :py:func:`~dagster.reconstructable.build_reconstructable_job` instead, which allows you to\n specify your own reconstruction strategy.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job, reconstructable\n\n @job\n def foo_job():\n ...\n\n reconstructable_foo_job = reconstructable(foo_job)\n\n\n @graph\n def foo():\n ...\n\n def make_bar_job():\n return foo.to_job()\n\n reconstructable_bar_job = reconstructable(make_bar_job)\n """\n from dagster.core.definitions import JobDefinition, PipelineDefinition\n\n if not seven.is_function_or_decorator_instance_of(target, PipelineDefinition):\n if isinstance(target, JobDefinition):\n raise DagsterInvariantViolationError(\n "Reconstructable target was not a function returning a job definition, or a job "\n "definition produced by a decorated function. If your job was constructed using "\n "``GraphDefinition.to_job``, you must wrap the ``to_job`` call in a function at "\n "module scope, ie not within any other functions. "\n "To learn more, check out the docs on ``reconstructable``: "\n "https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n )\n raise DagsterInvariantViolationError(\n "Reconstructable target should be a function or definition produced "\n "by a decorated function, got {type}.".format(type=type(target)),\n )\n\n if seven.is_lambda(target):\n raise DagsterInvariantViolationError(\n "Reconstructable target can not be a lambda. Use a function or "\n "decorated function defined at module scope instead, or use "\n "build_reconstructable_job."\n )\n\n if seven.qualname_differs(target):\n raise DagsterInvariantViolationError(\n 'Reconstructable target "{target.__name__}" has a different '\n '__qualname__ "{target.__qualname__}" indicating it is not '\n "defined at module scope. Use a function or decorated function "\n "defined at module scope instead, or use build_reconstructable_job.".format(\n target=target\n )\n )\n\n try:\n if (\n hasattr(target, "__module__")\n and hasattr(target, "__name__")\n and inspect.getmodule(target).__name__ != "__main__"\n ):\n return ReconstructablePipeline.for_module(target.__module__, target.__name__)\n except:\n pass\n\n python_file = get_python_file_from_target(target)\n if not python_file:\n raise DagsterInvariantViolationError(\n "reconstructable() can not reconstruct jobs or pipelines defined in interactive "\n "environments like <stdin>, IPython, or Jupyter notebooks. "\n "Use a pipeline defined in a module or file instead, or use build_reconstructable_job."\n )\n\n pointer = FileCodePointer(\n python_file=python_file, fn_name=target.__name__, working_directory=os.getcwd()\n )\n\n return bootstrap_standalone_recon_pipeline(pointer)
\n\n\n
[docs]@experimental\ndef build_reconstructable_job(\n reconstructor_module_name,\n reconstructor_function_name,\n reconstructable_args=None,\n reconstructable_kwargs=None,\n reconstructor_working_directory=None,\n):\n """\n Create a :py:class:`dagster.core.definitions.reconstructable.ReconstructablePipeline`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or in\n different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n This function allows you to use the strategy of your choice for reconstructing jobs, so\n that you can reconstruct certain kinds of jobs that are not supported by\n :py:func:`~dagster.reconstructable`, such as those defined by lambdas, in nested scopes (e.g.,\n dynamically within a method call), or in interactive environments such as the Python REPL or\n Jupyter notebooks.\n\n If you need to reconstruct jobs constructed in these ways, use this function instead of\n :py:func:`~dagster.reconstructable`.\n\n Args:\n reconstructor_module_name (str): The name of the module containing the function to use to\n reconstruct the job.\n reconstructor_function_name (str): The name of the function to use to reconstruct the\n job.\n reconstructable_args (Tuple): Args to the function to use to reconstruct the job.\n Values of the tuple must be JSON serializable.\n reconstructable_kwargs (Dict[str, Any]): Kwargs to the function to use to reconstruct the\n job. Values of the dict must be JSON serializable.\n\n Examples:\n\n .. code-block:: python\n\n # module: mymodule\n\n from dagster import JobDefinition, job, build_reconstructable_job\n\n class JobFactory:\n def make_job(*args, **kwargs):\n\n @job\n def _job(...):\n ...\n\n return _job\n\n def reconstruct_job(*args):\n factory = JobFactory()\n return factory.make_job(*args)\n\n factory = JobFactory()\n\n foo_job_args = (...,...)\n\n foo_job_kwargs = {...:...}\n\n foo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\n reconstructable_foo_job = build_reconstructable_job(\n 'mymodule',\n 'reconstruct_job',\n foo_job_args,\n foo_job_kwargs,\n )\n """\n check.str_param(reconstructor_module_name, "reconstructor_module_name")\n check.str_param(reconstructor_function_name, "reconstructor_function_name")\n check.opt_str_param(\n reconstructor_working_directory, "reconstructor_working_directory", os.getcwd()\n )\n\n reconstructable_args = list(check.opt_tuple_param(reconstructable_args, "reconstructable_args"))\n reconstructable_kwargs = list(\n (\n [key, value]\n for key, value in check.opt_dict_param(\n reconstructable_kwargs, "reconstructable_kwargs", key_type=str\n ).items()\n )\n )\n\n reconstructor_pointer = ModuleCodePointer(\n reconstructor_module_name,\n reconstructor_function_name,\n working_directory=reconstructor_working_directory,\n )\n\n pointer = CustomPointer(reconstructor_pointer, reconstructable_args, reconstructable_kwargs)\n\n pipeline_def = pipeline_def_from_pointer(pointer)\n\n return ReconstructablePipeline(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n pipeline_name=pipeline_def.name,\n )
\n\n\n# back compat, in case users have imported these directly\nbuild_reconstructable_pipeline = build_reconstructable_job\nbuild_reconstructable_target = build_reconstructable_job\n\n\ndef bootstrap_standalone_recon_pipeline(pointer):\n # So this actually straps the the pipeline for the sole\n # purpose of getting the pipeline name. If we changed ReconstructablePipeline\n # to get the pipeline on demand in order to get name, we could avoid this.\n pipeline_def = pipeline_def_from_pointer(pointer)\n return ReconstructablePipeline(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n pipeline_name=pipeline_def.name,\n )\n\n\ndef _check_is_loadable(definition):\n from dagster.core.asset_defs import AssetGroup\n\n from .graph_definition import GraphDefinition\n from .pipeline_definition import PipelineDefinition\n from .repository_definition import RepositoryDefinition\n\n if not isinstance(\n definition, (PipelineDefinition, RepositoryDefinition, GraphDefinition, AssetGroup)\n ):\n raise DagsterInvariantViolationError(\n (\n "Loadable attributes must be either a JobDefinition, GraphDefinition, "\n f"PipelineDefinition, AssetGroup, or RepositoryDefinition. Got {repr(definition)}."\n )\n )\n return definition\n\n\ndef load_def_in_module(module_name, attribute, working_directory):\n return def_from_pointer(CodePointer.from_module(module_name, attribute, working_directory))\n\n\ndef load_def_in_package(package_name, attribute, working_directory):\n return def_from_pointer(\n CodePointer.from_python_package(package_name, attribute, working_directory)\n )\n\n\ndef load_def_in_python_file(python_file, attribute, working_directory):\n return def_from_pointer(CodePointer.from_python_file(python_file, attribute, working_directory))\n\n\ndef def_from_pointer(\n pointer: CodePointer,\n) -> Union["PipelineDefinition", "RepositoryDefinition", "GraphDefinition"]:\n target = pointer.load_target()\n\n from dagster.core.asset_defs.asset_group import AssetGroup\n\n from .graph_definition import GraphDefinition\n from .pipeline_definition import PipelineDefinition\n from .repository_definition import RepositoryDefinition\n\n if isinstance(\n target, (PipelineDefinition, RepositoryDefinition, GraphDefinition, AssetGroup)\n ) or not callable(target):\n return _check_is_loadable(target)\n\n # if its a function invoke it - otherwise we are pointing to a\n # artifact in module scope, likely decorator output\n\n if seven.get_args(target):\n raise DagsterInvariantViolationError(\n "Error invoking function at {target} with no arguments. "\n "Reconstructable target must be callable with no arguments".format(\n target=pointer.describe()\n )\n )\n\n return _check_is_loadable(target())\n\n\ndef pipeline_def_from_pointer(pointer: CodePointer) -> "PipelineDefinition":\n from .pipeline_definition import PipelineDefinition\n\n target = def_from_pointer(pointer)\n\n if isinstance(target, PipelineDefinition):\n return target\n\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a JobDefinition (or PipelineDefinition for legacy code). "\n "Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n\n\n@overload\n# NOTE: mypy can't handle these overloads but pyright can\ndef repository_def_from_target_def( # type: ignore\n target: Union["RepositoryDefinition", "PipelineDefinition", "GraphDefinition", "AssetGroup"]\n) -> "RepositoryDefinition":\n ...\n\n\n@overload\ndef repository_def_from_target_def(target: object) -> None:\n ...\n\n\ndef repository_def_from_target_def(target: object) -> Optional["RepositoryDefinition"]:\n from dagster.core.asset_defs.asset_group import AssetGroup\n\n from .graph_definition import GraphDefinition\n from .pipeline_definition import PipelineDefinition\n from .repository_definition import CachingRepositoryData, RepositoryDefinition\n\n # special case - we can wrap a single pipeline in a repository\n if isinstance(target, (PipelineDefinition, GraphDefinition)):\n # consider including pipeline name in generated repo name\n return RepositoryDefinition(\n name=get_ephemeral_repository_name(target.name),\n repository_data=CachingRepositoryData.from_list([target]),\n )\n elif isinstance(target, AssetGroup):\n return RepositoryDefinition(\n name="__repository__", repository_data=CachingRepositoryData.from_list([target])\n )\n elif isinstance(target, RepositoryDefinition):\n return target\n else:\n return None\n\n\ndef repository_def_from_pointer(pointer: CodePointer) -> "RepositoryDefinition":\n target = def_from_pointer(pointer)\n repo_def = repository_def_from_target_def(target)\n if not repo_def:\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a "\n "RepositoryDefinition, JobDefinition, or PipelineDefinition. "\n "Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n return repo_def\n
", "current_page_name": "_modules/dagster/core/definitions/reconstruct", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.reconstruct"}, "repository_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.repository_definition

\nimport warnings\nfrom abc import ABC, abstractmethod\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    List,\n    Mapping,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.asset_defs.source_asset import SourceAsset\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.backcompat import ExperimentalWarning\n\nfrom .events import AssetKey\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .job_definition import JobDefinition\nfrom .partition import PartitionScheduleDefinition, PartitionSetDefinition\nfrom .pipeline_definition import PipelineDefinition\nfrom .schedule_definition import ScheduleDefinition\nfrom .sensor_definition import SensorDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.asset_defs.asset_group import AssetGroup\n\nVALID_REPOSITORY_DATA_DICT_KEYS = {\n    "pipelines",\n    "partition_sets",\n    "schedules",\n    "sensors",\n    "jobs",\n}\n\nRepositoryLevelDefinition = TypeVar(\n    "RepositoryLevelDefinition",\n    PipelineDefinition,\n    JobDefinition,\n    PartitionSetDefinition,\n    ScheduleDefinition,\n    SensorDefinition,\n)\n\n\nclass _CacheingDefinitionIndex(Generic[RepositoryLevelDefinition]):\n    def __init__(\n        self,\n        definition_class: Type[RepositoryLevelDefinition],\n        definition_class_name: str,\n        definition_kind: str,\n        definitions: Mapping[\n            str, Union[RepositoryLevelDefinition, Callable[[], RepositoryLevelDefinition]]\n        ],\n        validation_fn: Callable[[RepositoryLevelDefinition], RepositoryLevelDefinition],\n        lazy_definitions_fn: Optional[Callable[[], List[RepositoryLevelDefinition]]] = None,\n    ):\n        """\n        Args:\n            definitions: A dictionary of definition names to definitions or functions that load\n                definitions.\n            lazy_definitions_fn: A function for loading a list of definitions whose names are not\n                even known until loaded.\n\n        """\n\n        for key, definition in definitions.items():\n            check.invariant(\n                isinstance(definition, definition_class) or callable(definition),\n                "Bad definition for {definition_kind} {key}: must be {definition_class_name} or "\n                "callable, got {type_}".format(\n                    definition_kind=definition_kind,\n                    key=key,\n                    definition_class_name=definition_class_name,\n                    type_=type(definition),\n                ),\n            )\n\n        self._definition_class: Type[RepositoryLevelDefinition] = definition_class\n        self._definition_class_name = definition_class_name\n        self._definition_kind = definition_kind\n        self._validation_fn: Callable[\n            [RepositoryLevelDefinition], RepositoryLevelDefinition\n        ] = validation_fn\n\n        self._definitions: Mapping[\n            str, Union[RepositoryLevelDefinition, Callable[[], RepositoryLevelDefinition]]\n        ] = definitions\n        self._definition_cache: Dict[str, RepositoryLevelDefinition] = {}\n        self._definition_names: Optional[List[str]] = None\n\n        self._lazy_definitions_fn: Callable[\n            [], List[RepositoryLevelDefinition]\n        ] = lazy_definitions_fn or (lambda: [])\n        self._lazy_definitions: Optional[List[RepositoryLevelDefinition]] = None\n\n        self._all_definitions: Optional[List[RepositoryLevelDefinition]] = None\n\n    def _get_lazy_definitions(self) -> List[RepositoryLevelDefinition]:\n        if self._lazy_definitions is None:\n            self._lazy_definitions = self._lazy_definitions_fn()\n            for definition in self._lazy_definitions:\n                self._validate_and_cache_definition(definition, definition.name)\n\n        return self._lazy_definitions\n\n    def get_definition_names(self) -> List[str]:\n        if self._definition_names:\n            return self._definition_names\n\n        lazy_names = []\n        for definition in self._get_lazy_definitions():\n            strict_definition = self._definitions.get(definition.name)\n            if strict_definition:\n                check.invariant(\n                    strict_definition == definition,\n                    f"Duplicate definition found for {definition.name}",\n                )\n            else:\n                lazy_names.append(definition.name)\n\n        self._definition_names = list(self._definitions.keys()) + lazy_names\n        return self._definition_names\n\n    def has_definition(self, definition_name: str) -> bool:\n        check.str_param(definition_name, "definition_name")\n\n        return definition_name in self.get_definition_names()\n\n    def get_all_definitions(self) -> List[RepositoryLevelDefinition]:\n        if self._all_definitions is not None:\n            return self._all_definitions\n\n        self._all_definitions = list(\n            sorted(\n                map(self.get_definition, self.get_definition_names()),\n                key=lambda definition: definition.name,\n            )\n        )\n        return self._all_definitions\n\n    def get_definition(self, definition_name: str) -> RepositoryLevelDefinition:\n        check.str_param(definition_name, "definition_name")\n\n        if not self.has_definition(definition_name):\n            raise DagsterInvariantViolationError(\n                "Could not find {definition_kind} '{definition_name}'. Found: "\n                "{found_names}.".format(\n                    definition_kind=self._definition_kind,\n                    definition_name=definition_name,\n                    found_names=", ".join(\n                        [\n                            "'{found_name}'".format(found_name=found_name)\n                            for found_name in self.get_definition_names()\n                        ]\n                    ),\n                )\n            )\n\n        if definition_name in self._definition_cache:\n            return self._definition_cache[definition_name]\n\n        definition_source = self._definitions[definition_name]\n\n        if isinstance(definition_source, self._definition_class):\n            self._definition_cache[definition_name] = self._validation_fn(definition_source)\n            return definition_source\n        else:\n            definition = cast(Callable, definition_source)()\n            self._validate_and_cache_definition(definition, definition_name)\n            return definition\n\n    def _validate_and_cache_definition(\n        self, definition: RepositoryLevelDefinition, definition_dict_key: str\n    ):\n        check.invariant(\n            isinstance(definition, self._definition_class),\n            "Bad constructor for {definition_kind} {definition_name}: must return "\n            "{definition_class_name}, got value of type {type_}".format(\n                definition_kind=self._definition_kind,\n                definition_name=definition_dict_key,\n                definition_class_name=self._definition_class_name,\n                type_=type(definition),\n            ),\n        )\n        check.invariant(\n            definition.name == definition_dict_key,\n            "Bad constructor for {definition_kind} '{definition_name}': name in "\n            "{definition_class_name} does not match: got '{definition_def_name}'".format(\n                definition_kind=self._definition_kind,\n                definition_name=definition_dict_key,\n                definition_class_name=self._definition_class_name,\n                definition_def_name=definition.name,\n            ),\n        )\n        self._definition_cache[definition_dict_key] = self._validation_fn(definition)\n\n\n
[docs]class RepositoryData(ABC):\n """\n Users should usually rely on the :py:func:`@repository <repository>` decorator to create new\n repositories, which will in turn call the static constructors on this class. However, users may\n subclass :py:class:`RepositoryData` for fine-grained control over access to and lazy creation\n of repository members.\n """\n\n @abstractmethod\n def get_all_pipelines(self) -> List[PipelineDefinition]:\n """Return all pipelines/jobs in the repository as a list.\n\n Returns:\n List[PipelineDefinition]: All pipelines/jobs in the repository.\n """\n\n def get_all_jobs(self) -> List[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n return [job for job in self.get_all_pipelines() if isinstance(job, JobDefinition)]\n\n def get_pipeline_names(self) -> List[str]:\n """Get the names of all pipelines/jobs in the repository.\n\n Returns:\n List[str]\n """\n return [pipeline_def.name for pipeline_def in self.get_all_pipelines()]\n\n def get_job_names(self) -> List[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return [job_def.name for job_def in self.get_all_jobs()]\n\n def has_pipeline(self, pipeline_name: str) -> bool:\n """Check if a pipeline/job with a given name is present in the repository.\n\n Args:\n pipeline_name (str): The name of the pipeline/job.\n\n Returns:\n bool\n """\n return pipeline_name in self.get_pipeline_names()\n\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n return job_name in self.get_job_names()\n\n def get_pipeline(self, pipeline_name) -> PipelineDefinition:\n """Get a pipeline/job by name.\n\n Args:\n pipeline_name (str): Name of the pipeline/job to retrieve.\n\n Returns:\n PipelineDefinition: The pipeline/job definition corresponding to the given name.\n """\n pipelines_with_name = [\n pipeline for pipeline in self.get_all_pipelines() if pipeline.name == pipeline_name\n ]\n if not pipelines_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find pipeline/job {pipeline_name} in repository"\n )\n return pipelines_with_name[0]\n\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n match = next(job for job in self.get_all_jobs() if job.name == job_name)\n if match is None:\n raise DagsterInvariantViolationError(f"Could not find job {job_name} in repository")\n return match\n\n def get_partition_set_names(self):\n """Get the names of all partition sets in the repository.\n\n Returns:\n List[str]\n """\n return [partition_set.name for partition_set in self.get_all_partition_sets()]\n\n def has_partition_set(self, partition_set_name: str) -> bool:\n """Check if a partition set with a given name is present in the repository.\n\n Args:\n partition_set_name (str): The name of the partition set.\n\n Returns:\n bool\n """\n return partition_set_name in self.get_partition_set_names()\n\n def get_all_partition_sets(self) -> List[PartitionSetDefinition]:\n """Return all partition sets in the repository as a list.\n\n Returns:\n List[PartitionSetDefinition]: All partition sets in the repository.\n """\n return []\n\n def get_partition_set(self, partition_set_name: str) -> PartitionSetDefinition:\n """Get a partition set by name.\n\n Args:\n partition_set_name (str): Name of the partition set to retrieve.\n\n Returns:\n PartitionSetDefinition: The partition set definition corresponding to the given name.\n """\n partition_sets_with_name = [\n partition_set\n for partition_set in self.get_all_partition_sets()\n if partition_set.name == partition_set_name\n ]\n if not partition_sets_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find partition set {partition_set_name} in repository"\n )\n return partition_sets_with_name[0]\n\n def get_schedule_names(self) -> List[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return [schedule.name for schedule in self.get_all_schedules()]\n\n def get_all_schedules(self) -> List[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Returns:\n List[ScheduleDefinition]: All pipelines in the repository.\n """\n return []\n\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n schedules_with_name = [\n schedule for schedule in self.get_all_schedules() if schedule.name == schedule_name\n ]\n if not schedules_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find schedule {schedule_name} in repository"\n )\n return schedules_with_name[0]\n\n def has_schedule(self, schedule_name: str) -> bool:\n return schedule_name in self.get_schedule_names()\n\n def get_all_sensors(self) -> List[SensorDefinition]:\n return []\n\n def get_sensor_names(self) -> List[str]:\n return [sensor.name for sensor in self.get_all_sensors()]\n\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n sensors_with_name = [\n sensor for sensor in self.get_all_sensors() if sensor.name == sensor_name\n ]\n if not sensors_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find sensor {sensor_name} in repository"\n )\n return sensors_with_name[0]\n\n def has_sensor(self, sensor_name: str) -> bool:\n return sensor_name in self.get_sensor_names()\n\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return {}
\n\n\nT = TypeVar("T")\nResolvable = Callable[[], T]\n\n\nclass CachingRepositoryData(RepositoryData):\n """Default implementation of RepositoryData used by the :py:func:`@repository <repository>` decorator."""\n\n _all_jobs: Optional[List[JobDefinition]]\n _all_pipelines: Optional[List[PipelineDefinition]]\n\n def __init__(\n self,\n pipelines: Mapping[str, Union[PipelineDefinition, Resolvable[PipelineDefinition]]],\n jobs: Mapping[str, Union[JobDefinition, Resolvable[JobDefinition]]],\n partition_sets: Mapping[\n str, Union[PartitionSetDefinition, Resolvable[PartitionSetDefinition]]\n ],\n schedules: Mapping[str, Union[ScheduleDefinition, Resolvable[ScheduleDefinition]]],\n sensors: Mapping[str, Union[SensorDefinition, Resolvable[SensorDefinition]]],\n source_assets: Mapping[AssetKey, SourceAsset],\n ):\n """Constructs a new CachingRepositoryData object.\n\n You may pass pipeline, job, partition_set, and schedule definitions directly, or you may pass\n callables with no arguments that will be invoked to lazily construct definitions when\n accessed by name. This can be helpful for performance when there are many definitions in a\n repository, or when constructing the definitions is costly.\n\n Note that when lazily constructing a definition, the name of the definition must match its\n key in its dictionary index, or a :py:class:`DagsterInvariantViolationError` will be thrown\n at retrieval time.\n\n Args:\n pipelines (Mapping[str, Union[PipelineDefinition, Callable[[], PipelineDefinition]]]):\n The pipeline definitions belonging to the repository.\n jobs (Mapping[str, Union[JobDefinition, Callable[[], JobDefinition]]]):\n The job definitions belonging to the repository.\n partition_sets (Mapping[str, Union[PartitionSetDefinition, Callable[[], PartitionSetDefinition]]]):\n The partition sets belonging to the repository.\n schedules (Mapping[str, Union[ScheduleDefinition, Callable[[], ScheduleDefinition]]]):\n The schedules belonging to the repository.\n sensors (Mapping[str, Union[SensorDefinition, Callable[[], SensorDefinition]]]):\n The sensors belonging to a repository.\n source_assets (Mapping[AssetKey, SourceAsset]): The source assets belonging to a repository.\n """\n check.mapping_param(\n pipelines, "pipelines", key_type=str, value_type=(PipelineDefinition, FunctionType)\n )\n check.mapping_param(jobs, "jobs", key_type=str, value_type=(JobDefinition, FunctionType))\n check.mapping_param(\n partition_sets,\n "partition_sets",\n key_type=str,\n value_type=(PartitionSetDefinition, FunctionType),\n )\n check.mapping_param(\n schedules, "schedules", key_type=str, value_type=(ScheduleDefinition, FunctionType)\n )\n check.mapping_param(\n sensors, "sensors", key_type=str, value_type=(SensorDefinition, FunctionType)\n )\n check.mapping_param(\n source_assets, "source_assets", key_type=AssetKey, value_type=SourceAsset\n )\n\n self._pipelines = _CacheingDefinitionIndex(\n PipelineDefinition,\n "PipelineDefinition",\n "pipeline",\n pipelines,\n self._validate_pipeline,\n )\n\n self._jobs = _CacheingDefinitionIndex(\n JobDefinition,\n "JobDefinition",\n "job",\n jobs,\n self._validate_job,\n )\n\n self._schedules = _CacheingDefinitionIndex(\n ScheduleDefinition,\n "ScheduleDefinition",\n "schedule",\n schedules,\n self._validate_schedule,\n )\n schedule_partition_sets = [\n schedule.get_partition_set()\n for schedule in self._schedules.get_all_definitions()\n if isinstance(schedule, PartitionScheduleDefinition)\n ]\n self._source_assets = source_assets\n\n def load_partition_sets_from_pipelines() -> List[PartitionSetDefinition]:\n job_partition_sets = []\n for pipeline in self.get_all_pipelines():\n if isinstance(pipeline, JobDefinition):\n job_partition_set = pipeline.get_partition_set_def()\n\n if job_partition_set:\n # should only return a partition set if this was constructed using the job\n # API, with a partitioned config\n job_partition_sets.append(job_partition_set)\n\n return job_partition_sets\n\n self._partition_sets = _CacheingDefinitionIndex(\n PartitionSetDefinition,\n "PartitionSetDefinition",\n "partition set",\n merge_dicts(\n {partition_set.name: partition_set for partition_set in schedule_partition_sets},\n partition_sets,\n ),\n self._validate_partition_set,\n load_partition_sets_from_pipelines,\n )\n self._sensors = _CacheingDefinitionIndex(\n SensorDefinition,\n "SensorDefinition",\n "sensor",\n sensors,\n self._validate_sensor,\n )\n # load all sensors to force validation\n self._sensors.get_all_definitions()\n\n self._all_pipelines = None\n self._all_jobs = None\n\n @staticmethod\n def from_dict(repository_definitions: Dict[str, Dict[str, Any]]) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definition (Dict[str, Dict[str, ...]]): A dict of the form:\n\n {\n 'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n """\n check.dict_param(repository_definitions, "repository_definitions", key_type=str)\n check.invariant(\n set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS),\n "Bad dict: must not contain keys other than {{{valid_keys}}}: found {bad_keys}.".format(\n valid_keys=", ".join(\n ["'{key}'".format(key=key) for key in VALID_REPOSITORY_DATA_DICT_KEYS]\n ),\n bad_keys=", ".join(\n [\n "'{key}'"\n for key in repository_definitions.keys()\n if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n ]\n ),\n ),\n )\n\n for key in VALID_REPOSITORY_DATA_DICT_KEYS:\n if key not in repository_definitions:\n repository_definitions[key] = {}\n\n duplicate_keys = set(repository_definitions["schedules"].keys()).intersection(\n set(repository_definitions["sensors"].keys())\n )\n if duplicate_keys:\n raise DagsterInvalidDefinitionError(\n f"Duplicate definitions between schedules and sensors found for keys: {', '.join(duplicate_keys)}"\n )\n\n # merge jobs in to pipelines while they are just implemented as pipelines\n for key, job in repository_definitions["jobs"].items():\n if key in repository_definitions["pipelines"]:\n raise DagsterInvalidDefinitionError(\n f'Conflicting entries for name {key} in "jobs" and "pipelines".'\n )\n\n if isinstance(job, GraphDefinition):\n repository_definitions["jobs"][key] = job.coerce_to_job()\n elif not isinstance(job, JobDefinition):\n raise DagsterInvalidDefinitionError(\n f"Object mapped to {key} is not an instance of JobDefinition or GraphDefinition."\n )\n\n return CachingRepositoryData(**repository_definitions, source_assets={})\n\n @classmethod\n def from_list(\n cls,\n repository_definitions: List[\n Union[\n PipelineDefinition,\n PartitionSetDefinition,\n ScheduleDefinition,\n SensorDefinition,\n "AssetGroup",\n GraphDefinition,\n ]\n ],\n ) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definitions (List[Union[PipelineDefinition, PartitionSetDefinition, ScheduleDefinition, SensorDefinition, AssetGroup, GraphDefinition]]):\n Use this constructor when you have no need to lazy load pipelines/jobs or other\n definitions.\n """\n from dagster.core.asset_defs import AssetGroup, build_assets_job\n\n pipelines_or_jobs: Dict[str, Union[PipelineDefinition, JobDefinition]] = {}\n partition_sets: Dict[str, PartitionSetDefinition] = {}\n schedules: Dict[str, ScheduleDefinition] = {}\n sensors: Dict[str, SensorDefinition] = {}\n source_assets: Dict[AssetKey, SourceAsset] = {}\n for definition in repository_definitions:\n if isinstance(definition, PipelineDefinition):\n if (\n definition.name in pipelines_or_jobs\n and pipelines_or_jobs[definition.name] != definition\n ):\n raise DagsterInvalidDefinitionError(\n "Duplicate {target_type} definition found for {target}".format(\n target_type=definition.target_type, target=definition.describe_target()\n )\n )\n if definition.name == AssetGroup.all_assets_job_name():\n raise DagsterInvalidDefinitionError(\n f"Attempted to provide job called {AssetGroup.all_assets_job_name()} to repository, which is a reserved name. Please rename the job."\n )\n pipelines_or_jobs[definition.name] = definition\n elif isinstance(definition, PartitionSetDefinition):\n if definition.name in partition_sets:\n raise DagsterInvalidDefinitionError(\n "Duplicate partition set definition found for partition set "\n "{partition_set_name}".format(partition_set_name=definition.name)\n )\n partition_sets[definition.name] = definition\n elif isinstance(definition, SensorDefinition):\n if definition.name in sensors or definition.name in schedules:\n raise DagsterInvalidDefinitionError(\n f"Duplicate definition found for {definition.name}"\n )\n sensors[definition.name] = definition\n if definition.has_loadable_targets():\n targets = definition.load_targets()\n for target in targets:\n pipelines_or_jobs[target.name] = target\n elif isinstance(definition, ScheduleDefinition):\n if definition.name in sensors or definition.name in schedules:\n raise DagsterInvalidDefinitionError(\n f"Duplicate definition found for {definition.name}"\n )\n schedules[definition.name] = definition\n if definition.has_loadable_target():\n target = definition.load_target()\n pipelines_or_jobs[target.name] = target\n if isinstance(definition, PartitionScheduleDefinition):\n partition_set_def = definition.get_partition_set()\n if (\n partition_set_def.name in partition_sets\n and partition_set_def != partition_sets[partition_set_def.name]\n ):\n raise DagsterInvalidDefinitionError(\n "Duplicate partition set definition found for partition set "\n "{partition_set_name}".format(partition_set_name=partition_set_def.name)\n )\n partition_sets[partition_set_def.name] = partition_set_def\n elif isinstance(definition, GraphDefinition):\n coerced = definition.coerce_to_job()\n if coerced.name in pipelines_or_jobs:\n raise DagsterInvalidDefinitionError(\n "Duplicate {target_type} definition found for graph '{name}'".format(\n target_type=coerced.target_type, name=coerced.name\n )\n )\n pipelines_or_jobs[coerced.name] = coerced\n\n elif isinstance(definition, AssetGroup):\n asset_group = definition\n\n if asset_group.all_assets_job_name() in pipelines_or_jobs:\n raise DagsterInvalidDefinitionError(\n "When constructing repository, attempted to pass multiple AssetGroups. There can only be one AssetGroup per repository."\n )\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n pipelines_or_jobs[asset_group.all_assets_job_name()] = build_assets_job(\n asset_group.all_assets_job_name(),\n assets=asset_group.assets,\n source_assets=asset_group.source_assets,\n resource_defs=asset_group.resource_defs,\n executor_def=asset_group.executor_def,\n )\n source_assets = {\n source_asset.key: source_asset for source_asset in asset_group.source_assets\n }\n\n else:\n check.failed(f"Unexpected repository entry {definition}")\n\n pipelines: Dict[str, PipelineDefinition] = {}\n jobs: Dict[str, JobDefinition] = {}\n for name, pipeline_or_job in pipelines_or_jobs.items():\n if isinstance(pipeline_or_job, JobDefinition):\n jobs[name] = pipeline_or_job\n else:\n pipelines[name] = pipeline_or_job\n\n return CachingRepositoryData(\n pipelines=pipelines,\n jobs=jobs,\n partition_sets=partition_sets,\n schedules=schedules,\n sensors=sensors,\n source_assets=source_assets,\n )\n\n def get_pipeline_names(self) -> List[str]:\n """Get the names of all pipelines/jobs in the repository.\n\n Returns:\n List[str]\n """\n return self._pipelines.get_definition_names() + self.get_job_names()\n\n def get_job_names(self) -> List[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return self._jobs.get_definition_names()\n\n def has_pipeline(self, pipeline_name: str) -> bool:\n """Check if a pipeline/job with a given name is present in the repository.\n\n Args:\n pipeline_name (str): The name of the pipeline/job.\n\n Returns:\n bool\n """\n check.str_param(pipeline_name, "pipeline_name")\n\n return self._pipelines.has_definition(pipeline_name) or self._jobs.has_definition(\n pipeline_name\n )\n\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n check.str_param(job_name, "job_name")\n return self._jobs.has_definition(job_name)\n\n def get_all_pipelines(self) -> List[PipelineDefinition]:\n """Return all pipelines/jobs in the repository as a list.\n\n Note that this will construct any pipeline/job that has not yet been constructed.\n\n Returns:\n List[PipelineDefinition]: All pipelines/jobs in the repository.\n """\n if self._all_pipelines is not None:\n return self._all_pipelines\n\n self._all_jobs = self._jobs.get_all_definitions()\n pipelines: List[PipelineDefinition] = [\n *self._pipelines.get_all_definitions(),\n *self._all_jobs,\n ]\n self._check_solid_defs(pipelines)\n self._all_pipelines = pipelines\n return self._all_pipelines\n\n def get_all_jobs(self) -> List[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job that has not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n if self._all_jobs is not None:\n return self._all_jobs\n\n # _check_solid_defs enforces that pipeline and graph definition names are\n # unique within a repository. Loads pipelines in the line below to enforce\n # pipeline/job/graph uniqueness.\n self.get_all_pipelines()\n\n # The `get_all_pipelines` call ensures _all_jobs is set.\n return cast(List[JobDefinition], self._all_jobs)\n\n def get_pipeline(self, pipeline_name: str) -> PipelineDefinition:\n """Get a pipeline/job by name.\n\n If this pipeline/job has not yet been constructed, only this pipeline/job is constructed, and will\n be cached for future calls.\n\n Args:\n pipeline_name (str): Name of the pipeline/job to retrieve.\n\n Returns:\n PipelineDefinition: The pipeline/job definition corresponding to the given name.\n """\n\n check.str_param(pipeline_name, "pipeline_name")\n\n if self._jobs.has_definition(pipeline_name):\n return self._jobs.get_definition(pipeline_name)\n else:\n return self._pipelines.get_definition(pipeline_name)\n\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job has not yet been constructed, only this job is constructed, and will\n be cached for future calls.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n\n check.str_param(job_name, "job_name")\n return self._jobs.get_definition(job_name)\n\n def get_partition_set_names(self) -> List[str]:\n """Get the names of all partition sets in the repository.\n\n Returns:\n List[str]\n """\n return self._partition_sets.get_definition_names()\n\n def has_partition_set(self, partition_set_name: str) -> bool:\n """Check if a partition set with a given name is present in the repository.\n\n Args:\n partition_set_name (str): The name of the partition set.\n\n Returns:\n bool\n """\n check.str_param(partition_set_name, "partition_set_name")\n return self._partition_sets.has_definition(partition_set_name)\n\n def get_all_partition_sets(self) -> List[PartitionSetDefinition]:\n """Return all partition sets in the repository as a list.\n\n Note that this will construct any partition set that has not yet been constructed.\n\n Returns:\n List[PartitionSetDefinition]: All partition sets in the repository.\n """\n return self._partition_sets.get_all_definitions()\n\n def get_partition_set(self, partition_set_name: str) -> PartitionSetDefinition:\n """Get a partition set by name.\n\n If this partition set has not yet been constructed, only this partition set is constructed,\n and will be cached for future calls.\n\n Args:\n partition_set_name (str): Name of the partition set to retrieve.\n\n Returns:\n PartitionSetDefinition: The partition set definition corresponding to the given name.\n """\n\n check.str_param(partition_set_name, "partition_set_name")\n\n return self._partition_sets.get_definition(partition_set_name)\n\n def get_schedule_names(self) -> List[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return self._schedules.get_definition_names()\n\n def get_all_schedules(self) -> List[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Note that this will construct any schedule that has not yet been constructed.\n\n Returns:\n List[ScheduleDefinition]: All schedules in the repository.\n """\n return self._schedules.get_all_definitions()\n\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n if this schedule has not yet been constructed, only this schedule is constructed, and will\n be cached for future calls.\n\n args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.get_definition(schedule_name)\n\n def has_schedule(self, schedule_name: str) -> bool:\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.has_definition(schedule_name)\n\n def get_all_sensors(self) -> List[SensorDefinition]:\n return self._sensors.get_all_definitions()\n\n def get_sensor_names(self) -> List[str]:\n return self._sensors.get_definition_names()\n\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n return self._sensors.get_definition(sensor_name)\n\n def has_sensor(self, sensor_name: str) -> bool:\n return self._sensors.has_definition(sensor_name)\n\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._source_assets\n\n def _check_solid_defs(self, pipelines: List[PipelineDefinition]) -> None:\n solid_defs = {}\n solid_to_pipeline = {}\n for pipeline in pipelines:\n for solid_def in [*pipeline.all_node_defs, pipeline.graph]:\n # skip checks for subselected graphs because they don't have their own names\n if isinstance(solid_def, SubselectedGraphDefinition):\n break\n\n if solid_def.name not in solid_defs:\n solid_defs[solid_def.name] = solid_def\n solid_to_pipeline[solid_def.name] = pipeline.name\n\n if not solid_defs[solid_def.name] is solid_def:\n first_name, second_name = sorted(\n [solid_to_pipeline[solid_def.name], pipeline.name]\n )\n raise DagsterInvalidDefinitionError(\n (\n f"Conflicting definitions found in repository with name '{solid_def.name}'. "\n "Op/Graph/Solid definition names must be unique within a "\n f"repository. {solid_def.__class__.__name__} is defined in {pipeline.target_type} "\n f"'{first_name}' and in {pipeline.target_type} '{second_name}'."\n )\n )\n\n def _validate_pipeline(self, pipeline: PipelineDefinition) -> PipelineDefinition:\n return pipeline\n\n def _validate_job(self, job: JobDefinition) -> JobDefinition:\n return job\n\n def _validate_schedule(self, schedule: ScheduleDefinition) -> ScheduleDefinition:\n pipelines = self.get_pipeline_names()\n\n if schedule.pipeline_name not in pipelines:\n raise DagsterInvalidDefinitionError(\n f'ScheduleDefinition "{schedule.name}" targets job/pipeline "{schedule.pipeline_name}" '\n "which was not found in this repository."\n )\n\n return schedule\n\n def _validate_sensor(self, sensor: SensorDefinition) -> SensorDefinition:\n pipelines = self.get_pipeline_names()\n if len(sensor.targets) == 0:\n # skip validation when the sensor does not target a pipeline\n return sensor\n\n for target in sensor.targets:\n if target.pipeline_name not in pipelines:\n raise DagsterInvalidDefinitionError(\n f'SensorDefinition "{sensor.name}" targets job/pipeline "{sensor.pipeline_name}" '\n "which was not found in this repository."\n )\n\n return sensor\n\n def _validate_partition_set(\n self, partition_set: PartitionSetDefinition\n ) -> PartitionSetDefinition:\n return partition_set\n\n\n
[docs]class RepositoryDefinition:\n """Define a repository that contains a group of definitions.\n\n Users should typically not create objects of this class directly. Instead, use the\n :py:func:`@repository` decorator.\n\n Args:\n name (str): The name of the repository.\n repository_data (RepositoryData): Contains the definitions making up the repository.\n description (Optional[str]): A string description of the repository.\n """\n\n def __init__(\n self,\n name,\n repository_data,\n description=None,\n ):\n self._name = check_valid_name(name)\n self._description = check.opt_str_param(description, "description")\n self._repository_data = check.inst_param(repository_data, "repository_data", RepositoryData)\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def pipeline_names(self) -> List[str]:\n """List[str]: Names of all pipelines/jobs in the repository"""\n return self._repository_data.get_pipeline_names()\n\n @property\n def job_names(self) -> List[str]:\n """List[str]: Names of all jobs in the repository"""\n return self._repository_data.get_job_names()\n\n def has_pipeline(self, name: str) -> bool:\n """Check if a pipeline/job with a given name is present in the repository.\n\n Args:\n name (str): The name of the pipeline/job.\n\n Returns:\n bool\n """\n return self._repository_data.has_pipeline(name)\n\n def get_pipeline(self, name: str) -> PipelineDefinition:\n """Get a pipeline/job by name.\n\n If this pipeline/job is present in the lazily evaluated dictionary passed to the\n constructor, but has not yet been constructed, only this pipeline/job is constructed, and will\n be cached for future calls.\n\n Args:\n name (str): Name of the pipeline/job to retrieve.\n\n Returns:\n PipelineDefinition: The pipeline/job definition corresponding to the given name.\n """\n return self._repository_data.get_pipeline(name)\n\n def get_all_pipelines(self) -> List[PipelineDefinition]:\n """Return all pipelines/jobs in the repository as a list.\n\n Note that this will construct any pipeline/job in the lazily evaluated dictionary that\n has not yet been constructed.\n\n Returns:\n List[PipelineDefinition]: All pipelines/jobs in the repository.\n """\n return self._repository_data.get_all_pipelines()\n\n
[docs] def has_job(self, name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n name (str): The name of the job.\n\n Returns:\n bool\n """\n return self._repository_data.has_job(name)
\n\n
[docs] def get_job(self, name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job is present in the lazily evaluated dictionary passed to the\n constructor, but has not yet been constructed, only this job is constructed, and\n will be cached for future calls.\n\n Args:\n name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to\n the given name.\n """\n return self._repository_data.get_job(name)
\n\n
[docs] def get_all_jobs(self) -> List[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job in the lazily evaluated dictionary that has\n not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n return self._repository_data.get_all_jobs()
\n\n @property\n def partition_set_defs(self) -> List[PartitionSetDefinition]:\n return self._repository_data.get_all_partition_sets()\n\n def get_partition_set_def(self, name: str) -> PartitionSetDefinition:\n return self._repository_data.get_partition_set(name)\n\n @property\n def schedule_defs(self) -> List[ScheduleDefinition]:\n return self._repository_data.get_all_schedules()\n\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n return self._repository_data.get_schedule(name)\n\n def has_schedule_def(self, name: str) -> bool:\n return self._repository_data.has_schedule(name)\n\n @property\n def sensor_defs(self) -> List[SensorDefinition]:\n return self._repository_data.get_all_sensors()\n\n def get_sensor_def(self, name: str) -> SensorDefinition:\n return self._repository_data.get_sensor(name)\n\n def has_sensor_def(self, name: str) -> bool:\n return self._repository_data.has_sensor(name)\n\n @property\n def source_assets_by_key(self) -> Dict[AssetKey, SourceAsset]:\n return self._repository_data.get_source_assets_by_key()\n\n # If definition comes from the @repository decorator, then the __call__ method will be\n # overwritten. Therefore, we want to maintain the call-ability of repository definitions.\n def __call__(self, *args, **kwargs):\n return self
\n
", "current_page_name": "_modules/dagster/core/definitions/repository_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.repository_definition"}, "resource_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.resource_definition

\nfrom collections import namedtuple\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Union,\n    cast,\n    overload,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.configurable import AnonymousConfigurableDefinition\nfrom dagster.core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownResourceError,\n)\nfrom dagster.seven import funcsigs\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom ..decorator_utils import (\n    get_function_params,\n    is_required_param,\n    positional_arg_name_list,\n    validate_expected_params,\n)\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .resource_invocation import resource_invocation_result\n\nif TYPE_CHECKING:\n    from dagster.core.execution.resources_init import InitResourceContext\n\n\ndef is_context_provided(params: List[funcsigs.Parameter]) -> bool:\n    return len(params) >= 1\n\n\n
[docs]class ResourceDefinition(AnonymousConfigurableDefinition):\n """Core class for defining resources.\n\n Resources are scoped ways to make external resources (like database connections) available to\n during job execution and to clean up after execution resolves.\n\n If resource_fn yields once rather than returning (in the manner of functions decorable with\n :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then the body of the\n function after the yield will be run after execution resolves, allowing users to write their\n own teardown/cleanup logic.\n\n Depending on your executor, resources may be instantiated and cleaned up more than once in a\n job execution.\n\n Args:\n resource_fn (Callable[[InitResourceContext], Any]): User-provided function to instantiate\n the resource, which will be made available to executions keyed on the\n ``context.resources`` object.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the resource matches this schema and fail if it does not. If\n not set, Dagster will accept any config provided for the resource.\n description (Optional[str]): A human-readable description of the resource.\n required_resource_keys: (Optional[Set[str]]) Keys for the resources required by this\n resource. A DagsterInvariantViolationError will be raised during initialization if\n dependencies are cyclic.\n version (Optional[str]): (Experimental) The version of the resource's definition fn. Two\n wrapped resource functions should only have the same version if they produce the same\n resource definition when provided with the same inputs.\n """\n\n def __init__(\n self,\n resource_fn: Callable[["InitResourceContext"], Any],\n config_schema: Optional[Union[Any, IDefinitionConfigSchema]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._resource_fn = check.callable_param(resource_fn, "resource_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self._version = check.opt_str_param(version, "version")\n if version:\n experimental_arg_warning("version", "ResourceDefinition.__init__")\n\n @property\n def resource_fn(self) -> Callable[..., Any]:\n return self._resource_fn\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def version(self) -> Optional[str]:\n return self._version\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n
[docs] @staticmethod\n def none_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that returns a none resource.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that does nothing.\n """\n return ResourceDefinition.hardcoded_resource(value=None, description=description)
\n\n
[docs] @staticmethod\n def hardcoded_resource(value: Any, description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` with a hardcoded object.\n\n Args:\n value (Any): The value that will be accessible via context.resources.resource_name.\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A hardcoded resource.\n """\n return ResourceDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n
[docs] @staticmethod\n def mock_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` which wraps a ``mock.MagicMock``.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that creates the magic methods automatically and helps\n you mock existing resources.\n """\n from unittest import mock\n\n return ResourceDefinition(\n resource_fn=lambda _init_context: mock.MagicMock(), description=description\n )
\n\n @staticmethod\n def string_resource(description: Optional[str] = None) -> "ResourceDefinition":\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=str,\n description=description,\n )\n\n def copy_for_configured(\n self, description: Optional[str], config_schema: IDefinitionConfigSchema, _\n ) -> "ResourceDefinition":\n return ResourceDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n )\n\n def __call__(self, *args, **kwargs):\n from dagster.core.execution.resources_init import InitResourceContext\n\n context_provided = is_context_provided(get_function_params(self.resource_fn))\n\n if context_provided:\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Resource initialization function has context argument, but no context was provided "\n "when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of resource received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.resource_fn)[0].name\n\n if args:\n check.opt_inst_param(args[0], context_param_name, InitResourceContext)\n return resource_invocation_result(self, args[0])\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Resource initialization expected argument '{context_param_name}'."\n )\n check.opt_inst_param(\n kwargs[context_param_name], context_param_name, InitResourceContext\n )\n\n return resource_invocation_result(self, kwargs[context_param_name])\n else:\n return resource_invocation_result(self, None)
\n\n\nclass _ResourceDecoratorCallable:\n def __init__(\n self,\n config_schema: Optional[Dict[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self.config_schema = config_schema # checked by underlying definition\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n def __call__(self, resource_fn: Callable[["InitResourceContext"], Any]):\n check.callable_param(resource_fn, "resource_fn")\n\n any_name = ["*"] if is_context_provided(get_function_params(resource_fn)) else []\n\n params = get_function_params(resource_fn)\n\n missing_positional = validate_expected_params(params, any_name)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects a single "\n "positional argument."\n )\n\n extras = params[len(any_name) :]\n\n required_extras = list(filter(is_required_param, extras))\n if required_extras:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects only a single positional required argument. "\n f"Got required extra params {', '.join(positional_arg_name_list(required_extras))}"\n )\n\n resource_def = ResourceDefinition(\n resource_fn=resource_fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n required_resource_keys=self.required_resource_keys,\n )\n\n update_wrapper(resource_def, wrapped=resource_fn)\n\n return resource_def\n\n\n@overload\ndef resource(config_schema=Callable[["InitResourceContext"], Any]) -> ResourceDefinition:\n ...\n\n\n@overload\ndef resource(\n config_schema: Optional[Dict[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version=None,\n) -> Callable[[Callable[["InitResourceContext"], Any]], "ResourceDefinition"]:\n ...\n\n\n
[docs]def resource(\n config_schema: Optional[\n Union[Callable[["InitResourceContext"], Any], IDefinitionConfigSchema, Dict[str, Any]]\n ] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version=None,\n) -> Union[\n Callable[[Callable[["InitResourceContext"], Any]], "ResourceDefinition"], "ResourceDefinition"\n]:\n """Define a resource.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an instance of\n the resource. This function will become the ``resource_fn`` of an underlying\n :py:class:`ResourceDefinition`.\n\n If the decorated function yields once rather than returning (in the manner of functions\n decorable with :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then\n the body of the function after the yield will be run after execution resolves, allowing users\n to write their own teardown/cleanup logic.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.resource_config`. If not set, Dagster will accept any config provided.\n description(Optional[str]): A human-readable description of the resource.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by this resource.\n """\n\n # This case is for when decorator is used bare, without arguments.\n # E.g. @resource versus @resource()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _ResourceDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: Callable[["InitResourceContext"], Any]) -> "ResourceDefinition":\n return _ResourceDecoratorCallable(\n config_schema=cast(Optional[Dict[str, Any]], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )(resource_fn)\n\n return _wrap
\n\n\nclass Resources:\n """This class functions as a "tag" that we can use to type the namedtuple returned by\n ScopedResourcesBuilder.build(). The way that we create the namedtuple returned by build() is\n incompatible with type annotations on its own due to its dynamic attributes, so this tag class\n provides a workaround."""\n\n\nclass IContainsGenerator:\n """This class adds an additional tag to indicate that the resources object has at least one\n resource that has been yielded from a generator, and thus may require teardown."""\n\n\nclass ScopedResourcesBuilder(\n NamedTuple(\n "_ScopedResourcesBuilder",\n [("resource_instance_dict", Mapping[str, object]), ("contains_generator", bool)],\n )\n):\n """There are concepts in the codebase (e.g. ops, system storage) that receive\n only the resources that they have specified in required_resource_keys.\n ScopedResourcesBuilder is responsible for dynamically building a class with\n only those required resources and returning an instance of that class."""\n\n def __new__(\n cls,\n resource_instance_dict: Optional[Mapping[str, object]] = None,\n contains_generator: bool = False,\n ):\n return super(ScopedResourcesBuilder, cls).__new__(\n cls,\n resource_instance_dict=check.opt_dict_param(\n resource_instance_dict, "resource_instance_dict", key_type=str\n ),\n contains_generator=contains_generator,\n )\n\n def build(self, required_resource_keys: Optional[AbstractSet[str]]) -> Resources:\n\n """We dynamically create a type that has the resource keys as properties, to enable dotting into\n the resources from a context.\n\n For example, given:\n\n resources = {'foo': <some resource>, 'bar': <some other resource>}\n\n then this will create the type Resource(namedtuple('foo bar'))\n\n and then binds the specified resources into an instance of this object, which can be consumed\n as, e.g., context.resources.foo.\n """\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n # it is possible that the surrounding context does NOT have the required resource keys\n # because we are building a context for steps that we are not going to execute (e.g. in the\n # resume/retry case, in order to generate copy intermediates events)\n resource_instance_dict = {\n key: self.resource_instance_dict[key]\n for key in required_resource_keys\n if key in self.resource_instance_dict\n }\n\n # If any of the resources are generators, add the IContainsGenerator subclass to flag that\n # this is the case.\n if self.contains_generator:\n\n class _ScopedResourcesContainsGenerator(\n namedtuple("_ScopedResourcesContainsGenerator", list(resource_instance_dict.keys())), # type: ignore[misc]\n Resources,\n IContainsGenerator,\n ):\n def __getattr__(self, attr):\n raise DagsterUnknownResourceError(attr)\n\n return _ScopedResourcesContainsGenerator(**resource_instance_dict) # type: ignore[call-arg]\n\n else:\n\n class _ScopedResources(\n namedtuple("_ScopedResources", list(resource_instance_dict.keys())), # type: ignore[misc]\n Resources,\n ):\n def __getattr__(self, attr):\n raise DagsterUnknownResourceError(attr)\n\n return _ScopedResources(**resource_instance_dict) # type: ignore[call-arg]\n\n\n
[docs]def make_values_resource(**kwargs: Any) -> ResourceDefinition:\n """A helper function that creates a ``ResourceDefinition`` to take in user-defined values.\n\n This is useful for sharing values between ops.\n\n Args:\n **kwargs: Arbitrary keyword arguments that will be passed to the config schema of the\n returned resource definition. If not set, Dagster will accept any config provided for\n the resource.\n\n For example:\n\n .. code-block:: python\n\n @op(required_resource_keys={"globals"})\n def my_op(context):\n print(context.resources.globals["my_str_var"])\n\n @job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\n def my_job():\n my_op()\n\n Returns:\n ResourceDefinition: A resource that passes in user-defined values.\n """\n\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=kwargs or Any,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/resource_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.resource_definition"}, "run_request": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.run_request

\nfrom enum import Enum\nfrom typing import Any, Mapping, NamedTuple, Optional\n\nfrom dagster import check\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes.serdes import register_serdes_enum_fallbacks, whitelist_for_serdes\nfrom dagster.utils.error import SerializableErrorInfo\n\n\n@whitelist_for_serdes\nclass InstigatorType(Enum):\n    SCHEDULE = "SCHEDULE"\n    SENSOR = "SENSOR"\n\n\nregister_serdes_enum_fallbacks({"JobType": InstigatorType})\n# for internal backcompat\nJobType = InstigatorType\n\n\n
[docs]@whitelist_for_serdes\nclass SkipReason(NamedTuple("_SkipReason", [("skip_message", Optional[str])])):\n """\n Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\n why no runs were requested.\n\n Attributes:\n skip_message (Optional[str]): A message displayed in dagit for why this evaluation resulted\n in no requested runs.\n """\n\n def __new__(cls, skip_message: Optional[str] = None):\n return super(SkipReason, cls).__new__(\n cls,\n skip_message=check.opt_str_param(skip_message, "skip_message"),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RunRequest(\n NamedTuple(\n "_RunRequest",\n [\n ("run_key", Optional[str]),\n ("run_config", Mapping[str, Any]),\n ("tags", Mapping[str, str]),\n ("job_name", Optional[str]),\n ],\n )\n):\n """\n Represents all the information required to launch a single run. Must be returned by a\n SensorDefinition or ScheduleDefinition's evaluation function for a run to be launched.\n\n Attributes:\n run_key (str | None): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n run_config (Optional[Dict]): The config that parameterizes the run execution to\n be launched, as a dict.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n job_name (Optional[str]): (Experimental) The name of the job this run request will launch.\n Required for sensors that target multiple jobs.\n """\n\n def __new__(\n cls,\n run_key: Optional[str],\n run_config: Optional[Mapping[str, Any]] = None,\n tags: Optional[Mapping[str, str]] = None,\n job_name: Optional[str] = None,\n ):\n return super(RunRequest, cls).__new__(\n cls,\n run_key=check.opt_str_param(run_key, "run_key"),\n run_config=check.opt_dict_param(run_config, "run_config", key_type=str),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n job_name=check.opt_str_param(job_name, "job_name"),\n )
\n\n\n@whitelist_for_serdes\nclass PipelineRunReaction(\n NamedTuple(\n "_PipelineRunReaction",\n [\n ("pipeline_run", Optional[PipelineRun]),\n ("error", Optional[SerializableErrorInfo]),\n ("run_status", Optional[PipelineRunStatus]),\n ],\n )\n):\n """\n Represents a request that reacts to an existing pipeline run. If success, it will report logs\n back to the run.\n\n Attributes:\n pipeline_run (Optional[PipelineRun]): The pipeline run that originates this reaction.\n error (Optional[SerializableErrorInfo]): user code execution error.\n run_status: (Optional[PipelineRunStatus]): The run status that triggered the reaction.\n """\n\n def __new__(\n cls,\n pipeline_run: Optional[PipelineRun],\n error: Optional[SerializableErrorInfo] = None,\n run_status: Optional[PipelineRunStatus] = None,\n ):\n return super(PipelineRunReaction, cls).__new__(\n cls,\n pipeline_run=check.opt_inst_param(pipeline_run, "pipeline_run", PipelineRun),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n run_status=check.opt_inst_param(run_status, "run_status", PipelineRunStatus),\n )\n
", "current_page_name": "_modules/dagster/core/definitions/run_request", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.run_request"}, "run_status_sensor_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.run_status_sensor_definition

\nimport warnings\nfrom datetime import datetime\nfrom typing import Any, Callable, List, NamedTuple, Optional, Union, cast\n\nimport pendulum\n\nfrom dagster import check\nfrom dagster.core.definitions import GraphDefinition, PipelineDefinition\nfrom dagster.core.definitions.sensor_definition import (\n    DefaultSensorStatus,\n    PipelineRunReaction,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SkipReason,\n    is_context_provided,\n)\nfrom dagster.core.errors import (\n    DagsterInvalidInvocationError,\n    RunStatusSensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster.core.events import PIPELINE_RUN_STATUS_TO_EVENT_TYPE, DagsterEvent\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.storage.pipeline_run import DagsterRun, PipelineRun, PipelineRunStatus, RunsFilter\nfrom dagster.serdes import (\n    deserialize_json_to_dagster_namedtuple,\n    serialize_dagster_namedtuple,\n    whitelist_for_serdes,\n)\nfrom dagster.serdes.errors import DeserializationError\nfrom dagster.serdes.serdes import register_serdes_tuple_fallbacks\nfrom dagster.seven import JSONDecodeError\nfrom dagster.utils import utc_datetime_from_timestamp\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom ..decorator_utils import get_function_params\n\n\n@whitelist_for_serdes\nclass RunStatusSensorCursor(\n    NamedTuple(\n        "_RunStatusSensorCursor",\n        [("record_id", int), ("update_timestamp", str)],\n    )\n):\n    def __new__(cls, record_id, update_timestamp):\n\n        return super(RunStatusSensorCursor, cls).__new__(\n            cls,\n            record_id=check.int_param(record_id, "record_id"),\n            update_timestamp=check.str_param(update_timestamp, "update_timestamp"),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            obj = deserialize_json_to_dagster_namedtuple(json_str)\n            return isinstance(obj, RunStatusSensorCursor)\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    def to_json(self) -> str:\n        return serialize_dagster_namedtuple(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> tuple:\n        return deserialize_json_to_dagster_namedtuple(json_str)\n\n\n# handle backcompat\nregister_serdes_tuple_fallbacks({"PipelineSensorCursor": RunStatusSensorCursor})\n\n\n
[docs]class RunStatusSensorContext(\n NamedTuple(\n "_RunStatusSensorContext",\n [\n ("sensor_name", str),\n ("dagster_run", DagsterRun),\n ("dagster_event", DagsterEvent),\n ("instance", DagsterInstance),\n ],\n )\n):\n """The ``context`` object available to a decorated function of ``run_status_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n dagster_run (DagsterRun): the run of the job or pipeline.\n dagster_event (DagsterEvent): the event associated with the job or pipeline run status.\n instance (DagsterInstance): the current instance.\n """\n\n def __new__(cls, sensor_name, dagster_run, dagster_event, instance):\n\n return super(RunStatusSensorContext, cls).__new__(\n cls,\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n dagster_run=check.inst_param(dagster_run, "dagster_run", DagsterRun),\n dagster_event=check.inst_param(dagster_event, "dagster_event", DagsterEvent),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )\n\n def for_pipeline_failure(self):\n return PipelineFailureSensorContext(\n sensor_name=self.sensor_name,\n dagster_run=self.dagster_run,\n dagster_event=self.dagster_event,\n instance=self.instance,\n )\n\n
[docs] def for_run_failure(self):\n """Converts RunStatusSensorContext to RunFailureSensorContext."""\n return RunFailureSensorContext(\n sensor_name=self.sensor_name,\n dagster_run=self.dagster_run,\n dagster_event=self.dagster_event,\n instance=self.instance,\n )
\n\n @property\n def pipeline_run(self) -> PipelineRun:\n warnings.warn(\n "`RunStatusSensorContext.pipeline_run` is deprecated as of 0.13.0; use "\n "`RunStatusSensorContext.dagster_run` instead."\n )\n return self.dagster_run
\n\n\n
[docs]class PipelineFailureSensorContext(RunStatusSensorContext):\n """The ``context`` object available to a decorated function of ``pipeline_failure_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n pipeline_run (PipelineRun): the failed pipeline run.\n failure_event (DagsterEvent): the pipeline failure event.\n """\n\n @property\n def failure_event(self):\n return self.dagster_event
\n\n\n
[docs]class RunFailureSensorContext(RunStatusSensorContext):\n """The ``context`` object available to a decorated function of ``run_failure_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n pipeline_run (PipelineRun): the failed pipeline run.\n failure_event (DagsterEvent): the pipeline failure event.\n """\n\n @property\n def failure_event(self):\n return self.dagster_event
\n\n\n
[docs]def build_run_status_sensor_context(\n sensor_name: str,\n dagster_event: DagsterEvent,\n dagster_instance: DagsterInstance,\n dagster_run: DagsterRun,\n) -> RunStatusSensorContext:\n """\n Builds run status sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@run_status_sensor` or `@run_failure_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n dagster_event (DagsterEvent): A DagsterEvent with the same event type as the one that\n triggers the run_status_sensor\n dagster_instance (DagsterInstance): The dagster instance configured for the context.\n dagster_run (DagsterRun): DagsterRun object from running a job\n\n Examples:\n .. code-block:: python\n\n instance = DagsterInstance.ephemeral()\n result = my_job.execute_in_process(instance=instance)\n\n dagster_run = result.dagster_run\n dagster_event = result.get_job_success_event() # or get_job_failure_event()\n\n context = build_run_status_sensor_context(\n sensor_name="run_status_sensor_to_invoke",\n dagster_instance=instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n )\n run_status_sensor_to_invoke(context)\n """\n\n return RunStatusSensorContext(\n sensor_name=sensor_name,\n instance=dagster_instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n )
\n\n\n
[docs]def pipeline_failure_sensor(\n name: Optional[Union[Callable[..., Any], str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n pipeline_selection: Optional[List[str]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[\n [Callable[[PipelineFailureSensorContext], Union[SkipReason, PipelineRunReaction]]],\n SensorDefinition,\n]:\n """\n Creates a sensor that reacts to pipeline failure events, where the decorated function will be\n run when a pipeline run fails.\n\n Takes a :py:class:`~dagster.PipelineFailureSensorContext`.\n\n Args:\n name (Optional[str]): The name of the pipeline failure sensor. Defaults to the name of the\n decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n pipeline_selection (Optional[List[str]]): Names of the pipelines that will be monitored by\n this failure sensor. Defaults to None, which means the alert will be sent when any\n pipeline in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def inner(\n fn: Callable[[PipelineFailureSensorContext], Union[SkipReason, PipelineRunReaction]]\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n if name is None or callable(name):\n sensor_name = fn.__name__\n else:\n sensor_name = name\n\n @run_status_sensor(\n pipeline_run_status=PipelineRunStatus.FAILURE,\n pipeline_selection=pipeline_selection,\n name=sensor_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n )\n def _pipeline_failure_sensor(context: RunStatusSensorContext):\n fn(context.for_pipeline_failure())\n\n return _pipeline_failure_sensor\n\n # This case is for when decorator is used bare, without arguments, i.e. @pipeline_failure_sensor\n if callable(name):\n return inner(name)\n\n return inner
\n\n\n
[docs]def run_failure_sensor(\n name: Optional[Union[Callable[..., Any], str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None,\n pipeline_selection: Optional[List[str]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[\n [Callable[[RunFailureSensorContext], Union[SkipReason, PipelineRunReaction]]],\n SensorDefinition,\n]:\n """\n Creates a sensor that reacts to job failure events, where the decorated function will be\n run when a run fails.\n\n Takes a :py:class:`~dagster.RunFailureSensorContext`.\n\n Args:\n name (Optional[str]): The name of the job failure sensor. Defaults to the name of the\n decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition]]]): The jobs that\n will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails.\n pipeline_selection (Optional[List[str]]): (legacy) Names of the pipelines that will be monitored by\n this sensor. Defaults to None, which means the alert will be sent when any pipeline in\n the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def inner(\n fn: Callable[[RunFailureSensorContext], Union[SkipReason, PipelineRunReaction]]\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n if name is None or callable(name):\n sensor_name = fn.__name__\n else:\n sensor_name = name\n\n @run_status_sensor(\n pipeline_run_status=PipelineRunStatus.FAILURE,\n name=sensor_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job_selection=job_selection,\n pipeline_selection=pipeline_selection,\n default_status=default_status,\n )\n def _run_failure_sensor(context: RunStatusSensorContext):\n fn(context.for_run_failure())\n\n return _run_failure_sensor\n\n # This case is for when decorator is used bare, without arguments, i.e. @pipeline_failure_sensor\n if callable(name):\n return inner(name)\n\n return inner
\n\n\n
[docs]class RunStatusSensorDefinition(SensorDefinition):\n """\n Define a sensor that reacts to a given status of pipeline execution, where the decorated\n function will be evaluated when a run is at the given status.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n pipeline_run_status (PipelineRunStatus): The status of a run which will be\n monitored by the sensor.\n run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.RunStatusSensorContext`.\n pipeline_selection (Optional[List[str]]): (legacy) Names of the pipelines that will be monitored by\n this sensor. Defaults to None, which means the alert will be sent when any pipeline in\n the repository fails.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition]]]): The jobs that\n will be monitored by this sensor. Defaults to None, which means the alert will be sent\n when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n pipeline_run_status: PipelineRunStatus,\n run_status_sensor_fn: Callable[\n [RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]\n ],\n pipeline_selection: Optional[List[str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n ):\n\n from dagster.core.storage.event_log.base import EventRecordsFilter, RunShardedEventsCursor\n\n check.str_param(name, "name")\n check.inst_param(pipeline_run_status, "pipeline_run_status", PipelineRunStatus)\n check.callable_param(run_status_sensor_fn, "run_status_sensor_fn")\n check.opt_list_param(pipeline_selection, "pipeline_selection", str)\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.opt_list_param(job_selection, "job_selection", (PipelineDefinition, GraphDefinition))\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n self._run_status_sensor_fn = check.callable_param(\n run_status_sensor_fn, "run_status_sensor_fn"\n )\n\n def _wrapped_fn(context: SensorEvaluationContext):\n # initiate the cursor to (most recent event id, current timestamp) when:\n # * it's the first time starting the sensor\n # * or, the cursor isn't in valid format (backcompt)\n if context.cursor is None or not RunStatusSensorCursor.is_valid(context.cursor):\n most_recent_event_records = list(\n context.instance.get_event_records(ascending=False, limit=1)\n )\n most_recent_event_id = (\n most_recent_event_records[0].storage_id\n if len(most_recent_event_records) == 1\n else -1\n )\n\n new_cursor = RunStatusSensorCursor(\n update_timestamp=pendulum.now("UTC").isoformat(),\n record_id=most_recent_event_id,\n )\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initiating {name}. Set cursor to {new_cursor}")\n return\n\n record_id, update_timestamp = RunStatusSensorCursor.from_json(context.cursor)\n\n # Fetch events after the cursor id\n # * we move the cursor forward to the latest visited event's id to avoid revisits\n # * when the daemon is down, bc we persist the cursor info, we can go back to where we\n # left and backfill alerts for the qualified events (up to 5 at a time) during the downtime\n # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n after_cursor=RunShardedEventsCursor(\n id=record_id,\n run_updated_after=cast(datetime, pendulum.parse(update_timestamp)),\n ),\n event_type=PIPELINE_RUN_STATUS_TO_EVENT_TYPE[pipeline_run_status],\n ),\n ascending=True,\n limit=5,\n )\n\n for event_record in event_records:\n event_log_entry = event_record.event_log_entry\n storage_id = event_record.storage_id\n\n # get run info\n run_records = context.instance.get_run_records(\n filters=RunsFilter(run_ids=[event_log_entry.run_id])\n )\n\n # skip if we couldn't find the right run\n if len(run_records) != 1:\n # bc we couldn't find the run, we use the event timestamp as the approximate\n # run update timestamp\n approximate_update_timestamp = utc_datetime_from_timestamp(\n event_log_entry.timestamp\n )\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=approximate_update_timestamp.isoformat(),\n ).to_json()\n )\n continue\n\n pipeline_run = run_records[0].pipeline_run\n update_timestamp = run_records[0].update_timestamp\n\n # skip if any of of the followings happens:\n if (\n # the pipeline does not have a repository (manually executed)\n not pipeline_run.external_pipeline_origin\n or\n # the pipeline does not belong to the current repository\n pipeline_run.external_pipeline_origin.external_repository_origin.repository_name\n != context.repository_name\n or\n # if pipeline is not selected\n (pipeline_selection and pipeline_run.pipeline_name not in pipeline_selection)\n or\n # if job not selected\n (\n job_selection\n and pipeline_run.pipeline_name not in map(lambda x: x.name, job_selection)\n )\n ):\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n continue\n\n serializable_error = None\n\n try:\n with user_code_error_boundary(\n RunStatusSensorExecutionError,\n lambda: f'Error occurred during the execution sensor "{name}".',\n ):\n # one user code invocation maps to one failure event\n run_status_sensor_fn(\n RunStatusSensorContext(\n sensor_name=name,\n dagster_run=pipeline_run,\n dagster_event=event_log_entry.dagster_event,\n instance=context.instance,\n )\n )\n except RunStatusSensorExecutionError as run_status_sensor_execution_error:\n # When the user code errors, we report error to the sensor tick not the original run.\n serializable_error = serializable_error_info_from_exc_info(\n run_status_sensor_execution_error.original_exc_info\n )\n\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n\n # Yield PipelineRunReaction to indicate the execution success/failure.\n # The sensor machinery would\n # * report back to the original run if success\n # * update cursor and job state\n yield PipelineRunReaction(\n pipeline_run=pipeline_run,\n run_status=pipeline_run_status,\n error=serializable_error,\n )\n\n super(RunStatusSensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n )\n\n def __call__(self, *args, **kwargs):\n context_provided = is_context_provided(get_function_params(self._run_status_sensor_fn))\n\n if context_provided:\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Run status sensor function expected context argument, but no context argument "\n "was provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Run status sensor invocation received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self._run_status_sensor_fn)[0].name\n\n if args:\n context = check.opt_inst_param(args[0], context_param_name, RunStatusSensorContext)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Run status sensor invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name], context_param_name, RunStatusSensorContext\n )\n\n if not context:\n raise DagsterInvalidInvocationError(\n "Context must be provided for direct invocation of run status sensor."\n )\n\n return self._run_status_sensor_fn(context)\n\n else:\n if len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Run status sensor decorated function has no arguments, but arguments were "\n "provided to invocation."\n )\n\n return self._run_status_sensor_fn()
\n\n\n
[docs]def run_status_sensor(\n pipeline_run_status: PipelineRunStatus,\n pipeline_selection: Optional[List[str]] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[\n [Callable[[RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]]],\n RunStatusSensorDefinition,\n]:\n """\n Creates a sensor that reacts to a given status of pipeline execution, where the decorated\n function will be run when a pipeline is at the given status.\n\n Takes a :py:class:`~dagster.RunStatusSensorContext`.\n\n Args:\n pipeline_run_status (PipelineRunStatus): The status of pipeline execution which will be\n monitored by the sensor.\n pipeline_selection (Optional[List[str]]): Names of the pipelines that will be monitored by\n this sensor. Defaults to None, which means the alert will be sent when any pipeline in\n the repository fails.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job_selection (Optional[List[Union[PipelineDefinition, GraphDefinition]]]): Jobs that will\n be monitored by this sensor. Defaults to None, which means the alert will be sent when\n any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def inner(\n fn: Callable[["RunStatusSensorContext"], Union[SkipReason, PipelineRunReaction]]\n ) -> RunStatusSensorDefinition:\n\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(context: RunStatusSensorContext):\n fn(context)\n\n return RunStatusSensorDefinition(\n name=sensor_name,\n pipeline_run_status=pipeline_run_status,\n run_status_sensor_fn=_wrapped_fn,\n pipeline_selection=pipeline_selection,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job_selection=job_selection,\n default_status=default_status,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/core/definitions/run_status_sensor_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.run_status_sensor_definition"}, "schedule_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.schedule_definition

\nimport copy\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Union, cast\n\nimport pendulum\n\nfrom dagster import check\nfrom dagster.seven import funcsigs\n\nfrom ...serdes import whitelist_for_serdes\nfrom ...utils import ensure_gen, merge_dicts\nfrom ...utils.schedules import is_valid_cron_string\nfrom ..decorator_utils import get_function_params\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom ..instance import DagsterInstance\nfrom ..instance.ref import InstanceRef\nfrom ..storage.pipeline_run import PipelineRun\nfrom ..storage.tags import check_tags\nfrom .graph_definition import GraphDefinition\nfrom .mode import DEFAULT_MODE_NAME\nfrom .pipeline_definition import PipelineDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .target import DirectTarget, RepoRelativeTarget\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n\n@whitelist_for_serdes\nclass DefaultScheduleStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\n
[docs]class ScheduleEvaluationContext:\n """Schedule-specific execution context.\n\n An instance of this class is made available as the first argument to various ScheduleDefinition\n functions. It is passed as the first argument to ``run_config_fn``, ``tags_fn``,\n and ``should_execute``.\n\n Attributes:\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n scheduled_execution_time (datetime):\n The time in which the execution was scheduled to happen. May differ slightly\n from both the actual execution time and the time at which the run config is computed.\n Not available in all schedulers - currently only set in deployments using\n DagsterDaemonScheduler.\n """\n\n __slots__ = ["_instance_ref", "_scheduled_execution_time", "_exit_stack", "_instance"]\n\n def __init__(\n self, instance_ref: Optional[InstanceRef], scheduled_execution_time: Optional[datetime]\n ):\n self._exit_stack = ExitStack()\n self._instance = None\n\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._scheduled_execution_time = check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n )\n\n def __enter__(self):\n return self\n\n def __exit__(self, _exception_type, _exception_value, _traceback):\n self._exit_stack.close()\n\n @property\n def instance(self) -> "DagsterInstance":\n # self._instance_ref should only ever be None when this ScheduleEvaluationContext was\n # constructed under test.\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was provided."\n )\n if not self._instance:\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def scheduled_execution_time(self) -> Optional[datetime]:\n return self._scheduled_execution_time
\n\n\n# Preserve ScheduleExecutionContext for backcompat so type annotations don't break.\nScheduleExecutionContext = ScheduleEvaluationContext\n\n\n
[docs]def build_schedule_context(\n instance: Optional[DagsterInstance] = None, scheduled_execution_time: Optional[datetime] = None\n) -> ScheduleEvaluationContext:\n """Builds schedule execution context using the provided parameters.\n\n The instance provided to ``build_schedule_context`` must be persistent;\n DagsterInstance.ephemeral() will result in an error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the schedule.\n scheduled_execution_time (datetime): The time in which the execution was scheduled to\n happen. May differ slightly from both the actual execution time and the time at which\n the run config is computed.\n\n Examples:\n\n .. code-block:: python\n\n context = build_schedule_context(instance)\n daily_schedule.evaluate_tick(context)\n\n """\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n return ScheduleEvaluationContext(\n instance_ref=instance.get_ref() if instance and instance.is_persistent else None,\n scheduled_execution_time=check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n ),\n )
\n\n\n@whitelist_for_serdes\nclass ScheduleExecutionData(NamedTuple):\n run_requests: Optional[List[RunRequest]]\n skip_message: Optional[str]\n\n\n
[docs]class ScheduleDefinition:\n """Define a schedule that targets a job\n\n Args:\n name (Optional[str]): The name of the schedule to create. Defaults to the job name plus\n "_schedule".\n cron_schedule (str): A valid cron string specifying when the schedule will run, e.g.,\n '45 23 * * 6' for a schedule that runs at 11:45 PM every Saturday.\n pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the schedule runs.\n execution_fn (Callable[ScheduleEvaluationContext]): The core evaluation function for the\n schedule, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.ScheduleEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n run_config (Optional[Dict]): The config that parameterizes this execution,\n as a dict.\n run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Dict]]]): A function that\n takes a ScheduleEvaluationContext object and returns the run configuration that\n parameterizes this execution, as a dict. You may set only one of ``run_config``,\n ``run_config_fn``, and ``execution_fn``.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags``, ``tags_fn``, and ``execution_fn``.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): (legacy) The mode to apply when executing this schedule. (default: 'default')\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs\n at schedule execution time to determine whether a schedule should execute or skip. Takes\n a :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[dict[str, str]]): The environment variables to set for the\n schedule\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this\n schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: Optional[str] = None,\n cron_schedule: Optional[str] = None,\n pipeline_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n run_config_fn: Optional[Callable[..., Any]] = None,\n tags: Optional[Dict[str, str]] = None,\n tags_fn: Optional[Callable[..., Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[Any]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[..., bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n execution_timezone: Optional[str] = None,\n execution_fn: Optional[\n Union[Callable[[ScheduleEvaluationContext], Any], "DecoratedScheduleFunction"]\n ] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, PipelineDefinition]] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n ):\n from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n self._cron_schedule = check.str_param(cron_schedule, "cron_schedule")\n\n if not is_valid_cron_string(self._cron_schedule):\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{self._cron_schedule}' for schedule '{name}''. "\n "Dagster recognizes standard cron expressions consisting of 5 fields."\n )\n\n if job is not None:\n self._target: Union[DirectTarget, RepoRelativeTarget] = DirectTarget(job)\n else:\n self._target = RepoRelativeTarget(\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n mode=check.opt_str_param(mode, "mode") or DEFAULT_MODE_NAME,\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n )\n\n if name:\n self._name = check_valid_name(name)\n elif pipeline_name:\n self._name = pipeline_name + "_schedule"\n elif job:\n self._name = job.name + "_schedule"\n\n self._description = check.opt_str_param(description, "description")\n\n self._environment_vars = check.opt_dict_param(\n environment_vars, "environment_vars", key_type=str, value_type=str\n )\n self._execution_timezone = check.opt_str_param(execution_timezone, "execution_timezone")\n\n if execution_fn and (run_config_fn or tags_fn or should_execute or tags or run_config):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both execution_fn and individual run_config/tags arguments "\n "to ScheduleDefinition. Must provide only one of the two."\n )\n elif execution_fn:\n self._execution_fn: Optional[\n Union[Callable[..., Any], DecoratedScheduleFunction]\n ] = None\n if isinstance(execution_fn, DecoratedScheduleFunction):\n self._execution_fn = execution_fn\n else:\n self._execution_fn = check.opt_callable_param(execution_fn, "execution_fn")\n self._run_config_fn = None\n else:\n\n if run_config_fn and run_config:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both run_config_fn and run_config as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n self._run_config_fn = check.opt_callable_param(\n run_config_fn,\n "run_config_fn",\n default=lambda _context: check.opt_dict_param(run_config, "run_config"),\n )\n\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n check_tags(tags, "tags")\n tags_fn = lambda _context: tags\n else:\n tags_fn = check.opt_callable_param(tags_fn, "tags_fn", default=lambda _context: {})\n\n should_execute = check.opt_callable_param(\n should_execute, "should_execute", default=lambda _context: True\n )\n\n def _execution_fn(context):\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of should_execute for schedule {name}",\n ):\n if not should_execute(context):\n yield SkipReason(\n "should_execute function for {schedule_name} returned false.".format(\n schedule_name=name\n )\n )\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of run_config_fn for schedule {name}",\n ):\n evaluated_run_config = copy.deepcopy(\n self._run_config_fn(context)\n if is_context_provided(get_function_params(self._run_config_fn))\n else self._run_config_fn()\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {name}",\n ):\n evaluated_tags = tags_fn(context)\n\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n\n self._execution_fn = _execution_fn\n\n if self._execution_timezone:\n try:\n # Verify that the timezone can be loaded\n pendulum.timezone(self._execution_timezone) # type: ignore\n except Exception:\n raise DagsterInvalidDefinitionError(\n "Invalid execution timezone {timezone} for {schedule_name}".format(\n schedule_name=name, timezone=self._execution_timezone\n )\n )\n\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultScheduleStatus\n )\n\n def __call__(self, *args, **kwargs):\n from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n if not isinstance(self._execution_fn, DecoratedScheduleFunction):\n raise DagsterInvalidInvocationError(\n "Schedule invocation is only supported for schedules created via the schedule "\n "decorators."\n )\n result = None\n if self._execution_fn.has_context_arg:\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Schedule decorated function has context argument, but no context argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Schedule invocation received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self._execution_fn.decorated_fn)[0].name\n\n if args:\n context = check.opt_inst_param(\n args[0], context_param_name, ScheduleEvaluationContext\n )\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Schedule invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name], context_param_name, ScheduleEvaluationContext\n )\n\n context = context if context else build_schedule_context()\n\n result = self._execution_fn.decorated_fn(context)\n else:\n if len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Decorated schedule function takes no arguments, but arguments were provided."\n )\n result = self._execution_fn.decorated_fn()\n\n if isinstance(result, dict):\n return copy.deepcopy(result)\n else:\n return result\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def pipeline_name(self) -> str:\n return self._target.pipeline_name\n\n @property\n def solid_selection(self) -> Optional[List[Any]]:\n return self._target.solid_selection\n\n @property\n def mode(self) -> str:\n return self._target.mode\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def cron_schedule(self) -> str:\n return self._cron_schedule\n\n @property\n def environment_vars(self) -> Dict[str, str]:\n return self._environment_vars\n\n @property\n def execution_timezone(self) -> Optional[str]:\n return self._execution_timezone\n\n @property\n def job(self) -> PipelineDefinition:\n if isinstance(self._target, DirectTarget):\n return self._target.pipeline\n raise DagsterInvalidDefinitionError("No job was provided to ScheduleDefinition.")\n\n def evaluate_tick(self, context: "ScheduleEvaluationContext") -> ScheduleExecutionData:\n """Evaluate schedule using the provided context.\n\n Args:\n context (ScheduleEvaluationContext): The context with which to evaluate this schedule.\n Returns:\n ScheduleExecutionData: Contains list of run requests, or skip message if present.\n\n """\n\n from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n check.inst_param(context, "context", ScheduleEvaluationContext)\n if isinstance(self._execution_fn, DecoratedScheduleFunction):\n execution_fn = self._execution_fn.wrapped_fn\n else:\n execution_fn = cast(Callable[[ScheduleEvaluationContext], Any], self._execution_fn)\n result = list(ensure_gen(execution_fn(context)))\n\n skip_message: Optional[str] = None\n\n if not result or result == [None]:\n run_requests = []\n skip_message = "Schedule function returned an empty result"\n elif len(result) == 1:\n item = result[0]\n check.inst(item, (SkipReason, RunRequest))\n run_requests = [item] if isinstance(item, RunRequest) else []\n skip_message = item.skip_message if isinstance(item, SkipReason) else None\n else:\n # NOTE: mypy is not correctly reading this cast-- not sure why\n # (pyright reads it fine). Hence the type-ignores below.\n result = cast(List[RunRequest], check.is_list(result, of_type=RunRequest)) # type: ignore\n check.invariant(\n not any(not request.run_key for request in result), # type: ignore\n "Schedules that return multiple RunRequests must specify a run_key in each RunRequest",\n )\n run_requests = result # type: ignore\n skip_message = None\n\n # clone all the run requests with the required schedule tags\n run_requests_with_schedule_tags = [\n RunRequest(\n run_key=request.run_key,\n run_config=request.run_config,\n tags=merge_dicts(request.tags, PipelineRun.tags_for_schedule(self)),\n )\n for request in run_requests\n ]\n\n return ScheduleExecutionData(\n run_requests=run_requests_with_schedule_tags, skip_message=skip_message\n )\n\n def has_loadable_target(self):\n return isinstance(self._target, DirectTarget)\n\n def load_target(self):\n if isinstance(self._target, DirectTarget):\n return self._target.load()\n\n check.failed("Target is not loadable")\n\n @property\n def default_status(self) -> DefaultScheduleStatus:\n return self._default_status
\n\n\ndef is_context_provided(params: List[funcsigs.Parameter]) -> bool:\n return len(params) == 1\n
", "current_page_name": "_modules/dagster/core/definitions/schedule_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.schedule_definition"}, "sensor_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.sensor_definition

\nimport inspect\nfrom contextlib import ExitStack\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Generator,\n    List,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.instance.ref import InstanceRef\nfrom dagster.serdes import whitelist_for_serdes\nfrom dagster.seven import funcsigs\nfrom dagster.utils import ensure_gen\n\nfrom ..decorator_utils import get_function_params\nfrom .events import AssetKey\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .mode import DEFAULT_MODE_NAME\nfrom .pipeline_definition import PipelineDefinition\nfrom .run_request import PipelineRunReaction, RunRequest, SkipReason\nfrom .target import DirectTarget, RepoRelativeTarget\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.events.log import EventLogEntry\n\n\n@whitelist_for_serdes\nclass DefaultSensorStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\nDEFAULT_SENSOR_DAEMON_INTERVAL = 30\n\n\ndef is_context_provided(params: List[funcsigs.Parameter]) -> bool:\n    return len(params) == 1\n\n\n
[docs]class SensorEvaluationContext:\n """Sensor execution context.\n\n An instance of this class is made available as the first argument to the evaluation function\n on SensorDefinition.\n\n Attributes:\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest\n last_completion_time (float): DEPRECATED The last time that the sensor was evaluated (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n instance: Optional[DagsterInstance] = None,\n ):\n self._exit_stack = ExitStack()\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._last_completion_time = check.opt_float_param(\n last_completion_time, "last_completion_time"\n )\n self._last_run_key = check.opt_str_param(last_run_key, "last_run_key")\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._repository_name = check.opt_str_param(repository_name, "repository_name")\n self._instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n\n def __enter__(self):\n return self\n\n def __exit__(self, _exception_type, _exception_value, _traceback):\n self._exit_stack.close()\n\n @property\n def instance(self) -> DagsterInstance:\n # self._instance_ref should only ever be None when this SensorEvaluationContext was\n # constructed under test.\n if not self._instance:\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was provided."\n )\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def last_completion_time(self) -> Optional[float]:\n return self._last_completion_time\n\n @property\n def last_run_key(self) -> Optional[str]:\n return self._last_run_key\n\n @property\n def cursor(self) -> Optional[str]:\n """The cursor value for this sensor, which was set in an earlier sensor evaluation."""\n return self._cursor\n\n def update_cursor(self, cursor: Optional[str]) -> None:\n """Updates the cursor value for this sensor, which will be provided on the context for the\n next sensor evaluation.\n\n This can be used to keep track of progress and avoid duplicate work across sensor\n evaluations.\n\n Args:\n cursor (Optional[str]):\n """\n self._cursor = check.opt_str_param(cursor, "cursor")\n\n @property\n def repository_name(self) -> Optional[str]:\n return self._repository_name
\n\n\n# Preserve SensorExecutionContext for backcompat so type annotations don't break.\nSensorExecutionContext = SensorEvaluationContext\n\n\n
[docs]class SensorDefinition:\n """Define a sensor that initiates a set of runs based on some external state\n\n Args:\n evaluation_fn (Callable[[SensorEvaluationContext]]): The core evaluation function for the\n sensor, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n name (Optional[str]): The name of the sensor to create. Defaults to name of evaluation_fn\n pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the sensor\n fires. Cannot be used in conjunction with `job` or `jobs` parameters.\n solid_selection (Optional[List[str]]): (legacy) A list of solid subselection (including single\n solid names) to execute when the sensor runs. e.g. ``['*some_solid+', 'other_solid']``.\n Cannot be used in conjunction with `job` or `jobs` parameters.\n mode (Optional[str]): (legacy) The mode to apply when executing runs triggered by this\n sensor. Cannot be used in conjunction with `job` or `jobs` parameters. (default:\n 'default')\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[GraphDefinition, JobDefinition]): The job to execute when this sensor fires.\n jobs (Optional[Sequence[GraphDefinition, JobDefinition]]): (experimental) A list of jobs to execute when this sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: Optional[str] = None,\n evaluation_fn: Optional[\n Callable[\n ["SensorEvaluationContext"],\n Union[Generator[Union[RunRequest, SkipReason], None, None], RunRequest, SkipReason],\n ]\n ] = None,\n pipeline_name: Optional[str] = None,\n solid_selection: Optional[List[Any]] = None,\n mode: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, JobDefinition]] = None,\n jobs: Optional[Sequence[Union[GraphDefinition, JobDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n ):\n if evaluation_fn is None:\n raise DagsterInvalidDefinitionError("Must provide evaluation_fn to SensorDefinition.")\n\n if job and jobs:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both job and jobs to SensorDefinition. Must provide only one "\n "of the two."\n )\n\n job_param_name = "job" if job else "jobs"\n jobs = jobs if jobs else [job] if job else None\n\n if pipeline_name and jobs:\n raise DagsterInvalidDefinitionError(\n f"Attempted to provide both pipeline_name and {job_param_name} to "\n "SensorDefinition. Must provide only one of the two."\n )\n if solid_selection and jobs:\n raise DagsterInvalidDefinitionError(\n f"Attempted to provide solid_selection and {job_param_name} to SensorDefinition. "\n "The solid_selection argument is incompatible with jobs."\n )\n if mode and jobs:\n raise DagsterInvalidDefinitionError(\n f"Attempted to provide mode and {job_param_name} to SensorDefinition. "\n "The mode argument is incompatible with jobs."\n )\n\n targets: Optional[List[Union[RepoRelativeTarget, DirectTarget]]] = None\n if pipeline_name:\n targets = [\n RepoRelativeTarget(\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n mode=check.opt_str_param(mode, "mode") or DEFAULT_MODE_NAME,\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n )\n ]\n elif job:\n targets = [DirectTarget(job)]\n elif jobs:\n targets = [DirectTarget(job) for job in jobs]\n\n if name:\n self._name = check_valid_name(name)\n else:\n self._name = evaluation_fn.__name__\n\n self._raw_fn = check.callable_param(evaluation_fn, "evaluation_fn")\n self._evaluation_fn: Callable[\n [SensorEvaluationContext], Generator[Union[RunRequest, SkipReason], None, None]\n ] = wrap_sensor_evaluation(self._name, evaluation_fn)\n self._min_interval = check.opt_int_param(\n minimum_interval_seconds, "minimum_interval_seconds", DEFAULT_SENSOR_DAEMON_INTERVAL\n )\n self._description = check.opt_str_param(description, "description")\n self._targets = check.opt_list_param(targets, "targets", (DirectTarget, RepoRelativeTarget))\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultSensorStatus\n )\n\n def __call__(self, *args, **kwargs):\n context_provided = is_context_provided(get_function_params(self._raw_fn))\n\n if context_provided:\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Sensor evaluation function expected context argument, but no context argument "\n "was provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Sensor invocation received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self._raw_fn)[0].name\n\n if args:\n context = check.opt_inst_param(args[0], context_param_name, SensorEvaluationContext)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Sensor invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name], context_param_name, SensorEvaluationContext\n )\n\n context = context if context else build_sensor_context()\n\n return self._raw_fn(context)\n\n else:\n if len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Sensor decorated function has no arguments, but arguments were provided to "\n "invocation."\n )\n\n return self._raw_fn()\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def minimum_interval_seconds(self) -> Optional[int]:\n return self._min_interval\n\n @property\n def targets(self) -> List[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets\n\n @property\n def job(self) -> PipelineDefinition:\n if self._targets:\n if len(self._targets) == 1 and isinstance(self._targets[0], DirectTarget):\n return self._targets[0].pipeline\n elif len(self._targets) > 1:\n raise DagsterInvalidDefinitionError(\n "Job property not available when SensorDefinition has multiple jobs."\n )\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n def evaluate_tick(self, context: "SensorEvaluationContext") -> "SensorExecutionData":\n """Evaluate sensor using the provided context.\n\n Args:\n context (SensorEvaluationContext): The context with which to evaluate this sensor.\n Returns:\n SensorExecutionData: Contains list of run requests, or skip message if present.\n\n """\n\n check.inst_param(context, "context", SensorEvaluationContext)\n result = list(ensure_gen(self._evaluation_fn(context)))\n\n skip_message: Optional[str] = None\n\n run_requests: List[RunRequest]\n pipeline_run_reactions: List[PipelineRunReaction]\n if not result or result == [None]:\n run_requests = []\n pipeline_run_reactions = []\n skip_message = "Sensor function returned an empty result"\n elif len(result) == 1:\n item = result[0]\n check.inst(item, (SkipReason, RunRequest, PipelineRunReaction))\n run_requests = [item] if isinstance(item, RunRequest) else []\n pipeline_run_reactions = (\n [cast(PipelineRunReaction, item)] if isinstance(item, PipelineRunReaction) else []\n )\n skip_message = item.skip_message if isinstance(item, SkipReason) else None\n else:\n check.is_list(result, (SkipReason, RunRequest, PipelineRunReaction))\n has_skip = any(map(lambda x: isinstance(x, SkipReason), result))\n has_run_request = any(map(lambda x: isinstance(x, RunRequest), result))\n has_run_reaction = any(map(lambda x: isinstance(x, PipelineRunReaction), result))\n\n if has_skip:\n if has_run_request:\n check.failed(\n "Expected a single SkipReason or one or more RunRequests: received both "\n "RunRequest and SkipReason"\n )\n elif has_run_reaction:\n check.failed(\n "Expected a single SkipReason or one or more PipelineRunReaction: "\n "received both PipelineRunReaction and SkipReason"\n )\n else:\n check.failed("Expected a single SkipReason: received multiple SkipReasons")\n\n if has_run_request:\n run_requests = cast(List[RunRequest], result)\n pipeline_run_reactions = []\n\n else:\n # only run reactions\n run_requests = []\n pipeline_run_reactions = cast(List[PipelineRunReaction], result)\n\n self.check_valid_run_requests(run_requests)\n\n return SensorExecutionData(\n run_requests,\n skip_message,\n context.cursor,\n pipeline_run_reactions,\n )\n\n def has_loadable_targets(self) -> bool:\n for target in self._targets:\n if isinstance(target, DirectTarget):\n return True\n return False\n\n def load_targets(self) -> List[PipelineDefinition]:\n targets = []\n for target in self._targets:\n if isinstance(target, DirectTarget):\n targets.append(target.load())\n return targets\n\n def check_valid_run_requests(self, run_requests: List[RunRequest]):\n has_multiple_targets = len(self._targets) > 1\n target_names = [target.pipeline_name for target in self._targets]\n\n if run_requests and not self._targets:\n raise Exception(\n f"Error in sensor {self._name}: Sensor evaluation function returned a RunRequest "\n "for a sensor lacking a specified target (pipeline_name, job, or jobs). Targets "\n "can be specified by providing job, jobs, or pipeline_name to the @sensor "\n "decorator."\n )\n\n for run_request in run_requests:\n if run_request.job_name is None and has_multiple_targets:\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest that did not "\n f"specify job_name for the requested run. Expected one of: {target_names}"\n )\n elif run_request.job_name and run_request.job_name not in target_names:\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest with job_name "\n f"{run_request.job_name}. Expected one of: {target_names}"\n )\n\n @property\n def _target(self) -> Optional[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets[0] if self._targets else None\n\n @property\n def pipeline_name(self) -> Optional[str]:\n return self._target.pipeline_name if self._target else None\n\n @property\n def solid_selection(self) -> Optional[List[Any]]:\n return self._target.solid_selection if self._target else None\n\n @property\n def mode(self) -> Optional[str]:\n return self._target.mode if self._target else None\n\n @property\n def default_status(self) -> DefaultSensorStatus:\n return self._default_status
\n\n\n@whitelist_for_serdes\nclass SensorExecutionData(\n NamedTuple(\n "_SensorExecutionData",\n [\n ("run_requests", Optional[List[RunRequest]]),\n ("skip_message", Optional[str]),\n ("cursor", Optional[str]),\n ("pipeline_run_reactions", Optional[List[PipelineRunReaction]]),\n ],\n )\n):\n def __new__(\n cls,\n run_requests: Optional[List[RunRequest]] = None,\n skip_message: Optional[str] = None,\n cursor: Optional[str] = None,\n pipeline_run_reactions: Optional[List[PipelineRunReaction]] = None,\n ):\n check.opt_list_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_str_param(cursor, "cursor")\n check.opt_list_param(pipeline_run_reactions, "pipeline_run_reactions", PipelineRunReaction)\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(SensorExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n cursor=cursor,\n pipeline_run_reactions=pipeline_run_reactions,\n )\n\n\ndef wrap_sensor_evaluation(\n sensor_name: str,\n fn: Callable[\n ["SensorEvaluationContext"],\n Union[Generator[Union[RunRequest, SkipReason], None, None], RunRequest, SkipReason],\n ],\n) -> Callable[["SensorEvaluationContext"], Generator[Union[SkipReason, RunRequest], None, None]]:\n def _wrapped_fn(context):\n result = fn(context) if is_context_provided(get_function_params(fn)) else fn()\n\n if inspect.isgenerator(result):\n for item in result:\n yield item\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n\n elif result is not None:\n raise Exception(\n (\n "Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n "{result} of type {type_}. Should only return SkipReason or "\n "RunRequest objects."\n ).format(sensor_name=sensor_name, result=result, type_=type(result))\n )\n\n return _wrapped_fn\n\n\n
[docs]def build_sensor_context(\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n) -> SensorEvaluationContext:\n """Builds sensor execution context using the provided parameters.\n\n This function can be used to provide a context to the invocation of a sensor definition.If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A cursor value to provide to the evaluation of the sensor.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n\n Examples:\n\n .. code-block:: python\n\n context = build_sensor_context()\n my_sensor(context)\n\n """\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n return SensorEvaluationContext(\n instance_ref=None,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n )
\n\n\n
[docs]class AssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a given\n asset.\n\n Args:\n name (str): The name of the sensor to create.\n asset_key (AssetKey): The asset_key this sensor monitors.\n pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the sensor\n fires. Cannot be used in conjunction with `job` or `jobs` parameters.\n asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Generator[Union[RunRequest, SkipReason], None, None], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.SensorEvaluationContext` and\n an EventLogEntry corresponding to an AssetMaterialization event.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n solid_selection (Optional[List[str]]): (legacy) A list of solid subselection (including single\n solid names) to execute when the sensor runs. e.g. ``['*some_solid+', 'other_solid']``.\n Cannot be used in conjunction with `job` or `jobs` parameters.\n mode (Optional[str]): (legacy) The mode to apply when executing runs triggered by this sensor.\n (default: 'default').\n Cannot be used in conjunction with `job` or `jobs` parameters.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_key: AssetKey,\n pipeline_name: Optional[str],\n asset_materialization_fn: Callable[\n ["SensorExecutionContext", "EventLogEntry"],\n Union[Generator[Union[RunRequest, SkipReason], None, None], RunRequest, SkipReason],\n ],\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, JobDefinition]] = None,\n jobs: Optional[Sequence[Union[GraphDefinition, JobDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n ):\n self._asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n from dagster.core.events import DagsterEventType\n from dagster.core.storage.event_log.base import EventRecordsFilter\n\n def _wrap_asset_fn(materialization_fn):\n def _fn(context):\n after_cursor = None\n if context.cursor:\n try:\n after_cursor = int(context.cursor)\n except ValueError:\n after_cursor = None\n\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=self._asset_key,\n after_cursor=after_cursor,\n ),\n ascending=False,\n limit=1,\n )\n\n if not event_records:\n return\n\n event_record = event_records[0]\n yield from materialization_fn(context, event_record.event_log_entry)\n context.update_cursor(str(event_record.storage_id))\n\n return _fn\n\n super(AssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n pipeline_name=pipeline_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn"),\n ),\n solid_selection=solid_selection,\n mode=mode,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n )\n\n @property\n def asset_key(self):\n return self._asset_key
\n
", "current_page_name": "_modules/dagster/core/definitions/sensor_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.sensor_definition"}, "solid_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.solid_definition

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    Iterator,\n    List,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.dependency import NodeHandle\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster.core.types.dagster_type import DagsterType\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom ..decorator_utils import get_function_params\nfrom .config import ConfigMapping\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .dependency import IDependencyDefinition, NodeHandle, NodeInvocation\nfrom .graph_definition import GraphDefinition\nfrom .input import InputDefinition, InputMapping\nfrom .node_definition import NodeDefinition\nfrom .output import OutputDefinition, OutputMapping\nfrom .solid_invocation import solid_invocation_result\n\nif TYPE_CHECKING:\n    from .decorators.solid_decorator import DecoratedSolidFunction\n\n\n
[docs]class SolidDefinition(NodeDefinition):\n """\n The definition of a Solid that performs a user-defined computation.\n\n For more details on what a solid is, refer to the\n `Solid Overview <../../overview/solids-pipelines/solids>`_ .\n\n End users should prefer the :func:`@solid <solid>` and :func:`@lambda_solid <lambda_solid>`\n decorators. SolidDefinition is generally intended to be used by framework authors.\n\n Args:\n name (str): Name of the solid. Must be unique within any :py:class:`PipelineDefinition`\n using the solid.\n input_defs (List[InputDefinition]): Inputs of the solid.\n compute_fn (Callable): The core of the solid, the function that does the actual\n computation. The signature of this function is determined by ``input_defs``, and\n optionally, an injected first argument, ``context``, a collection of information provided\n by the system.\n\n This function will be coerced into a generator or an async generator, which must yield\n one :py:class:`Output` for each of the solid's ``output_defs``, and additionally may\n yield other types of Dagster events, including :py:class:`Materialization` and\n :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the solid.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the solid matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the solid.\n description (Optional[str]): Human-readable description of the solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this\n solid.\n version (Optional[str]): (Experimental) The version of the solid's compute_fn. Two solids should have\n the same version if and only if they deterministically produce the same outputs when\n provided the same inputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this solid.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n SolidDefinition(\n name="add_one",\n input_defs=[InputDefinition("num", Int)],\n output_defs=[OutputDefinition(Int)], # default name ("result")\n compute_fn=_add_one,\n )\n """\n\n def __init__(\n self,\n name: str,\n input_defs: Sequence[InputDefinition],\n compute_fn: Union[Callable[..., Any], "DecoratedSolidFunction"],\n output_defs: Sequence[OutputDefinition],\n config_schema: Optional[Union[Dict[str, Any], IDefinitionConfigSchema]] = None,\n description: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n required_resource_keys: Optional[Union[Set[str], FrozenSet[str]]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .decorators.solid_decorator import DecoratedSolidFunction\n\n if isinstance(compute_fn, DecoratedSolidFunction):\n self._compute_fn: Union[Callable[..., Any], DecoratedSolidFunction] = compute_fn\n else:\n compute_fn = cast(Callable[..., Any], compute_fn)\n self._compute_fn = check.callable_param(compute_fn, "compute_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._required_resource_keys = frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n self._version = check.opt_str_param(version, "version")\n if version:\n experimental_arg_warning("version", "SolidDefinition.__init__")\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n positional_inputs = (\n self._compute_fn.positional_inputs()\n if isinstance(self._compute_fn, DecoratedSolidFunction)\n else None\n )\n\n super(SolidDefinition, self).__init__(\n name=name,\n input_defs=check.list_param(input_defs, "input_defs", InputDefinition),\n output_defs=check.list_param(output_defs, "output_defs", OutputDefinition),\n description=description,\n tags=check.opt_dict_param(tags, "tags", key_type=str),\n positional_inputs=positional_inputs,\n )\n\n def __call__(self, *args, **kwargs) -> Any:\n from ..execution.context.invocation import UnboundSolidExecutionContext\n from .composition import is_in_composition\n from .decorators.solid_decorator import DecoratedSolidFunction\n\n if is_in_composition():\n return super(SolidDefinition, self).__call__(*args, **kwargs)\n else:\n node_label = self.node_type_str # string "solid" for solids, "op" for ops\n\n if not isinstance(self.compute_fn, DecoratedSolidFunction):\n raise DagsterInvalidInvocationError(\n f"Attemped to invoke {node_label} that was not constructed using the `@{node_label}` "\n f"decorator. Only {node_label}s constructed using the `@{node_label}` decorator can be "\n "directly invoked."\n )\n if self.compute_fn.has_context_arg():\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n f"Compute function of {node_label} '{self.name}' has context argument, but no context "\n "was provided when invoking."\n )\n if len(args) > 0:\n if args[0] is not None and not isinstance(\n args[0], UnboundSolidExecutionContext\n ):\n raise DagsterInvalidInvocationError(\n f"Compute function of {node_label} '{self.name}' has context argument, "\n "but no context was provided when invoking."\n )\n context = args[0]\n return solid_invocation_result(self, context, *args[1:], **kwargs)\n # Context argument is provided under kwargs\n else:\n context_param_name = get_function_params(self.compute_fn.decorated_fn)[0].name\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Compute function of {node_label} '{self.name}' has context argument "\n f"'{context_param_name}', but no value for '{context_param_name}' was "\n f"found when invoking. Provided kwargs: {kwargs}"\n )\n context = kwargs[context_param_name]\n kwargs_sans_context = {\n kwarg: val\n for kwarg, val in kwargs.items()\n if not kwarg == context_param_name\n }\n return solid_invocation_result(self, context, *args, **kwargs_sans_context)\n\n else:\n if len(args) > 0 and isinstance(args[0], UnboundSolidExecutionContext):\n raise DagsterInvalidInvocationError(\n f"Compute function of {node_label} '{self.name}' has no context argument, but "\n "context was provided when invoking."\n )\n return solid_invocation_result(self, None, *args, **kwargs)\n\n @property\n def node_type_str(self) -> str:\n return "solid"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return False\n\n @property\n def compute_fn(self) -> Union[Callable[..., Any], "DecoratedSolidFunction"]:\n return self._compute_fn\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n @property\n def required_resource_keys(self) -> Optional[FrozenSet[str]]:\n return frozenset(self._required_resource_keys)\n\n @property\n def version(self) -> Optional[str]:\n return self._version\n\n def all_dagster_types(self) -> Iterator[DagsterType]:\n yield from self.all_input_output_types()\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n\n def iterate_solid_defs(self) -> Iterator["SolidDefinition"]:\n yield self\n\n def resolve_output_to_origin(\n self, output_name: str, handle: NodeHandle\n ) -> Tuple[OutputDefinition, NodeHandle]:\n return self.output_def_named(output_name), handle\n\n def input_has_default(self, input_name: str) -> InputDefinition:\n return self.input_def_named(input_name).has_default_value\n\n def default_value_for_input(self, input_name: str) -> InputDefinition:\n return self.input_def_named(input_name).default_value\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n return True\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n config_or_config_fn: Any,\n ) -> "SolidDefinition":\n return SolidDefinition(\n name=name,\n input_defs=self.input_defs,\n compute_fn=self.compute_fn,\n output_defs=self.output_defs,\n config_schema=config_schema,\n description=description or self.description,\n tags=self.tags,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n retry_policy=self.retry_policy,\n )\n\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n return self._retry_policy
\n\n\n
[docs]class CompositeSolidDefinition(GraphDefinition):\n """The core unit of composition and abstraction, composite solids allow you to\n define a solid from a graph of solids.\n\n In the same way you would refactor a block of code in to a function to deduplicate, organize,\n or manage complexity - you can refactor solids in a pipeline in to a composite solid.\n\n Args:\n name (str): The name of this composite solid. Must be unique within any\n :py:class:`PipelineDefinition` using the solid.\n solid_defs (List[Union[SolidDefinition, CompositeSolidDefinition]]): The set of solid\n definitions used in this composite solid. Composites may be arbitrarily nested.\n input_mappings (Optional[List[InputMapping]]): Define the inputs to the composite solid,\n and how they map to the inputs of its constituent solids.\n output_mappings (Optional[List[OutputMapping]]): Define the outputs of the composite solid,\n and how they map from the outputs of its constituent solids.\n config_mapping (Optional[ConfigMapping]): By specifying a config mapping, you can override\n the configuration for the child solids contained within this composite solid. Config\n mappings require both a configuration field to be specified, which is exposed as the\n configuration for the composite solid, and a configuration mapping function, which\n is called to map the configuration of the composite solid into the configuration that\n is applied to any child solids.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares where each solid gets its inputs. The keys at the top\n level dict are either string names of solids or NodeInvocations. The values\n are dicts that map input names to DependencyDefinitions.\n description (Optional[str]): Human readable description of this composite solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n may expect and require certain metadata to be attached to a solid.\n positional_inputs (Optional[List[str]]): The positional order of the inputs if it\n differs from the order of the input mappings\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n add_two = CompositeSolidDefinition(\n 'add_two',\n solid_defs=[add_one],\n dependencies={\n NodeInvocation('add_one', 'adder_1'): {},\n NodeInvocation('add_one', 'adder_2'): {'num': DependencyDefinition('adder_1')},\n },\n input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')],\n output_mappings=[OutputDefinition(Int).mapping_from('adder_2')],\n )\n """\n\n def __init__(\n self,\n name: str,\n solid_defs: List[NodeDefinition],\n input_mappings: Optional[List[InputMapping]] = None,\n output_mappings: Optional[List[OutputMapping]] = None,\n config_mapping: Optional[ConfigMapping] = None,\n dependencies: Optional[\n Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]\n ] = None,\n description: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n positional_inputs: Optional[List[str]] = None,\n ):\n _check_io_managers_on_composite_solid(name, input_mappings, output_mappings)\n\n super(CompositeSolidDefinition, self).__init__(\n name=name,\n description=description,\n node_defs=solid_defs,\n dependencies=dependencies,\n tags=tags,\n positional_inputs=positional_inputs,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config=config_mapping,\n )\n\n def all_dagster_types(self) -> Iterator[DagsterType]:\n yield from self.all_input_output_types()\n\n for node_def in self._node_defs:\n yield from node_def.all_dagster_types()\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n config_or_config_fn: Any,\n ) -> "CompositeSolidDefinition":\n config_mapping = self._config_mapping\n if config_mapping is None:\n raise DagsterInvalidDefinitionError(\n "Only composite solids utilizing config mapping can be pre-configured. The "\n 'composite solid "{graph_name}" does not have a config mapping, and thus has '\n "nothing to be configured.".format(graph_name=self.name)\n )\n\n return CompositeSolidDefinition(\n name=name,\n solid_defs=self._node_defs,\n input_mappings=self.input_mappings,\n output_mappings=self.output_mappings,\n config_mapping=ConfigMapping(\n config_mapping.config_fn,\n config_schema=config_schema,\n receive_processed_config_values=config_mapping.receive_processed_config_values,\n ),\n dependencies=self.dependencies,\n description=description or self.description,\n tags=self.tags,\n positional_inputs=self.positional_inputs,\n )\n\n @property\n def node_type_str(self):\n return "composite solid"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return False
\n\n\ndef _check_io_managers_on_composite_solid(\n name: str,\n input_mappings: Optional[List[InputMapping]],\n output_mappings: Optional[List[OutputMapping]],\n):\n # Ban root_manager_key on composite solids\n if input_mappings:\n for input_mapping in input_mappings:\n input_def = input_mapping.definition\n if input_def.root_manager_key:\n raise DagsterInvalidDefinitionError(\n "Root input manager cannot be set on a composite solid: "\n f'root_manager_key "{input_def.root_manager_key}" '\n f'is set on InputDefinition "{input_def.name}" of composite solid "{name}". '\n )\n # Ban io_manager_key on composite solids\n if output_mappings:\n for output_mapping in output_mappings:\n output_def = output_mapping.definition\n if output_def.io_manager_key != "io_manager":\n raise DagsterInvalidDefinitionError(\n "IO manager cannot be set on a composite solid: "\n f'io_manager_key "{output_def.io_manager_key}" '\n f'is set on OutputtDefinition "{output_def.name}" of composite solid "{name}". '\n )\n
", "current_page_name": "_modules/dagster/core/definitions/solid_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.solid_definition"}, "time_window_partitions": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.time_window_partitions

\nfrom datetime import datetime, time\nfrom typing import Any, Callable, Dict, List, NamedTuple, Optional, Union, cast\n\nimport pendulum\n\nfrom dagster import check\nfrom dagster.utils.partitions import DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\nfrom dagster.utils.schedules import schedule_execution_time_iterator\n\nfrom .partition import (\n    DEFAULT_DATE_FORMAT,\n    Partition,\n    PartitionedConfig,\n    PartitionsDefinition,\n    ScheduleType,\n    get_cron_schedule,\n)\n\n\nclass TimeWindow(NamedTuple):\n    """An interval that is closed at the start and open at the end"""\n\n    start: datetime\n    end: datetime\n\n\nclass TimeWindowPartitionsDefinition(\n    PartitionsDefinition[TimeWindow],  # pylint: disable=unsubscriptable-object\n    NamedTuple(\n        "_TimeWindowPartitions",\n        [\n            ("schedule_type", ScheduleType),\n            ("start", datetime),\n            ("timezone", str),\n            ("fmt", str),\n            ("end_offset", int),\n            ("minute_offset", int),\n            ("hour_offset", int),\n            ("day_offset", Optional[int]),\n        ],\n    ),\n):\n    def __new__(\n        cls,\n        schedule_type: ScheduleType,\n        start: Union[datetime, str],\n        timezone: Optional[str],\n        fmt: str,\n        end_offset: int,\n        minute_offset: int = 0,\n        hour_offset: int = 0,\n        day_offset: Optional[int] = None,\n    ):\n        if isinstance(start, str):\n            start_dt = datetime.strptime(start, fmt)\n        else:\n            start_dt = start\n\n        return super(TimeWindowPartitionsDefinition, cls).__new__(\n            cls,\n            schedule_type,\n            start_dt,\n            timezone or "UTC",\n            fmt,\n            end_offset,\n            minute_offset,\n            hour_offset,\n            day_offset,\n        )\n\n    def get_partitions(\n        self, current_time: Optional[datetime] = None\n    ) -> List[Partition[TimeWindow]]:\n        current_timestamp = (\n            pendulum.instance(current_time, tz=self.timezone)\n            if current_time\n            else pendulum.now(self.timezone)\n        ).timestamp()\n\n        time_of_day = time(self.hour_offset, self.minute_offset)\n\n        start_timestamp = pendulum.instance(self.start, tz=self.timezone).timestamp()\n        iterator = schedule_execution_time_iterator(\n            start_timestamp=start_timestamp,\n            cron_schedule=get_cron_schedule(\n                schedule_type=self.schedule_type,\n                time_of_day=time_of_day,\n                execution_day=self.day_offset,\n            ),\n            execution_timezone=self.timezone,\n        )\n\n        partitions: List[Partition[TimeWindow]] = []\n        prev_time = next(iterator)\n        while prev_time.timestamp() < start_timestamp:\n            prev_time = next(iterator)\n\n        end_offset = self.end_offset\n        partitions_past_current_time = 0\n        while True:\n            next_time = next(iterator)\n            if (\n                next_time.timestamp() <= current_timestamp\n                or partitions_past_current_time < end_offset\n            ):\n                partitions.append(\n                    Partition(\n                        value=TimeWindow(prev_time, next_time),\n                        name=prev_time.strftime(self.fmt),\n                    )\n                )\n\n                if next_time.timestamp() > current_timestamp:\n                    partitions_past_current_time += 1\n            else:\n                break\n\n            prev_time = next_time\n\n        if end_offset < 0:\n            partitions = partitions[:end_offset]\n\n        return partitions\n\n    def __str__(self) -> str:\n        partition_def_str = f"{self.schedule_type.value.capitalize()}, starting {self.start.strftime(self.fmt)} {self.timezone}."\n        if self.end_offset != 0:\n            partition_def_str += f" End offsetted by {self.end_offset} partition{'' if self.end_offset == 1 else 's'}."\n        return partition_def_str\n\n    def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n        start = self.start_time_for_partition_key(partition_key)\n        time_of_day = time(self.hour_offset, self.minute_offset)\n        iterator = schedule_execution_time_iterator(\n            start_timestamp=start.timestamp(),\n            cron_schedule=get_cron_schedule(\n                schedule_type=self.schedule_type,\n                time_of_day=time_of_day,\n                execution_day=self.day_offset,\n            ),\n            execution_timezone=self.timezone,\n        )\n\n        return TimeWindow(next(iterator), next(iterator))\n\n    def start_time_for_partition_key(self, partition_key: str) -> datetime:\n        return pendulum.instance(datetime.strptime(partition_key, self.fmt), tz=self.timezone)\n\n    def get_default_partition_mapping(self):\n        from dagster.core.asset_defs.time_window_partition_mapping import TimeWindowPartitionMapping\n\n        return TimeWindowPartitionMapping()\n\n\nclass DailyPartitionsDefinition(TimeWindowPartitionsDefinition):\n    def __new__(\n        cls,\n        start_date: Union[datetime, str],\n        minute_offset: int = 0,\n        hour_offset: int = 0,\n        timezone: Optional[str] = None,\n        fmt: Optional[str] = None,\n        end_offset: int = 0,\n    ):\n        """A set of daily partitions.\n\n        The first partition in the set will start at the start_date at midnight. The last partition\n        in the set will end before the current time, unless the end_offset argument is set to a\n        positive number. If minute_offset and/or hour_offset are used, the start and end times of\n        each partition will be hour_offset:minute_offset of each day.\n\n        Args:\n            start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n                provide in either a datetime or string format.\n            minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n                to 0.\n            hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n            timezone (Optional[str]): The timezone in which each date should exist.\n                Supported strings for timezones are the ones provided by the\n                `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n            fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n            end_offset (int): Extends the partition set by a number of partitions equal to the value\n                passed. If end_offset is 0 (the default), the last partition ends before the current\n                time. If end_offset is 1, the second-to-last partition ends before the current time,\n                and so on.\n\n        .. code-block:: python\n            DailyPartitionsDefinition(start_date="2022-03-12")\n            # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n            DailyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n            # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n        """\n        _fmt = fmt or DEFAULT_DATE_FORMAT\n\n        return super(DailyPartitionsDefinition, cls).__new__(\n            cls,\n            schedule_type=ScheduleType.DAILY,\n            start=start_date,\n            minute_offset=minute_offset,\n            hour_offset=hour_offset,\n            timezone=timezone,\n            fmt=_fmt,\n            end_offset=end_offset,\n        )\n\n\ndef wrap_time_window_tags_fn(\n    tags_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]]\n) -> Callable[[Partition], Dict[str, str]]:\n    def _tag_wrapper(partition: Partition) -> Dict[str, str]:\n        if not tags_fn:\n            return {}\n        return tags_fn(cast(datetime, partition.value[0]), cast(datetime, partition.value[1]))\n\n    return _tag_wrapper\n\n\n
[docs]def daily_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]] = None,\n) -> Callable[[Callable[[datetime, datetime], Dict[str, Any]]], PartitionedConfig]:\n """Defines run config over a set of daily partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the bounds\n of the date partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset and/or hour_offset are used, the start and end times of each partition\n will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n @daily_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n @daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def inner(fn: Callable[[datetime, datetime], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n return PartitionedConfig(\n run_config_for_partition_fn=lambda partition: fn(\n partition.value[0], partition.value[1]\n ),\n partitions_def=DailyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n ),\n decorated_fn=fn,\n tags_for_partition_fn=wrap_time_window_tags_fn(tags_for_partition_fn),\n )\n\n return inner
\n\n\nclass HourlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n def __new__(\n cls,\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n """A set of hourly partitions.\n\n The first partition in the set will start on the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset is provided, the start and end times of each partition\n will be minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n _fmt = fmt or DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n\n return super(HourlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.HOURLY,\n start=start_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )\n\n\n
[docs]def hourly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]] = None,\n) -> Callable[[Callable[[datetime, datetime], Dict[str, Any]]], PartitionedConfig]:\n """Defines run config over a set of hourly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset is provided, the start and end times of each partition will be\n minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def inner(fn: Callable[[datetime, datetime], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n return PartitionedConfig(\n run_config_for_partition_fn=lambda partition: fn(\n partition.value[0], partition.value[1]\n ),\n partitions_def=HourlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n ),\n decorated_fn=fn,\n tags_for_partition_fn=wrap_time_window_tags_fn(tags_for_partition_fn),\n )\n\n return inner
\n\n\nclass MonthlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n def __new__(\n cls,\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n """A set of monthly partitions.\n\n The first partition in the set will start at the soonest first of the month after start_date\n at midnight. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and\n end date of each partition will be day_offset. If minute_offset and/or hour_offset are used,\n the start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n MonthlyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n MonthlyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(MonthlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.MONTHLY,\n start=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )\n\n\n
[docs]def monthly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]] = None,\n) -> Callable[[Callable[[datetime, datetime], Dict[str, Any]]], PartitionedConfig]:\n """Defines run config over a set of monthly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at midnight on the soonest first of the month after\n start_date. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and end\n date of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\n start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n @monthly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n @monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def inner(fn: Callable[[datetime, datetime], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n return PartitionedConfig(\n run_config_for_partition_fn=lambda partition: fn(\n partition.value[0], partition.value[1]\n ),\n partitions_def=MonthlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n ),\n decorated_fn=fn,\n tags_for_partition_fn=wrap_time_window_tags_fn(tags_for_partition_fn),\n )\n\n return inner
\n\n\nclass WeeklyPartitionsDefinition(TimeWindowPartitionsDefinition):\n def __new__(\n cls,\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n """Defines a set of weekly partitions.\n\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n WeeklyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n WeeklyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(WeeklyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.WEEKLY,\n start=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )\n\n\n
[docs]def weekly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]] = None,\n) -> Callable[[Callable[[datetime, datetime], Dict[str, Any]]], PartitionedConfig]:\n """Defines run config over a set of weekly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n @weekly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n @weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def inner(fn: Callable[[datetime, datetime], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n return PartitionedConfig(\n run_config_for_partition_fn=lambda partition: fn(\n partition.value[0], partition.value[1]\n ),\n partitions_def=WeeklyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n ),\n decorated_fn=fn,\n tags_for_partition_fn=wrap_time_window_tags_fn(tags_for_partition_fn),\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/core/definitions/time_window_partitions", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.time_window_partitions"}, "utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.utils

\nimport keyword\nimport os\nimport re\nfrom glob import glob\nfrom typing import Any, Dict, List, Optional, Tuple\n\nimport pkg_resources\nimport yaml\n\nfrom dagster import check, seven\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.utils import frozentags\nfrom dagster.utils.yaml_utils import merge_yaml_strings, merge_yamls\n\nDEFAULT_OUTPUT = "result"\n\nDISALLOWED_NAMES = set(\n    [\n        "context",\n        "conf",\n        "config",\n        "meta",\n        "arg_dict",\n        "dict",\n        "input_arg_dict",\n        "output_arg_dict",\n        "int",\n        "str",\n        "float",\n        "bool",\n        "input",\n        "output",\n        "type",\n    ]\n    + list(keyword.kwlist)  # just disallow all python keywords\n)\n\nVALID_NAME_REGEX_STR = r"^[A-Za-z0-9_]+$"\nVALID_NAME_REGEX = re.compile(VALID_NAME_REGEX_STR)\n\n\nclass NoValueSentinel:\n    """Sentinel value to distinguish unset from None"""\n\n\ndef has_valid_name_chars(name):\n    return bool(VALID_NAME_REGEX.match(name))\n\n\ndef check_valid_name(name: str):\n    check.str_param(name, "name")\n    if name in DISALLOWED_NAMES:\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. It conflicts with a Dagster or python reserved keyword.'\n        )\n\n    if not has_valid_name_chars(name):\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. Names must be in regex {VALID_NAME_REGEX_STR}.'\n        )\n\n    check.invariant(is_valid_name(name))\n    return name\n\n\ndef is_valid_name(name):\n    check.str_param(name, "name")\n\n    return name not in DISALLOWED_NAMES and has_valid_name_chars(name)\n\n\ndef _kv_str(key, value):\n    return '{key}="{value}"'.format(key=key, value=repr(value))\n\n\ndef struct_to_string(name, **kwargs):\n    # Sort the kwargs to ensure consistent representations across Python versions\n    props_str = ", ".join([_kv_str(key, value) for key, value in sorted(kwargs.items())])\n    return "{name}({props_str})".format(name=name, props_str=props_str)\n\n\ndef validate_tags(tags: Optional[Dict[str, Any]]) -> Dict[str, Any]:\n    valid_tags = {}\n    for key, value in check.opt_dict_param(tags, "tags", key_type=str).items():\n        if not isinstance(value, str):\n            valid = False\n            err_reason = 'Could not JSON encode value "{}"'.format(value)\n            try:\n                str_val = seven.json.dumps(value)\n                err_reason = 'JSON encoding "{json}" of value "{val}" is not equivalent to original value'.format(\n                    json=str_val, val=value\n                )\n\n                valid = seven.json.loads(str_val) == value\n            except Exception:\n                pass\n\n            if not valid:\n                raise DagsterInvalidDefinitionError(\n                    'Invalid value for tag "{key}", {err_reason}. Tag values must be strings '\n                    "or meet the constraint that json.loads(json.dumps(value)) == value.".format(\n                        key=key, err_reason=err_reason\n                    )\n                )\n\n            valid_tags[key] = str_val\n        else:\n            valid_tags[key] = value\n\n    return frozentags(valid_tags)\n\n\n
[docs]def config_from_files(config_files: List[str]) -> Dict[str, Any]:\n """Constructs run config from YAML files.\n\n Args:\n config_files (List[str]): List of paths or glob patterns for yaml files\n to load and parse as the run config.\n\n Returns:\n Dict[str, Any]: A run config dictionary constructed from provided YAML files.\n\n Raises:\n FileNotFoundError: When a config file produces no results\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n config_files = check.opt_list_param(config_files, "config_files")\n\n filenames = []\n for file_glob in config_files or []:\n globbed_files = glob(file_glob)\n if not globbed_files:\n raise DagsterInvariantViolationError(\n 'File or glob pattern "{file_glob}" for "config_files"'\n "produced no results.".format(file_glob=file_glob)\n )\n\n filenames += [os.path.realpath(globbed_file) for globbed_file in globbed_files]\n\n try:\n run_config = merge_yamls(filenames)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing files {filenames} "\n f"loaded by file/patterns {config_files}."\n ) from err\n\n return run_config
\n\n\n
[docs]def config_from_yaml_strings(yaml_strings: List[str]) -> Dict[str, Any]:\n """Static constructor for run configs from YAML strings.\n\n Args:\n yaml_strings (List[str]): List of yaml strings to parse as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n yaml_strings = check.list_param(yaml_strings, "yaml_strings", of_type=str)\n\n try:\n run_config = merge_yaml_strings(yaml_strings)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing YAMLs {yaml_strings} "\n ) from err\n\n return run_config
\n\n\n
[docs]def config_from_pkg_resources(pkg_resource_defs: List[Tuple[str, str]]) -> Dict[str, Any]:\n """Load a run config from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n\n .. code-block:: python\n\n config_from_pkg_resources(\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n pkg_resource_defs (List[(str, str)]): List of pkg_resource modules/files to\n load as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n pkg_resource_defs = check.list_param(pkg_resource_defs, "pkg_resource_defs", of_type=tuple)\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs}."\n ) from err\n\n return config_from_yaml_strings(yaml_strings=yaml_strings)
\n
", "current_page_name": "_modules/dagster/core/definitions/utils", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.utils"}, "version_strategy": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.version_strategy

\nimport hashlib\nimport inspect\nfrom typing import TYPE_CHECKING, Any, NamedTuple, Optional\n\nif TYPE_CHECKING:\n    from .op_definition import OpDefinition\n    from .resource_definition import ResourceDefinition\n    from .solid_definition import SolidDefinition\n\n\nclass OpVersionContext(NamedTuple):\n    """Provides execution-time information for computing the version for an op.\n    Attributes:\n        op_def (OpDefinition): The definition of the op to compute a version for.\n        op_config (Any): The parsed config to be passed to the op during execution.\n    """\n\n    op_def: "OpDefinition"\n    op_config: Any\n\n    @property\n    def solid_def(self) -> "SolidDefinition":\n        return self.op_def\n\n    @property\n    def solid_config(self) -> Any:\n        return self.op_config\n\n\nSolidVersionContext = OpVersionContext\n\n\nclass ResourceVersionContext(NamedTuple):\n    """Version-specific resource context.\n\n    Attributes:\n        resource_def (ResourceDefinition): The definition of the resource whose version will be computed.\n        resource_config (Any): The parsed config to be passed to the resource during execution.\n    """\n\n    resource_def: "ResourceDefinition"\n    resource_config: Any\n\n\n
[docs]class VersionStrategy:\n """Abstract class for defining a strategy to version solids and resources.\n\n When subclassing, `get_solid_version` must be implemented, and `get_resource_version` can be\n optionally implemented.\n\n `get_solid_version` should ingest a SolidVersionContext, and `get_resource_version` should ingest a\n ResourceVersionContext. From that, each synthesize a unique string called a `version`, which will\n be tagged to outputs of that solid in the pipeline. Providing a `VersionStrategy` instance to a\n job will enable memoization on that job, such that only steps whose outputs do not have an\n up-to-date version will run.\n """\n\n def get_solid_version(self, context: SolidVersionContext) -> str:\n pass\n\n def get_op_version(self, context: OpVersionContext) -> str:\n return self.get_solid_version(context)\n\n def get_resource_version(\n self, context: ResourceVersionContext # pylint: disable=unused-argument\n ) -> Optional[str]:\n return None
\n\n\n
[docs]class SourceHashVersionStrategy(VersionStrategy):\n def _get_source_hash(self, fn):\n code_as_str = inspect.getsource(fn)\n return hashlib.sha1(code_as_str.encode("utf-8")).hexdigest()\n\n def get_op_version(self, context: OpVersionContext) -> str:\n compute_fn = context.op_def.compute_fn\n if callable(compute_fn):\n return self._get_source_hash(compute_fn)\n else:\n return self._get_source_hash(compute_fn.decorated_fn)\n\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n return self._get_source_hash(context.resource_def.resource_fn)
\n
", "current_page_name": "_modules/dagster/core/definitions/version_strategy", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.version_strategy"}}, "errors": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.errors

\n"""Core Dagster error classes.\n\nAll errors thrown by the Dagster framework inherit from :py:class:`~dagster.DagsterError`. Users\nshould not subclass this base class for their own exceptions.\n\nThere is another exception base class, :py:class:`~dagster.DagsterUserCodeExecutionError`, which is\nused by the framework in concert with the :py:func:`~dagster.core.errors.user_code_error_boundary`.\n\nDagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\n:py:class:`~dagster.DagsterUserCodeExecutionError`.\n\nThe wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.\n"""\n\nimport sys\nfrom contextlib import contextmanager\n\nfrom dagster import check\nfrom dagster.utils.interrupts import raise_interrupts_as\n\n\nclass DagsterExecutionInterruptedError(BaseException):\n    """\n    Pipeline execution was interrupted during the execution process.\n\n    Just like KeyboardInterrupt this inherits from BaseException\n    as to not be accidentally caught by code that catches Exception\n    and thus prevent the interpreter from exiting.\n    """\n\n\n
[docs]class DagsterError(Exception):\n """Base class for all errors thrown by the Dagster framework.\n\n Users should not subclass this base class for their own exceptions."""\n\n @property\n def is_user_code_error(self):\n """Returns true if this error is attributable to user code."""\n return False
\n\n\n
[docs]class DagsterInvalidDefinitionError(DagsterError):\n """Indicates that the rules for a definition have been violated by the user."""
\n\n\nclass DagsterInvalidSubsetError(DagsterError):\n """Indicates that a subset of a pipeline is invalid because either:\n - One or more ops in the specified subset do not exist on the job.'\n - The subset produces an invalid job.\n """\n\n\nCONFIG_ERROR_VERBIAGE = """\nThis value can be a:\n - Field\n - Python primitive types that resolve to dagster config types\n - int, float, bool, str, list.\n - A dagster config type: Int, Float, Bool, Array, Optional, Selector, Shape, Permissive, Map\n - A bare python dictionary, which is wrapped in Field(Shape(...)). Any values\n in the dictionary get resolved by the same rules, recursively.\n - A python list with a single entry that can resolve to a type, e.g. [int]\n"""\n\n\n
[docs]class DagsterInvalidConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a config with an invalid value\n\n Acceptable values for config types are any of:\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type: :py:data:`~dagster.Int`, :py:data:`~dagster.Float`,\n :py:data:`~dagster.Bool`, :py:data:`~dagster.String`,\n :py:data:`~dagster.StringSource`, :py:data:`~dagster.Any`,\n :py:class:`~dagster.Array`, :py:data:`~dagster.Noneable`, :py:data:`~dagster.Enum`,\n :py:class:`~dagster.Selector`, :py:class:`~dagster.Shape`, or\n :py:class:`~dagster.Permissive`.\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n 5. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self, original_root, current_value, stack, reason=None, **kwargs):\n self.original_root = original_root\n self.current_value = current_value\n self.stack = stack\n super(DagsterInvalidConfigDefinitionError, self).__init__(\n (\n "Error defining config. Original value passed: {original_root}. "\n "{stack_str}{current_value} "\n "cannot be resolved.{reason_str}" + CONFIG_ERROR_VERBIAGE\n ).format(\n original_root=repr(original_root),\n stack_str="Error at stack path :" + ":".join(stack) + ". " if stack else "",\n current_value=repr(current_value),\n reason_str=" Reason: {reason}.".format(reason=reason) if reason else "",\n ),\n **kwargs,\n )
\n\n\n
[docs]class DagsterInvariantViolationError(DagsterError):\n """Indicates the user has violated a well-defined invariant that can only be enforced\n at runtime."""
\n\n\n
[docs]class DagsterExecutionStepNotFoundError(DagsterError):\n """Thrown when the user specifies execution step keys that do not exist."""\n\n def __init__(self, *args, **kwargs):\n self.step_keys = check.list_param(kwargs.pop("step_keys"), "step_keys", str)\n super(DagsterExecutionStepNotFoundError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterExecutionPlanSnapshotNotFoundError(DagsterError):\n """Thrown when an expected execution plan snapshot could not be found on a PipelineRun."""\n\n\n
[docs]class DagsterRunNotFoundError(DagsterError):\n """Thrown when a run cannot be found in run storage."""\n\n def __init__(self, *args, **kwargs):\n self.invalid_run_id = check.str_param(kwargs.pop("invalid_run_id"), "invalid_run_id")\n super(DagsterRunNotFoundError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterStepOutputNotFoundError(DagsterError):\n """Indicates that previous step outputs required for an execution step to proceed are not\n available."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterStepOutputNotFoundError, self).__init__(*args, **kwargs)
\n\n\n@contextmanager\ndef raise_execution_interrupts():\n with raise_interrupts_as(DagsterExecutionInterruptedError):\n yield\n\n\n
[docs]@contextmanager\ndef user_code_error_boundary(error_cls, msg_fn, log_manager=None, **kwargs):\n """\n Wraps the execution of user-space code in an error boundary. This places a uniform\n policy around any user code invoked by the framework. This ensures that all user\n errors are wrapped in an exception derived from DagsterUserCodeExecutionError,\n and that the original stack trace of the user error is preserved, so that it\n can be reported without confusing framework code in the stack trace, if a\n tool author wishes to do so.\n\n Examples:\n\n .. code-block:: python\n\n with user_code_error_boundary(\n # Pass a class that inherits from DagsterUserCodeExecutionError\n DagsterExecutionStepExecutionError,\n # Pass a function that produces a message\n "Error occurred during step execution"\n ):\n call_user_provided_function()\n\n """\n check.callable_param(msg_fn, "msg_fn")\n check.class_param(error_cls, "error_cls", superclass=DagsterUserCodeExecutionError)\n\n with raise_execution_interrupts():\n if log_manager:\n log_manager.begin_python_log_capture()\n try:\n yield\n except DagsterError as de:\n # The system has thrown an error that is part of the user-framework contract\n raise de\n except Exception as e: # pylint: disable=W0703\n # An exception has been thrown by user code and computation should cease\n # with the error reported further up the stack\n raise error_cls(\n msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs\n ) from e\n finally:\n if log_manager:\n log_manager.end_python_log_capture()
\n\n\n
[docs]class DagsterUserCodeExecutionError(DagsterError):\n """\n This is the base class for any exception that is meant to wrap an\n :py:class:`~python:Exception` thrown by user code. It wraps that existing user code.\n The ``original_exc_info`` argument to the constructor is meant to be a tuple of the type\n returned by :py:func:`sys.exc_info <python:sys.exc_info>` at the call site of the constructor.\n\n Users should not subclass this base class for their own exceptions and should instead throw\n freely from user code. User exceptions will be automatically wrapped and rethrown.\n """\n\n def __init__(self, *args, **kwargs):\n # original_exc_info should be gotten from a sys.exc_info() call at the\n # callsite inside of the exception handler. this will allow consuming\n # code to *re-raise* the user error in it's original format\n # for cleaner error reporting that does not have framework code in it\n user_exception = check.inst_param(kwargs.pop("user_exception"), "user_exception", Exception)\n original_exc_info = check.tuple_param(kwargs.pop("original_exc_info"), "original_exc_info")\n\n check.invariant(original_exc_info[0] is not None)\n\n super(DagsterUserCodeExecutionError, self).__init__(args[0], *args[1:], **kwargs)\n\n self.user_exception = check.opt_inst_param(user_exception, "user_exception", Exception)\n self.original_exc_info = original_exc_info\n\n @property\n def is_user_code_error(self):\n return True
\n\n\n
[docs]class DagsterTypeCheckError(DagsterUserCodeExecutionError):\n """Indicates an error in the op type system at runtime. E.g. a op receives an\n unexpected input, or produces an output that does not match the type of the output definition.\n """
\n\n\nclass DagsterExecutionLoadInputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while loading an input for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.input_name = check.str_param(kwargs.pop("input_name"), "input_name")\n super(DagsterExecutionLoadInputError, self).__init__(*args, **kwargs)\n\n\nclass DagsterExecutionHandleOutputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while handling an output for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterExecutionHandleOutputError, self).__init__(*args, **kwargs)\n\n\n
[docs]class DagsterExecutionStepExecutionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of an execution step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.op_name = check.str_param(kwargs.pop("op_name"), "op_name")\n self.op_def_name = check.str_param(kwargs.pop("op_def_name"), "op_def_name")\n super(DagsterExecutionStepExecutionError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterResourceFunctionError(DagsterUserCodeExecutionError):\n """\n Indicates an error occurred while executing the body of the ``resource_fn`` in a\n :py:class:`~dagster.ResourceDefinition` during resource initialization.\n """
\n\n\n
[docs]class DagsterConfigMappingFunctionError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of a config mapping\n function defined in a :py:class:`~dagster.JobDefinition` or `~dagster.GraphDefinition` during\n config parsing.\n """
\n\n\nclass DagsterTypeLoadingError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of an type load\n function defined in a :py:class:`~dagster.DagsterTypeLoader` during loading of a custom type.\n """\n\n\nclass DagsterTypeMaterializationError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of an output\n materialization function defined in a :py:class:`~dagster.DagsterTypeMaterializer` during\n materialization of a custom type.\n """\n\n\n
[docs]class DagsterUnknownResourceError(DagsterError, AttributeError):\n # inherits from AttributeError as it is raised within a __getattr__ call... used to support\n # object hasattr method\n """Indicates that an unknown resource was accessed in the body of an execution step. May often\n happen by accessing a resource in the compute function of an op without first supplying the\n op with the correct `required_resource_keys` argument.\n """\n\n def __init__(self, resource_name, *args, **kwargs):\n self.resource_name = check.str_param(resource_name, "resource_name")\n msg = (\n "Unknown resource `{resource_name}`. Specify `{resource_name}` as a required resource "\n "on the compute / config function that accessed it."\n ).format(resource_name=resource_name)\n super(DagsterUnknownResourceError, self).__init__(msg, *args, **kwargs)
\n\n\nclass DagsterInvalidInvocationError(DagsterError):\n """\n Indicates that an error has occurred when an op has been invoked, but before the actual\n core compute has been reached.\n """\n\n\n
[docs]class DagsterInvalidConfigError(DagsterError):\n """Thrown when provided config is invalid (does not type check against the relevant config\n schema)."""\n\n def __init__(self, preamble, errors, config_value, *args, **kwargs):\n from dagster.config.errors import EvaluationError\n\n check.str_param(preamble, "preamble")\n self.errors = check.list_param(errors, "errors", of_type=EvaluationError)\n self.config_value = config_value\n\n error_msg = preamble\n error_messages = []\n\n for i_error, error in enumerate(self.errors):\n error_messages.append(error.message)\n error_msg += "\\n Error {i_error}: {error_message}".format(\n i_error=i_error + 1, error_message=error.message\n )\n\n self.message = error_msg\n self.error_messages = error_messages\n\n super(DagsterInvalidConfigError, self).__init__(error_msg, *args, **kwargs)
\n\n\n
[docs]class DagsterUnmetExecutorRequirementsError(DagsterError):\n """Indicates the resolved executor is incompatible with the state of other systems\n such as the :py:class:`~dagster.core.instance.DagsterInstance` or system storage configuration.\n """
\n\n\n
[docs]class DagsterSubprocessError(DagsterError):\n """An exception has occurred in one or more of the child processes dagster manages.\n This error forwards the message and stack trace for all of the collected errors.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.subprocess_error_infos = check.list_param(\n kwargs.pop("subprocess_error_infos"), "subprocess_error_infos", SerializableErrorInfo\n )\n super(DagsterSubprocessError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterUserCodeUnreachableError(DagsterError):\n """Dagster was unable to reach a user code server to fetch information about user code."""\n\n\nclass DagsterUserCodeProcessError(DagsterError):\n """An exception has occurred in a user code process that the host process raising this error\n was communicating with."""\n\n @staticmethod\n def from_error_info(error_info):\n from dagster.utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterUserCodeProcessError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterUserCodeProcessError, self).__init__(*args, **kwargs)\n\n\nclass DagsterRepositoryLocationLoadError(DagsterError):\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.load_error_infos = check.list_param(\n kwargs.pop("load_error_infos"),\n "load_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterRepositoryLocationLoadError, self).__init__(*args, **kwargs)\n\n\nclass DagsterLaunchFailedError(DagsterError):\n """Indicates an error while attempting to launch a pipeline run."""\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterLaunchFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterBackfillFailedError(DagsterError):\n """Indicates an error while attempting to launch a backfill."""\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterBackfillFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterInstanceSchemaOutdated(DagsterError):\n """Indicates that the dagster instance must be migrated."""\n\n def __init__(self, db_revision=None, head_revision=None):\n super(DagsterInstanceSchemaOutdated, self).__init__(\n "Raised an exception that may indicate that the Dagster database needs to be be migrated."\n "{revision_clause} To migrate, run `dagster instance migrate`.".format(\n revision_clause=(\n " Database is at revision {db_revision}, head is "\n "{head_revision}.".format(db_revision=db_revision, head_revision=head_revision)\n if db_revision or head_revision\n else ""\n ),\n )\n )\n\n\nclass DagsterRunAlreadyExists(DagsterError):\n """Indicates that a pipeline run already exists in a run storage."""\n\n\nclass DagsterSnapshotDoesNotExist(DagsterError):\n """Indicates you attempted to create a pipeline run with a nonexistent snapshot id"""\n\n\nclass DagsterRunConflict(DagsterError):\n """Indicates that a conflicting pipeline run exists in a run storage."""\n\n\n
[docs]class DagsterTypeCheckDidNotPass(DagsterError):\n """Indicates that a type check failed.\n\n This is raised when ``raise_on_error`` is ``True`` in calls to the synchronous job and\n graph execution APIs (e.g. `graph.execute_in_process()`, `job.execute_in_process()` -- typically\n within a test), and a :py:class:`~dagster.DagsterType`'s type check fails by returning either\n ``False`` or an instance of :py:class:`~dagster.TypeCheck` whose ``success`` member is ``False``.\n """\n\n def __init__(self, description=None, metadata_entries=None, dagster_type=None):\n from dagster import DagsterType, MetadataEntry\n\n super(DagsterTypeCheckDidNotPass, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n self.dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)
\n\n\n
[docs]class DagsterEventLogInvalidForRun(DagsterError):\n """Raised when the event logs for a historical run are malformed or invalid."""\n\n def __init__(self, run_id):\n self.run_id = check.str_param(run_id, "run_id")\n super(DagsterEventLogInvalidForRun, self).__init__(\n "Event logs invalid for run id {}".format(run_id)\n )
\n\n\nclass ScheduleExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of schedule."""\n\n\nclass SensorExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of a sensor (or its job)."""\n\n\nclass PartitionExecutionError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions of a partition set schedule."""\n\n\nclass DagsterInvalidAssetKey(DagsterError):\n """Error raised by invalid asset key"""\n\n\nclass DagsterInvalidMetadata(DagsterError):\n """Error raised by invalid metadata parameters"""\n\n\nclass HookExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined hook."""\n\n\nclass RunStatusSensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined run status sensor."""\n\n\nclass DagsterImportError(DagsterError):\n """Import error raised while importing user-code."""\n\n\nclass JobError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions for a defined Job."""\n\n\nclass DagsterUnknownStepStateError(DagsterError):\n """When job execution completes with steps in an unknown state"""\n\n\nclass DagsterObjectStoreError(DagsterError):\n """Errors during an object store operation."""\n\n\nclass DagsterInvalidPropertyError(DagsterError):\n """Indicates that an invalid property was accessed. May often happen by accessing a property\n that no longer exists after breaking changes."""\n\n\nclass DagsterHomeNotSetError(DagsterError):\n """\n The user has tried to use a command that requires an instance or invoke DagsterInstance.get()\n without setting DAGSTER_HOME env var.\n """\n\n\nclass DagsterUnknownPartitionError(DagsterError):\n """\n The user has tried to access run config for a partition name that does not exist.\n """\n
", "current_page_name": "_modules/dagster/core/errors", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.errors"}, "events": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.events

\n"""Structured representations of system events."""\nimport logging\nimport os\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, List, NamedTuple, Optional, Union, cast\n\nfrom dagster import check\nfrom dagster.core.definitions import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    HookDefinition,\n    Materialization,\n    MetadataEntry,\n    NodeHandle,\n)\nfrom dagster.core.definitions.events import AssetLineageInfo, ObjectStoreOperationType\nfrom dagster.core.definitions.metadata import MetadataValue\nfrom dagster.core.errors import DagsterError, HookExecutionError\nfrom dagster.core.execution.context.hook import HookContext\nfrom dagster.core.execution.context.system import (\n    IPlanContext,\n    IStepContext,\n    PlanExecutionContext,\n    PlanOrchestrationContext,\n    StepExecutionContext,\n)\nfrom dagster.core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster.core.execution.plan.outputs import StepOutputData\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus\nfrom dagster.serdes import register_serdes_tuple_fallbacks, whitelist_for_serdes\nfrom dagster.utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info\nfrom dagster.utils.timing import format_duration\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.events import ObjectStoreOperation\n    from dagster.core.execution.plan.inputs import StepInputData\n    from dagster.core.execution.plan.objects import StepFailureData, StepRetryData, StepSuccessData\n    from dagster.core.execution.plan.plan import ExecutionPlan\n    from dagster.core.execution.plan.step import ExecutionStep, StepKind\n\n    EventSpecificData = Union[\n        StepOutputData,\n        StepFailureData,\n        StepSuccessData,\n        "StepMaterializationData",\n        "StepExpectationResultData",\n        StepInputData,\n        "EngineEventData",\n        "HookErroredData",\n        StepRetryData,\n        "PipelineFailureData",\n        "PipelineCanceledData",\n        "ObjectStoreOperationResultData",\n        "HandledOutputData",\n        "LoadedInputData",\n        "ComputeLogsCaptureData",\n        "AssetObservationData",\n        "AssetMaterializationPlannedData",\n    ]\n\n\n
[docs]class DagsterEventType(Enum):\n """The types of events that may be yielded by solid and pipeline execution."""\n\n STEP_OUTPUT = "STEP_OUTPUT"\n STEP_INPUT = "STEP_INPUT"\n STEP_FAILURE = "STEP_FAILURE"\n STEP_START = "STEP_START"\n STEP_SUCCESS = "STEP_SUCCESS"\n STEP_SKIPPED = "STEP_SKIPPED"\n\n STEP_UP_FOR_RETRY = "STEP_UP_FOR_RETRY" # "failed" but want to retry\n STEP_RESTARTED = "STEP_RESTARTED"\n\n ASSET_MATERIALIZATION = "ASSET_MATERIALIZATION"\n ASSET_MATERIALIZATION_PLANNED = "ASSET_MATERIALIZATION_PLANNED"\n ASSET_OBSERVATION = "ASSET_OBSERVATION"\n STEP_EXPECTATION_RESULT = "STEP_EXPECTATION_RESULT"\n\n # We want to display RUN_* events in dagit and in our LogManager output, but in order to\n # support backcompat for our storage layer, we need to keep the persisted value to be strings\n # of the form "PIPELINE_*". We may have user code that pass in the DagsterEventType\n # enum values into storage APIs (like get_event_records, which takes in an EventRecordsFilter).\n RUN_ENQUEUED = "PIPELINE_ENQUEUED"\n RUN_DEQUEUED = "PIPELINE_DEQUEUED"\n RUN_STARTING = "PIPELINE_STARTING" # Launch is happening, execution hasn't started yet\n RUN_START = "PIPELINE_START" # Execution has started\n RUN_SUCCESS = "PIPELINE_SUCCESS"\n RUN_FAILURE = "PIPELINE_FAILURE"\n RUN_CANCELING = "PIPELINE_CANCELING"\n RUN_CANCELED = "PIPELINE_CANCELED"\n\n # Keep these legacy enum values around, to keep back-compatability for user code that might be\n # using these constants to filter event records\n PIPELINE_ENQUEUED = RUN_ENQUEUED\n PIPELINE_DEQUEUED = RUN_DEQUEUED\n PIPELINE_STARTING = RUN_STARTING\n PIPELINE_START = RUN_START\n PIPELINE_SUCCESS = RUN_SUCCESS\n PIPELINE_FAILURE = RUN_FAILURE\n PIPELINE_CANCELING = RUN_CANCELING\n PIPELINE_CANCELED = RUN_CANCELED\n\n OBJECT_STORE_OPERATION = "OBJECT_STORE_OPERATION"\n ASSET_STORE_OPERATION = "ASSET_STORE_OPERATION"\n LOADED_INPUT = "LOADED_INPUT"\n HANDLED_OUTPUT = "HANDLED_OUTPUT"\n\n ENGINE_EVENT = "ENGINE_EVENT"\n\n HOOK_COMPLETED = "HOOK_COMPLETED"\n HOOK_ERRORED = "HOOK_ERRORED"\n HOOK_SKIPPED = "HOOK_SKIPPED"\n\n ALERT_START = "ALERT_START"\n ALERT_SUCCESS = "ALERT_SUCCESS"\n ALERT_FAILURE = "ALERT_FAILURE"\n\n LOGS_CAPTURED = "LOGS_CAPTURED"
\n\n\nEVENT_TYPE_VALUE_TO_DISPLAY_STRING = {\n "PIPELINE_ENQUEUED": "RUN_ENQUEUED",\n "PIPELINE_DEQUEUED": "RUN_DEQUEUED",\n "PIPELINE_STARTING": "RUN_STARTING",\n "PIPELINE_START": "RUN_START",\n "PIPELINE_SUCCESS": "RUN_SUCCESS",\n "PIPELINE_FAILURE": "RUN_FAILURE",\n "PIPELINE_CANCELING": "RUN_CANCELING",\n "PIPELINE_CANCELED": "RUN_CANCELED",\n}\n\nSTEP_EVENTS = {\n DagsterEventType.STEP_INPUT,\n DagsterEventType.STEP_START,\n DagsterEventType.STEP_OUTPUT,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.STEP_SUCCESS,\n DagsterEventType.STEP_SKIPPED,\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.STEP_EXPECTATION_RESULT,\n DagsterEventType.OBJECT_STORE_OPERATION,\n DagsterEventType.HANDLED_OUTPUT,\n DagsterEventType.LOADED_INPUT,\n DagsterEventType.STEP_RESTARTED,\n DagsterEventType.STEP_UP_FOR_RETRY,\n}\n\nFAILURE_EVENTS = {\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.RUN_CANCELED,\n}\n\nPIPELINE_EVENTS = {\n DagsterEventType.RUN_ENQUEUED,\n DagsterEventType.RUN_DEQUEUED,\n DagsterEventType.RUN_STARTING,\n DagsterEventType.RUN_START,\n DagsterEventType.RUN_SUCCESS,\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.RUN_CANCELING,\n DagsterEventType.RUN_CANCELED,\n}\n\nHOOK_EVENTS = {\n DagsterEventType.HOOK_COMPLETED,\n DagsterEventType.HOOK_ERRORED,\n DagsterEventType.HOOK_SKIPPED,\n}\n\nALERT_EVENTS = {\n DagsterEventType.ALERT_START,\n DagsterEventType.ALERT_SUCCESS,\n DagsterEventType.ALERT_FAILURE,\n}\n\n\nEVENT_TYPE_TO_PIPELINE_RUN_STATUS = {\n DagsterEventType.RUN_START: PipelineRunStatus.STARTED,\n DagsterEventType.RUN_SUCCESS: PipelineRunStatus.SUCCESS,\n DagsterEventType.RUN_FAILURE: PipelineRunStatus.FAILURE,\n DagsterEventType.RUN_ENQUEUED: PipelineRunStatus.QUEUED,\n DagsterEventType.RUN_STARTING: PipelineRunStatus.STARTING,\n DagsterEventType.RUN_CANCELING: PipelineRunStatus.CANCELING,\n DagsterEventType.RUN_CANCELED: PipelineRunStatus.CANCELED,\n}\n\nPIPELINE_RUN_STATUS_TO_EVENT_TYPE = {v: k for k, v in EVENT_TYPE_TO_PIPELINE_RUN_STATUS.items()}\n\nASSET_EVENTS = {\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n}\n\n\ndef _assert_type(\n method: str, expected_type: DagsterEventType, actual_type: DagsterEventType\n) -> None:\n check.invariant(\n expected_type == actual_type,\n (\n "{method} only callable when event_type is {expected_type}, called on {actual_type}"\n ).format(method=method, expected_type=expected_type, actual_type=actual_type),\n )\n\n\ndef _validate_event_specific_data(\n event_type: DagsterEventType, event_specific_data: Optional["EventSpecificData"]\n) -> Optional["EventSpecificData"]:\n from dagster.core.execution.plan.inputs import StepInputData\n from dagster.core.execution.plan.objects import StepFailureData, StepSuccessData\n\n if event_type == DagsterEventType.STEP_OUTPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepOutputData)\n elif event_type == DagsterEventType.STEP_FAILURE:\n check.inst_param(event_specific_data, "event_specific_data", StepFailureData)\n elif event_type == DagsterEventType.STEP_SUCCESS:\n check.inst_param(event_specific_data, "event_specific_data", StepSuccessData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION:\n check.inst_param(event_specific_data, "event_specific_data", StepMaterializationData)\n elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT:\n check.inst_param(event_specific_data, "event_specific_data", StepExpectationResultData)\n elif event_type == DagsterEventType.STEP_INPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepInputData)\n elif event_type == DagsterEventType.ENGINE_EVENT:\n check.inst_param(event_specific_data, "event_specific_data", EngineEventData)\n elif event_type == DagsterEventType.HOOK_ERRORED:\n check.inst_param(event_specific_data, "event_specific_data", HookErroredData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n check.inst_param(\n event_specific_data, "event_specific_data", AssetMaterializationPlannedData\n )\n\n return event_specific_data\n\n\ndef log_step_event(step_context: IStepContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n step_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for step {step_context.step.key}",\n dagster_event=event,\n )\n\n\ndef log_pipeline_event(pipeline_context: IPlanContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n pipeline_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for pipeline {pipeline_context.pipeline_name}",\n dagster_event=event,\n )\n\n\ndef log_resource_event(log_manager: DagsterLogManager, event: "DagsterEvent") -> None:\n event_specific_data = cast(EngineEventData, event.event_specific_data)\n\n log_level = logging.ERROR if event_specific_data.error else logging.DEBUG\n log_manager.log_dagster_event(level=log_level, msg=event.message or "", dagster_event=event)\n\n\n
[docs]@whitelist_for_serdes\nclass DagsterEvent(\n NamedTuple(\n "_DagsterEvent",\n [\n ("event_type_value", str),\n ("pipeline_name", str),\n ("step_handle", Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]]),\n ("solid_handle", Optional[NodeHandle]),\n ("step_kind_value", Optional[str]),\n ("logging_tags", Optional[Dict[str, str]]),\n ("event_specific_data", Optional["EventSpecificData"]),\n ("message", Optional[str]),\n ("pid", Optional[int]),\n ("step_key", Optional[str]),\n ],\n )\n):\n """Events yielded by solid and pipeline execution.\n\n Users should not instantiate this class.\n\n Attributes:\n event_type_value (str): Value for a DagsterEventType.\n pipeline_name (str)\n solid_handle (NodeHandle)\n step_kind_value (str): Value for a StepKind.\n logging_tags (Dict[str, str])\n event_specific_data (Any): Type must correspond to event_type_value.\n message (str)\n pid (int)\n step_key (Optional[str]): DEPRECATED\n """\n\n @staticmethod\n def from_step(\n event_type: "DagsterEventType",\n step_context: IStepContext,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n message=check.opt_str_param(message, "message"),\n pid=os.getpid(),\n )\n\n log_step_event(step_context, event)\n\n return event\n\n @staticmethod\n def from_pipeline(\n event_type: DagsterEventType,\n pipeline_context: IPlanContext,\n message: Optional[str] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n ) -> "DagsterEvent":\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n )\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n pipeline_name=pipeline_context.pipeline_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n step_handle=step_handle,\n pid=os.getpid(),\n )\n\n log_pipeline_event(pipeline_context, event)\n\n return event\n\n @staticmethod\n def from_resource(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n message: Optional[str] = None,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n\n event = DagsterEvent(\n DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=pipeline_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(\n DagsterEventType.ENGINE_EVENT, event_specific_data\n ),\n step_handle=execution_plan.step_handle_for_single_step_plans(),\n pid=os.getpid(),\n )\n log_resource_event(log_manager, event)\n return event\n\n @staticmethod\n def asset_materialization_planned(\n pipeline_name: str,\n asset_key: AssetKey,\n log_manager: DagsterLogManager,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value,\n pipeline_name=pipeline_name,\n message=f"{pipeline_name} intends to materialize asset {asset_key.to_string()}",\n event_specific_data=AssetMaterializationPlannedData(asset_key),\n )\n log_level = logging.DEBUG\n log_manager.log_dagster_event(level=log_level, msg=event.message or "", dagster_event=event)\n return event\n\n def __new__(\n cls,\n event_type_value: str,\n pipeline_name: str,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n solid_handle: Optional[NodeHandle] = None,\n step_kind_value: Optional[str] = None,\n logging_tags: Optional[Dict[str, str]] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n pid: Optional[int] = None,\n # legacy\n step_key: Optional[str] = None,\n ):\n event_type_value, event_specific_data = _handle_back_compat(\n event_type_value, event_specific_data\n )\n\n # old events may contain solid_handle but not step_handle\n if solid_handle is not None and step_handle is None:\n step_handle = StepHandle(solid_handle)\n\n # Legacy events may have step_key set directly, preserve those to stay in sync\n # with legacy execution plan snapshots.\n if step_handle is not None and step_key is None:\n step_key = step_handle.to_key()\n\n return super(DagsterEvent, cls).__new__(\n cls,\n check.str_param(event_type_value, "event_type_value"),\n check.str_param(pipeline_name, "pipeline_name"),\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n ),\n check.opt_inst_param(solid_handle, "solid_handle", NodeHandle),\n check.opt_str_param(step_kind_value, "step_kind_value"),\n check.opt_dict_param(logging_tags, "logging_tags"),\n _validate_event_specific_data(DagsterEventType(event_type_value), event_specific_data),\n check.opt_str_param(message, "message"),\n check.opt_int_param(pid, "pid"),\n check.opt_str_param(step_key, "step_key"),\n )\n\n @property\n def solid_name(self) -> str:\n check.invariant(self.solid_handle is not None)\n solid_handle = cast(NodeHandle, self.solid_handle)\n return solid_handle.name\n\n @property\n def event_type(self) -> DagsterEventType:\n """DagsterEventType: The type of this event."""\n return DagsterEventType(self.event_type_value)\n\n @property\n def is_step_event(self) -> bool:\n return self.event_type in STEP_EVENTS\n\n @property\n def is_hook_event(self) -> bool:\n return self.event_type in HOOK_EVENTS\n\n @property\n def is_alert_event(self) -> bool:\n return self.event_type in ALERT_EVENTS\n\n @property\n def step_kind(self) -> "StepKind":\n from dagster.core.execution.plan.step import StepKind\n\n return StepKind(self.step_kind_value)\n\n @property\n def is_step_success(self) -> bool:\n return self.event_type == DagsterEventType.STEP_SUCCESS\n\n @property\n def is_successful_output(self) -> bool:\n return self.event_type == DagsterEventType.STEP_OUTPUT\n\n @property\n def is_step_start(self) -> bool:\n return self.event_type == DagsterEventType.STEP_START\n\n @property\n def is_step_failure(self) -> bool:\n return self.event_type == DagsterEventType.STEP_FAILURE\n\n @property\n def is_step_skipped(self) -> bool:\n return self.event_type == DagsterEventType.STEP_SKIPPED\n\n @property\n def is_step_up_for_retry(self) -> bool:\n return self.event_type == DagsterEventType.STEP_UP_FOR_RETRY\n\n @property\n def is_step_restarted(self) -> bool:\n return self.event_type == DagsterEventType.STEP_RESTARTED\n\n @property\n def is_pipeline_success(self) -> bool:\n return self.event_type == DagsterEventType.RUN_SUCCESS\n\n @property\n def is_pipeline_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @property\n def is_failure(self) -> bool:\n return self.event_type in FAILURE_EVENTS\n\n @property\n def is_pipeline_event(self) -> bool:\n return self.event_type in PIPELINE_EVENTS\n\n @property\n def is_engine_event(self) -> bool:\n return self.event_type == DagsterEventType.ENGINE_EVENT\n\n @property\n def is_handled_output(self) -> bool:\n return self.event_type == DagsterEventType.HANDLED_OUTPUT\n\n @property\n def is_loaded_input(self) -> bool:\n return self.event_type == DagsterEventType.LOADED_INPUT\n\n @property\n def is_step_materialization(self) -> bool:\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION\n\n @property\n def is_expectation_result(self) -> bool:\n return self.event_type == DagsterEventType.STEP_EXPECTATION_RESULT\n\n @property\n def is_asset_observation(self) -> bool:\n return self.event_type == DagsterEventType.ASSET_OBSERVATION\n\n @property\n def asset_key(self) -> Optional[AssetKey]:\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.asset_key\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.asset_key\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.asset_key\n else:\n return None\n\n @property\n def partition(self) -> Optional[str]:\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.partition\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.partition\n else:\n return None\n\n @property\n def step_input_data(self) -> "StepInputData":\n from dagster.core.execution.plan.inputs import StepInputData\n\n _assert_type("step_input_data", DagsterEventType.STEP_INPUT, self.event_type)\n return cast(StepInputData, self.event_specific_data)\n\n @property\n def step_output_data(self) -> StepOutputData:\n _assert_type("step_output_data", DagsterEventType.STEP_OUTPUT, self.event_type)\n return cast(StepOutputData, self.event_specific_data)\n\n @property\n def step_success_data(self) -> "StepSuccessData":\n from dagster.core.execution.plan.objects import StepSuccessData\n\n _assert_type("step_success_data", DagsterEventType.STEP_SUCCESS, self.event_type)\n return cast(StepSuccessData, self.event_specific_data)\n\n @property\n def step_failure_data(self) -> "StepFailureData":\n from dagster.core.execution.plan.objects import StepFailureData\n\n _assert_type("step_failure_data", DagsterEventType.STEP_FAILURE, self.event_type)\n return cast(StepFailureData, self.event_specific_data)\n\n @property\n def step_retry_data(self) -> "StepRetryData":\n from dagster.core.execution.plan.objects import StepRetryData\n\n _assert_type("step_retry_data", DagsterEventType.STEP_UP_FOR_RETRY, self.event_type)\n return cast(StepRetryData, self.event_specific_data)\n\n @property\n def step_materialization_data(self) -> "StepMaterializationData":\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data)\n\n @property\n def asset_observation_data(self) -> "AssetObservationData":\n _assert_type("asset_observation_data", DagsterEventType.ASSET_OBSERVATION, self.event_type)\n return cast(AssetObservationData, self.event_specific_data)\n\n @property\n def asset_materialization_planned_data(self) -> "AssetMaterializationPlannedData":\n _assert_type(\n "asset_materialization_planned",\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n self.event_type,\n )\n return cast(AssetMaterializationPlannedData, self.event_specific_data)\n\n @property\n def step_expectation_result_data(self) -> "StepExpectationResultData":\n _assert_type(\n "step_expectation_result_data",\n DagsterEventType.STEP_EXPECTATION_RESULT,\n self.event_type,\n )\n return cast(StepExpectationResultData, self.event_specific_data)\n\n @property\n def pipeline_failure_data(self) -> "PipelineFailureData":\n _assert_type("pipeline_failure_data", DagsterEventType.RUN_FAILURE, self.event_type)\n return cast(PipelineFailureData, self.event_specific_data)\n\n @property\n def engine_event_data(self) -> "EngineEventData":\n _assert_type("engine_event_data", DagsterEventType.ENGINE_EVENT, self.event_type)\n return cast(EngineEventData, self.event_specific_data)\n\n @property\n def hook_completed_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_completed_data", DagsterEventType.HOOK_COMPLETED, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_errored_data(self) -> "HookErroredData":\n _assert_type("hook_errored_data", DagsterEventType.HOOK_ERRORED, self.event_type)\n return cast(HookErroredData, self.event_specific_data)\n\n @property\n def hook_skipped_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_skipped_data", DagsterEventType.HOOK_SKIPPED, self.event_type)\n return self.event_specific_data\n\n @property\n def logs_captured_data(self):\n _assert_type("logs_captured_data", DagsterEventType.LOGS_CAPTURED, self.event_type)\n return self.event_specific_data\n\n @staticmethod\n def step_output_event(\n step_context: StepExecutionContext, step_output_data: StepOutputData\n ) -> "DagsterEvent":\n\n output_def = step_context.solid.output_def_named(\n step_output_data.step_output_handle.output_name\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_OUTPUT,\n step_context=step_context,\n event_specific_data=step_output_data,\n message='Yielded output "{output_name}"{mapping_clause} of type "{output_type}".{type_check_clause}'.format(\n output_name=step_output_data.step_output_handle.output_name,\n output_type=output_def.dagster_type.display_name,\n type_check_clause=(\n " Warning! Type check failed."\n if not step_output_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_output_data.type_check_data\n else " (No type check).",\n mapping_clause=f' mapping key "{step_output_data.step_output_handle.mapping_key}"'\n if step_output_data.step_output_handle.mapping_key\n else "",\n ),\n )\n\n @staticmethod\n def step_failure_event(\n step_context: IStepContext, step_failure_data: "StepFailureData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_FAILURE,\n step_context=step_context,\n event_specific_data=step_failure_data,\n message='Execution of step "{step_key}" failed.'.format(step_key=step_context.step.key),\n )\n\n @staticmethod\n def step_retry_event(\n step_context: IStepContext, step_retry_data: "StepRetryData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_UP_FOR_RETRY,\n step_context=step_context,\n event_specific_data=step_retry_data,\n message='Execution of step "{step_key}" failed and has requested a retry{wait_str}.'.format(\n step_key=step_context.step.key,\n wait_str=" in {n} seconds".format(n=step_retry_data.seconds_to_wait)\n if step_retry_data.seconds_to_wait\n else "",\n ),\n )\n\n @staticmethod\n def step_input_event(\n step_context: StepExecutionContext, step_input_data: "StepInputData"\n ) -> "DagsterEvent":\n step_input = step_context.step.step_input_named(step_input_data.input_name)\n input_def = step_input.source.get_input_def(step_context.pipeline_def)\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_INPUT,\n step_context=step_context,\n event_specific_data=step_input_data,\n message='Got input "{input_name}" of type "{input_type}".{type_check_clause}'.format(\n input_name=step_input_data.input_name,\n input_type=input_def.dagster_type.display_name,\n type_check_clause=(\n " Warning! Type check failed."\n if not step_input_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_input_data.type_check_data\n else " (No type check).",\n ),\n )\n\n @staticmethod\n def step_start_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_START,\n step_context=step_context,\n message='Started execution of step "{step_key}".'.format(\n step_key=step_context.step.key\n ),\n )\n\n @staticmethod\n def step_restarted_event(step_context: IStepContext, previous_attempts: int) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_RESTARTED,\n step_context=step_context,\n message='Started re-execution (attempt # {n}) of step "{step_key}".'.format(\n step_key=step_context.step.key, n=previous_attempts + 1\n ),\n )\n\n @staticmethod\n def step_success_event(\n step_context: IStepContext, success: "StepSuccessData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SUCCESS,\n step_context=step_context,\n event_specific_data=success,\n message='Finished execution of step "{step_key}" in {duration}.'.format(\n step_key=step_context.step.key,\n duration=format_duration(success.duration_ms),\n ),\n )\n\n @staticmethod\n def step_skipped_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SKIPPED,\n step_context=step_context,\n message='Skipped execution of step "{step_key}".'.format(\n step_key=step_context.step.key\n ),\n )\n\n @staticmethod\n def asset_materialization(\n step_context: IStepContext,\n materialization: Union[AssetMaterialization, Materialization],\n asset_lineage: Optional[List[AssetLineageInfo]] = None,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n step_context=step_context,\n event_specific_data=StepMaterializationData(materialization, asset_lineage),\n message=materialization.description\n if materialization.description\n else "Materialized value{label_clause}.".format(\n label_clause=" {label}".format(label=materialization.label)\n if materialization.label\n else ""\n ),\n )\n\n @staticmethod\n def asset_observation(\n step_context: IStepContext, observation: AssetObservation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n step_context=step_context,\n event_specific_data=AssetObservationData(observation),\n )\n\n @staticmethod\n def step_expectation_result(\n step_context: IStepContext, expectation_result: ExpectationResult\n ) -> "DagsterEvent":\n def _msg():\n if expectation_result.description:\n return expectation_result.description\n\n return "Expectation{label_clause} {result_verb}".format(\n label_clause=" " + expectation_result.label if expectation_result.label else "",\n result_verb="passed" if expectation_result.success else "failed",\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_EXPECTATION_RESULT,\n step_context=step_context,\n event_specific_data=StepExpectationResultData(expectation_result),\n message=_msg(),\n )\n\n @staticmethod\n def pipeline_start(pipeline_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_pipeline(\n DagsterEventType.RUN_START,\n pipeline_context,\n message='Started execution of run for "{pipeline_name}".'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n )\n\n @staticmethod\n def pipeline_success(pipeline_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_pipeline(\n DagsterEventType.RUN_SUCCESS,\n pipeline_context,\n message='Finished execution of run for "{pipeline_name}".'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n )\n\n @staticmethod\n def pipeline_failure(\n pipeline_context_or_name: Union[IPlanContext, str],\n context_msg: str,\n error_info: Optional[SerializableErrorInfo] = None,\n ) -> "DagsterEvent":\n check.str_param(context_msg, "context_msg")\n if isinstance(pipeline_context_or_name, IPlanContext):\n return DagsterEvent.from_pipeline(\n DagsterEventType.RUN_FAILURE,\n pipeline_context_or_name,\n message='Execution of run for "{pipeline_name}" failed. {context_msg}'.format(\n pipeline_name=pipeline_context_or_name.pipeline_name,\n context_msg=context_msg,\n ),\n event_specific_data=PipelineFailureData(error_info),\n )\n else:\n # when the failure happens trying to bring up context, the pipeline_context hasn't been\n # built and so can't use from_pipeline\n check.str_param(pipeline_context_or_name, "pipeline_name")\n event = DagsterEvent(\n event_type_value=DagsterEventType.RUN_FAILURE.value,\n pipeline_name=pipeline_context_or_name,\n event_specific_data=PipelineFailureData(error_info),\n message='Execution of run for "{pipeline_name}" failed. {context_msg}'.format(\n pipeline_name=pipeline_context_or_name,\n context_msg=context_msg,\n ),\n pid=os.getpid(),\n )\n return event\n\n @staticmethod\n def pipeline_canceled(\n pipeline_context: IPlanContext, error_info: Optional[SerializableErrorInfo] = None\n ) -> "DagsterEvent":\n return DagsterEvent.from_pipeline(\n DagsterEventType.RUN_CANCELED,\n pipeline_context,\n message='Execution of run for "{pipeline_name}" canceled.'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n event_specific_data=PipelineCanceledData(\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo)\n ),\n )\n\n @staticmethod\n def resource_init_start(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n ) -> "DagsterEvent":\n\n return DagsterEvent.from_resource(\n pipeline_name=pipeline_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Starting initialization of resources [{}].".format(\n ", ".join(sorted(resource_keys))\n ),\n event_specific_data=EngineEventData(metadata_entries=[], marker_start="resources"),\n )\n\n @staticmethod\n def resource_init_success(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_instances: Dict[str, Any],\n resource_init_times: Dict[str, str],\n ) -> "DagsterEvent":\n\n metadata_entries = []\n for key in resource_instances.keys():\n metadata_entries.extend(\n [\n MetadataEntry(\n key,\n value=MetadataValue.python_artifact(resource_instances[key].__class__),\n ),\n MetadataEntry(f"{key}:init_time", value=resource_init_times[key]),\n ]\n )\n\n return DagsterEvent.from_resource(\n pipeline_name=pipeline_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Finished initialization of resources [{}].".format(\n ", ".join(sorted(resource_init_times.keys()))\n ),\n event_specific_data=EngineEventData(\n metadata_entries=metadata_entries,\n marker_end="resources",\n ),\n )\n\n @staticmethod\n def resource_init_failure(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n\n return DagsterEvent.from_resource(\n pipeline_name=pipeline_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Initialization of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata_entries=[],\n marker_end="resources",\n error=error,\n ),\n )\n\n @staticmethod\n def resource_teardown_failure(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n\n return DagsterEvent.from_resource(\n pipeline_name=pipeline_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Teardown of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata_entries=[],\n marker_start=None,\n marker_end=None,\n error=error,\n ),\n )\n\n @staticmethod\n def engine_event(\n pipeline_context: IPlanContext,\n message: str,\n event_specific_data: Optional["EngineEventData"] = None,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n ) -> "DagsterEvent":\n return DagsterEvent.from_pipeline(\n DagsterEventType.ENGINE_EVENT,\n pipeline_context,\n message,\n event_specific_data=event_specific_data,\n step_handle=step_handle,\n )\n\n @staticmethod\n def object_store_operation(\n step_context: IStepContext, object_store_operation_result: "ObjectStoreOperation"\n ) -> "DagsterEvent":\n\n object_store_name = (\n "{object_store_name} ".format(\n object_store_name=object_store_operation_result.object_store_name\n )\n if object_store_operation_result.object_store_name\n else ""\n )\n\n serialization_strategy_modifier = (\n " using {serialization_strategy_name}".format(\n serialization_strategy_name=object_store_operation_result.serialization_strategy_name\n )\n if object_store_operation_result.serialization_strategy_name\n else ""\n )\n\n value_name = object_store_operation_result.value_name\n\n if (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.SET_OBJECT\n ):\n message = (\n "Stored intermediate object for output {value_name} in "\n "{object_store_name}object store{serialization_strategy_modifier}."\n ).format(\n value_name=value_name,\n object_store_name=object_store_name,\n serialization_strategy_modifier=serialization_strategy_modifier,\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.GET_OBJECT\n ):\n message = (\n "Retrieved intermediate object for input {value_name} in "\n "{object_store_name}object store{serialization_strategy_modifier}."\n ).format(\n value_name=value_name,\n object_store_name=object_store_name,\n serialization_strategy_modifier=serialization_strategy_modifier,\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.CP_OBJECT\n ):\n message = (\n "Copied intermediate object for input {value_name} from {key} to {dest_key}"\n ).format(\n value_name=value_name,\n key=object_store_operation_result.key,\n dest_key=object_store_operation_result.dest_key,\n )\n else:\n message = ""\n\n return DagsterEvent.from_step(\n DagsterEventType.OBJECT_STORE_OPERATION,\n step_context,\n event_specific_data=ObjectStoreOperationResultData(\n op=object_store_operation_result.op,\n value_name=value_name,\n address=object_store_operation_result.key,\n metadata_entries=[\n MetadataEntry(\n "key", value=MetadataValue.path(object_store_operation_result.key)\n ),\n ],\n version=object_store_operation_result.version,\n mapping_key=object_store_operation_result.mapping_key,\n ),\n message=message,\n )\n\n @staticmethod\n def handled_output(\n step_context: IStepContext,\n output_name: str,\n manager_key: str,\n message_override: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n ) -> "DagsterEvent":\n message = f'Handled output "{output_name}" using IO manager "{manager_key}"'\n return DagsterEvent.from_step(\n event_type=DagsterEventType.HANDLED_OUTPUT,\n step_context=step_context,\n event_specific_data=HandledOutputData(\n output_name=output_name,\n manager_key=manager_key,\n metadata_entries=metadata_entries if metadata_entries else [],\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def loaded_input(\n step_context: IStepContext,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n message_override: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n ) -> "DagsterEvent":\n\n message = f'Loaded input "{input_name}" using input manager "{manager_key}"'\n if upstream_output_name:\n message += f', from output "{upstream_output_name}" of step ' f'"{upstream_step_key}"'\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.LOADED_INPUT,\n step_context=step_context,\n event_specific_data=LoadedInputData(\n input_name=input_name,\n manager_key=manager_key,\n upstream_output_name=upstream_output_name,\n upstream_step_key=upstream_step_key,\n metadata_entries=metadata_entries if metadata_entries else [],\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def hook_completed(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_COMPLETED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n message=(\n 'Finished the execution of hook "{hook_name}" triggered for "{solid_name}".'\n ).format(hook_name=hook_def.name, solid_name=step_context.solid.name),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def hook_errored(\n step_context: StepExecutionContext, error: HookExecutionError\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_ERRORED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n event_specific_data=_validate_event_specific_data(\n event_type,\n HookErroredData(\n error=serializable_error_info_from_exc_info(error.original_exc_info)\n ),\n ),\n )\n\n step_context.log.log_dagster_event(level=logging.ERROR, msg=str(error), dagster_event=event)\n\n return event\n\n @staticmethod\n def hook_skipped(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_SKIPPED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n message=(\n 'Skipped the execution of hook "{hook_name}". It did not meet its triggering '\n 'condition during the execution of "{solid_name}".'\n ).format(hook_name=hook_def.name, solid_name=step_context.solid.name),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def capture_logs(pipeline_context: IPlanContext, log_key: str, steps: List["ExecutionStep"]):\n step_keys = [step.key for step in steps]\n if len(step_keys) == 1:\n message = f"Started capturing logs for step: {step_keys[0]}."\n else:\n message = f"Started capturing logs in process (pid: {os.getpid()})."\n\n if isinstance(pipeline_context, StepExecutionContext):\n return DagsterEvent.from_step(\n DagsterEventType.LOGS_CAPTURED,\n pipeline_context,\n message=message,\n event_specific_data=ComputeLogsCaptureData(\n step_keys=step_keys,\n log_key=log_key,\n ),\n )\n\n return DagsterEvent.from_pipeline(\n DagsterEventType.LOGS_CAPTURED,\n pipeline_context,\n message=message,\n event_specific_data=ComputeLogsCaptureData(\n step_keys=step_keys,\n log_key=log_key,\n ),\n )
\n\n\ndef get_step_output_event(\n events: List[DagsterEvent], step_key: str, output_name: Optional[str] = "result"\n) -> Optional["DagsterEvent"]:\n check.list_param(events, "events", of_type=DagsterEvent)\n check.str_param(step_key, "step_key")\n check.str_param(output_name, "output_name")\n for event in events:\n if (\n event.event_type == DagsterEventType.STEP_OUTPUT\n and event.step_key == step_key\n and event.step_output_data.output_name == output_name\n ):\n return event\n return None\n\n\n@whitelist_for_serdes\nclass AssetObservationData(\n NamedTuple("_AssetObservation", [("asset_observation", AssetObservation)])\n):\n def __new__(cls, asset_observation: AssetObservation):\n return super(AssetObservationData, cls).__new__(\n cls,\n asset_observation=check.inst_param(\n asset_observation, "asset_observation", AssetObservation\n ),\n )\n\n\n@whitelist_for_serdes\nclass StepMaterializationData(\n NamedTuple(\n "_StepMaterializationData",\n [\n ("materialization", Union[Materialization, AssetMaterialization]),\n ("asset_lineage", List[AssetLineageInfo]),\n ],\n )\n):\n def __new__(\n cls,\n materialization: Union[Materialization, AssetMaterialization],\n asset_lineage: Optional[List[AssetLineageInfo]] = None,\n ):\n return super(StepMaterializationData, cls).__new__(\n cls,\n materialization=check.inst_param(\n materialization, "materialization", (Materialization, AssetMaterialization)\n ),\n asset_lineage=check.opt_list_param(\n asset_lineage, "asset_lineage", of_type=AssetLineageInfo\n ),\n )\n\n\n@whitelist_for_serdes\nclass AssetMaterializationPlannedData(\n NamedTuple("_AssetMaterializationPlannedData", [("asset_key", AssetKey)])\n):\n def __new__(cls, asset_key: AssetKey):\n return super(AssetMaterializationPlannedData, cls).__new__(\n cls, asset_key=check.inst_param(asset_key, "asset_key", AssetKey)\n )\n\n\n@whitelist_for_serdes\nclass StepExpectationResultData(\n NamedTuple(\n "_StepExpectationResultData",\n [\n ("expectation_result", ExpectationResult),\n ],\n )\n):\n def __new__(cls, expectation_result: ExpectationResult):\n return super(StepExpectationResultData, cls).__new__(\n cls,\n expectation_result=check.inst_param(\n expectation_result, "expectation_result", ExpectationResult\n ),\n )\n\n\n@whitelist_for_serdes\nclass ObjectStoreOperationResultData(\n NamedTuple(\n "_ObjectStoreOperationResultData",\n [\n ("op", ObjectStoreOperationType),\n ("value_name", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ("address", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n value_name: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n address: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperationResultData, cls).__new__(\n cls,\n op=cast(ObjectStoreOperationType, check.str_param(op, "op")),\n value_name=check.opt_str_param(value_name, "value_name"),\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n address=check.opt_str_param(address, "address"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n\n@whitelist_for_serdes\nclass EngineEventData(\n NamedTuple(\n "_EngineEventData",\n [\n ("metadata_entries", List[MetadataEntry]),\n ("error", Optional[SerializableErrorInfo]),\n ("marker_start", Optional[str]),\n ("marker_end", Optional[str]),\n ],\n )\n):\n # serdes log\n # * added optional error\n # * added marker_start / marker_end\n #\n def __new__(\n cls,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n error: Optional[SerializableErrorInfo] = None,\n marker_start: Optional[str] = None,\n marker_end: Optional[str] = None,\n ):\n return super(EngineEventData, cls).__new__(\n cls,\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n marker_start=check.opt_str_param(marker_start, "marker_start"),\n marker_end=check.opt_str_param(marker_end, "marker_end"),\n )\n\n @staticmethod\n def in_process(\n pid: int, step_keys_to_execute: Optional[List[str]] = None, marker_end: Optional[str] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata_entries=[MetadataEntry("pid", value=str(pid))]\n + (\n [MetadataEntry("step_keys", value=str(step_keys_to_execute))]\n if step_keys_to_execute\n else []\n ),\n marker_end=marker_end,\n )\n\n @staticmethod\n def multiprocess(\n pid: int, step_keys_to_execute: Optional[List[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata_entries=[MetadataEntry("pid", value=str(pid))]\n + (\n [MetadataEntry("step_keys", value=str(step_keys_to_execute))]\n if step_keys_to_execute\n else []\n )\n )\n\n @staticmethod\n def interrupted(steps_interrupted: List[str]) -> "EngineEventData":\n return EngineEventData(\n metadata_entries=[MetadataEntry("steps_interrupted", value=str(steps_interrupted))]\n )\n\n @staticmethod\n def engine_error(error: SerializableErrorInfo) -> "EngineEventData":\n return EngineEventData(metadata_entries=[], error=error)\n\n\n@whitelist_for_serdes\nclass PipelineFailureData(\n NamedTuple(\n "_PipelineFailureData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(PipelineFailureData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass PipelineCanceledData(\n NamedTuple(\n "_PipelineCanceledData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(PipelineCanceledData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HookErroredData(\n NamedTuple(\n "_HookErroredData",\n [\n ("error", SerializableErrorInfo),\n ],\n )\n):\n def __new__(cls, error: SerializableErrorInfo):\n return super(HookErroredData, cls).__new__(\n cls, error=check.inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HandledOutputData(\n NamedTuple(\n "_HandledOutputData",\n [\n ("output_name", str),\n ("manager_key", str),\n ("metadata_entries", List[MetadataEntry]),\n ],\n )\n):\n def __new__(\n cls,\n output_name: str,\n manager_key: str,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n ):\n return super(HandledOutputData, cls).__new__(\n cls,\n output_name=check.str_param(output_name, "output_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n )\n\n\n@whitelist_for_serdes\nclass LoadedInputData(\n NamedTuple(\n "_LoadedInputData",\n [\n ("input_name", str),\n ("manager_key", str),\n ("upstream_output_name", Optional[str]),\n ("upstream_step_key", Optional[str]),\n ("metadata_entries", Optional[List[MetadataEntry]]),\n ],\n )\n):\n def __new__(\n cls,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n ):\n return super(LoadedInputData, cls).__new__(\n cls,\n input_name=check.str_param(input_name, "input_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n upstream_output_name=check.opt_str_param(upstream_output_name, "upstream_output_name"),\n upstream_step_key=check.opt_str_param(upstream_step_key, "upstream_step_key"),\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n )\n\n\n@whitelist_for_serdes\nclass ComputeLogsCaptureData(\n NamedTuple(\n "_ComputeLogsCaptureData",\n [\n ("log_key", str),\n ("step_keys", List[str]),\n ],\n )\n):\n def __new__(cls, log_key, step_keys):\n return super(ComputeLogsCaptureData, cls).__new__(\n cls,\n log_key=check.str_param(log_key, "log_key"),\n step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n )\n\n\n###################################################################################################\n# THE GRAVEYARD\n#\n# -|- -|- -|-\n# | | |\n# _-'~~~~~`-_ . _-'~~~~~`-_ _-'~~~~~`-_\n# .' '. .' '. .' '.\n# | R I P | | R I P | | R I P |\n# | | | | | |\n# | Synthetic | | Asset | | Pipeline |\n# | Process | | Store | | Init |\n# | Events | | Operations | | Failures |\n# | | | | | |\n###################################################################################################\n\n# Keep these around to prevent issues like https://github.com/dagster-io/dagster/issues/3533\n@whitelist_for_serdes\nclass AssetStoreOperationData(NamedTuple):\n op: str\n step_key: str\n output_name: str\n asset_store_key: str\n\n\n@whitelist_for_serdes\nclass AssetStoreOperationType(Enum):\n SET_ASSET = "SET_ASSET"\n GET_ASSET = "GET_ASSET"\n\n\n@whitelist_for_serdes\nclass PipelineInitFailureData(NamedTuple):\n error: SerializableErrorInfo\n\n\ndef _handle_back_compat(event_type_value, event_specific_data):\n # transform old specific process events in to engine events\n if event_type_value == "PIPELINE_PROCESS_START":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n elif event_type_value == "PIPELINE_PROCESS_STARTED":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n elif event_type_value == "PIPELINE_PROCESS_EXITED":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n\n # changes asset store ops in to get/set asset\n elif event_type_value == "ASSET_STORE_OPERATION":\n if event_specific_data.op in ("GET_ASSET", AssetStoreOperationType.GET_ASSET):\n return (\n DagsterEventType.LOADED_INPUT.value,\n LoadedInputData(\n event_specific_data.output_name, event_specific_data.asset_store_key\n ),\n )\n if event_specific_data.op in ("SET_ASSET", AssetStoreOperationType.SET_ASSET):\n return (\n DagsterEventType.HANDLED_OUTPUT.value,\n HandledOutputData(\n event_specific_data.output_name, event_specific_data.asset_store_key, []\n ),\n )\n\n # previous name for ASSET_MATERIALIZATION was STEP_MATERIALIZATION\n if event_type_value == "STEP_MATERIALIZATION":\n return DagsterEventType.ASSET_MATERIALIZATION.value, event_specific_data\n\n # transform PIPELINE_INIT_FAILURE to PIPELINE_FAILURE\n if event_type_value == "PIPELINE_INIT_FAILURE":\n return DagsterEventType.PIPELINE_FAILURE.value, PipelineFailureData(\n event_specific_data.error\n )\n\n return event_type_value, event_specific_data\n\n\nregister_serdes_tuple_fallbacks(\n {\n "PipelineProcessStartedData": None,\n "PipelineProcessExitedData": None,\n "PipelineProcessStartData": None,\n }\n)\n
", "current_page_name": "_modules/dagster/core/events", "customsidebar": null, "log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.events.log

\nfrom typing import Any, Dict, NamedTuple, Optional, Union\n\nfrom dagster import check\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.utils import coerce_valid_log_level\nfrom dagster.serdes.serdes import (\n    DefaultNamedTupleSerializer,\n    WhitelistMap,\n    deserialize_json_to_dagster_namedtuple,\n    register_serdes_tuple_fallbacks,\n    serialize_dagster_namedtuple,\n    whitelist_for_serdes,\n)\nfrom dagster.utils.error import SerializableErrorInfo\nfrom dagster.utils.log import (\n    JsonEventLoggerHandler,\n    StructuredLoggerHandler,\n    StructuredLoggerMessage,\n    construct_single_handler_logger,\n)\n\n\nclass EventLogEntrySerializer(DefaultNamedTupleSerializer):\n    @classmethod\n    def value_to_storage_dict(\n        cls,\n        value: NamedTuple,\n        whitelist_map: WhitelistMap,\n        descent_path: str,\n    ) -> Dict[str, Any]:\n        storage_dict = super().value_to_storage_dict(value, whitelist_map, descent_path)\n        # include an empty string for the message field to allow older versions of dagster to load the events\n        storage_dict["message"] = ""\n        return storage_dict\n\n\n
[docs]@whitelist_for_serdes(serializer=EventLogEntrySerializer)\nclass EventLogEntry(\n NamedTuple(\n "_EventLogEntry",\n [\n ("error_info", Optional[SerializableErrorInfo]),\n ("level", Union[str, int]),\n ("user_message", str),\n ("run_id", str),\n ("timestamp", float),\n ("step_key", Optional[str]),\n ("pipeline_name", Optional[str]),\n ("dagster_event", Optional[DagsterEvent]),\n ],\n )\n):\n """Entries in the event log.\n\n These entries may originate from the logging machinery (DagsterLogManager/context.log), from\n framework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n (e.g. Output).\n\n Args:\n error_info (Optional[SerializableErrorInfo]): Error info for an associated exception, if\n any, as generated by serializable_error_info_from_exc_info and friends.\n level (Union[str, int]): The Python log level at which to log this event. Note that\n framework and user code events are also logged to Python logging. This value may be an\n integer or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.\n user_message (str): For log messages, this is the user-generated message.\n run_id (str): The id of the run which generated this event.\n timestamp (float): The Unix timestamp of this event.\n step_key (Optional[str]): The step key for the step which generated this event. Some events\n are generated outside of a step context.\n job_name (Optional[str]): The job which generated this event. Some events are\n generated outside of a job context.\n dagster_event (Optional[DagsterEvent]): For framework and user events, the associated\n structured event.\n pipeline_name (Optional[str]): (legacy) The pipeline which generated this event. Some events are\n generated outside of a pipeline context.\n """\n\n def __new__(\n cls,\n error_info,\n level,\n user_message,\n run_id,\n timestamp,\n step_key=None,\n pipeline_name=None,\n dagster_event=None,\n job_name=None,\n ):\n if pipeline_name and job_name:\n raise DagsterInvariantViolationError(\n "Provided both `pipeline_name` and `job_name` parameters to `EventLogEntry` "\n "initialization. Please provide only one or the other."\n )\n\n pipeline_name = pipeline_name or job_name\n return super(EventLogEntry, cls).__new__(\n cls,\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo),\n coerce_valid_log_level(level),\n check.str_param(user_message, "user_message"),\n check.str_param(run_id, "run_id"),\n check.float_param(timestamp, "timestamp"),\n check.opt_str_param(step_key, "step_key"),\n check.opt_str_param(pipeline_name, "pipeline_name"),\n check.opt_inst_param(dagster_event, "dagster_event", DagsterEvent),\n )\n\n @property\n def is_dagster_event(self) -> bool:\n return bool(self.dagster_event)\n\n @property\n def job_name(self) -> Optional[str]:\n return self.pipeline_name\n\n def get_dagster_event(self) -> DagsterEvent:\n if not isinstance(self.dagster_event, DagsterEvent):\n check.failed(\n "Not a dagster event, check is_dagster_event before calling get_dagster_event",\n )\n\n return self.dagster_event\n\n def to_json(self):\n return serialize_dagster_namedtuple(self)\n\n @staticmethod\n def from_json(json_str):\n return deserialize_json_to_dagster_namedtuple(json_str)\n\n @property\n def dagster_event_type(self):\n return self.dagster_event.event_type if self.dagster_event else None\n\n @property\n def message(self) -> str:\n """\n Return the message from the structured DagsterEvent if present, fallback to user_message\n """\n\n if self.is_dagster_event:\n msg = self.get_dagster_event().message\n if msg is not None:\n return msg\n\n return self.user_message
\n\n\ndef construct_event_record(logger_message):\n check.inst_param(logger_message, "logger_message", StructuredLoggerMessage)\n\n return EventLogEntry(\n level=logger_message.level,\n user_message=logger_message.meta["orig_message"],\n run_id=logger_message.meta["run_id"],\n timestamp=logger_message.record.created,\n step_key=logger_message.meta.get("step_key"),\n job_name=logger_message.meta.get("pipeline_name"),\n dagster_event=logger_message.meta.get("dagster_event"),\n error_info=None,\n )\n\n\ndef construct_event_logger(event_record_callback):\n """\n Callback receives a stream of event_records. Piggybacks on the logging machinery.\n """\n check.callable_param(event_record_callback, "event_record_callback")\n\n return construct_single_handler_logger(\n "event-logger",\n "debug",\n StructuredLoggerHandler(\n lambda logger_message: event_record_callback(construct_event_record(logger_message))\n ),\n )\n\n\ndef construct_json_event_logger(json_path):\n """Record a stream of event records to json"""\n check.str_param(json_path, "json_path")\n return construct_single_handler_logger(\n "json-event-record-logger",\n "debug",\n JsonEventLoggerHandler(\n json_path,\n lambda record: construct_event_record(\n StructuredLoggerMessage(\n name=record.name,\n message=record.msg,\n level=record.levelno,\n meta=record.dagster_meta,\n record=record,\n )\n ),\n ),\n )\n\n\nregister_serdes_tuple_fallbacks(\n {\n # These were originally distinguished from each other but ended up being empty subclasses\n # of EventLogEntry -- instead of using the subclasses we were relying on\n # EventLogEntry.is_dagster_event to distinguish events that originate in the logging\n # machinery from events that are yielded by user code\n "DagsterEventRecord": EventLogEntry,\n "LogMessageRecord": EventLogEntry,\n # renamed EventRecord -> EventLogEntry\n "EventRecord": EventLogEntry,\n }\n)\n
", "current_page_name": "_modules/dagster/core/events/log", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster.core.events"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.events.log"}, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.events"}, "execution": {"api": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.api

\nimport sys\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, FrozenSet, Iterator, List, Optional, Tuple, Union\n\nfrom dagster import check\nfrom dagster.core.definitions import IPipeline, JobDefinition, PipelineDefinition\nfrom dagster.core.definitions.pipeline_base import InMemoryPipeline\nfrom dagster.core.definitions.pipeline_definition import PipelineSubsetDefinition\nfrom dagster.core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster.core.events import DagsterEvent, EngineEventData\nfrom dagster.core.execution.context.system import PlanOrchestrationContext\nfrom dagster.core.execution.plan.execute_plan import inner_plan_execution_iterator\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.execution.plan.plan import ExecutionPlan\nfrom dagster.core.execution.plan.state import KnownExecutionState\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.instance import DagsterInstance, InstanceRef\nfrom dagster.core.selector import parse_step_selection\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.system_config.objects import ResolvedRunConfig\nfrom dagster.core.telemetry import log_repo_stats, telemetry_wrapper\nfrom dagster.core.utils import str_format_set\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\nfrom dagster.utils.interrupts import capture_interrupts\n\nfrom .context_creation_pipeline import (\n    ExecutionContextManager,\n    PlanExecutionContextManager,\n    PlanOrchestrationContextManager,\n    orchestration_context_event_generator,\n    scoped_pipeline_context,\n)\nfrom .results import PipelineExecutionResult\n\n## Brief guide to the execution APIs\n# | function name               | operates over      | sync  | supports    | creates new PipelineRun |\n# |                             |                    |       | reexecution | in instance             |\n# | --------------------------- | ------------------ | ----- | ----------- | ----------------------- |\n# | execute_pipeline_iterator   | IPipeline          | async | no          | yes                     |\n# | execute_pipeline            | IPipeline          | sync  | no          | yes                     |\n# | execute_run_iterator        | PipelineRun        | async | (1)         | no                      |\n# | execute_run                 | PipelineRun        | sync  | (1)         | no                      |\n# | execute_plan_iterator       | ExecutionPlan      | async | (2)         | no                      |\n# | execute_plan                | ExecutionPlan      | sync  | (2)         | no                      |\n# | reexecute_pipeline          | IPipeline          | sync  | yes         | yes                     |\n# | reexecute_pipeline_iterator | IPipeline          | async | yes         | yes                     |\n#\n# Notes on reexecution support:\n# (1) The appropriate bits must be set on the PipelineRun passed to this function. Specifically,\n#     parent_run_id and root_run_id must be set and consistent, and if a solids_to_execute or\n#     step_keys_to_execute are set they must be consistent with the parent and root runs.\n# (2) As for (1), but the ExecutionPlan passed must also agree in all relevant bits.\n\n\ndef execute_run_iterator(\n    pipeline: IPipeline,\n    pipeline_run: PipelineRun,\n    instance: DagsterInstance,\n    resume_from_failure: bool = False,\n) -> Iterator[DagsterEvent]:\n    check.inst_param(pipeline, "pipeline", IPipeline)\n    check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if pipeline_run.status == PipelineRunStatus.CANCELED:\n        # This can happen if the run was force-terminated while it was starting\n        def gen_execute_on_cancel():\n            yield instance.report_engine_event(\n                "Not starting execution since the run was canceled before execution could start",\n                pipeline_run,\n            )\n\n        return gen_execute_on_cancel()\n\n    if not resume_from_failure:\n        if pipeline_run.status not in (PipelineRunStatus.NOT_STARTED, PipelineRunStatus.STARTING):\n            if instance.run_monitoring_enabled:\n                # This can happen if the pod was unexpectedly restarted by the cluster - ignore it since\n                # the run monitoring daemon will also spin up a new pod\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a duplicate run that was started from somewhere other than the run monitor daemon",\n                        pipeline_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            else:\n                raise Exception(\n                    f"{pipeline_run.pipeline_name} ({pipeline_run.run_id}) started "\n                    f"a new run while the run was already in state {pipeline_run.status}. "\n                    "This most frequently happens when the run worker unexpectedly stops and is "\n                    "restarted by the cluster.",\n                )\n    else:\n        check.invariant(\n            pipeline_run.status == PipelineRunStatus.STARTED\n            or pipeline_run.status == PipelineRunStatus.STARTING,\n            desc="Run of {} ({}) in state {}, expected STARTED or STARTING because it's "\n            "resuming from a run worker failure".format(\n                pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status\n            ),\n        )\n\n    if pipeline_run.solids_to_execute:\n        pipeline_def = pipeline.get_definition()\n        if isinstance(pipeline_def, PipelineSubsetDefinition):\n            check.invariant(\n                pipeline_run.solids_to_execute == pipeline.solids_to_execute,\n                "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that conflicts "\n                "with pipeline subset {pipeline_solids_to_execute}.".format(\n                    pipeline_solids_to_execute=str_format_set(pipeline.solids_to_execute),\n                    solids_to_execute=str_format_set(pipeline_run.solids_to_execute),\n                ),\n            )\n        else:\n            # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created\n            # note that when we receive the solids to execute via PipelineRun, it won't support\n            # solid selection query syntax\n            pipeline = pipeline.subset_for_execution_from_existing_pipeline(\n                frozenset(pipeline_run.solids_to_execute)\n            )\n\n    execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run, instance)\n\n    return iter(\n        ExecuteRunWithPlanIterable(\n            execution_plan=execution_plan,\n            iterator=pipeline_execution_iterator,\n            execution_context_manager=PlanOrchestrationContextManager(\n                context_event_generator=orchestration_context_event_generator,\n                pipeline=pipeline,\n                execution_plan=execution_plan,\n                pipeline_run=pipeline_run,\n                instance=instance,\n                run_config=pipeline_run.run_config,\n                raise_on_error=False,\n                executor_defs=None,\n                output_capture=None,\n                resume_from_failure=resume_from_failure,\n            ),\n        )\n    )\n\n\ndef execute_run(\n    pipeline: IPipeline,\n    pipeline_run: PipelineRun,\n    instance: DagsterInstance,\n    raise_on_error: bool = False,\n) -> PipelineExecutionResult:\n    """Executes an existing pipeline run synchronously.\n\n    Synchronous version of execute_run_iterator.\n\n    Args:\n        pipeline (IPipeline): The pipeline to execute.\n        pipeline_run (PipelineRun): The run to execute\n        instance (DagsterInstance): The instance in which the run has been created.\n        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n            Defaults to ``False``.\n\n    Returns:\n        PipelineExecutionResult: The result of the execution.\n    """\n    if isinstance(pipeline, PipelineDefinition):\n        if isinstance(pipeline, JobDefinition):\n            error = "execute_run requires a reconstructable job but received job definition directly instead."\n        else:\n            error = (\n                "execute_run requires a reconstructable pipeline but received pipeline definition "\n                "directly instead."\n            )\n        raise DagsterInvariantViolationError(\n            f"{error} To support hand-off to other processes please wrap your definition in "\n            "a call to reconstructable(). Learn more about reconstructable here: https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n        )\n\n    check.inst_param(pipeline, "pipeline", IPipeline)\n    check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if pipeline_run.status == PipelineRunStatus.CANCELED:\n        message = "Not starting execution since the run was canceled before execution could start"\n        instance.report_engine_event(\n            message,\n            pipeline_run,\n        )\n        raise DagsterInvariantViolationError(message)\n\n    check.invariant(\n        pipeline_run.status == PipelineRunStatus.NOT_STARTED\n        or pipeline_run.status == PipelineRunStatus.STARTING,\n        desc="Run {} ({}) in state {}, expected NOT_STARTED or STARTING".format(\n            pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status\n        ),\n    )\n    pipeline_def = pipeline.get_definition()\n    if pipeline_run.solids_to_execute:\n        if isinstance(pipeline_def, PipelineSubsetDefinition):\n            check.invariant(\n                pipeline_run.solids_to_execute == pipeline.solids_to_execute,\n                "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that "\n                "conflicts with pipeline subset {pipeline_solids_to_execute}.".format(\n                    pipeline_solids_to_execute=str_format_set(pipeline.solids_to_execute),\n                    solids_to_execute=str_format_set(pipeline_run.solids_to_execute),\n                ),\n            )\n        else:\n            # when `execute_run` is directly called, the sub pipeline hasn't been created\n            # note that when we receive the solids to execute via PipelineRun, it won't support\n            # solid selection query syntax\n            pipeline = pipeline.subset_for_execution_from_existing_pipeline(\n                frozenset(pipeline_run.solids_to_execute)\n            )\n\n    execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run, instance)\n\n    output_capture: Optional[Dict[StepOutputHandle, Any]] = {}\n\n    _execute_run_iterable = ExecuteRunWithPlanIterable(\n        execution_plan=execution_plan,\n        iterator=pipeline_execution_iterator,\n        execution_context_manager=PlanOrchestrationContextManager(\n            context_event_generator=orchestration_context_event_generator,\n            pipeline=pipeline,\n            execution_plan=execution_plan,\n            pipeline_run=pipeline_run,\n            instance=instance,\n            run_config=pipeline_run.run_config,\n            raise_on_error=raise_on_error,\n            executor_defs=None,\n            output_capture=output_capture,\n        ),\n    )\n    event_list = list(_execute_run_iterable)\n\n    return PipelineExecutionResult(\n        pipeline.get_definition(),\n        pipeline_run.run_id,\n        event_list,\n        lambda: scoped_pipeline_context(\n            execution_plan,\n            pipeline,\n            pipeline_run.run_config,\n            pipeline_run,\n            instance,\n        ),\n        output_capture=output_capture,\n    )\n\n\n
[docs]def execute_pipeline_iterator(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> Iterator[DagsterEvent]:\n """Execute a pipeline iteratively.\n\n Rather than package up the result of running a pipeline into a single object, like\n :py:func:`execute_pipeline`, this function yields the stream of events resulting from pipeline\n execution.\n\n This is intended to allow the caller to handle these events on a streaming basis in whatever\n way is appropriate.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n run_config (Optional[dict]): The configuration that parametrizes this run,\n as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n\n - ``['some_solid']``: selects ``some_solid`` itself.\n - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).\n - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its\n ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Iterator[DagsterEvent]: The stream of events resulting from pipeline execution.\n """\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n (\n pipeline,\n run_config,\n mode,\n tags,\n solids_to_execute,\n solid_selection,\n ) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n )\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n tags=tags,\n )\n\n return execute_run_iterator(pipeline, pipeline_run, execute_instance)
\n\n\n@contextmanager\ndef ephemeral_instance_if_missing(\n instance: Optional[DagsterInstance],\n) -> Iterator[DagsterInstance]:\n if instance:\n yield instance\n else:\n with DagsterInstance.ephemeral() as ephemeral_instance:\n yield ephemeral_instance\n\n\n
[docs]def execute_pipeline(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n """Execute a pipeline synchronously.\n\n Users will typically call this API when testing pipeline execution, or running standalone\n scripts.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n run_config (Optional[dict]): The configuration that parametrizes this run,\n as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n\n - ``['some_solid']``: selects ``some_solid`` itself.\n - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).\n - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its\n ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.\n\n Returns:\n :py:class:`PipelineExecutionResult`: The result of pipeline execution.\n\n For the asynchronous version, see :py:func:`execute_pipeline_iterator`.\n """\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n return _logged_execute_pipeline(\n pipeline,\n instance=execute_instance,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n raise_on_error=raise_on_error,\n )
\n\n\n@telemetry_wrapper\ndef _logged_execute_pipeline(\n pipeline: Union[IPipeline, PipelineDefinition],\n instance: DagsterInstance,\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n check.inst_param(instance, "instance", DagsterInstance)\n (\n pipeline,\n run_config,\n mode,\n tags,\n solids_to_execute,\n solid_selection,\n ) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n )\n\n log_repo_stats(instance=instance, pipeline=pipeline, source="execute_pipeline")\n\n pipeline_run = instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n tags=tags,\n )\n\n return execute_run(\n pipeline,\n pipeline_run,\n instance,\n raise_on_error=raise_on_error,\n )\n\n\n
[docs]def reexecute_pipeline(\n pipeline: Union[IPipeline, PipelineDefinition],\n parent_run_id: str,\n run_config: Optional[dict] = None,\n step_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n """Reexecute an existing pipeline run.\n\n Users will typically call this API when testing pipeline reexecution, or running standalone\n scripts.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n parent_run_id (str): The id of the previous run to reexecute. The run must exist in the\n instance.\n run_config (Optional[dict]): The configuration that parametrizes this run,\n as a dict.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n\n - ``['some_solid']``: selects ``some_solid`` itself.\n - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).\n - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its\n ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.\n\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n\n Returns:\n :py:class:`PipelineExecutionResult`: The result of pipeline execution.\n\n For the asynchronous version, see :py:func:`reexecute_pipeline_iterator`.\n """\n\n check.opt_list_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n (pipeline, run_config, mode, tags, _, _) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n )\n\n parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)\n if parent_pipeline_run is None:\n check.failed(\n "No parent run with id {parent_run_id} found in instance.".format(\n parent_run_id=parent_run_id\n ),\n )\n\n execution_plan: Optional[ExecutionPlan] = None\n # resolve step selection DSL queries using parent execution information\n if step_selection:\n execution_plan = _resolve_reexecute_step_selection(\n execute_instance,\n pipeline,\n mode,\n run_config,\n parent_pipeline_run,\n step_selection,\n )\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n execution_plan=execution_plan,\n run_config=run_config,\n mode=mode,\n tags=tags,\n solid_selection=parent_pipeline_run.solid_selection,\n solids_to_execute=parent_pipeline_run.solids_to_execute,\n root_run_id=parent_pipeline_run.root_run_id or parent_pipeline_run.run_id,\n parent_run_id=parent_pipeline_run.run_id,\n )\n\n return execute_run(\n pipeline,\n pipeline_run,\n execute_instance,\n raise_on_error=raise_on_error,\n )
\n\n\n
[docs]def reexecute_pipeline_iterator(\n pipeline: Union[IPipeline, PipelineDefinition],\n parent_run_id: str,\n run_config: Optional[dict] = None,\n step_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> Iterator[DagsterEvent]:\n """Reexecute a pipeline iteratively.\n\n Rather than package up the result of running a pipeline into a single object, like\n :py:func:`reexecute_pipeline`, this function yields the stream of events resulting from pipeline\n reexecution.\n\n This is intended to allow the caller to handle these events on a streaming basis in whatever\n way is appropriate.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n parent_run_id (str): The id of the previous run to reexecute. The run must exist in the\n instance.\n run_config (Optional[dict]): The configuration that parametrizes this run,\n as a dict.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n\n - ``['some_solid']``: selects ``some_solid`` itself.\n - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).\n - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its\n ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.\n\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Iterator[DagsterEvent]: The stream of events resulting from pipeline reexecution.\n """\n\n check.opt_list_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n (pipeline, run_config, mode, tags, _, _) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=None,\n )\n parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)\n if parent_pipeline_run is None:\n check.failed(\n "No parent run with id {parent_run_id} found in instance.".format(\n parent_run_id=parent_run_id\n ),\n )\n\n execution_plan: Optional[ExecutionPlan] = None\n # resolve step selection DSL queries using parent execution information\n if step_selection:\n execution_plan = _resolve_reexecute_step_selection(\n execute_instance,\n pipeline,\n mode,\n run_config,\n parent_pipeline_run,\n step_selection,\n )\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n execution_plan=execution_plan,\n mode=mode,\n tags=tags,\n solid_selection=parent_pipeline_run.solid_selection,\n solids_to_execute=parent_pipeline_run.solids_to_execute,\n root_run_id=parent_pipeline_run.root_run_id or parent_pipeline_run.run_id,\n parent_run_id=parent_pipeline_run.run_id,\n )\n\n return execute_run_iterator(pipeline, pipeline_run, execute_instance)
\n\n\ndef execute_plan_iterator(\n execution_plan: ExecutionPlan,\n pipeline: IPipeline,\n pipeline_run: PipelineRun,\n instance: DagsterInstance,\n retry_mode: Optional[RetryMode] = None,\n run_config: Optional[dict] = None,\n) -> Iterator[DagsterEvent]:\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(pipeline, "pipeline", IPipeline)\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.inst_param(instance, "instance", DagsterInstance)\n retry_mode = check.opt_inst_param(retry_mode, "retry_mode", RetryMode, RetryMode.DISABLED)\n run_config = check.opt_dict_param(run_config, "run_config")\n\n return iter(\n ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=inner_plan_execution_iterator,\n execution_context_manager=PlanExecutionContextManager(\n pipeline=pipeline,\n retry_mode=retry_mode,\n execution_plan=execution_plan,\n run_config=run_config,\n pipeline_run=pipeline_run,\n instance=instance,\n ),\n )\n )\n\n\ndef execute_plan(\n execution_plan: ExecutionPlan,\n pipeline: IPipeline,\n instance: DagsterInstance,\n pipeline_run: PipelineRun,\n run_config: Optional[Dict] = None,\n retry_mode: Optional[RetryMode] = None,\n) -> List[DagsterEvent]:\n """This is the entry point of dagster-graphql executions. For the dagster CLI entry point, see\n execute_pipeline() above.\n """\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(pipeline, "pipeline", IPipeline)\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n run_config = check.opt_dict_param(run_config, "run_config")\n check.opt_inst_param(retry_mode, "retry_mode", RetryMode)\n\n return list(\n execute_plan_iterator(\n execution_plan=execution_plan,\n pipeline=pipeline,\n run_config=run_config,\n pipeline_run=pipeline_run,\n instance=instance,\n retry_mode=retry_mode,\n )\n )\n\n\ndef _check_pipeline(pipeline: Union[PipelineDefinition, IPipeline]) -> IPipeline:\n # backcompat\n if isinstance(pipeline, PipelineDefinition):\n pipeline = InMemoryPipeline(pipeline)\n\n check.inst_param(pipeline, "pipeline", IPipeline)\n return pipeline\n\n\ndef _get_execution_plan_from_run(\n pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance\n) -> ExecutionPlan:\n if (\n # need to rebuild execution plan so it matches the subsetted graph\n pipeline.solids_to_execute is None\n and pipeline_run.execution_plan_snapshot_id\n ):\n execution_plan_snapshot = instance.get_execution_plan_snapshot(\n pipeline_run.execution_plan_snapshot_id\n )\n if execution_plan_snapshot.can_reconstruct_plan:\n return ExecutionPlan.rebuild_from_snapshot(\n pipeline_run.pipeline_name,\n execution_plan_snapshot,\n )\n return create_execution_plan(\n pipeline,\n run_config=pipeline_run.run_config,\n mode=pipeline_run.mode,\n step_keys_to_execute=pipeline_run.step_keys_to_execute,\n instance_ref=instance.get_ref() if instance.is_persistent else None,\n )\n\n\ndef create_execution_plan(\n pipeline: Union[IPipeline, PipelineDefinition],\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n step_keys_to_execute: Optional[List[str]] = None,\n known_state: Optional[KnownExecutionState] = None,\n instance_ref: Optional[InstanceRef] = None,\n tags: Optional[Dict[str, str]] = None,\n) -> ExecutionPlan:\n pipeline = _check_pipeline(pipeline)\n pipeline_def = pipeline.get_definition()\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n mode = check.opt_str_param(mode, "mode", default=pipeline_def.get_default_mode_name())\n check.opt_nullable_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n tags = check.opt_dict_param(tags, "tags", key_type=str, value_type=str)\n\n resolved_run_config = ResolvedRunConfig.build(pipeline_def, run_config, mode=mode)\n\n return ExecutionPlan.build(\n pipeline,\n resolved_run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance_ref=instance_ref,\n tags=tags,\n )\n\n\ndef pipeline_execution_iterator(\n pipeline_context: PlanOrchestrationContext, execution_plan: ExecutionPlan\n) -> Iterator[DagsterEvent]:\n """A complete execution of a pipeline. Yields pipeline start, success,\n and failure events.\n\n Args:\n pipeline_context (PlanOrchestrationContext):\n execution_plan (ExecutionPlan):\n """\n\n # TODO: restart event?\n if not pipeline_context.resume_from_failure:\n yield DagsterEvent.pipeline_start(pipeline_context)\n\n pipeline_exception_info = None\n pipeline_canceled_info = None\n failed_steps = []\n generator_closed = False\n try:\n for event in pipeline_context.executor.execute(pipeline_context, execution_plan):\n if event.is_step_failure:\n failed_steps.append(event.step_key)\n\n yield event\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n pipeline_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if pipeline_context.raise_on_error:\n raise\n except (KeyboardInterrupt, DagsterExecutionInterruptedError):\n pipeline_canceled_info = serializable_error_info_from_exc_info(sys.exc_info())\n if pipeline_context.raise_on_error:\n raise\n except BaseException:\n pipeline_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if pipeline_context.raise_on_error:\n raise # finally block will run before this is re-raised\n finally:\n if pipeline_canceled_info:\n reloaded_run = pipeline_context.instance.get_run_by_id(pipeline_context.run_id)\n if reloaded_run and reloaded_run.status == PipelineRunStatus.CANCELING:\n event = DagsterEvent.pipeline_canceled(pipeline_context, pipeline_canceled_info)\n elif reloaded_run and reloaded_run.status == PipelineRunStatus.CANCELED:\n # This happens if the run was force-terminated but was still able to send\n # a cancellation request\n event = DagsterEvent.engine_event(\n pipeline_context,\n "Computational resources were cleaned up after the run was forcibly marked as canceled.",\n EngineEventData(),\n )\n elif pipeline_context.instance.run_will_resume(pipeline_context.run_id):\n event = DagsterEvent.engine_event(\n pipeline_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, not treating as failure because run will be resumed.",\n EngineEventData(),\n )\n else:\n event = DagsterEvent.pipeline_failure(\n pipeline_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, treating as failure.",\n pipeline_canceled_info,\n )\n elif pipeline_exception_info:\n event = DagsterEvent.pipeline_failure(\n pipeline_context,\n "An exception was thrown during execution.",\n pipeline_exception_info,\n )\n elif failed_steps:\n event = DagsterEvent.pipeline_failure(\n pipeline_context,\n "Steps failed: {}.".format(failed_steps),\n )\n else:\n event = DagsterEvent.pipeline_success(pipeline_context)\n if not generator_closed:\n yield event\n\n\nclass ExecuteRunWithPlanIterable:\n """Utility class to consolidate execution logic.\n\n This is a class and not a function because, e.g., in constructing a `scoped_pipeline_context`\n for `PipelineExecutionResult`, we need to pull out the `pipeline_context` after we're done\n yielding events. This broadly follows a pattern we make use of in other places,\n cf. `dagster.utils.EventGenerationManager`.\n """\n\n def __init__(self, execution_plan, iterator, execution_context_manager):\n self.execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n self.iterator = check.callable_param(iterator, "iterator")\n self.execution_context_manager = check.inst_param(\n execution_context_manager, "execution_context_manager", ExecutionContextManager\n )\n\n self.pipeline_context = None\n\n def __iter__(self):\n # Since interrupts can't be raised at arbitrary points safely, delay them until designated\n # checkpoints during the execution.\n # To be maximally certain that interrupts are always caught during an execution process,\n # you can safely add an additional `with capture_interrupts()` at the very beginning of the\n # process that performs the execution.\n with capture_interrupts():\n yield from self.execution_context_manager.prepare_context()\n self.pipeline_context = self.execution_context_manager.get_context()\n generator_closed = False\n try:\n if self.pipeline_context: # False if we had a pipeline init failure\n yield from self.iterator(\n execution_plan=self.execution_plan,\n pipeline_context=self.pipeline_context,\n )\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n raise\n finally:\n for event in self.execution_context_manager.shutdown_context():\n if not generator_closed:\n yield event\n\n\ndef _check_execute_pipeline_args(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict],\n mode: Optional[str],\n preset: Optional[str],\n tags: Optional[Dict[str, Any]],\n solid_selection: Optional[List[str]] = None,\n) -> Tuple[\n IPipeline,\n Optional[dict],\n Optional[str],\n Dict[str, Any],\n Optional[FrozenSet[str]],\n Optional[List[str]],\n]:\n pipeline = _check_pipeline(pipeline)\n pipeline_def = pipeline.get_definition()\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n run_config = check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n check.invariant(\n not (mode is not None and preset is not None),\n "You may set only one of `mode` (got {mode}) or `preset` (got {preset}).".format(\n mode=mode, preset=preset\n ),\n )\n\n tags = check.opt_dict_param(tags, "tags", key_type=str)\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n\n if preset is not None:\n pipeline_preset = pipeline_def.get_preset(preset)\n\n if pipeline_preset.run_config is not None:\n check.invariant(\n (not run_config) or (pipeline_preset.run_config == run_config),\n "The environment set in preset '{preset}' does not agree with the environment "\n "passed in the `run_config` argument.".format(preset=preset),\n )\n\n run_config = pipeline_preset.run_config\n\n # load solid_selection from preset\n if pipeline_preset.solid_selection is not None:\n check.invariant(\n solid_selection is None or solid_selection == pipeline_preset.solid_selection,\n "The solid_selection set in preset '{preset}', {preset_subset}, does not agree with "\n "the `solid_selection` argument: {solid_selection}".format(\n preset=preset,\n preset_subset=pipeline_preset.solid_selection,\n solid_selection=solid_selection,\n ),\n )\n solid_selection = pipeline_preset.solid_selection\n\n check.invariant(\n mode is None or mode == pipeline_preset.mode,\n "Mode {mode} does not agree with the mode set in preset '{preset}': "\n "('{preset_mode}')".format(preset=preset, preset_mode=pipeline_preset.mode, mode=mode),\n )\n\n mode = pipeline_preset.mode\n\n tags = merge_dicts(pipeline_preset.tags, tags)\n\n if mode is not None:\n if not pipeline_def.has_mode_definition(mode):\n raise DagsterInvariantViolationError(\n (\n "You have attempted to execute pipeline {name} with mode {mode}. "\n "Available modes: {modes}"\n ).format(\n name=pipeline_def.name,\n mode=mode,\n modes=pipeline_def.available_modes,\n )\n )\n else:\n if pipeline_def.is_multi_mode:\n raise DagsterInvariantViolationError(\n (\n "Pipeline {name} has multiple modes (Available modes: {modes}) and you have "\n "attempted to execute it without specifying a mode. Set "\n "mode property on the PipelineRun object."\n ).format(name=pipeline_def.name, modes=pipeline_def.available_modes)\n )\n mode = pipeline_def.get_default_mode_name()\n\n tags = merge_dicts(pipeline_def.tags, tags)\n\n # generate pipeline subset from the given solid_selection\n if solid_selection:\n pipeline = pipeline.subset_for_execution(solid_selection)\n\n return (\n pipeline,\n run_config,\n mode,\n tags,\n pipeline.solids_to_execute,\n solid_selection,\n )\n\n\ndef _resolve_reexecute_step_selection(\n instance: DagsterInstance,\n pipeline: IPipeline,\n mode: Optional[str],\n run_config: Optional[dict],\n parent_pipeline_run: PipelineRun,\n step_selection: List[str],\n) -> ExecutionPlan:\n if parent_pipeline_run.solid_selection:\n pipeline = pipeline.subset_for_execution(parent_pipeline_run.solid_selection)\n\n parent_logs = instance.all_logs(parent_pipeline_run.run_id)\n parent_plan = create_execution_plan(\n pipeline,\n parent_pipeline_run.run_config,\n mode,\n known_state=KnownExecutionState.derive_from_logs(parent_logs),\n )\n step_keys_to_execute = parse_step_selection(parent_plan.get_all_step_deps(), step_selection)\n execution_plan = create_execution_plan(\n pipeline,\n run_config,\n mode,\n step_keys_to_execute=list(step_keys_to_execute),\n known_state=KnownExecutionState.for_reexecution(parent_logs, step_keys_to_execute),\n tags=parent_pipeline_run.tags,\n )\n return execution_plan\n
", "current_page_name": "_modules/dagster/core/execution/api", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.api"}, "build_resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.build_resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Generator, Optional, cast\n\nfrom dagster import check\nfrom dagster.config.validate import process_config\nfrom dagster.core.definitions.resource_definition import (\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster.core.definitions.run_config import define_resource_dictionary_cls\nfrom dagster.core.errors import DagsterInvalidConfigError\nfrom dagster.core.execution.resources_init import resource_initialization_manager\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.system_config.objects import ResourceConfig, config_map_resources\n\nfrom .api import ephemeral_instance_if_missing\nfrom .context_creation_pipeline import initialize_console_manager\n\n\ndef _get_mapped_resource_config(\n    resource_defs: Dict[str, ResourceDefinition], resource_config: Dict[str, Any]\n) -> Dict[str, ResourceConfig]:\n    resource_config_schema = define_resource_dictionary_cls(\n        resource_defs, set(resource_defs.keys())\n    )\n    config_evr = process_config(resource_config_schema, resource_config)\n    if not config_evr.success:\n        raise DagsterInvalidConfigError(\n            "Error in config for resources ",\n            config_evr.errors,\n            resource_config,\n        )\n    config_value = cast(Dict[str, Any], config_evr.value)\n    return config_map_resources(resource_defs, config_value)\n\n\n
[docs]@contextmanager\ndef build_resources(\n resources: Dict[str, Any],\n instance: Optional[DagsterInstance] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n pipeline_run: Optional[PipelineRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n) -> Generator[Resources, None, None]:\n """Context manager that yields resources using provided resource definitions and run config.\n\n This API allows for using resources in an independent context. Resources will be initialized\n with the provided run config, and optionally, pipeline_run. The resulting resources will be\n yielded on a dictionary keyed identically to that provided for `resource_defs`. Upon exiting the\n context, resources will also be torn down safely.\n\n Args:\n resources (Dict[str, Any]): Resource instances or definitions to build. All\n required resource dependencies to a given resource must be contained within this\n dictionary, or the resource build will fail.\n instance (Optional[DagsterInstance]): The dagster instance configured to instantiate\n resources on.\n resource_config (Optional[Dict[str, Any]]): A dict representing the config to be\n provided to each resource during initialization and teardown.\n pipeline_run (Optional[PipelineRun]): The pipeline run to provide during resource\n initialization and teardown. If the provided resources require either the `pipeline_run`\n or `run_id` attributes of the provided context during resource initialization and/or\n teardown, this must be provided, or initialization will fail.\n log_manager (Optional[DagsterLogManager]): Log Manager to use during resource\n initialization. Defaults to system log manager.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import resource, build_resources\n\n @resource\n def the_resource():\n return "foo"\n\n with build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n assert resources.from_def == "foo"\n assert resources.from_val == "bar"\n\n """\n\n resources = check.dict_param(resources, "resource_defs", key_type=str)\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resource_config = check.opt_dict_param(resource_config, "resource_config", key_type=str)\n log_manager = check.opt_inst_param(log_manager, "log_manager", DagsterLogManager)\n resource_defs = wrap_resources_for_execution(resources)\n mapped_resource_config = _get_mapped_resource_config(resource_defs, resource_config)\n\n with ephemeral_instance_if_missing(instance) as dagster_instance:\n resources_manager = resource_initialization_manager(\n resource_defs=resource_defs,\n resource_configs=mapped_resource_config,\n log_manager=log_manager if log_manager else initialize_console_manager(pipeline_run),\n execution_plan=None,\n pipeline_run=pipeline_run,\n resource_keys_to_init=set(resource_defs.keys()),\n instance=dagster_instance,\n emit_persistent_events=False,\n pipeline_def_for_backwards_compat=None,\n )\n try:\n list(resources_manager.generate_setup_events())\n instantiated_resources = check.inst(\n resources_manager.get_object(), ScopedResourcesBuilder\n )\n yield instantiated_resources.build(\n set(instantiated_resources.resource_instance_dict.keys())\n )\n finally:\n list(resources_manager.generate_teardown_events())
\n\n\ndef wrap_resources_for_execution(\n resources: Optional[Dict[str, Any]] = None\n) -> Dict[str, ResourceDefinition]:\n resources = check.opt_dict_param(resources, "resources", key_type=str)\n resource_defs = {}\n # Wrap instantiated resource values in a resource definition.\n # If an instantiated IO manager is provided, wrap it in an IO manager definition.\n for resource_key, resource in resources.items():\n if isinstance(resource, ResourceDefinition):\n resource_defs[resource_key] = resource\n elif isinstance(resource, IOManager):\n resource_defs[resource_key] = IOManagerDefinition.hardcoded_io_manager(resource)\n else:\n resource_defs[resource_key] = ResourceDefinition.hardcoded_resource(resource)\n\n return resource_defs\n
", "current_page_name": "_modules/dagster/core/execution/build_resources", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.build_resources"}, "context": {"compute": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.compute

\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Dict, Iterator, List, Mapping, Optional\n\nfrom dagster import check\nfrom dagster.core.definitions.dependency import Node, NodeHandle\nfrom dagster.core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    Materialization,\n    UserEvent,\n)\nfrom dagster.core.definitions.mode import ModeDefinition\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.definitions.time_window_partitions import TimeWindow\nfrom dagster.core.errors import DagsterInvalidPropertyError\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.utils.forked_pdb import ForkedPdb\n\nfrom .system import StepExecutionContext\n\n\nclass AbstractComputeExecutionContext(ABC):  # pylint: disable=no-init\n    """Base class for solid context implemented by SolidExecutionContext and DagstermillExecutionContext"""\n\n    @abstractmethod\n    def has_tag(self, key) -> bool:\n        """Implement this method to check if a logging tag is set."""\n\n    @abstractmethod\n    def get_tag(self, key: str) -> Optional[str]:\n        """Implement this method to get a logging tag."""\n\n    @property\n    @abstractmethod\n    def run_id(self) -> str:\n        """The run id for the context."""\n\n    @property\n    @abstractmethod\n    def solid_def(self) -> SolidDefinition:\n        """The solid definition corresponding to the execution step being executed."""\n\n    @property\n    @abstractmethod\n    def solid(self) -> Node:\n        """The solid corresponding to the execution step being executed."""\n\n    @property\n    @abstractmethod\n    def pipeline_def(self) -> PipelineDefinition:\n        """The pipeline being executed."""\n\n    @property\n    @abstractmethod\n    def pipeline_run(self) -> PipelineRun:\n        """The PipelineRun object corresponding to the execution."""\n\n    @property\n    @abstractmethod\n    def resources(self) -> Any:\n        """Resources available in the execution context."""\n\n    @property\n    @abstractmethod\n    def log(self) -> DagsterLogManager:\n        """The log manager available in the execution context."""\n\n    @property\n    @abstractmethod\n    def solid_config(self) -> Any:\n        """The parsed config specific to this solid."""\n\n    @property\n    def op_config(self) -> Any:\n        return self.solid_config\n\n\n
[docs]class SolidExecutionContext(AbstractComputeExecutionContext):\n """The ``context`` object that can be made available as the first argument to a solid's compute\n function.\n\n The context object provides system information such as resources, config, and logging to a\n solid's compute function. Users should not instantiate this object directly.\n\n Example:\n\n .. code-block:: python\n\n @solid\n def hello_world(context: SolidExecutionContext):\n context.log.info("Hello, world!")\n\n """\n\n __slots__ = ["_step_execution_context"]\n\n def __init__(self, step_execution_context: StepExecutionContext):\n self._step_execution_context = check.inst_param(\n step_execution_context,\n "step_execution_context",\n StepExecutionContext,\n )\n self._pdb: Optional[ForkedPdb] = None\n self._events: List[DagsterEvent] = []\n self._output_metadata: Dict[str, Any] = {}\n\n @property\n def solid_config(self) -> Any:\n return self._step_execution_context.op_config\n\n @property\n def pipeline_run(self) -> PipelineRun:\n """PipelineRun: The current pipeline run"""\n return self._step_execution_context.pipeline_run\n\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current Dagster instance"""\n return self._step_execution_context.instance\n\n @property\n def pdb(self) -> ForkedPdb:\n """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the op.\n\n Example:\n\n .. code-block:: python\n\n @op\n def debug(context):\n context.pdb.set_trace()\n\n """\n if self._pdb is None:\n self._pdb = ForkedPdb()\n\n return self._pdb\n\n @property\n def file_manager(self):\n """Deprecated access to the file manager.\n\n :meta private:\n """\n raise DagsterInvalidPropertyError(\n "You have attempted to access the file manager which has been moved to resources in 0.10.0. "\n "Please access it via `context.resources.file_manager` instead."\n )\n\n @property\n def resources(self) -> Any:\n """Resources: The currently available resources."""\n return self._step_execution_context.resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n """Optional[StepLauncher]: The current step launcher, if any."""\n return self._step_execution_context.step_launcher\n\n @property\n def run_id(self) -> str:\n """str: The id of the current execution's run."""\n return self._step_execution_context.run_id\n\n @property\n def run_config(self) -> dict:\n """dict: The run config for the current execution."""\n return self._step_execution_context.run_config\n\n @property\n def pipeline_def(self) -> PipelineDefinition:\n """PipelineDefinition: The currently executing pipeline."""\n return self._step_execution_context.pipeline_def\n\n @property\n def pipeline_name(self) -> str:\n """str: The name of the currently executing pipeline."""\n return self._step_execution_context.pipeline_name\n\n @property\n def mode_def(self) -> ModeDefinition:\n """ModeDefinition: The mode of the current execution."""\n return self._step_execution_context.mode_def\n\n @property\n def log(self) -> DagsterLogManager:\n """DagsterLogManager: The log manager available in the execution context."""\n return self._step_execution_context.log\n\n @property\n def solid_handle(self) -> NodeHandle:\n """NodeHandle: The current solid's handle.\n\n :meta private:\n """\n return self._step_execution_context.solid_handle\n\n @property\n def solid(self) -> Node:\n """Solid: The current solid object.\n\n :meta private:\n\n """\n return self._step_execution_context.pipeline_def.get_solid(self.solid_handle)\n\n @property\n def solid_def(self) -> SolidDefinition:\n """SolidDefinition: The current solid definition."""\n return self._step_execution_context.pipeline_def.get_solid(self.solid_handle).definition\n\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run"""\n return self._step_execution_context.has_partition_key\n\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n return self._step_execution_context.partition_key\n\n
[docs] def output_asset_partition_key(self, output_name: str = "result") -> str:\n """Returns the asset partition key for the given output. Defaults to "result", which is the\n name of the default output.\n """\n return self._step_execution_context.asset_partition_key_for_output(output_name)
\n\n
[docs] def output_asset_partitions_time_window(self, output_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n return self._step_execution_context.asset_partitions_time_window_for_output(output_name)
\n\n
[docs] def has_tag(self, key: str) -> bool:\n """Check if a logging tag is set.\n\n Args:\n key (str): The tag to check.\n\n Returns:\n bool: Whether the tag is set.\n """\n return self._step_execution_context.has_tag(key)
\n\n
[docs] def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag.\n\n Args:\n key (tag): The tag to get.\n\n Returns:\n Optional[str]: The value of the tag, if present.\n """\n return self._step_execution_context.get_tag(key)
\n\n def has_events(self) -> bool:\n return bool(self._events)\n\n
[docs] def consume_events(self) -> Iterator[DagsterEvent]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the beginning of the op's computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events
\n\n
[docs] def log_event(self, event: UserEvent) -> None:\n """Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.\n\n Events logged with this method will appear in the list of DagsterEvents, as well as the event log.\n\n Args:\n event (Union[AssetMaterialization, Materialization, AssetObservation, ExpectationResult]): The event to log.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op, AssetMaterialization\n\n @op\n def log_materialization(context):\n context.log_event(AssetMaterialization("foo"))\n """\n\n if isinstance(event, (AssetMaterialization, Materialization)):\n self._events.append(\n DagsterEvent.asset_materialization(\n self._step_execution_context,\n event,\n self._step_execution_context.get_input_lineage(),\n )\n )\n elif isinstance(event, AssetObservation):\n self._events.append(DagsterEvent.asset_observation(self._step_execution_context, event))\n elif isinstance(event, ExpectationResult):\n self._events.append(\n DagsterEvent.step_expectation_result(self._step_execution_context, event)\n )\n else:\n check.failed("Unexpected event {event}".format(event=event))
\n\n
[docs] def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n """Add metadata to one of the outputs of an op.\n\n This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.\n\n Args:\n metadata (Mapping[str, Any]): The metadata to attach to the output\n output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import Out, op\n from typing import Tuple\n\n @op\n def add_metadata(context):\n context.add_output_metadata({"foo", "bar"})\n return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n @op(out={"a": Out(), "b": Out()})\n def add_metadata_two_outputs(context) -> Tuple[str, int]:\n context.add_output_metadata({"foo": "bar"}, output_name="b")\n context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n return ("dog", 5)\n\n """\n metadata = check.dict_param(metadata, "metadata", key_type=str)\n output_name = check.opt_str_param(output_name, "output_name")\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n self._step_execution_context.add_output_metadata(\n metadata=metadata, output_name=output_name, mapping_key=mapping_key\n )
\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n return self._step_execution_context.get_output_metadata(\n output_name=output_name, mapping_key=mapping_key\n )\n\n def get_step_execution_context(self) -> StepExecutionContext:\n """Allows advanced users (e.g. framework authors) to punch through to the underlying\n step execution context.\n\n :meta private:\n\n Returns:\n StepExecutionContext: The underlying system context.\n """\n return self._step_execution_context\n\n @property\n def retry_number(self) -> int:\n """\n Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc.\n """\n\n return self._step_execution_context.previous_attempt_count\n\n def describe_op(self):\n return self._step_execution_context.describe_op()\n\n
[docs] def get_mapping_key(self) -> Optional[str]:\n """\n Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None.\n """\n return self._step_execution_context.step.get_mapping_key()
\n\n\n
[docs]class OpExecutionContext(SolidExecutionContext):\n pass
\n
", "current_page_name": "_modules/dagster/core/execution/context/compute", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.compute"}, "hook": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.hook

\nimport warnings\nfrom typing import AbstractSet, Any, Dict, Optional, Set, Union\n\nfrom dagster import check\n\nfrom ...definitions.composition import PendingNodeInvocation\nfrom ...definitions.decorators.graph_decorator import graph\nfrom ...definitions.dependency import Node\nfrom ...definitions.hook_definition import HookDefinition\nfrom ...definitions.mode import ModeDefinition\nfrom ...definitions.op_definition import OpDefinition\nfrom ...definitions.resource_definition import IContainsGenerator, Resources\nfrom ...definitions.solid_definition import SolidDefinition\nfrom ...errors import DagsterInvalidPropertyError, DagsterInvariantViolationError\nfrom ...log_manager import DagsterLogManager\nfrom ..plan.step import ExecutionStep\nfrom .system import StepExecutionContext\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set when a `HookContext` is constructed from "\n        "`build_hook_context`."\n    )\n\n\ndef _check_property_on_test_context(\n    context: "HookContext", attr_str: str, user_facing_name: str, param_on_builder: str\n):\n    """Check if attribute is not None on context. If none, error, and point user in direction of\n    how to specify the parameter on the context object."""\n\n    value = getattr(context, attr_str)\n    if value is None:\n        raise DagsterInvalidPropertyError(\n            f"Attribute '{user_facing_name}' was not provided when "\n            f"constructing context. Provide a value for the '{param_on_builder}' parameter on "\n            "'build_hook_context'. To learn more, check out the testing hooks section of Dagster's "\n            "concepts docs: https://docs.dagster.io/concepts/ops-jobs-graphs/op-hooks#testing-hooks"\n        )\n    else:\n        return value\n\n\n
[docs]class HookContext:\n """The ``context`` object available to a hook function on an DagsterEvent.\n\n Attributes:\n log (DagsterLogManager): Centralized log dispatch from user code.\n hook_def (HookDefinition): The hook that the context object belongs to.\n solid (Solid): The solid instance associated with the hook.\n op (Op): The op instance associated with the hook.\n step_key (str): The key for the step where this hook is being triggered.\n required_resource_keys (Set[str]): Resources required by this hook.\n resources (Resources): Resources available in the hook context.\n solid_config (Any): The parsed config specific to this solid.\n op_config (Any): The parsed config specific to this op.\n pipeline_name (str): The name of the pipeline where this hook is being triggered.\n job_name (str): The name of the job where this hook is being triggered.\n run_id (str): The id of the run where this hook is being triggered.\n mode_def (ModeDefinition): The mode with which the pipeline is being run.\n op_exception (Optional[BaseException]): The thrown exception in a failed op.\n op_output_values (Dict): Computed output values in an op.\n """\n\n def __init__(\n self,\n step_execution_context: StepExecutionContext,\n hook_def: HookDefinition,\n ):\n self._step_execution_context = step_execution_context\n self._hook_def = check.inst_param(hook_def, "hook_def", HookDefinition)\n self._required_resource_keys = hook_def.required_resource_keys\n self._resources = step_execution_context.scoped_resources_builder.build(\n self._required_resource_keys\n )\n\n @property\n def pipeline_name(self) -> str:\n return self.job_name\n\n @property\n def job_name(self) -> str:\n return self._step_execution_context.job_name\n\n @property\n def run_id(self) -> str:\n return self._step_execution_context.run_id\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def solid(self) -> Node:\n return self.op\n\n @property\n def op(self) -> Node:\n return self._step_execution_context.solid\n\n @property\n def step(self) -> ExecutionStep:\n warnings.warn(\n "The step property of HookContext has been deprecated, and will be removed "\n "in a future release."\n )\n return self._step_execution_context.step\n\n @property\n def step_key(self) -> str:\n return self._step_execution_context.step.key\n\n @property\n def mode_def(self) -> Optional[ModeDefinition]:\n return self._step_execution_context.mode_def\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n solid_config = self._step_execution_context.resolved_run_config.solids.get(\n str(self._step_execution_context.step.solid_handle)\n )\n return solid_config.config if solid_config else None\n\n @property\n def op_config(self) -> Any:\n return self.solid_config\n\n # Because of the fact that we directly use the log manager of the step, if a user calls\n # hook_context.log.with_tags, then they will end up mutating the step's logging tags as well.\n # This is not problematic because the hook only runs after the step has been completed.\n @property\n def log(self) -> DagsterLogManager:\n return self._step_execution_context.log\n\n @property\n def solid_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed solid.\n\n Returns:\n Optional[BaseException]: the exception object, None if the solid execution succeeds.\n """\n return self.op_exception\n\n @property\n def op_exception(self):\n return self._step_execution_context.step_exception\n\n @property\n def solid_output_values(self) -> Dict[str, Union[Any, Dict[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n results: Dict[str, Union[Any, Dict[str, Any]]] = {}\n captured = self._step_execution_context.step_output_capture\n\n if captured is None:\n check.failed("Outputs were unexpectedly not captured for hook")\n\n # make the returned values more user-friendly\n for step_output_handle, value in captured.items():\n if step_output_handle.mapping_key:\n if results.get(step_output_handle.output_name) is None:\n results[step_output_handle.output_name] = {\n step_output_handle.mapping_key: value\n }\n else:\n results[step_output_handle.output_name][step_output_handle.mapping_key] = value\n else:\n results[step_output_handle.output_name] = value\n\n return results\n\n @property\n def op_output_values(self):\n return self.solid_output_values
\n\n\nclass UnboundHookContext(HookContext):\n def __init__(\n self,\n resources: Dict[str, Any],\n mode_def: Optional[ModeDefinition],\n op: Optional[Union[SolidDefinition, PendingNodeInvocation]],\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n ): # pylint: disable=super-init-not-called\n from ..build_resources import build_resources\n from ..context_creation_pipeline import initialize_console_manager\n\n self._mode_def = mode_def\n\n self._op = None\n if op is not None:\n\n @graph(name="hook_context_container")\n def temp_graph():\n op()\n\n self._op = temp_graph.solids[0]\n\n # Open resource context manager\n self._resources_cm = build_resources(resources)\n self._resources = self._resources_cm.__enter__() # pylint: disable=no-member\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n\n self._log = initialize_console_manager(None)\n\n self._cm_scope_entered = False\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n self._resources_cm.__exit__(*exc) # pylint: disable=no-member\n\n def __del__(self):\n if self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None) # pylint: disable=no-member\n\n @property\n def job_name(self) -> str:\n return self.pipeline_name\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def mode_def(self) -> Optional[ModeDefinition]:\n return self._mode_def\n\n @property\n def required_resource_keys(self) -> Set[str]:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def resources(self) -> "Resources":\n if self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_hook_context(...) as context:`"\n )\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log\n\n @property\n def op_exception(self) -> Optional[BaseException]:\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Dict[str, Union[Any, Dict[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n\nclass BoundHookContext(HookContext):\n def __init__(\n self,\n hook_def: HookDefinition,\n resources: Resources,\n op: Optional[Node],\n mode_def: Optional[ModeDefinition],\n log_manager: DagsterLogManager,\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n ): # pylint: disable=super-init-not-called\n self._hook_def = hook_def\n self._resources = resources\n self._op = op\n self._mode_def = mode_def\n self._log_manager = log_manager\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def mode_def(self) -> Optional[ModeDefinition]:\n return self._mode_def\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._hook_def.required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log_manager\n\n @property\n def op_exception(self):\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Dict[str, Union[Any, Dict[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n\n
[docs]def build_hook_context(\n resources: Optional[Dict[str, Any]] = None,\n mode_def: Optional[ModeDefinition] = None,\n solid: Optional[Union[SolidDefinition, PendingNodeInvocation]] = None,\n op: Optional[Union[OpDefinition, PendingNodeInvocation]] = None,\n run_id: Optional[str] = None,\n job_name: Optional[str] = None,\n op_exception: Optional[Exception] = None,\n) -> UnboundHookContext:\n """Builds hook context from provided parameters.\n\n ``build_hook_context`` can be used as either a function or a context manager. If there is a\n provided resource to ``build_hook_context`` that is a context manager, then it must be used as a\n context manager. This function can be used to provide the context argument to the invocation of\n a hook definition.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can\n either be values or resource definitions.\n mode_def (Optional[ModeDefinition]): The mode definition used with the context.\n op (Optional[OpDefinition, PendingNodeInvocation]): The op definition which the\n hook may be associated with.\n solid (Optional[SolidDefinition, PendingNodeInvocation]): (legacy) The solid definition which the\n hook may be associated with.\n run_id (Optional[str]): The id of the run in which the hook is invoked (provided for mocking purposes).\n job_name (Optional[str]): The name of the job in which the hook is used (provided for mocking purposes).\n op_exception (Optional[Exception]): The exception that caused the hook to be triggered.\n\n Examples:\n .. code-block:: python\n\n context = build_hook_context()\n hook_to_invoke(context)\n\n with build_hook_context(resources={"foo": context_manager_resource}) as context:\n hook_to_invoke(context)\n """\n check.invariant(not (solid and op), "cannot set both `solid` and `op` on `build_hook_context`.")\n\n op = check.opt_inst_param(op, "op", (OpDefinition, PendingNodeInvocation))\n solid = check.opt_inst_param(solid, "solid", (SolidDefinition, PendingNodeInvocation))\n op = op or solid\n\n return UnboundHookContext(\n resources=check.opt_dict_param(resources, "resources", key_type=str),\n mode_def=check.opt_inst_param(mode_def, "mode_def", ModeDefinition),\n op=op,\n run_id=check.opt_str_param(run_id, "run_id"),\n job_name=check.opt_str_param(job_name, "job_name"),\n op_exception=check.opt_inst_param(op_exception, "op_exception", Exception),\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/hook", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.hook"}, "init": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.init

\nfrom typing import Any, Dict, Optional, Union\n\nfrom dagster import check\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\n\n\n
[docs]class InitResourceContext:\n """Resource-specific initialization context.\n\n Attributes:\n resource_config (Any): The configuration data provided by the run config. The schema\n for this data is defined by the ``config_field`` argument to\n :py:class:`ResourceDefinition`.\n resource_def (ResourceDefinition): The definition of the resource currently being\n constructed.\n log_manager (DagsterLogManager): The log manager for this run of the job or pipeline\n resources (ScopedResources): The resources that are available to the resource that we are\n initalizing.\n dagster_run (Optional[PipelineRun]): The dagster run to use. When initializing resources\n outside of execution context, this will be None.\n run_id (Optional[str]): The id for this run of the job or pipeline. When initializing resources\n outside of execution context, this will be None.\n pipeline_run (Optional[PipelineRun]): (legacy) The dagster run to use. When initializing resources\n outside of execution context, this will be None.\n\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Resources,\n resource_def: Optional[ResourceDefinition] = None,\n instance: Optional[DagsterInstance] = None,\n dagster_run: Optional[PipelineRun] = None,\n pipeline_run: Optional[PipelineRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n pipeline_def_for_backwards_compat: Optional[PipelineDefinition] = None,\n ):\n\n if dagster_run and pipeline_run:\n raise DagsterInvariantViolationError(\n "Provided both ``dagster_run`` and ``pipeline_run`` to InitResourceContext "\n "initialization. Please provide one or the other."\n )\n self._resource_config = resource_config\n self._resource_def = resource_def\n self._log_manager = log_manager\n self._instance = instance\n self._resources = resources\n\n self._pipeline_def_for_backwards_compat = pipeline_def_for_backwards_compat\n self._dagster_run = dagster_run or pipeline_run\n\n @property\n def resource_config(self) -> Any:\n return self._resource_config\n\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n return self._resource_def\n\n @property\n def resources(self) -> Resources:\n return self._resources\n\n @property\n def instance(self) -> Optional[DagsterInstance]:\n return self._instance\n\n @property\n def pipeline_def_for_backwards_compat(self) -> Optional[PipelineDefinition]:\n return self._pipeline_def_for_backwards_compat\n\n @property\n def dagster_run(self) -> Optional[PipelineRun]:\n return self._dagster_run\n\n @property\n def pipeline_run(self) -> Optional[PipelineRun]:\n return self.dagster_run\n\n @property\n def log(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n @property\n def run_id(self) -> Optional[str]:\n return self.pipeline_run.run_id if self.pipeline_run else None\n\n def replace_config(self, config: Any) -> "InitResourceContext":\n return InitResourceContext(\n resource_config=config,\n resources=self.resources,\n instance=self.instance,\n resource_def=self.resource_def,\n pipeline_run=self.pipeline_run,\n log_manager=self.log,\n )
\n\n\nclass UnboundInitResourceContext(InitResourceContext):\n """Resource initialization context outputted by ``build_init_resource_context``.\n\n Represents a context whose config has not yet been validated against a resource definition,\n hence the inability to access the `resource_def` attribute. When an instance of\n ``UnboundInitResourceContext`` is passed to a resource invocation, config is validated,\n and it is subsumed into an `InitResourceContext`, which contains the resource_def validated\n against.\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Optional[Union[Resources, Dict[str, Any]]],\n instance: Optional[DagsterInstance],\n ):\n from dagster.core.execution.api import ephemeral_instance_if_missing\n from dagster.core.execution.build_resources import build_resources\n from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n\n self._instance_provided = (\n check.opt_inst_param(instance, "instance", DagsterInstance) is not None\n )\n # Construct ephemeral instance if missing\n self._instance_cm = ephemeral_instance_if_missing(instance)\n # Pylint can't infer that the ephemeral_instance context manager has an __enter__ method,\n # so ignore lint error\n instance = self._instance_cm.__enter__() # pylint: disable=no-member\n\n # If we are provided with a Resources instance, then we do not need to initialize\n if isinstance(resources, Resources):\n self._resources_cm = None\n else:\n self._resources_cm = build_resources(\n check.opt_dict_param(resources, "resources", key_type=str), instance=instance\n )\n resources = self._resources_cm.__enter__() # pylint: disable=no-member\n self._resources_contain_cm = isinstance(resources, IContainsGenerator)\n\n self._cm_scope_entered = False\n super(UnboundInitResourceContext, self).__init__(\n resource_config=resource_config,\n resources=resources,\n resource_def=None,\n instance=instance,\n pipeline_run=None,\n log_manager=initialize_console_manager(None),\n pipeline_def_for_backwards_compat=None,\n )\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc) # pylint: disable=no-member\n if self._instance_provided:\n self._instance_cm.__exit__(*exc) # pylint: disable=no-member\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None) # pylint: disable=no-member\n if self._instance_provided and not self._cm_scope_entered:\n self._instance_cm.__exit__(None, None, None) # pylint: disable=no-member\n\n @property\n def resource_config(self) -> Any:\n return self._resource_config\n\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def resources(self) -> Resources:\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_init_resource_context(...) as context:`"\n )\n return self._resources\n\n @property\n def instance(self) -> Optional[DagsterInstance]:\n return self._instance\n\n @property\n def pipeline_def_for_backwards_compat(self) -> Optional[PipelineDefinition]:\n return None\n\n @property\n def pipeline_run(self) -> Optional[PipelineRun]:\n return None\n\n @property\n def log(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n @property\n def run_id(self) -> Optional[str]:\n return None\n\n\n
[docs]def build_init_resource_context(\n config: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> InitResourceContext:\n """Builds resource initialization context from provided parameters.\n\n ``build_init_resource_context`` can be used as either a function or context manager. If there is a\n provided resource to ``build_init_resource_context`` that is a context manager, then it must be\n used as a context manager. This function can be used to provide the context argument to the\n invocation of a resource.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n config (Optional[Any]): The resource config to provide to the context.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_init_resource_context()\n resource_to_init(context)\n\n with build_init_resource_context(\n resources={"foo": context_manager_resource}\n ) as context:\n resource_to_init(context)\n\n """\n return UnboundInitResourceContext(\n resource_config=check.opt_dict_param(config, "config", key_type=str),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n resources=check.opt_dict_param(resources, "resources", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/init", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.init"}, "input": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.input

\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union, cast\n\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey, AssetObservation\nfrom dagster.core.definitions.metadata import MetadataEntry, PartitionMetadataEntry\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.resource_definition import Resources\n    from dagster.core.events import DagsterEvent\n    from dagster.core.execution.context.system import StepExecutionContext\n    from dagster.core.log_manager import DagsterLogManager\n    from dagster.core.types.dagster_type import DagsterType\n\n    from .output import OutputContext\n\n\n
[docs]class InputContext:\n """\n The ``context`` object available to the load_input method of :py:class:`RootInputManager`.\n\n Attributes:\n name (Optional[str]): The name of the input that we're loading.\n pipeline_name (Optional[str]): The name of the pipeline.\n solid_def (Optional[SolidDefinition]): The definition of the solid that's loading the input.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n log (Optional[DagsterLogManager]): The log manager to use for this input.\n resource_config (Optional[Dict[str, Any]]): The config associated with the resource that\n initializes the RootInputManager.\n resources (Optional[Resources]): The resources required by the resource that initializes the\n input manager. If using the :py:func:`@root_input_manager` decorator, these resources\n correspond to those requested with the `required_resource_keys` parameter.\n op_def (Optional[OpDefinition]): The definition of the op that's loading the input.\n """\n\n def __init__(\n self,\n name: Optional[str] = None,\n pipeline_name: Optional[str] = None,\n solid_def: Optional["SolidDefinition"] = None,\n config: Optional[Any] = None,\n metadata: Optional[Dict[str, Any]] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n resources: Optional[Union["Resources", Dict[str, Any]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n op_def: Optional["OpDefinition"] = None,\n ):\n from dagster.core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster.core.execution.build_resources import build_resources\n\n self._name = name\n self._pipeline_name = pipeline_name\n check.invariant(\n solid_def is None or op_def is None, "Can't provide both a solid_def and an op_def arg"\n )\n self._solid_def = solid_def or op_def\n self._config = config\n self._metadata = metadata\n self._upstream_output = upstream_output\n self._dagster_type = dagster_type\n self._log = log_manager\n self._resource_config = resource_config\n self._step_context = step_context\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_dict_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__() # pylint: disable=no-member\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events: List["DagsterEvent"] = []\n self._observations: List[AssetObservation] = []\n self._metadata_entries: List[Union[MetadataEntry, PartitionMetadataEntry]] = []\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc) # pylint: disable=no-member\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None) # pylint: disable=no-member\n\n @property\n def has_input_name(self) -> bool:\n """If we're the InputContext is being used to load the result of a run from outside the run,\n then it won't have an input name."""\n return self._name is not None\n\n @property\n def name(self) -> str:\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._name\n\n @property\n def pipeline_name(self) -> str:\n if self._pipeline_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access pipeline_name, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._pipeline_name\n\n @property\n def solid_def(self) -> "SolidDefinition":\n if self._solid_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access solid_def, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._solid_def\n\n @property\n def op_def(self) -> "OpDefinition":\n if self._solid_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the InputContext"\n )\n\n return cast(OpDefinition, self._solid_def)\n\n @property\n def config(self) -> Any:\n return self._config\n\n @property\n def metadata(self) -> Optional[Dict[str, Any]]:\n return self._metadata\n\n @property\n def upstream_output(self) -> Optional["OutputContext"]:\n return self._upstream_output\n\n @property\n def dagster_type(self) -> "DagsterType":\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._dagster_type\n\n @property\n def log(self) -> "DagsterLogManager":\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._log\n\n @property\n def resource_config(self) -> Optional[Dict[str, Any]]:\n return self._resource_config\n\n @property\n def resources(self) -> Any:\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the InputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_input_context(...) as context:`"\n )\n return self._resources\n\n @property\n def asset_key(self) -> Optional[AssetKey]:\n matching_input_defs = [\n input_def\n for input_def in cast(SolidDefinition, self._solid_def).input_defs\n if input_def.name == self.name\n ]\n check.invariant(len(matching_input_defs) == 1)\n return matching_input_defs[0].get_asset_key(self)\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._step_context\n\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run"""\n return self.step_context.has_partition_key\n\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n return self.step_context.partition_key\n\n @property\n def has_asset_partitions(self) -> bool:\n if self._step_context is not None:\n return self._step_context.has_asset_partitions_for_input(self.name)\n else:\n return False\n\n @property\n def asset_partition_key(self) -> str:\n """The partition key for input asset.\n\n Raises an error if the input asset has no partitioning, or if the run covers a partition\n range for the input asset.\n """\n return self.step_context.asset_partition_key_for_input(self.name)\n\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n return self.step_context.asset_partition_key_range_for_input(self.name)\n\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n if self.upstream_output is None:\n check.failed("InputContext needs upstream_output to get asset_partitions_time_window")\n\n partitions_def = self.upstream_output.solid_def.output_def_named(\n self.upstream_output.name\n ).asset_partitions_def\n\n if not partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an output that does not correspond to a "\n "partitioned asset."\n )\n\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n raise ValueError(\n "Tried to get asset partitions for an input that correponds to a partitioned "\n "asset that is not partitioned with a TimeWindowPartitionsDefinition."\n )\n\n partition_key_range = self.asset_partition_key_range\n return TimeWindow(\n partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n )\n\n
[docs] def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_input`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n\n events = self._events\n self._events = []\n yield from events
\n\n
[docs] def add_input_metadata(\n self,\n metadata: Dict[str, Any],\n description: Optional[str] = None,\n ) -> None:\n """Accepts a dictionary of metadata. Metadata entries will appear on the LOADED_INPUT event.\n If the input is an asset, metadata will be attached to an asset observation.\n\n The asset observation will be yielded from the run and appear in the event log.\n Only valid if the context has an asset key.\n """\n from dagster.core.definitions.metadata import normalize_metadata\n from dagster.core.events import DagsterEvent\n\n metadata = check.dict_param(metadata, "metadata", key_type=str)\n self._metadata_entries.extend(normalize_metadata(metadata, []))\n if self.asset_key:\n check.opt_str_param(description, "description")\n\n observation = AssetObservation(\n asset_key=self.asset_key,\n description=description,\n partition=self.asset_partition_key if self.has_asset_partitions else None,\n metadata=metadata,\n )\n self._observations.append(observation)\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, observation))
\n\n
[docs] def get_observations(\n self,\n ) -> List[AssetObservation]:\n """Retrieve the list of user-generated asset observations that were observed via the context.\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_input_context, AssetObservation\n\n class MyIOManager(IOManager):\n def load_input(self, context, obj):\n ...\n\n def test_load_input():\n mgr = MyIOManager()\n context = build_input_context()\n mgr.load_input(context)\n observations = context.get_observations()\n ...\n """\n return self._observations
\n\n def consume_metadata_entries(self) -> List[Union[MetadataEntry, PartitionMetadataEntry]]:\n result = self._metadata_entries\n self._metadata_entries = []\n return result
\n\n\n
[docs]def build_input_context(\n name: Optional[str] = None,\n config: Optional[Any] = None,\n metadata: Optional[Dict[str, Any]] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n op_def: Optional[OpDefinition] = None,\n step_context: Optional["StepExecutionContext"] = None,\n) -> "InputContext":\n """Builds input context from provided parameters.\n\n ``build_input_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_input_context`` must be used as a\n context manager.\n\n Args:\n name (Optional[str]): The name of the input that we're loading.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n resource_config (Optional[Dict[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the input manager.\n resources (Optional[Dict[str, Any]]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n asset_key (Optional[AssetKey]): The asset key attached to the InputDefinition.\n op_def (Optional[OpDefinition]): The definition of the op that's loading the input.\n step_context (Optional[StepExecutionContext]): For internal use.\n\n Examples:\n\n .. code-block:: python\n\n build_input_context()\n\n with build_input_context(resources={"foo": context_manager_resource}) as context:\n do_something\n """\n from dagster.core.execution.context.output import OutputContext\n from dagster.core.execution.context.system import StepExecutionContext\n from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n from dagster.core.types.dagster_type import DagsterType\n\n name = check.opt_str_param(name, "name")\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n upstream_output = check.opt_inst_param(upstream_output, "upstream_output", OutputContext)\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n resource_config = check.opt_dict_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_dict_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n step_context = check.opt_inst_param(step_context, "step_context", StepExecutionContext)\n\n return InputContext(\n name=name,\n pipeline_name=None,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n resource_config=resource_config,\n resources=resources,\n step_context=step_context,\n op_def=op_def,\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/input", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.input"}, "invocation": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.invocation

\n# pylint: disable=super-init-not-called\nfrom typing import AbstractSet, Any, Dict, List, Mapping, NamedTuple, Optional, Set, Union, cast\n\nfrom dagster import check\nfrom dagster.config import Shape\nfrom dagster.core.definitions.composition import PendingNodeInvocation\nfrom dagster.core.definitions.dependency import Node, NodeHandle\nfrom dagster.core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    Materialization,\n    UserEvent,\n)\nfrom dagster.core.definitions.hook_definition import HookDefinition\nfrom dagster.core.definitions.mode import ModeDefinition\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.definitions.resource_definition import (\n    IContainsGenerator,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import (\n    DagsterInvalidConfigError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster.core.execution.build_resources import build_resources\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.types.dagster_type import DagsterType\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.forked_pdb import ForkedPdb\n\nfrom .compute import OpExecutionContext\nfrom .system import StepExecutionContext, TypeCheckContext\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set on the context when a solid is directly invoked."\n    )\n\n\nclass UnboundSolidExecutionContext(OpExecutionContext):\n    """The ``context`` object available as the first argument to a solid's compute function when\n    being invoked directly. Can also be used as a context manager.\n    """\n\n    def __init__(\n        self,\n        solid_config: Any,\n        resources_dict: Optional[Dict[str, Any]],\n        resources_config: Dict[str, Any],\n        instance: Optional[DagsterInstance],\n        partition_key: Optional[str],\n    ):  # pylint: disable=super-init-not-called\n        from dagster.core.execution.api import ephemeral_instance_if_missing\n        from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n\n        self._solid_config = solid_config\n\n        self._instance_provided = (\n            check.opt_inst_param(instance, "instance", DagsterInstance) is not None\n        )\n        # Construct ephemeral instance if missing\n        self._instance_cm = ephemeral_instance_if_missing(instance)\n        # Pylint can't infer that the ephemeral_instance context manager has an __enter__ method,\n        # so ignore lint error\n        self._instance = self._instance_cm.__enter__()  # pylint: disable=no-member\n\n        self._resources_config = resources_config\n        # Open resource context manager\n        self._resources_contain_cm = False\n        self._resources_cm = build_resources(\n            resources=check.opt_dict_param(resources_dict, "resources_dict", key_type=str),\n            instance=instance,\n            resource_config=resources_config,\n        )\n        self._resources = self._resources_cm.__enter__()  # pylint: disable=no-member\n        self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n        self._log = initialize_console_manager(None)\n        self._pdb: Optional[ForkedPdb] = None\n        self._cm_scope_entered = False\n        self._partition_key = partition_key\n        self._user_events: List[UserEvent] = []\n        self._output_metadata: Dict[str, Any] = {}\n\n    def __enter__(self):\n        self._cm_scope_entered = True\n        return self\n\n    def __exit__(self, *exc):\n        self._resources_cm.__exit__(*exc)  # pylint: disable=no-member\n        if self._instance_provided:\n            self._instance_cm.__exit__(*exc)  # pylint: disable=no-member\n\n    def __del__(self):\n        if self._resources_contain_cm and not self._cm_scope_entered:\n            self._resources_cm.__exit__(None, None, None)  # pylint: disable=no-member\n        if self._instance_provided and not self._cm_scope_entered:\n            self._instance_cm.__exit__(None, None, None)  # pylint: disable=no-member\n\n    @property\n    def solid_config(self) -> Any:\n        return self._solid_config\n\n    @property\n    def resources(self) -> Resources:\n        if self._resources_contain_cm and not self._cm_scope_entered:\n            raise DagsterInvariantViolationError(\n                "At least one provided resource is a generator, but attempting to access "\n                "resources outside of context manager scope. You can use the following syntax to "\n                "open a context manager: `with build_solid_context(...) as context:`"\n            )\n        return self._resources\n\n    @property\n    def pipeline_run(self) -> PipelineRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> dict:\n        raise DagsterInvalidPropertyError(_property_msg("run_config", "property"))\n\n    @property\n    def pipeline_def(self) -> PipelineDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_def", "property"))\n\n    @property\n    def pipeline_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_name", "property"))\n\n    @property\n    def mode_def(self) -> ModeDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("mode_def", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def solid_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("solid_handle", "property"))\n\n    @property\n    def solid(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("solid", "property"))\n\n    @property\n    def solid_def(self) -> SolidDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("solid_def", "property"))\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key:\n            return self._partition_key\n        else:\n            check.failed("Tried to access partition_key for a non-partitioned run")\n\n    def has_tag(self, key: str) -> bool:\n        raise DagsterInvalidPropertyError(_property_msg("has_tag", "method"))\n\n    def get_tag(self, key: str) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("get_tag", "method"))\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def bind(\n        self, solid_def_or_invocation: Union[SolidDefinition, PendingNodeInvocation]\n    ) -> "BoundSolidExecutionContext":\n\n        solid_def = (\n            solid_def_or_invocation\n            if isinstance(solid_def_or_invocation, SolidDefinition)\n            else solid_def_or_invocation.node_def.ensure_solid_def()\n        )\n\n        _validate_resource_requirements(self.resources, solid_def)\n\n        solid_config = _resolve_bound_config(self.solid_config, solid_def)\n\n        return BoundSolidExecutionContext(\n            solid_def=solid_def,\n            solid_config=solid_config,\n            resources=self.resources,\n            resources_config=self._resources_config,\n            instance=self.instance,\n            log_manager=self.log,\n            pdb=self.pdb,\n            tags=solid_def_or_invocation.tags\n            if isinstance(solid_def_or_invocation, PendingNodeInvocation)\n            else None,\n            hook_defs=solid_def_or_invocation.hook_defs\n            if isinstance(solid_def_or_invocation, PendingNodeInvocation)\n            else None,\n            alias=solid_def_or_invocation.given_alias\n            if isinstance(solid_def_or_invocation, PendingNodeInvocation)\n            else None,\n            user_events=self._user_events,\n            output_metadata=self._output_metadata,\n        )\n\n    def get_events(self) -> List[UserEvent]:\n        """Retrieve the list of user-generated events that were logged via the context.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import op, build_op_context, AssetMaterialization, ExpectationResult\n\n            @op\n            def my_op(context):\n                ...\n\n            def test_my_op():\n                context = build_op_context()\n                my_op(context)\n                all_user_events = context.get_events()\n                materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n                expectation_results = [event for event in all_user_events if isinstance(event, ExpectationResult)]\n                ...\n        """\n\n        return self._user_events\n\n    def get_output_metadata(\n        self, output_name: str, mapping_key: Optional[str] = None\n    ) -> Optional[Mapping[str, Any]]:\n        """Retrieve metadata that was logged for an output and mapping_key, if it exists.\n\n        If metadata cannot be found for the particular output_name/mapping_key combination, None will be returned.\n\n        Args:\n            output_name (str): The name of the output to retrieve logged metadata for.\n            mapping_key (Optional[str]): The mapping key to retrieve metadata for (only applies when using dynamic outputs).\n\n        Returns:\n            Optional[Mapping[str, Any]]: The metadata values present for the output_name/mapping_key combination, if present.\n        """\n        metadata = self._output_metadata.get(output_name)\n        if mapping_key and metadata:\n            return metadata.get(mapping_key)\n        return metadata\n\n\ndef _validate_resource_requirements(resources: "Resources", solid_def: SolidDefinition) -> None:\n    """Validate correctness of resources against required resource keys"""\n\n    resources_dict = resources._asdict()  # type: ignore[attr-defined]\n\n    required_resource_keys: AbstractSet[str] = solid_def.required_resource_keys or set()\n    for resource_key in required_resource_keys:\n        if resource_key not in resources_dict:\n            raise DagsterInvalidInvocationError(\n                f'{solid_def.node_type_str} "{solid_def.name}" requires resource "{resource_key}", but no resource '\n                "with that key was found on the context."\n            )\n\n\ndef _resolve_bound_config(solid_config: Any, solid_def: SolidDefinition) -> Any:\n    """Validate config against config schema, and return validated config."""\n    from dagster.config.validate import process_config\n\n    # Config processing system expects the top level config schema to be a dictionary, but solid\n    # config schema can be scalar. Thus, we wrap it in another layer of indirection.\n    outer_config_shape = Shape({"config": solid_def.get_config_field()})\n    config_evr = process_config(\n        outer_config_shape, {"config": solid_config} if solid_config else {}\n    )\n    if not config_evr.success:\n        raise DagsterInvalidConfigError(\n            f"Error in config for {solid_def.node_type_str} ",\n            config_evr.errors,\n            solid_config,\n        )\n    validated_config = cast(Dict, config_evr.value).get("config")\n    mapped_config_evr = solid_def.apply_config_mapping({"config": validated_config})\n    if not mapped_config_evr.success:\n        raise DagsterInvalidConfigError(\n            f"Error in config for {solid_def.node_type_str} ",\n            mapped_config_evr.errors,\n            solid_config,\n        )\n    validated_config = cast(Dict, mapped_config_evr.value).get("config")\n    return validated_config\n\n\nclass BoundSolidExecutionContext(OpExecutionContext):\n    """The solid execution context that is passed to the compute function during invocation.\n\n    This context is bound to a specific solid definition, for which the resources and config have\n    been validated.\n    """\n\n    def __init__(\n        self,\n        solid_def: SolidDefinition,\n        solid_config: Any,\n        resources: "Resources",\n        resources_config: Dict[str, Any],\n        instance: DagsterInstance,\n        log_manager: DagsterLogManager,\n        pdb: Optional[ForkedPdb],\n        tags: Optional[Dict[str, str]],\n        hook_defs: Optional[AbstractSet[HookDefinition]],\n        alias: Optional[str],\n        user_events: List[UserEvent],\n        output_metadata: Dict[str, Any],\n    ):\n        self._solid_def = solid_def\n        self._solid_config = solid_config\n        self._resources = resources\n        self._instance = instance\n        self._log = log_manager\n        self._pdb = pdb\n        self._tags = merge_dicts(self._solid_def.tags, tags) if tags else self._solid_def.tags\n        self._hook_defs = hook_defs\n        self._alias = alias if alias else self._solid_def.name\n        self._resources_config = resources_config\n        self._user_events: List[UserEvent] = user_events\n        self._seen_outputs: Dict[str, Union[str, Set[str]]] = {}\n        self._output_metadata: Dict[str, Any] = output_metadata\n\n    @property\n    def solid_config(self) -> Any:\n        return self._solid_config\n\n    @property\n    def resources(self) -> Resources:\n        return self._resources\n\n    @property\n    def pipeline_run(self) -> PipelineRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> dict:\n        run_config = {}\n        if self._solid_config:\n            run_config["solids"] = {self._solid_def.name: {"config": self._solid_config}}\n        run_config["resources"] = self._resources_config\n        return run_config\n\n    @property\n    def pipeline_def(self) -> PipelineDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_def", "property"))\n\n    @property\n    def pipeline_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_name", "property"))\n\n    @property\n    def mode_def(self) -> ModeDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("mode_def", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def solid_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("solid_handle", "property"))\n\n    @property\n    def solid(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("solid", "property"))\n\n    @property\n    def solid_def(self) -> SolidDefinition:\n        return self._solid_def\n\n    def has_tag(self, key: str) -> bool:\n        return key in self._tags\n\n    def get_tag(self, key: str) -> str:\n        return self._tags.get(key)\n\n    @property\n    def alias(self) -> str:\n        return self._alias\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def for_type(self, dagster_type: DagsterType) -> TypeCheckContext:\n        resources = cast(NamedTuple, self.resources)\n        return TypeCheckContext(\n            self.run_id, self.log, ScopedResourcesBuilder(resources._asdict()), dagster_type\n        )\n\n    def get_mapping_key(self) -> Optional[str]:\n        return None\n\n    def describe_op(self):\n        if isinstance(self.solid_def, OpDefinition):\n            return f'op "{self.solid_def.name}"'\n\n        return f'solid "{self.solid_def.name}"'\n\n    def log_event(self, event: UserEvent) -> None:\n\n        check.inst_param(\n            event,\n            "event",\n            (AssetMaterialization, AssetObservation, ExpectationResult, Materialization),\n        )\n        self._user_events.append(event)\n\n    def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n        if mapping_key:\n            if output_name not in self._seen_outputs:\n                self._seen_outputs[output_name] = set()\n            cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n        else:\n            self._seen_outputs[output_name] = "seen"\n\n    def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n        if mapping_key:\n            return (\n                output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n            )\n        return output_name in self._seen_outputs\n\n    def add_output_metadata(\n        self,\n        metadata: Mapping[str, Any],\n        output_name: Optional[str] = None,\n        mapping_key: Optional[str] = None,\n    ) -> None:\n        """Add metadata to one of the outputs of an op.\n\n        This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.\n\n        Args:\n            metadata (Mapping[str, Any]): The metadata to attach to the output\n            output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import Out, op\n            from typing import Tuple\n\n            @op\n            def add_metadata(context):\n                context.add_output_metadata({"foo", "bar"})\n                return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n            @op(out={"a": Out(), "b": Out()})\n            def add_metadata_two_outputs(context) -> Tuple[str, int]:\n                context.add_output_metadata({"foo": "bar"}, output_name="b")\n                context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n                return ("dog", 5)\n\n        """\n        metadata = check.dict_param(metadata, "metadata", key_type=str)\n        output_name = check.opt_str_param(output_name, "output_name")\n        mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n        if output_name is None and len(self.solid_def.output_defs) == 1:\n            output_def = self.solid_def.output_defs[0]\n            output_name = output_def.name\n        elif output_name is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to log metadata without providing output_name, but multiple outputs exist. Please provide an output_name to the invocation of `context.add_output_metadata`."\n            )\n        else:\n            output_def = self.solid_def.output_def_named(output_name)\n\n        if self.has_seen_output(output_name, mapping_key):\n            output_desc = (\n                f"output '{output_def.name}'"\n                if not mapping_key\n                else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n            )\n            raise DagsterInvariantViolationError(\n                f"In {self.solid_def.node_type_str} '{self.solid_def.name}', attempted to log output metadata for {output_desc} which has already been yielded. Metadata must be logged before the output is yielded."\n            )\n        if output_def.is_dynamic and not mapping_key:\n            raise DagsterInvariantViolationError(\n                f"In {self.solid_def.node_type_str} '{self.solid_def.name}', attempted to log metadata for dynamic output '{output_def.name}' without providing a mapping key. When logging metadata for a dynamic output, it is necessary to provide a mapping key."\n            )\n\n        output_name = output_def.name\n        if output_name in self._output_metadata:\n            if not mapping_key or mapping_key in self._output_metadata[output_name]:\n                raise DagsterInvariantViolationError(\n                    f"In {self.solid_def.node_type_str} '{self.solid_def.name}', attempted to log metadata for output '{output_name}' more than once."\n                )\n        if mapping_key:\n            if not output_name in self._output_metadata:\n                self._output_metadata[output_name] = {}\n            self._output_metadata[output_name][mapping_key] = metadata\n\n        else:\n            self._output_metadata[output_name] = metadata\n\n\n
[docs]def build_op_context(\n resources: Optional[Dict[str, Any]] = None,\n op_config: Any = None,\n resources_config: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n config: Any = None,\n partition_key: Optional[str] = None,\n) -> OpExecutionContext:\n """Builds op execution context from provided parameters.\n\n ``op`` is currently built on top of `solid`, and thus this function creates a `SolidExecutionContext`.\n ``build_op_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_op_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking a op.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n config (Optional[Any]): The op config to provide to the context.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_op_context()\n op_to_invoke(context)\n\n with build_op_context(resources={"foo": context_manager_resource}) as context:\n op_to_invoke(context)\n """\n\n if op_config and config:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke ``build_op_context`` with both ``op_config``, and its "\n "legacy version, ``config``. Please provide one or the other."\n )\n\n op_config = op_config if op_config else config\n return build_solid_context(\n resources=resources,\n resources_config=resources_config,\n solid_config=op_config,\n instance=instance,\n partition_key=partition_key,\n )
\n\n\n
[docs]def build_solid_context(\n resources: Optional[Dict[str, Any]] = None,\n solid_config: Any = None,\n resources_config: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n config: Any = None,\n partition_key: Optional[str] = None,\n) -> UnboundSolidExecutionContext:\n """Builds solid execution context from provided parameters.\n\n ``build_solid_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_solid_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking a solid.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n solid_config (Optional[Any]): The solid config to provide to the context. The value provided\n here will be available as ``context.solid_config``.\n resources_config (Optional[Dict[str, Any]]): Configuration for any resource definitions\n provided to the resources arg. The configuration under a specific key should match the\n resource under a specific key in the resources dictionary.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_solid_context()\n solid_to_invoke(context)\n\n with build_solid_context(resources={"foo": context_manager_resource}) as context:\n solid_to_invoke(context)\n """\n\n if solid_config and config:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke ``build_solid_context`` with both ``solid_config``, and its "\n "legacy version, ``config``. Please provide one or the other."\n )\n\n solid_config = solid_config if solid_config else config\n\n return UnboundSolidExecutionContext(\n resources_dict=check.opt_dict_param(resources, "resources", key_type=str),\n resources_config=check.opt_dict_param(resources_config, "resources_config", key_type=str),\n solid_config=solid_config,\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/invocation", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.invocation"}, "logger": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.logger

\nfrom typing import Any, Optional\n\nfrom dagster import check\nfrom dagster.core.definitions.job_definition import JobDefinition\nfrom dagster.core.definitions.logger_definition import LoggerDefinition\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.errors import DagsterInvariantViolationError\n\nfrom .output import RUN_ID_PLACEHOLDER\n\n\n
[docs]class InitLoggerContext:\n """Logger-specific initialization context.\n\n An instance of this class is made available as the first argument to the ``logger_fn`` decorated\n by :py:func:`@logger <logger>` or set on a :py:class:`LoggerDefinition`.\n\n Users should not instantiate this class.\n\n Attributes:\n logger_config (Any): The configuration data provided by the run config. The\n schema for this data is defined by ``config_schema`` on the :py:class:`LoggerDefinition`\n pipeline_def (Optional[PipelineDefinition]): The pipeline/job definition currently being executed.\n logger_def (Optional[LoggerDefinition]): The logger definition for the logger being constructed.\n run_id (str): The ID for this run of the pipeline.\n """\n\n def __init__(\n self,\n logger_config: Any,\n logger_def: Optional[LoggerDefinition] = None,\n pipeline_def: Optional[PipelineDefinition] = None,\n run_id: Optional[str] = None,\n ):\n self._logger_config = logger_config\n self._pipeline_def = check.opt_inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n self._logger_def = check.opt_inst_param(logger_def, "logger_def", LoggerDefinition)\n self._run_id = check.opt_str_param(run_id, "run_id")\n\n @property\n def logger_config(self) -> Any:\n return self._logger_config\n\n @property\n def pipeline_def(self) -> Optional[PipelineDefinition]:\n return self._pipeline_def\n\n @property\n def job_def(self) -> Optional[JobDefinition]:\n if not self._pipeline_def:\n return None\n if not isinstance(self._pipeline_def, JobDefinition):\n raise DagsterInvariantViolationError(\n "Attempted to access the .job_def property on an InitLoggerContext that was "\n "initialized with a PipelineDefinition. Please use .pipeline_def instead."\n )\n return self._pipeline_def\n\n @property\n def logger_def(self) -> Optional[LoggerDefinition]:\n return self._logger_def\n\n @property\n def run_id(self) -> Optional[str]:\n return self._run_id
\n\n\nclass UnboundInitLoggerContext(InitLoggerContext):\n """Logger initialization context outputted by ``build_init_logger_context``.\n\n Represents a context whose config has not yet been validated against a logger definition, hence\n the inability to access the `logger_def` attribute. When an instance of\n ``UnboundInitLoggerContext`` is passed to ``LoggerDefinition.initialize``, config is validated,\n and it is subsumed into an `InitLoggerContext`, which contains the logger_def validated against.\n """\n\n def __init__(self, logger_config: Any, pipeline_def: Optional[PipelineDefinition]):\n super(UnboundInitLoggerContext, self).__init__(\n logger_config, logger_def=None, pipeline_def=pipeline_def, run_id=None\n )\n\n @property\n def logger_def(self) -> LoggerDefinition:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def run_id(self) -> Optional[str]:\n return RUN_ID_PLACEHOLDER\n
", "current_page_name": "_modules/dagster/core/execution/context/logger", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.logger"}, "output": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.output

\nimport warnings\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union, cast\n\nfrom dagster import check\nfrom dagster.core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    Materialization,\n    MetadataEntry,\n    PartitionMetadataEntry,\n)\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.time_window_partitions import TimeWindow\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.execution.plan.utils import build_resources_for_manager\n\nif TYPE_CHECKING:\n    from dagster.core.definitions import PipelineDefinition\n    from dagster.core.definitions.resource_definition import Resources\n    from dagster.core.events import DagsterEvent\n    from dagster.core.execution.context.system import StepExecutionContext\n    from dagster.core.execution.plan.outputs import StepOutputHandle\n    from dagster.core.execution.plan.plan import ExecutionPlan\n    from dagster.core.log_manager import DagsterLogManager\n    from dagster.core.system_config.objects import ResolvedRunConfig\n    from dagster.core.types.dagster_type import DagsterType\n\nRUN_ID_PLACEHOLDER = "__EPHEMERAL_RUN_ID"\n\n\n
[docs]class OutputContext:\n """\n The context object that is available to the `handle_output` method of an :py:class:`IOManager`.\n\n Attributes:\n step_key (Optional[str]): The step_key for the compute step that produced the output.\n name (Optional[str]): The name of the output that produced the output.\n pipeline_name (Optional[str]): The name of the pipeline definition.\n run_id (Optional[str]): The id of the run that produced the output.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n solid_def (Optional[SolidDefinition]): The definition of the solid that produced the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n log (Optional[DagsterLogManager]): The log manager to use for this output.\n version (Optional[str]): (Experimental) The version of the output.\n resource_config (Optional[Dict[str, Any]]): The config associated with the resource that\n initializes the RootInputManager.\n resources (Optional[Resources]): The resources required by the output manager, specified by the\n `required_resource_keys` parameter.\n op_def (Optional[OpDefinition]): The definition of the op that produced the output.\n """\n\n def __init__(\n self,\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n pipeline_name: Optional[str] = None,\n run_id: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n mapping_key: Optional[str] = None,\n config: Optional[Any] = None,\n solid_def: Optional["SolidDefinition"] = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n resources: Optional[Union["Resources", Dict[str, Any]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n op_def: Optional["OpDefinition"] = None,\n ):\n from dagster.core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster.core.execution.build_resources import build_resources\n\n self._step_key = step_key\n self._name = name\n self._pipeline_name = pipeline_name\n self._run_id = run_id\n self._metadata = metadata\n self._mapping_key = mapping_key\n self._config = config\n check.invariant(\n solid_def is None or op_def is None, "Can't provide both a solid_def and an op_def arg"\n )\n self._solid_def = solid_def or op_def\n self._dagster_type = dagster_type\n self._log = log_manager\n self._version = version\n self._resource_config = resource_config\n self._step_context = step_context\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_dict_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__() # pylint: disable=no-member\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events: List["DagsterEvent"] = []\n self._user_events: List[Union[AssetMaterialization, AssetObservation, Materialization]] = []\n self._metadata_entries: Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]] = None\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc) # pylint: disable=no-member\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None) # pylint: disable=no-member\n\n @property\n def step_key(self) -> str:\n if self._step_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_key\n\n @property\n def name(self) -> str:\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._name\n\n @property\n def pipeline_name(self) -> str:\n if self._pipeline_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access pipeline_name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._pipeline_name\n\n @property\n def run_id(self) -> str:\n if self._run_id is None:\n raise DagsterInvariantViolationError(\n "Attempting to access run_id, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._run_id\n\n @property\n def metadata(self) -> Optional[Dict[str, Any]]:\n return self._metadata\n\n @property\n def mapping_key(self) -> Optional[str]:\n return self._mapping_key\n\n @property\n def config(self) -> Any:\n return self._config\n\n @property\n def solid_def(self) -> "SolidDefinition":\n if self._solid_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access solid_def, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._solid_def\n\n @property\n def op_def(self) -> "OpDefinition":\n if self._solid_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return cast(OpDefinition, self._solid_def)\n\n @property\n def dagster_type(self) -> "DagsterType":\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._dagster_type\n\n @property\n def log(self) -> "DagsterLogManager":\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._log\n\n @property\n def version(self) -> Optional[str]:\n return self._version\n\n @property\n def resource_config(self) -> Optional[Dict[str, Any]]:\n return self._resource_config\n\n @property\n def resources(self) -> Any:\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the OutputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_output_context(...) as context:`"\n )\n return self._resources\n\n @property\n def asset_key(self) -> Optional[AssetKey]:\n matching_output_defs = [\n output_def\n for output_def in cast(SolidDefinition, self._solid_def).output_defs\n if output_def.name == self.name\n ]\n check.invariant(len(matching_output_defs) == 1)\n return matching_output_defs[0].get_asset_key(self)\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_context\n\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run"""\n return self.step_context.has_partition_key\n\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n return self.step_context.partition_key\n\n @property\n def has_asset_partitions(self) -> bool:\n if self._step_context is not None:\n return self._step_context.has_asset_partitions_for_output(self.name)\n else:\n return False\n\n @property\n def asset_partition_key(self) -> str:\n """The partition key for output asset.\n\n Raises an error if the output asset has no partitioning, or if the run covers a partition\n range for the output asset.\n """\n return self.step_context.asset_partition_key_for_output(self.name)\n\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n return self.step_context.asset_partition_key_range_for_output(self.name)\n\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n return self.step_context.asset_partitions_time_window_for_output(self.name)\n\n
[docs] def get_run_scoped_output_identifier(self) -> List[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n The unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. run id, step key, and output name\n """\n\n warnings.warn(\n "`OutputContext.get_run_scoped_output_identifier` is deprecated. Use "\n "`OutputContext.get_output_identifier` instead."\n )\n # if run_id is None and this is a re-execution, it means we failed to find its source run id\n check.invariant(\n self.run_id is not None,\n "Unable to find the run scoped output identifier: run_id is None on OutputContext.",\n )\n check.invariant(\n self.step_key is not None,\n "Unable to find the run scoped output identifier: step_key is None on OutputContext.",\n )\n check.invariant(\n self.name is not None,\n "Unable to find the run scoped output identifier: name is None on OutputContext.",\n )\n run_id = cast(str, self.run_id)\n step_key = cast(str, self.step_key)\n name = cast(str, self.name)\n\n if self.mapping_key:\n return [run_id, step_key, name, self.mapping_key]\n\n return [run_id, step_key, name]
\n\n
[docs] def get_output_identifier(self) -> List[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n version = self.version\n step_key = self.step_key\n name = self.name\n if version is not None:\n check.invariant(\n self.mapping_key is None,\n f"Mapping key and version both provided for output '{name}' of step '{step_key}'. "\n "Dynamic mapping is not supported when using versioning.",\n )\n identifier = ["versioned_outputs", version, step_key, name]\n else:\n run_id = self.run_id\n identifier = [run_id, step_key, name]\n if self.mapping_key:\n identifier.append(self.mapping_key)\n\n return identifier
\n\n
[docs] def log_event(\n self, event: Union[AssetObservation, AssetMaterialization, Materialization]\n ) -> None:\n """Log an AssetMaterialization or AssetObservation from within the body of an io manager's `handle_output` method.\n\n Events logged with this method will appear in the event log.\n\n Args:\n event (Union[AssetMaterialization, Materialization, AssetObservation]): The event to log.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import IOManager, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.log_event(AssetMaterialization("foo"))\n """\n from dagster.core.events import DagsterEvent\n\n if isinstance(event, (AssetMaterialization, Materialization)):\n if self._step_context:\n self._events.append(\n DagsterEvent.asset_materialization(\n self._step_context,\n event,\n self._step_context.get_input_lineage(),\n )\n )\n self._user_events.append(event)\n elif isinstance(event, AssetObservation):\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, event))\n self._user_events.append(event)\n else:\n check.failed("Unexpected event {event}".format(event=event))
\n\n
[docs] def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_output`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n\n events = self._events\n self._events = []\n yield from events
\n\n
[docs] def get_logged_events(\n self,\n ) -> List[Union[AssetMaterialization, Materialization, AssetObservation]]:\n """Retrieve the list of user-generated events that were logged via the context.\n\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_output_context, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n ...\n\n def test_handle_output():\n mgr = MyIOManager()\n context = build_output_context()\n mgr.handle_output(context)\n all_user_events = context.get_logged_events()\n materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n ...\n """\n\n return self._user_events
\n\n
[docs] def add_output_metadata(self, metadata: Dict[str, Any]) -> None:\n """Add a dictionary of metadata to the handled output.\n\n Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.\n\n Args:\n metadata (Dict[str, Any]): A metadata dictionary to log\n\n Examples:\n\n .. code-block:: python\n\n from dagster import IOManager\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.add_output_metadata({"foo": "bar"})\n """\n from dagster.core.definitions.metadata import normalize_metadata\n\n self._metadata_entries = normalize_metadata(metadata, [])
\n\n
[docs] def get_logged_metadata_entries(\n self,\n ) -> List[Union[MetadataEntry, PartitionMetadataEntry]]:\n """Get the list of metadata entries that have been logged for use with this output."""\n return self._metadata_entries or []
\n\n
[docs] def consume_logged_metadata_entries(\n self,\n ) -> List[Union[MetadataEntry, PartitionMetadataEntry]]:\n """Pops and yields all user-generated metadata entries that have been recorded from this context.\n\n If consume_logged_metadata_entries has not yet been called, this will yield all logged events since the call to `handle_output`. If consume_logged_metadata_entries has been called, it will yield all events since the last time consume_logged_metadata_entries was called. Designed for internal use. Users should never need to invoke this method.\n """\n result = self._metadata_entries\n self._metadata_entries = []\n return result or []
\n\n\ndef get_output_context(\n execution_plan: "ExecutionPlan",\n pipeline_def: "PipelineDefinition",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n run_id: Optional[str],\n log_manager: Optional["DagsterLogManager"],\n step_context: Optional["StepExecutionContext"],\n resources: Optional["Resources"],\n version: Optional[str],\n) -> "OutputContext":\n """\n Args:\n run_id (str): The run ID of the run that produced the output, not necessarily the run that\n the context will be used in.\n """\n\n step = execution_plan.get_step_by_key(step_output_handle.step_key)\n # get config\n solid_config = resolved_run_config.solids[step.solid_handle.to_string()]\n outputs_config = solid_config.outputs\n\n if outputs_config:\n output_config = outputs_config.get_output_manager_config(step_output_handle.output_name)\n else:\n output_config = None\n\n step_output = execution_plan.get_step_output(step_output_handle)\n output_def = pipeline_def.get_solid(step_output.solid_handle).output_def_named(step_output.name)\n\n io_manager_key = output_def.io_manager_key\n resource_config = resolved_run_config.resources[io_manager_key].config\n\n if step_context:\n check.invariant(\n not resources,\n "Expected either resources or step context to be set, but "\n "received both. If step context is provided, resources for IO manager will be "\n "retrieved off of that.",\n )\n resources = build_resources_for_manager(io_manager_key, step_context)\n\n return OutputContext(\n step_key=step_output_handle.step_key,\n name=step_output_handle.output_name,\n pipeline_name=pipeline_def.name,\n run_id=run_id,\n metadata=output_def.metadata,\n mapping_key=step_output_handle.mapping_key,\n config=output_config,\n solid_def=pipeline_def.get_solid(step.solid_handle).definition,\n dagster_type=output_def.dagster_type,\n log_manager=log_manager,\n version=version,\n step_context=step_context,\n resource_config=resource_config,\n resources=resources,\n )\n\n\ndef step_output_version(\n pipeline_def: "PipelineDefinition",\n execution_plan: "ExecutionPlan",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n) -> Optional[str]:\n from dagster.core.execution.resolve_versions import resolve_step_output_versions\n\n step_output_versions = resolve_step_output_versions(\n pipeline_def, execution_plan, resolved_run_config\n )\n return (\n step_output_versions[step_output_handle]\n if step_output_handle in step_output_versions\n else None\n )\n\n\n
[docs]def build_output_context(\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n run_id: Optional[str] = None,\n mapping_key: Optional[str] = None,\n config: Optional[Any] = None,\n dagster_type: Optional["DagsterType"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n solid_def: Optional[SolidDefinition] = None,\n op_def: Optional[OpDefinition] = None,\n) -> "OutputContext":\n """Builds output context from provided parameters.\n\n ``build_output_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_output_context`` must be used as a\n context manager.\n\n Args:\n step_key (Optional[str]): The step_key for the compute step that produced the output.\n name (Optional[str]): The name of the output that produced the output.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n version (Optional[str]): (Experimental) The version of the output.\n resource_config (Optional[Dict[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the output manager.\n resources (Optional[Resources]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n solid_def (Optional[SolidDefinition]): The definition of the solid that produced the output.\n op_def (Optional[OpDefinition]): The definition of the solid that produced the output.\n\n Examples:\n\n .. code-block:: python\n\n build_output_context()\n\n with build_output_context(resources={"foo": context_manager_resource}) as context:\n do_something\n\n """\n from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n from dagster.core.types.dagster_type import DagsterType\n\n step_key = check.opt_str_param(step_key, "step_key")\n name = check.opt_str_param(name, "name")\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n run_id = check.opt_str_param(run_id, "run_id", default=RUN_ID_PLACEHOLDER)\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n version = check.opt_str_param(version, "version")\n resource_config = check.opt_dict_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_dict_param(resources, "resources", key_type=str)\n solid_def = check.opt_inst_param(solid_def, "solid_def", SolidDefinition)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n\n return OutputContext(\n step_key=step_key,\n name=name,\n pipeline_name=None,\n run_id=run_id,\n metadata=metadata,\n mapping_key=mapping_key,\n config=config,\n solid_def=solid_def,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n version=version,\n resource_config=resource_config,\n resources=resources,\n step_context=None,\n op_def=op_def,\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/output", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.output"}, "system": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.system

\n"""\nThis module contains the execution context objects that are internal to the system.\nNot every property on these should be exposed to random Jane or Joe dagster user\nso we have a different layer of objects that encode the explicit public API\nin the user_context module\n"""\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Union,\n    cast,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey, AssetLineageInfo\nfrom dagster.core.definitions.hook_definition import HookDefinition\nfrom dagster.core.definitions.mode import ModeDefinition\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster.core.definitions.pipeline_base import IPipeline\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.definitions.reconstruct import ReconstructablePipeline\nfrom dagster.core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.execution.plan.step import ExecutionStep\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.executor.base import Executor\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.io_manager import IOManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import PARTITION_NAME_TAG\nfrom dagster.core.system_config.objects import ResolvedRunConfig\nfrom dagster.core.types.dagster_type import DagsterType, DagsterTypeKind\n\nfrom .input import InputContext\nfrom .output import OutputContext, get_output_context\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.dependency import Node, NodeHandle\n    from dagster.core.definitions.resource_definition import Resources\n    from dagster.core.execution.plan.plan import ExecutionPlan\n    from dagster.core.instance import DagsterInstance\n\n    from .hook import HookContext\n\n\nclass IPlanContext(ABC):\n    """Context interface to represent run information that does not require access to user code.\n\n    The information available via this interface is accessible to the system throughout a run.\n    """\n\n    @property\n    @abstractmethod\n    def plan_data(self) -> "PlanData":\n        raise NotImplementedError()\n\n    @property\n    def pipeline(self) -> IPipeline:\n        return self.plan_data.pipeline\n\n    @property\n    def pipeline_run(self) -> PipelineRun:\n        return self.plan_data.pipeline_run\n\n    @property\n    def run_id(self) -> str:\n        return self.pipeline_run.run_id\n\n    @property\n    def run_config(self) -> dict:\n        return self.pipeline_run.run_config\n\n    @property\n    def pipeline_name(self) -> str:\n        return self.pipeline_run.pipeline_name\n\n    @property\n    def job_name(self) -> str:\n        return self.pipeline_name\n\n    @property\n    def instance(self) -> "DagsterInstance":\n        return self.plan_data.instance\n\n    @property\n    def raise_on_error(self) -> bool:\n        return self.plan_data.raise_on_error\n\n    @property\n    def retry_mode(self) -> RetryMode:\n        return self.plan_data.retry_mode\n\n    @property\n    def execution_plan(self):\n        return self.plan_data.execution_plan\n\n    @property\n    @abstractmethod\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        raise NotImplementedError()\n\n    @property\n    def log(self) -> DagsterLogManager:\n        raise NotImplementedError()\n\n    @property\n    def logging_tags(self) -> Dict[str, str]:\n        return self.log.logging_metadata.to_tags()\n\n    def has_tag(self, key: str) -> bool:\n        check.str_param(key, "key")\n        return key in self.log.logging_metadata.pipeline_tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        check.str_param(key, "key")\n        return self.log.logging_metadata.pipeline_tags.get(key)\n\n\nclass PlanData(NamedTuple):\n    """The data about a run that is available during both orchestration and execution.\n\n    This object does not contain any information that requires access to user code, such as the\n    pipeline definition and resources.\n    """\n\n    pipeline: IPipeline\n    pipeline_run: PipelineRun\n    instance: "DagsterInstance"\n    execution_plan: "ExecutionPlan"\n    raise_on_error: bool = False\n    retry_mode: RetryMode = RetryMode.DISABLED\n\n\nclass ExecutionData(NamedTuple):\n    """The data that is available to the system during execution.\n\n    This object contains information that requires access to user code, such as the pipeline\n    definition and resources.\n    """\n\n    scoped_resources_builder: ScopedResourcesBuilder\n    resolved_run_config: ResolvedRunConfig\n    pipeline_def: PipelineDefinition\n    mode_def: ModeDefinition\n\n\nclass IStepContext(IPlanContext):\n    """Interface to represent data to be available during either step orchestration or execution."""\n\n    @property\n    @abstractmethod\n    def step(self) -> ExecutionStep:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def solid_handle(self) -> "NodeHandle":\n        raise NotImplementedError()\n\n\nclass PlanOrchestrationContext(IPlanContext):\n    """Context for the orchestration of a run.\n\n    This context assumes inability to run user code directly.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n        resume_from_failure: bool = False,\n    ):\n        self._plan_data = plan_data\n        self._log_manager = log_manager\n        self._executor = executor\n        self._output_capture = output_capture\n        self._resume_from_failure = resume_from_failure\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def reconstructable_pipeline(self) -> ReconstructablePipeline:\n        if not isinstance(self.pipeline, ReconstructablePipeline):\n            raise DagsterInvariantViolationError(\n                "reconstructable_pipeline property must be a ReconstructablePipeline"\n            )\n        return self.pipeline\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def executor(self) -> Executor:\n        return self._executor\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(self, step: ExecutionStep) -> "IStepContext":\n        return StepOrchestrationContext(\n            plan_data=self.plan_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            executor=self.executor,\n            step=step,\n            output_capture=self.output_capture,\n        )\n\n    @property\n    def resume_from_failure(self) -> bool:\n        return self._resume_from_failure\n\n\nclass StepOrchestrationContext(PlanOrchestrationContext, IStepContext):\n    """Context for the orchestration of a step.\n\n    This context assumes inability to run user code directly. Thus, it does not include any resource\n    information.\n    """\n\n    def __init__(self, plan_data, log_manager, executor, step, output_capture):\n        super(StepOrchestrationContext, self).__init__(\n            plan_data, log_manager, executor, output_capture\n        )\n        self._step = step\n\n    @property\n    def step(self) -> ExecutionStep:\n        return self._step\n\n    @property\n    def solid_handle(self) -> "NodeHandle":\n        return self.step.solid_handle\n\n\nclass PlanExecutionContext(IPlanContext):\n    """Context for the execution of a plan.\n\n    This context assumes that user code can be run directly, and thus includes resource and\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        execution_data: ExecutionData,\n        log_manager: DagsterLogManager,\n        output_capture: Optional[Dict[StepOutputHandle, Any]] = None,\n    ):\n        self._plan_data = plan_data\n        self._execution_data = execution_data\n        self._log_manager = log_manager\n        self._output_capture = output_capture\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(self, step: ExecutionStep, previous_attempt_count: int = 0) -> IStepContext:\n\n        return StepExecutionContext(\n            plan_data=self.plan_data,\n            execution_data=self._execution_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            step=step,\n            output_capture=self.output_capture,\n            previous_attempt_count=previous_attempt_count,\n        )\n\n    @property\n    def pipeline_def(self) -> PipelineDefinition:\n        return self._execution_data.pipeline_def\n\n    @property\n    def resolved_run_config(self) -> ResolvedRunConfig:\n        return self._execution_data.resolved_run_config\n\n    @property\n    def scoped_resources_builder(self) -> ScopedResourcesBuilder:\n        return self._execution_data.scoped_resources_builder\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def partition_key(self) -> str:\n        tags = self._plan_data.pipeline_run.tags\n        check.invariant(\n            PARTITION_NAME_TAG in tags, "Tried to access partition_key for a non-partitioned run"\n        )\n        return tags[PARTITION_NAME_TAG]\n\n    @property\n    def has_partition_key(self) -> bool:\n        return PARTITION_NAME_TAG in self._plan_data.pipeline_run.tags\n\n    def for_type(self, dagster_type: DagsterType) -> "TypeCheckContext":\n        return TypeCheckContext(\n            self.run_id, self.log, self._execution_data.scoped_resources_builder, dagster_type\n        )\n\n\nclass StepExecutionContext(PlanExecutionContext, IStepContext):\n    """Context for the execution of a step.\n\n    This context assumes that user code can be run directly, and thus includes resource and\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        execution_data: ExecutionData,\n        log_manager: DagsterLogManager,\n        step: ExecutionStep,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n        previous_attempt_count: int,\n    ):\n        from dagster.core.execution.resources_init import get_required_resource_keys_for_step\n\n        super(StepExecutionContext, self).__init__(\n            plan_data=plan_data,\n            execution_data=execution_data,\n            log_manager=log_manager,\n            output_capture=output_capture,\n        )\n        self._step = step\n        self._required_resource_keys = get_required_resource_keys_for_step(\n            plan_data.pipeline.get_definition(),\n            step,\n            plan_data.execution_plan,\n        )\n        self._resources = execution_data.scoped_resources_builder.build(\n            self._required_resource_keys\n        )\n        self._previous_attempt_count = previous_attempt_count\n        self._input_lineage: List[AssetLineageInfo] = []\n\n        resources_iter = cast(Iterable, self._resources)\n\n        step_launcher_resources = [\n            resource for resource in resources_iter if isinstance(resource, StepLauncher)\n        ]\n\n        self._step_launcher: Optional[StepLauncher] = None\n        if len(step_launcher_resources) > 1:\n            raise DagsterInvariantViolationError(\n                "Multiple required resources for {described_op} have inherited StepLauncher"\n                "There should be at most one step launcher resource per {node_type}.".format(\n                    described_op=self.describe_op(), node_type=self.solid_def.node_type_str\n                )\n            )\n        elif len(step_launcher_resources) == 1:\n            self._step_launcher = step_launcher_resources[0]\n\n        self._step_exception: Optional[BaseException] = None\n\n        self._step_output_capture: Optional[Dict[StepOutputHandle, Any]] = None\n        # Enable step output capture if there are any hooks which will receive them.\n        # Expect in the future that hooks may control whether or not they get outputs,\n        # but for now presence of any will cause output capture.\n        if self.pipeline_def.get_all_hooks_for_handle(self.solid_handle):\n            self._step_output_capture = {}\n\n        self._output_metadata: Dict[str, Any] = {}\n        self._seen_outputs: Dict[str, Union[str, Set[str]]] = {}\n\n    @property\n    def step(self) -> ExecutionStep:\n        return self._step\n\n    @property\n    def solid_handle(self) -> "NodeHandle":\n        return self.step.solid_handle\n\n    @property\n    def required_resource_keys(self) -> Set[str]:\n        return self._required_resource_keys\n\n    @property\n    def resources(self) -> "Resources":\n        return self._resources\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        return self._step_launcher\n\n    @property\n    def solid_def(self) -> SolidDefinition:\n        return self.solid.definition.ensure_solid_def()\n\n    @property\n    def pipeline_def(self) -> PipelineDefinition:\n        return self._execution_data.pipeline_def\n\n    @property\n    def mode_def(self) -> ModeDefinition:\n        return self._execution_data.mode_def\n\n    @property\n    def solid(self) -> "Node":\n        return self.pipeline_def.get_solid(self._step.solid_handle)\n\n    @property\n    def solid_retry_policy(self) -> Optional[RetryPolicy]:\n        return self.pipeline_def.get_retry_policy_for_handle(self.solid_handle)\n\n    def describe_op(self):\n        if isinstance(self.solid_def, OpDefinition):\n            return f'op "{str(self.solid_handle)}"'\n\n        return f'solid "{str(self.solid_handle)}"'\n\n    def get_io_manager(self, step_output_handle) -> IOManager:\n        step_output = self.execution_plan.get_step_output(step_output_handle)\n        io_manager_key = (\n            self.pipeline_def.get_solid(step_output.solid_handle)\n            .output_def_named(step_output.name)\n            .io_manager_key\n        )\n\n        output_manager = getattr(self.resources, io_manager_key)\n        return check.inst(output_manager, IOManager)\n\n    def get_output_context(self, step_output_handle) -> OutputContext:\n        return get_output_context(\n            self.execution_plan,\n            self.pipeline_def,\n            self.resolved_run_config,\n            step_output_handle,\n            self._get_source_run_id(step_output_handle),\n            log_manager=self.log,\n            step_context=self,\n            resources=None,\n            version=self.execution_plan.get_version_for_step_output_handle(step_output_handle),\n        )\n\n    def for_input_manager(\n        self,\n        name: str,\n        config: Any,\n        metadata: Any,\n        dagster_type: DagsterType,\n        source_handle: Optional[StepOutputHandle] = None,\n        resource_config: Any = None,\n        resources: Optional["Resources"] = None,\n    ) -> InputContext:\n        return InputContext(\n            pipeline_name=self.pipeline_def.name,\n            name=name,\n            solid_def=self.solid_def,\n            config=config,\n            metadata=metadata,\n            upstream_output=self.get_output_context(source_handle) if source_handle else None,\n            dagster_type=dagster_type,\n            log_manager=self.log,\n            step_context=self,\n            resource_config=resource_config,\n            resources=resources,\n        )\n\n    def for_hook(self, hook_def: HookDefinition) -> "HookContext":\n        from .hook import HookContext\n\n        return HookContext(self, hook_def)\n\n    def can_load(self, step_output_handle: StepOutputHandle) -> bool:\n        # Whether IO Manager can load the source\n        # FIXME https://github.com/dagster-io/dagster/issues/3511\n        # This is a stopgap which asks the instance to check the event logs to find out step skipping\n\n        from dagster.core.events import DagsterEventType\n\n        # can load from upstream in the same run\n        for record in self.instance.all_logs(self.run_id, of_type=DagsterEventType.STEP_OUTPUT):\n            if step_output_handle == record.dagster_event.event_specific_data.step_output_handle:\n                return True\n\n        if (\n            self._should_load_from_previous_runs(step_output_handle)\n            # should and can load from a previous run\n            and self._get_source_run_id_from_logs(step_output_handle)\n        ):\n            return True\n\n        return False\n\n    def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n        if mapping_key:\n            if output_name not in self._seen_outputs:\n                self._seen_outputs[output_name] = set()\n            cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n        else:\n            self._seen_outputs[output_name] = "seen"\n\n    def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n        if mapping_key:\n            return (\n                output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n            )\n        return output_name in self._seen_outputs\n\n    def add_output_metadata(\n        self,\n        metadata: Mapping[str, Any],\n        output_name: Optional[str] = None,\n        mapping_key: Optional[str] = None,\n    ) -> None:\n\n        if output_name is None and len(self.solid_def.output_defs) == 1:\n            output_def = self.solid_def.output_defs[0]\n            output_name = output_def.name\n        elif output_name is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to log metadata without providing output_name, but multiple outputs exist. Please provide an output_name to the invocation of `context.add_output_metadata`."\n            )\n        else:\n            output_def = self.solid_def.output_def_named(output_name)\n\n        if self.has_seen_output(output_name, mapping_key):\n            output_desc = (\n                f"output '{output_def.name}'"\n                if not mapping_key\n                else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n            )\n            raise DagsterInvariantViolationError(\n                f"In {self.solid_def.node_type_str} '{self.solid.name}', attempted to log output metadata for {output_desc} which has already been yielded. Metadata must be logged before the output is yielded."\n            )\n        if output_def.is_dynamic and not mapping_key:\n            raise DagsterInvariantViolationError(\n                f"In {self.solid_def.node_type_str} '{self.solid.name}', attempted to log metadata for dynamic output '{output_def.name}' without providing a mapping key. When logging metadata for a dynamic output, it is necessary to provide a mapping key."\n            )\n\n        output_name = output_def.name\n        if output_name in self._output_metadata:\n            if not mapping_key or mapping_key in self._output_metadata[output_name]:\n                raise DagsterInvariantViolationError(\n                    f"In {self.solid_def.node_type_str} '{self.solid.name}', attempted to log metadata for output '{output_name}' more than once."\n                )\n        if mapping_key:\n            if not output_name in self._output_metadata:\n                self._output_metadata[output_name] = {}\n            self._output_metadata[output_name][mapping_key] = metadata\n\n        else:\n            self._output_metadata[output_name] = metadata\n\n    def get_output_metadata(\n        self, output_name: str, mapping_key: Optional[str] = None\n    ) -> Optional[Mapping[str, Any]]:\n        metadata = self._output_metadata.get(output_name)\n        if mapping_key and metadata:\n            return metadata.get(mapping_key)\n        return metadata\n\n    def _get_source_run_id_from_logs(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n        from dagster.core.events import DagsterEventType\n\n        # walk through event logs to find the right run_id based on the run lineage\n        run_group = self.instance.get_run_group(self.run_id)\n        if run_group is None:\n            check.failed(f"Failed to load run group {self.run_id}")\n\n        _, runs = run_group\n        run_id_to_parent_run_id = {run.run_id: run.parent_run_id for run in runs}\n        source_run_id = self.pipeline_run.parent_run_id\n        while source_run_id:\n            # note: this would cost N db calls where N = number of parent runs\n            step_output_record = self.instance.all_logs(\n                source_run_id, of_type=DagsterEventType.STEP_OUTPUT\n            )\n            # if the parent run has yielded an StepOutput event for the given step output,\n            # we find the source run id\n            for r in step_output_record:\n                if r.dagster_event.step_output_data.step_output_handle == step_output_handle:\n                    return source_run_id\n            # else, keep looking backwards\n            source_run_id = run_id_to_parent_run_id.get(source_run_id)\n\n        # When a fixed path is provided via io manager, it's able to run step subset using an execution\n        # plan when the ascendant outputs were not previously created by dagster-controlled\n        # computations. for example, in backfills, with fixed path io manager, we allow users to\n        # "re-execute" runs with steps where the outputs weren't previously stored by dagster.\n\n        # Warn about this special case because it will also reach here when all previous runs have\n        # skipped yielding this output. From the logs, we have no easy way to differentiate the fixed\n        # path case and the skipping case, until we record the skipping info in KnownExecutionState,\n        # i.e. resolve https://github.com/dagster-io/dagster/issues/3511\n        self.log.warn(\n            f"No previously stored outputs found for source {step_output_handle}. "\n            "This is either because you are using an IO Manager that does not depend on run ID, "\n            "or because all the previous runs have skipped the output in conditional execution."\n        )\n        return None\n\n    def _should_load_from_previous_runs(self, step_output_handle: StepOutputHandle) -> bool:\n        return (  # this is re-execution\n            self.pipeline_run.parent_run_id is not None\n            # we are not re-executing the entire pipeline\n            and self.pipeline_run.step_keys_to_execute is not None\n            # this step is not being executed\n            and step_output_handle.step_key not in self.pipeline_run.step_keys_to_execute\n        )\n\n    def _get_source_run_id(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n        if self._should_load_from_previous_runs(step_output_handle):\n            return self._get_source_run_id_from_logs(step_output_handle)\n        else:\n            return self.pipeline_run.run_id\n\n    def capture_step_exception(self, exception: BaseException):\n        self._step_exception = check.inst_param(exception, "exception", BaseException)\n\n    @property\n    def step_exception(self) -> Optional[BaseException]:\n        return self._step_exception\n\n    @property\n    def step_output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._step_output_capture\n\n    @property\n    def previous_attempt_count(self) -> int:\n        return self._previous_attempt_count\n\n    @property\n    def op_config(self) -> Any:\n        solid_config = self.resolved_run_config.solids.get(str(self.solid_handle))\n        return solid_config.config if solid_config else None\n\n    def has_asset_partitions_for_input(self, input_name: str) -> bool:\n        op_config = self.op_config\n        if op_config is not None and "assets" in op_config:\n            all_input_asset_partitions = op_config["assets"].get("input_partitions")\n            if all_input_asset_partitions is not None:\n                this_input_asset_partitions = all_input_asset_partitions.get(input_name)\n                if this_input_asset_partitions is not None:\n                    return True\n\n        return False\n\n    def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n        op_config = self.op_config\n        if op_config is not None and "assets" in op_config:\n            all_input_asset_partitions = op_config["assets"].get("input_partitions")\n            if all_input_asset_partitions is not None:\n                this_input_asset_partitions = all_input_asset_partitions.get(input_name)\n                if this_input_asset_partitions is not None:\n                    return PartitionKeyRange(\n                        this_input_asset_partitions["start"], this_input_asset_partitions["end"]\n                    )\n\n        check.failed("The input has no asset partitions")\n\n    def asset_partition_key_for_input(self, input_name: str) -> str:\n        start, end = self.asset_partition_key_range_for_input(input_name)\n        if start == end:\n            return start\n        else:\n            check.failed(\n                f"Tried to access partition key for input '{input_name}' of step '{self.step.key}', "\n                f"but the step input has a partition range: '{start}' to '{end}'."\n            )\n\n    def has_asset_partitions_for_output(self, output_name: str) -> bool:\n        op_config = self.op_config\n        if op_config is not None and "assets" in op_config:\n            all_output_asset_partitions = op_config["assets"].get("output_partitions")\n            if all_output_asset_partitions is not None:\n                this_output_asset_partitions = all_output_asset_partitions.get(output_name)\n                if this_output_asset_partitions is not None:\n                    return True\n\n        return False\n\n    def asset_partition_key_range_for_output(self, output_name: str) -> PartitionKeyRange:\n        op_config = self.op_config\n        if op_config is not None and "assets" in op_config:\n            all_output_asset_partitions = op_config["assets"].get("output_partitions")\n            if all_output_asset_partitions is not None:\n                this_output_asset_partitions = all_output_asset_partitions.get(output_name)\n                if this_output_asset_partitions is not None:\n                    return PartitionKeyRange(\n                        this_output_asset_partitions["start"], this_output_asset_partitions["end"]\n                    )\n\n        check.failed("The output has no asset partitions")\n\n    def asset_partition_key_for_output(self, output_name: str) -> str:\n        start, end = self.asset_partition_key_range_for_output(output_name)\n        if start == end:\n            return start\n        else:\n            check.failed(\n                f"Tried to access partition key for output '{output_name}' of step '{self.step.key}', "\n                f"but the step output has a partition range: '{start}' to '{end}'."\n            )\n\n    def asset_partitions_time_window_for_output(self, output_name: str) -> TimeWindow:\n        """The time window for the partitions of the asset correponding to the given output.\n\n        Raises an error if either of the following are true:\n        - The output asset has no partitioning.\n        - The output asset is not partitioned with a TimeWindowPartitionsDefinition.\n        """\n        partitions_def = self.solid_def.output_def_named(output_name).asset_partitions_def\n\n        if not partitions_def:\n            raise ValueError(\n                "Tried to get asset partitions for an output that does not correspond to a "\n                "partitioned asset."\n            )\n\n        if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n            raise ValueError(\n                "Tried to get asset partitions for an output that correponds to a partitioned "\n                "asset that is not partitioned with a TimeWindowPartitionsDefinition."\n            )\n\n        partition_key_range = self.asset_partition_key_range_for_output(output_name)\n        return TimeWindow(\n            partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n            partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n        )\n\n    def get_input_lineage(self) -> List[AssetLineageInfo]:\n        if not self._input_lineage:\n\n            for step_input in self.step.step_inputs:\n                input_def = step_input.source.get_input_def(self.pipeline_def)\n                dagster_type = input_def.dagster_type\n\n                if dagster_type.kind == DagsterTypeKind.NOTHING:\n                    continue\n\n                self._input_lineage.extend(step_input.source.get_asset_lineage(self))\n\n        self._input_lineage = _dedup_asset_lineage(self._input_lineage)\n\n        return self._input_lineage\n\n\ndef _dedup_asset_lineage(asset_lineage: List[AssetLineageInfo]) -> List[AssetLineageInfo]:\n    """Method to remove duplicate specifications of the same Asset/Partition pair from the lineage\n    information. Duplicates can occur naturally when calculating transitive dependencies from solids\n    with multiple Outputs, which in turn have multiple Inputs (because each Output of the solid will\n    inherit all dependencies from all of the solid Inputs).\n    """\n    key_partition_mapping: Dict[AssetKey, Set[str]] = defaultdict(set)\n\n    for lineage_info in asset_lineage:\n        if not lineage_info.partitions:\n            key_partition_mapping[lineage_info.asset_key] |= set()\n        for partition in lineage_info.partitions:\n            key_partition_mapping[lineage_info.asset_key].add(partition)\n    return [\n        AssetLineageInfo(asset_key=asset_key, partitions=partitions)\n        for asset_key, partitions in key_partition_mapping.items()\n    ]\n\n\n
[docs]class TypeCheckContext:\n """The ``context`` object available to a type check function on a DagsterType.\n\n Attributes:\n log (DagsterLogManager): Centralized log dispatch from user code.\n resources (Any): An object whose attributes contain the resources available to this op.\n run_id (str): The id of this job run.\n """\n\n def __init__(\n self,\n run_id: str,\n log_manager: DagsterLogManager,\n scoped_resources_builder: ScopedResourcesBuilder,\n dagster_type: DagsterType,\n ):\n self._run_id = run_id\n self._log = log_manager\n self._resources = scoped_resources_builder.build(dagster_type.required_resource_keys)\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def run_id(self) -> str:\n return self._run_id\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log
\n
", "current_page_name": "_modules/dagster/core/execution/context/system", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.system"}}, "execute_in_process_result": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.execute_in_process_result

\nfrom typing import Any, Dict, List, Optional, Union, cast\n\nfrom dagster import check\nfrom dagster.core.definitions import NodeDefinition, NodeHandle\nfrom dagster.core.definitions.events import AssetMaterialization, AssetObservation, Materialization\nfrom dagster.core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster.core.errors import DagsterError, DagsterInvariantViolationError\nfrom dagster.core.events import (\n    AssetObservationData,\n    DagsterEvent,\n    DagsterEventType,\n    StepMaterializationData,\n)\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.storage.pipeline_run import DagsterRun\n\n\n
[docs]class ExecuteInProcessResult:\n def __init__(\n self,\n node_def: NodeDefinition,\n all_events: List[DagsterEvent],\n dagster_run: DagsterRun,\n output_capture: Optional[Dict[StepOutputHandle, Any]],\n ):\n self._node_def = node_def\n\n # If top-level result, no handle will be provided\n self._handle = NodeHandle(node_def.name, parent=None)\n self._event_list = all_events\n self._dagster_run = dagster_run\n\n self._output_capture = check.opt_dict_param(\n output_capture, "output_capture", key_type=StepOutputHandle\n )\n\n @property\n def success(self) -> bool:\n """bool: Whether execution was successful."""\n return self._dagster_run.is_success\n\n @property\n def all_node_events(self) -> List[DagsterEvent]:\n """List[DagsterEvent]: All dagster events from the in-process execution."""\n\n step_events = []\n\n for node_name in self._node_def.ensure_graph_def().node_dict.keys():\n handle = NodeHandle(node_name, None)\n step_events += _filter_events_by_handle(self._event_list, handle)\n\n return step_events\n\n @property\n def all_events(self) -> List[DagsterEvent]:\n """List[DagsterEvent]: All dagster events emitted during in-process execution."""\n\n return self._event_list\n\n @property\n def run_id(self) -> str:\n """str: The run id for the executed run"""\n return self._dagster_run.run_id\n\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: the DagsterRun object for the completed execution."""\n return self._dagster_run\n\n
[docs] def events_for_node(self, node_name: str) -> List[DagsterEvent]:\n """Retrieves all dagster events for a specific node.\n\n Args:\n node_name (str): The name of the node for which outputs should be retrieved.\n\n Returns:\n List[DagsterEvent]: A list of all dagster events associated with provided node name.\n """\n check.str_param(node_name, "node_name")\n\n return _filter_events_by_handle(self._event_list, NodeHandle.from_string(node_name))
\n\n def asset_materializations_for_node(\n self, node_name\n ) -> List[Union[Materialization, AssetMaterialization]]:\n return [\n cast(StepMaterializationData, event.event_specific_data).materialization\n for event in self.events_for_node(node_name)\n if event.event_type_value == DagsterEventType.ASSET_MATERIALIZATION.value\n ]\n\n def asset_observations_for_node(self, node_name) -> List[AssetObservation]:\n return [\n cast(AssetObservationData, event.event_specific_data).asset_observation\n for event in self.events_for_node(node_name)\n if event.event_type_value == DagsterEventType.ASSET_OBSERVATION.value\n ]\n\n
[docs] def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n If the top-level job has no output, calling this method will result in a\n DagsterInvariantViolationError.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n\n check.str_param(output_name, "output_name")\n\n graph_def = self._node_def.ensure_graph_def()\n if not graph_def.has_output(output_name) and len(graph_def.output_mappings) == 0:\n raise DagsterInvariantViolationError(\n f"Attempted to retrieve top-level outputs for '{graph_def.name}', which has no outputs."\n )\n elif not graph_def.has_output(output_name):\n raise DagsterInvariantViolationError(\n f"Could not find top-level output '{output_name}' in '{graph_def.name}'."\n )\n # Resolve the first layer of mapping\n output_mapping = graph_def.get_output_mapping(output_name)\n mapped_node = graph_def.solid_named(output_mapping.maps_from.solid_name)\n origin_output_def, origin_handle = mapped_node.definition.resolve_output_to_origin(\n output_mapping.maps_from.output_name,\n NodeHandle(mapped_node.name, None),\n )\n\n # Get output from origin node\n return _filter_outputs_by_handle(\n self._output_capture, origin_handle, origin_output_def.name\n )
\n\n
[docs] def output_for_node(self, node_str: str, output_name: Optional[str] = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the in-process run of the job.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n\n # resolve handle of node that node_str is referring to\n target_handle = NodeHandle.from_string(node_str)\n target_node_def = self._node_def.ensure_graph_def().get_solid(target_handle).definition\n origin_output_def, origin_handle = target_node_def.resolve_output_to_origin(\n output_name, NodeHandle.from_string(node_str)\n )\n\n # retrieve output value from resolved handle\n return _filter_outputs_by_handle(\n self._output_capture, origin_handle, origin_output_def.name\n )
\n\n
[docs] def get_job_success_event(self):\n """Returns a DagsterEvent with type DagsterEventType.PIPELINE_SUCCESS if it ocurred during\n execution\n """\n events = list(\n filter(\n lambda event: event.event_type == DagsterEventType.PIPELINE_SUCCESS, self.all_events\n )\n )\n\n if len(events) == 0:\n raise DagsterError("No event of type DagsterEventType.PIPELINE_SUCCESS found.")\n\n return events[0]
\n\n
[docs] def get_job_failure_event(self):\n """Returns a DagsterEvent with type DagsterEventType.PIPELINE_FAILURE if it ocurred during\n execution\n """\n events = list(\n filter(\n lambda event: event.event_type == DagsterEventType.PIPELINE_FAILURE, self.all_events\n )\n )\n\n if len(events) == 0:\n raise DagsterError("No event of type DagsterEventType.PIPELINE_FAILURE found.")\n\n return events[0]
\n\n\ndef _filter_events_by_handle(\n event_list: List[DagsterEvent], handle: NodeHandle\n) -> List[DagsterEvent]:\n step_events = []\n for event in event_list:\n if event.is_step_event:\n event_handle = cast(\n NodeHandle, event.solid_handle\n ) # step events are guaranteed to have a node handle.\n if event_handle.is_or_descends_from(handle):\n step_events.append(event)\n\n return step_events\n\n\ndef _filter_outputs_by_handle(\n output_dict: Dict[StepOutputHandle, Any],\n node_handle: NodeHandle,\n output_name: str,\n) -> Any:\n mapped_outputs = {}\n step_key = str(node_handle)\n output_found = False\n for step_output_handle, value in output_dict.items():\n\n # For the mapped output case, where step keys are in the format\n # "step_key[upstream_mapped_output_name]" within the step output handle.\n if step_output_handle.step_key.startswith(f"{step_key}["):\n output_found = True\n key_start = step_output_handle.step_key.find("[")\n key_end = step_output_handle.step_key.find("]")\n upstream_mapped_output_name = step_output_handle.step_key[key_start + 1 : key_end]\n mapped_outputs[upstream_mapped_output_name] = value\n\n # For all other cases, search for exact match.\n elif (\n step_key == step_output_handle.step_key\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n if not step_output_handle.mapping_key:\n return output_dict[step_output_handle]\n mapped_outputs[step_output_handle.mapping_key] = value\n\n if not output_found:\n raise DagsterInvariantViolationError(f"No outputs found for node '{node_handle}'.")\n return mapped_outputs\n
", "current_page_name": "_modules/dagster/core/execution/execute_in_process_result", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.execute_in_process_result"}, "results": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.results

\nfrom collections import defaultdict\n\nfrom dagster import check\nfrom dagster.core.definitions import GraphDefinition, Node, NodeHandle, PipelineDefinition\nfrom dagster.core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.events import DagsterEvent, DagsterEventType\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.execution.plan.step import StepKind\nfrom dagster.core.execution.plan.utils import build_resources_for_manager\n\n\ndef _construct_events_by_step_key(event_list):\n    events_by_step_key = defaultdict(list)\n    for event in event_list:\n        events_by_step_key[event.step_key].append(event)\n\n    return dict(events_by_step_key)\n\n\nclass GraphExecutionResult:\n    def __init__(\n        self,\n        container,\n        event_list,\n        reconstruct_context,\n        pipeline_def,\n        handle=None,\n        output_capture=None,\n    ):\n        self.container = check.inst_param(container, "container", GraphDefinition)\n        self.event_list = check.list_param(event_list, "step_event_list", of_type=DagsterEvent)\n        self.reconstruct_context = check.callable_param(reconstruct_context, "reconstruct_context")\n        self.pipeline_def = check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n        self.handle = check.opt_inst_param(handle, "handle", NodeHandle)\n        self.output_capture = check.opt_dict_param(\n            output_capture, "output_capture", key_type=StepOutputHandle\n        )\n        self._events_by_step_key = _construct_events_by_step_key(event_list)\n\n    @property\n    def success(self):\n        """bool: Whether all steps in the execution were successful."""\n        return all([not event.is_failure for event in self.event_list])\n\n    @property\n    def step_event_list(self):\n        """List[DagsterEvent] The full list of events generated by steps in the execution.\n\n        Excludes events generated by the pipeline lifecycle, e.g., ``PIPELINE_START``.\n        """\n        return [event for event in self.event_list if event.is_step_event]\n\n    @property\n    def events_by_step_key(self):\n        return self._events_by_step_key\n\n    def result_for_solid(self, name):\n        """Get the result of a top level solid.\n\n        Args:\n            name (str): The name of the top-level solid or aliased solid for which to retrieve the\n                result.\n\n        Returns:\n            Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of the solid\n            execution within the pipeline.\n        """\n        if not self.container.has_solid_named(name):\n            raise DagsterInvariantViolationError(\n                "Tried to get result for solid '{name}' in '{container}'. No such top level "\n                "solid.".format(name=name, container=self.container.name)\n            )\n\n        return self.result_for_handle(NodeHandle(name, None))\n\n    def output_for_solid(self, handle_str, output_name=DEFAULT_OUTPUT):\n        """Get the output of a solid by its solid handle string and output name.\n\n        Args:\n            handle_str (str): The string handle for the solid.\n            output_name (str): Optional. The name of the output, default to DEFAULT_OUTPUT.\n\n        Returns:\n            The output value for the handle and output_name.\n        """\n        check.str_param(handle_str, "handle_str")\n        check.str_param(output_name, "output_name")\n        return self.result_for_handle(NodeHandle.from_string(handle_str)).output_value(output_name)\n\n    @property\n    def solid_result_list(self):\n        """List[Union[CompositeSolidExecutionResult, SolidExecutionResult]]: The results for each\n        top level solid."""\n        return [self.result_for_solid(solid.name) for solid in self.container.solids]\n\n    def _result_for_handle(self, solid, handle):\n        if not solid:\n            raise DagsterInvariantViolationError(\n                "Can not find solid handle {handle_str}.".format(handle_str=handle.to_string())\n            )\n\n        events_by_kind = defaultdict(list)\n\n        if solid.is_graph:\n            events = []\n            for event in self.event_list:\n                if event.is_step_event:\n                    if event.solid_handle.is_or_descends_from(handle.with_ancestor(self.handle)):\n                        events_by_kind[event.step_kind].append(event)\n                        events.append(event)\n\n            return CompositeSolidExecutionResult(\n                solid,\n                events,\n                events_by_kind,\n                self.reconstruct_context,\n                self.pipeline_def,\n                handle=handle.with_ancestor(self.handle),\n                output_capture=self.output_capture,\n            )\n        else:\n            for event in self.event_list:\n                if event.is_step_event:\n                    if event.solid_handle.is_or_descends_from(handle.with_ancestor(self.handle)):\n                        events_by_kind[event.step_kind].append(event)\n\n            return SolidExecutionResult(\n                solid,\n                events_by_kind,\n                self.reconstruct_context,\n                self.pipeline_def,\n                output_capture=self.output_capture,\n            )\n\n    def result_for_handle(self, handle):\n        """Get the result of a solid by its solid handle.\n\n        This allows indexing into top-level solids to retrieve the results of children of\n        composite solids.\n\n        Args:\n            handle (Union[str,NodeHandle]): The handle for the solid.\n\n        Returns:\n            Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of the given\n            solid.\n        """\n        if isinstance(handle, str):\n            handle = NodeHandle.from_string(handle)\n        else:\n            check.inst_param(handle, "handle", NodeHandle)\n\n        solid = self.container.get_solid(handle)\n\n        return self._result_for_handle(solid, handle)\n\n\n
[docs]class PipelineExecutionResult(GraphExecutionResult):\n """The result of executing a pipeline.\n\n Returned by :py:func:`execute_pipeline`. Users should not instantiate this class directly.\n """\n\n def __init__(\n self,\n pipeline_def,\n run_id,\n event_list,\n reconstruct_context,\n output_capture=None,\n ):\n self.run_id = check.str_param(run_id, "run_id")\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n super(PipelineExecutionResult, self).__init__(\n container=pipeline_def.graph,\n event_list=event_list,\n reconstruct_context=reconstruct_context,\n pipeline_def=pipeline_def,\n output_capture=output_capture,\n )
\n\n\n
[docs]class CompositeSolidExecutionResult(GraphExecutionResult):\n """Execution result for a composite solid in a pipeline.\n\n Users should not instantiate this class directly.\n """\n\n def __init__(\n self,\n solid,\n event_list,\n step_events_by_kind,\n reconstruct_context,\n pipeline_def,\n handle=None,\n output_capture=None,\n ):\n check.inst_param(solid, "solid", Node)\n check.invariant(\n solid.is_graph,\n desc="Tried to instantiate a CompositeSolidExecutionResult with a noncomposite solid",\n )\n self.solid = solid\n self.step_events_by_kind = check.dict_param(\n step_events_by_kind, "step_events_by_kind", key_type=StepKind, value_type=list\n )\n self.output_capture = check.opt_dict_param(\n output_capture, "output_capture", key_type=StepOutputHandle\n )\n super(CompositeSolidExecutionResult, self).__init__(\n container=solid.definition,\n event_list=event_list,\n reconstruct_context=reconstruct_context,\n pipeline_def=pipeline_def,\n handle=handle,\n output_capture=output_capture,\n )\n\n def output_values_for_solid(self, name):\n check.str_param(name, "name")\n return self.result_for_solid(name).output_values\n\n def output_values_for_handle(self, handle_str):\n check.str_param(handle_str, "handle_str")\n\n return self.result_for_handle(handle_str).output_values\n\n def output_value_for_solid(self, name, output_name=DEFAULT_OUTPUT):\n check.str_param(name, "name")\n check.str_param(output_name, "output_name")\n\n return self.result_for_solid(name).output_value(output_name)\n\n def output_value_for_handle(self, handle_str, output_name=DEFAULT_OUTPUT):\n check.str_param(handle_str, "handle_str")\n check.str_param(output_name, "output_name")\n\n return self.result_for_handle(handle_str).output_value(output_name)\n\n @property\n def output_values(self):\n values = {}\n\n for output_name in self.solid.definition.output_dict:\n output_mapping = self.solid.definition.get_output_mapping(output_name)\n\n inner_solid_values = self._result_for_handle(\n self.solid.definition.solid_named(output_mapping.maps_from.solid_name),\n NodeHandle(output_mapping.maps_from.solid_name, None),\n ).output_values\n\n if inner_solid_values is not None: # may be None if inner solid was skipped\n if output_mapping.maps_from.output_name in inner_solid_values:\n values[output_name] = inner_solid_values[output_mapping.maps_from.output_name]\n\n return values\n\n def output_value(self, output_name=DEFAULT_OUTPUT):\n check.str_param(output_name, "output_name")\n\n if not self.solid.definition.has_output(output_name):\n raise DagsterInvariantViolationError(\n "Output '{output_name}' not defined in composite solid '{solid}': "\n "{outputs_clause}. If you were expecting this output to be present, you may "\n "be missing an output_mapping from an inner solid to its enclosing composite "\n "solid.".format(\n output_name=output_name,\n solid=self.solid.name,\n outputs_clause="found outputs {output_names}".format(\n output_names=str(list(self.solid.definition.output_dict.keys()))\n )\n if self.solid.definition.output_dict\n else "no output mappings were defined",\n )\n )\n\n output_mapping = self.solid.definition.get_output_mapping(output_name)\n\n return self._result_for_handle(\n self.solid.definition.solid_named(output_mapping.maps_from.solid_name),\n NodeHandle(output_mapping.maps_from.solid_name, None),\n ).output_value(output_mapping.maps_from.output_name)
\n\n\n
[docs]class SolidExecutionResult:\n """Execution result for a leaf solid in a pipeline.\n\n Users should not instantiate this class.\n """\n\n def __init__(\n self, solid, step_events_by_kind, reconstruct_context, pipeline_def, output_capture=None\n ):\n check.inst_param(solid, "solid", Node)\n check.invariant(\n not solid.is_graph,\n desc="Tried to instantiate a SolidExecutionResult with a composite solid",\n )\n self.solid = solid\n self.step_events_by_kind = check.dict_param(\n step_events_by_kind, "step_events_by_kind", key_type=StepKind, value_type=list\n )\n self.reconstruct_context = check.callable_param(reconstruct_context, "reconstruct_context")\n self.output_capture = check.opt_dict_param(output_capture, "output_capture")\n self.pipeline_def = check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n @property\n def compute_input_event_dict(self):\n """Dict[str, DagsterEvent]: All events of type ``STEP_INPUT``, keyed by input name."""\n return {se.event_specific_data.input_name: se for se in self.input_events_during_compute}\n\n @property\n def input_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_INPUT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_INPUT)\n\n
[docs] def get_output_event_for_compute(self, output_name="result"):\n """The ``STEP_OUTPUT`` event for the given output name.\n\n Throws if not present.\n\n Args:\n output_name (Optional[str]): The name of the output. (default: 'result')\n\n Returns:\n DagsterEvent: The corresponding event.\n """\n events = self.get_output_events_for_compute(output_name)\n check.invariant(\n len(events) == 1, "Multiple output events returned, use get_output_events_for_compute"\n )\n return events[0]
\n\n @property\n def compute_output_events_dict(self):\n """Dict[str, List[DagsterEvent]]: All events of type ``STEP_OUTPUT``, keyed by output name"""\n results = defaultdict(list)\n for se in self.output_events_during_compute:\n results[se.step_output_data.output_name].append(se)\n\n return dict(results)\n\n
[docs] def get_output_events_for_compute(self, output_name="result"):\n """The ``STEP_OUTPUT`` event for the given output name.\n\n Throws if not present.\n\n Args:\n output_name (Optional[str]): The name of the output. (default: 'result')\n\n Returns:\n List[DagsterEvent]: The corresponding events.\n """\n return self.compute_output_events_dict[output_name]
\n\n @property\n def output_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_OUTPUT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_OUTPUT)\n\n @property\n def compute_step_events(self):\n """List[DagsterEvent]: All events generated by execution of the solid compute function."""\n return self.step_events_by_kind.get(StepKind.COMPUTE, [])\n\n @property\n def step_events(self):\n return self.compute_step_events\n\n @property\n def materializations_during_compute(self):\n """List[Materialization]: All materializations yielded by the solid."""\n return [\n mat_event.event_specific_data.materialization\n for mat_event in self.materialization_events_during_compute\n ]\n\n @property\n def materialization_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``ASSET_MATERIALIZATION``."""\n return self._compute_steps_of_type(DagsterEventType.ASSET_MATERIALIZATION)\n\n @property\n def expectation_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_EXPECTATION_RESULT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_EXPECTATION_RESULT)\n\n def _compute_steps_of_type(self, dagster_event_type):\n return list(\n filter(lambda se: se.event_type == dagster_event_type, self.compute_step_events)\n )\n\n @property\n def expectation_results_during_compute(self):\n """List[ExpectationResult]: All expectation results yielded by the solid"""\n return [\n expt_event.event_specific_data.expectation_result\n for expt_event in self.expectation_events_during_compute\n ]\n\n
[docs] def get_step_success_event(self):\n """DagsterEvent: The ``STEP_SUCCESS`` event, throws if not present."""\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_SUCCESS:\n return step_event\n\n check.failed("Step success not found for solid {}".format(self.solid.name))
\n\n @property\n def compute_step_failure_event(self):\n """DagsterEvent: The ``STEP_FAILURE`` event, throws if it did not fail."""\n if self.success:\n raise DagsterInvariantViolationError(\n "Cannot call compute_step_failure_event if successful"\n )\n\n step_failure_events = self._compute_steps_of_type(DagsterEventType.STEP_FAILURE)\n check.invariant(len(step_failure_events) == 1)\n return step_failure_events[0]\n\n @property\n def success(self):\n """bool: Whether solid execution was successful."""\n any_success = False\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_FAILURE:\n return False\n if step_event.event_type == DagsterEventType.STEP_SUCCESS:\n any_success = True\n\n return any_success\n\n @property\n def skipped(self):\n """bool: Whether solid execution was skipped."""\n return all(\n [\n step_event.event_type == DagsterEventType.STEP_SKIPPED\n for step_event in self.compute_step_events\n ]\n )\n\n @property\n def output_values(self):\n """Union[None, Dict[str, Union[Any, Dict[str, Any]]]: The computed output values.\n\n Returns ``None`` if execution did not succeed.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n\n Note that accessing this property will reconstruct the pipeline context (including, e.g.,\n resources) to retrieve materialized output values.\n """\n if not self.success or not self.compute_step_events:\n return None\n\n results = {}\n with self.reconstruct_context() as context:\n for compute_step_event in self.compute_step_events:\n if compute_step_event.is_successful_output:\n output = compute_step_event.step_output_data\n step = context.execution_plan.get_step_by_key(compute_step_event.step_key)\n value = self._get_value(context.for_step(step), output)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if results.get(output.output_name) is None:\n results[output.output_name] = {mapping_key: value}\n else:\n results[output.output_name][mapping_key] = value\n else:\n results[output.output_name] = value\n\n return results\n\n
[docs] def output_value(self, output_name=DEFAULT_OUTPUT):\n """Get a computed output value.\n\n Note that calling this method will reconstruct the pipeline context (including, e.g.,\n resources) to retrieve materialized output values.\n\n Args:\n output_name(str): The output name for which to retrieve the value. (default: 'result')\n\n Returns:\n Union[None, Any, Dict[str, Any]]: ``None`` if execution did not succeed, the output value\n in the normal case, and a dict of mapping keys to values in the mapped case.\n """\n check.str_param(output_name, "output_name")\n\n if not self.solid.definition.has_output(output_name):\n raise DagsterInvariantViolationError(\n "Output '{output_name}' not defined in solid '{solid}': found outputs "\n "{output_names}".format(\n output_name=output_name,\n solid=self.solid.name,\n output_names=str(list(self.solid.definition.output_dict.keys())),\n )\n )\n\n if not self.success:\n return None\n\n with self.reconstruct_context() as context:\n found = False\n result = None\n for compute_step_event in self.compute_step_events:\n if (\n compute_step_event.is_successful_output\n and compute_step_event.step_output_data.output_name == output_name\n ):\n found = True\n output = compute_step_event.step_output_data\n step = context.execution_plan.get_step_by_key(compute_step_event.step_key)\n value = self._get_value(context.for_step(step), output)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if result is None:\n result = {mapping_key: value}\n else:\n result[mapping_key] = value\n else:\n result = value\n\n if found:\n return result\n\n raise DagsterInvariantViolationError(\n (\n "Did not find result {output_name} in solid {self.solid.name} "\n "execution result"\n ).format(output_name=output_name, self=self)\n )
\n\n def _get_value(self, context, step_output_data):\n step_output_handle = step_output_data.step_output_handle\n # output capture dictionary will only have values in the in process case, but will not have\n # values from steps launched via step launcher.\n if self.output_capture and step_output_handle in self.output_capture:\n return self.output_capture[step_output_handle]\n manager = context.get_io_manager(step_output_handle)\n manager_key = context.execution_plan.get_manager_key(step_output_handle, self.pipeline_def)\n res = manager.load_input(\n context.for_input_manager(\n name=None,\n config=None,\n metadata=None,\n dagster_type=self.solid.output_def_named(step_output_data.output_name).dagster_type,\n source_handle=step_output_handle,\n resource_config=context.resolved_run_config.resources[manager_key].config,\n resources=build_resources_for_manager(manager_key, context),\n )\n )\n return res\n\n @property\n def failure_data(self):\n """Union[None, StepFailureData]: Any data corresponding to this step's failure, if it\n failed."""\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_FAILURE:\n return step_event.step_failure_data\n\n @property\n def retry_attempts(self) -> int:\n """Number of times this step retried"""\n count = 0\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_RESTARTED:\n count += 1\n return count
\n
", "current_page_name": "_modules/dagster/core/execution/results", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.results"}, "validate_run_config": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.validate_run_config

\nfrom typing import Any, Dict, Optional, cast\n\nfrom dagster import check\nfrom dagster.core.definitions import JobDefinition, PipelineDefinition\nfrom dagster.core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]def validate_run_config(\n job_def: Optional[JobDefinition] = None,\n run_config: Optional[Dict[str, Any]] = None,\n mode: Optional[str] = None,\n pipeline_def: Optional[PipelineDefinition] = None,\n) -> Dict[str, Any]:\n """Function to validate a provided run config blob against a given job. For legacy APIs, a\n pipeline/mode can also be passed in.\n\n If validation is successful, this function will return a dictionary representation of the\n validated config actually used during execution.\n\n Args:\n job_def (Union[PipelineDefinition, JobDefinition]): The job definition to validate run\n config against\n run_config (Optional[Dict[str, Any]]): The run config to validate\n mode (str): The mode of the pipeline to validate against (different modes may require\n different config)\n pipeline_def (PipelineDefinition): The pipeline definition to validate run config against.\n\n Returns:\n Dict[str, Any]: A dictionary representation of the validated config.\n """\n\n job_def = check.opt_inst_param(job_def, "job_def", (JobDefinition, PipelineDefinition))\n pipeline_def = check.opt_inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n\n if job_def and pipeline_def:\n check.failed("Cannot specify both a job_def and a pipeline_def")\n\n pipeline_or_job_def = pipeline_def or job_def\n\n if pipeline_or_job_def is None:\n check.failed("Must specify at least one of job_def and pipeline_def")\n\n pipeline_or_job_def = cast(PipelineDefinition, pipeline_def or job_def)\n mode = check.opt_str_param(mode, "mode", default=pipeline_or_job_def.get_default_mode_name())\n\n return ResolvedRunConfig.build(pipeline_or_job_def, run_config, mode=mode).to_dict()
\n
", "current_page_name": "_modules/dagster/core/execution/validate_run_config", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.validate_run_config"}}, "executor": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.executor.base

\nfrom abc import ABC, abstractmethod\n\nfrom dagster.core.execution.retries import RetryMode\n\n\n
[docs]class Executor(ABC): # pylint: disable=no-init\n
[docs] @abstractmethod\n def execute(self, plan_context, execution_plan):\n """\n For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.\n\n Args:\n plan_context (PlanOrchestrationContext): The plan's orchestration context.\n execution_plan (ExecutionPlan): The plan to execute.\n\n Returns:\n A stream of dagster events.\n """
\n\n @property\n @abstractmethod\n def retries(self) -> RetryMode:\n """\n Whether retries are enabled or disabled for this instance of the executor.\n\n Executors should allow this to be controlled via configuration if possible.\n\n Returns: RetryMode\n """
\n
", "current_page_name": "_modules/dagster/core/executor/base", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.executor.base"}, "init": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.executor.init

\nfrom typing import Dict, NamedTuple\n\nfrom dagster import check\nfrom dagster.core.definitions import ExecutorDefinition, IPipeline\nfrom dagster.core.instance import DagsterInstance\n\n\n
[docs]class InitExecutorContext(\n NamedTuple(\n "InitExecutorContext",\n [\n ("job", IPipeline),\n ("executor_def", ExecutorDefinition),\n ("executor_config", Dict[str, object]),\n ("instance", DagsterInstance),\n ],\n )\n):\n """Executor-specific initialization context.\n\n Attributes:\n job (IPipeline): The job to be executed.\n executor_def (ExecutorDefinition): The definition of the executor currently being\n constructed.\n executor_config (dict): The parsed config passed to the executor.\n instance (DagsterInstance): The current instance.\n """\n\n def __new__(\n cls,\n job: IPipeline,\n executor_def: ExecutorDefinition,\n executor_config: Dict[str, object],\n instance: DagsterInstance,\n ):\n return super(InitExecutorContext, cls).__new__(\n cls,\n job=check.inst_param(job, "job", IPipeline),\n executor_def=check.inst_param(executor_def, "executor_def", ExecutorDefinition),\n executor_config=check.dict_param(executor_config, "executor_config", key_type=str),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )\n\n @property\n def pipeline(self) -> IPipeline:\n return self.job
\n
", "current_page_name": "_modules/dagster/core/executor/init", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.executor.init"}}, "instance": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.instance

\nimport inspect\nimport logging\nimport logging.config\nimport os\nimport sys\nimport time\nimport warnings\nimport weakref\nfrom collections import defaultdict\nfrom contextlib import ExitStack\nfrom enum import Enum\nfrom tempfile import TemporaryDirectory\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport yaml\n\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.pipeline_base import InMemoryPipeline\nfrom dagster.core.definitions.pipeline_definition import (\n    PipelineDefinition,\n    PipelineSubsetDefinition,\n)\nfrom dagster.core.errors import (\n    DagsterHomeNotSetError,\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunConflict,\n)\nfrom dagster.core.storage.pipeline_run import (\n    IN_PROGRESS_RUN_STATUSES,\n    DagsterRun,\n    JobBucket,\n    PipelineRun,\n    PipelineRunStatsSnapshot,\n    PipelineRunStatus,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster.core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster.core.system_config.objects import ResolvedRunConfig\nfrom dagster.core.utils import str_format_list\nfrom dagster.serdes import ConfigurableClass\nfrom dagster.seven import get_current_datetime_in_utc\nfrom dagster.utils import traced\nfrom dagster.utils.backcompat import experimental_functionality_warning\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME, is_dagster_home_set\nfrom .ref import InstanceRef\n\n# 'airflow_execution_date' and 'is_airflow_ingest_pipeline' are hardcoded tags used in the\n# airflow ingestion logic (see: dagster_pipeline_factory.py). 'airflow_execution_date' stores the\n# 'execution_date' used in Airflow operator execution and 'is_airflow_ingest_pipeline' determines\n# whether 'airflow_execution_date' is needed.\n# https://github.com/dagster-io/dagster/issues/2403\nAIRFLOW_EXECUTION_DATE_STR = "airflow_execution_date"\nIS_AIRFLOW_INGEST_PIPELINE_STR = "is_airflow_ingest_pipeline"\n\n\nif TYPE_CHECKING:\n    from dagster.core.debug import DebugRunPayload\n    from dagster.core.events import DagsterEvent, DagsterEventType\n    from dagster.core.events.log import EventLogEntry\n    from dagster.core.execution.stats import RunStepKeyStatsSnapshot\n    from dagster.core.host_representation import HistoricalPipeline\n    from dagster.core.launcher import RunLauncher\n    from dagster.core.run_coordinator import RunCoordinator\n    from dagster.core.scheduler import Scheduler\n    from dagster.core.scheduler.instigation import InstigatorTick, TickStatus\n    from dagster.core.snap import ExecutionPlanSnapshot, PipelineSnapshot\n    from dagster.core.storage.compute_log_manager import ComputeLogManager\n    from dagster.core.storage.event_log import EventLogStorage\n    from dagster.core.storage.event_log.base import EventLogRecord, EventRecordsFilter\n    from dagster.core.storage.root import LocalArtifactStorage\n    from dagster.core.storage.runs import RunStorage\n    from dagster.core.storage.schedules import ScheduleStorage\n    from dagster.core.workspace.workspace import IWorkspace\n    from dagster.daemon.types import DaemonHeartbeat\n\n\ndef _check_run_equality(\n    pipeline_run: PipelineRun, candidate_run: PipelineRun\n) -> Dict[str, Tuple[Any, Any]]:\n    field_diff = {}\n    for field in pipeline_run._fields:\n        expected_value = getattr(pipeline_run, field)\n        candidate_value = getattr(candidate_run, field)\n        if expected_value != candidate_value:\n            field_diff[field] = (expected_value, candidate_value)\n\n    return field_diff\n\n\ndef _format_field_diff(field_diff: Dict[str, Tuple[Any, Any]]) -> str:\n    return "\\n".join(\n        [\n            (\n                "    {field_name}:\\n"\n                + "        Expected: {expected_value}\\n"\n                + "        Received: {candidate_value}"\n            ).format(\n                field_name=field_name,\n                expected_value=expected_value,\n                candidate_value=candidate_value,\n            )\n            for field_name, (\n                expected_value,\n                candidate_value,\n            ) in field_diff.items()\n        ]\n    )\n\n\nclass _EventListenerLogHandler(logging.Handler):\n    def __init__(self, instance):\n        self._instance = instance\n        super(_EventListenerLogHandler, self).__init__()\n\n    def emit(self, record):\n        from dagster.core.events import EngineEventData\n        from dagster.core.events.log import StructuredLoggerMessage, construct_event_record\n\n        event = construct_event_record(\n            StructuredLoggerMessage(\n                name=record.name,\n                message=record.msg,\n                level=record.levelno,\n                meta=record.dagster_meta,\n                record=record,\n            )\n        )\n\n        try:\n            self._instance.handle_new_event(event)\n        except Exception as e:\n            sys.stderr.write(f"Exception while writing logger call to event log: {str(e)}\\n")\n            if event.dagster_event:\n                # Swallow user-generated log failures so that the entire step/run doesn't fail, but\n                # raise failures writing system-generated log events since they are the source of\n                # truth for the state of the run\n                raise\n            elif event.run_id:\n                self._instance.report_engine_event(\n                    "Exception while writing logger call to event log",\n                    pipeline_name=event.pipeline_name,\n                    run_id=event.run_id,\n                    step_key=event.step_key,\n                    engine_event_data=EngineEventData(\n                        error=serializable_error_info_from_exc_info(sys.exc_info()),\n                    ),\n                )\n\n\nclass InstanceType(Enum):\n    PERSISTENT = "PERSISTENT"\n    EPHEMERAL = "EPHEMERAL"\n\n\nclass MayHaveInstanceWeakref:\n    """Mixin for classes that can have a weakref back to a Dagster instance."""\n\n    def __init__(self):\n        self._instance_weakref: weakref.ReferenceType["DagsterInstance"] = None\n\n    @property\n    def _instance(self) -> "DagsterInstance":\n        instance = (\n            self._instance_weakref()\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            if (hasattr(self, "_instance_weakref") and self._instance_weakref is not None)\n            else None\n        )\n        return cast("DagsterInstance", instance)\n\n    def register_instance(self, instance: "DagsterInstance"):\n        check.invariant(\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            (not hasattr(self, "_instance_weakref") or self._instance_weakref is None),\n            "Must only call initialize once",\n        )\n\n        # Store a weakref to avoid a circular reference / enable GC\n        self._instance_weakref = weakref.ref(instance)\n\n\n
[docs]class DagsterInstance:\n """Core abstraction for managing Dagster's access to storage and other resources.\n\n Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\n the values in the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\n transient in-memory components.\n\n Configuration of this class should be done by setting values in ``$DAGSTER_HOME/dagster.yaml``.\n For example, to use Postgres for run and event log storage, you can write a ``dagster.yaml``\n such as the following:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/postgres_dagster.yaml\n :caption: dagster.yaml\n :language: YAML\n\n Args:\n instance_type (InstanceType): Indicates whether the instance is ephemeral or persistent.\n Users should not attempt to set this value directly or in their ``dagster.yaml`` files.\n local_artifact_storage (LocalArtifactStorage): The local artifact storage is used to\n configure storage for any artifacts that require a local disk, such as schedules, or\n when using the filesystem system storage to manage files and intermediates. By default,\n this will be a :py:class:`dagster.core.storage.root.LocalArtifactStorage`. Configurable\n in ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass`\n machinery.\n run_storage (RunStorage): The run storage is used to store metadata about ongoing and past\n pipeline runs. By default, this will be a\n :py:class:`dagster.core.storage.runs.SqliteRunStorage`. Configurable in ``dagster.yaml``\n using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n event_storage (EventLogStorage): Used to store the structured event logs generated by\n pipeline runs. By default, this will be a\n :py:class:`dagster.core.storage.event_log.SqliteEventLogStorage`. Configurable in\n ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n compute_log_manager (ComputeLogManager): The compute log manager handles stdout and stderr\n logging for solid compute functions. By default, this will be a\n :py:class:`dagster.core.storage.local_compute_log_manager.LocalComputeLogManager`.\n Configurable in ``dagster.yaml`` using the\n :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n run_coordinator (RunCoordinator): A runs coordinator may be used to manage the execution\n of pipeline runs.\n run_launcher (Optional[RunLauncher]): Optionally, a run launcher may be used to enable\n a Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\n addition to running them locally.\n settings (Optional[Dict]): Specifies certain per-instance settings,\n such as feature flags. These are set in the ``dagster.yaml`` under a set of whitelisted\n keys.\n ref (Optional[InstanceRef]): Used by internal machinery to pass instances across process\n boundaries.\n """\n\n _PROCESS_TEMPDIR: Optional[TemporaryDirectory] = None\n _EXIT_STACK = None\n\n def __init__(\n self,\n instance_type: InstanceType,\n local_artifact_storage: "LocalArtifactStorage",\n run_storage: "RunStorage",\n event_storage: "EventLogStorage",\n compute_log_manager: "ComputeLogManager",\n run_coordinator: "RunCoordinator",\n run_launcher: "RunLauncher",\n scheduler: Optional["Scheduler"] = None,\n schedule_storage: Optional["ScheduleStorage"] = None,\n settings: Optional[Dict[str, Any]] = None,\n ref: Optional[InstanceRef] = None,\n ):\n from dagster.core.launcher import RunLauncher\n from dagster.core.run_coordinator import RunCoordinator\n from dagster.core.scheduler import Scheduler\n from dagster.core.storage.compute_log_manager import ComputeLogManager\n from dagster.core.storage.event_log import EventLogStorage\n from dagster.core.storage.root import LocalArtifactStorage\n from dagster.core.storage.runs import RunStorage\n from dagster.core.storage.schedules import ScheduleStorage\n\n self._instance_type = check.inst_param(instance_type, "instance_type", InstanceType)\n self._local_artifact_storage = check.inst_param(\n local_artifact_storage, "local_artifact_storage", LocalArtifactStorage\n )\n self._event_storage = check.inst_param(event_storage, "event_storage", EventLogStorage)\n self._event_storage.register_instance(self)\n\n self._run_storage = check.inst_param(run_storage, "run_storage", RunStorage)\n self._run_storage.register_instance(self)\n\n self._compute_log_manager = check.inst_param(\n compute_log_manager, "compute_log_manager", ComputeLogManager\n )\n self._compute_log_manager.register_instance(self)\n self._scheduler = check.opt_inst_param(scheduler, "scheduler", Scheduler)\n\n self._schedule_storage = check.opt_inst_param(\n schedule_storage, "schedule_storage", ScheduleStorage\n )\n if self._schedule_storage:\n self._schedule_storage.register_instance(self)\n\n self._run_coordinator = check.inst_param(run_coordinator, "run_coordinator", RunCoordinator)\n self._run_coordinator.register_instance(self)\n\n self._run_launcher = check.inst_param(run_launcher, "run_launcher", RunLauncher)\n self._run_launcher.register_instance(self)\n\n self._settings = check.opt_dict_param(settings, "settings")\n\n self._ref = check.opt_inst_param(ref, "ref", InstanceRef)\n\n self._subscribers: Dict[str, List[Callable]] = defaultdict(list)\n\n run_monitoring_enabled = self.run_monitoring_settings.get("enabled", False)\n if run_monitoring_enabled and not self.run_launcher.supports_check_run_worker_health:\n run_monitoring_enabled = False\n warnings.warn(\n "The configured run launcher does not support run monitoring, disabling it.",\n )\n self._run_monitoring_enabled = run_monitoring_enabled\n if self.run_monitoring_enabled and self.run_monitoring_max_resume_run_attempts:\n check.invariant(\n self.run_launcher.supports_resume_run,\n "The configured run launcher does not support resuming runs. "\n "Set max_resume_run_attempts to 0 to use run monitoring. Any runs with a failed run "\n "worker will be marked as failed, but will not be resumed.",\n )\n\n # ctors\n\n @staticmethod\n def ephemeral(\n tempdir: Optional[str] = None, preload: Optional[List["DebugRunPayload"]] = None\n ) -> "DagsterInstance":\n from dagster.core.launcher.sync_in_memory_run_launcher import SyncInMemoryRunLauncher\n from dagster.core.run_coordinator import DefaultRunCoordinator\n from dagster.core.storage.event_log import InMemoryEventLogStorage\n from dagster.core.storage.noop_compute_log_manager import NoOpComputeLogManager\n from dagster.core.storage.root import LocalArtifactStorage\n from dagster.core.storage.runs import InMemoryRunStorage\n\n if tempdir is None:\n tempdir = DagsterInstance.temp_storage()\n\n return DagsterInstance(\n instance_type=InstanceType.EPHEMERAL,\n local_artifact_storage=LocalArtifactStorage(tempdir),\n run_storage=InMemoryRunStorage(preload=preload),\n event_storage=InMemoryEventLogStorage(preload=preload),\n compute_log_manager=NoOpComputeLogManager(),\n run_coordinator=DefaultRunCoordinator(),\n run_launcher=SyncInMemoryRunLauncher(),\n )\n\n @staticmethod\n def get() -> "DagsterInstance":\n dagster_home_path = os.getenv("DAGSTER_HOME")\n\n if not dagster_home_path:\n raise DagsterHomeNotSetError(\n (\n "The environment variable $DAGSTER_HOME is not set. \\n"\n "Dagster requires this environment variable to be set to an existing directory in your filesystem. "\n "This directory is used to store metadata across sessions, or load the dagster.yaml "\n "file which can configure storing metadata in an external database.\\n"\n "You can resolve this error by exporting the environment variable. For example, you can run the following command in your shell or include it in your shell configuration file:\\n"\n '\\texport DAGSTER_HOME=~"/dagster_home"\\n'\n "or PowerShell\\n"\n "$env:DAGSTER_HOME = ($home + '\\\\dagster_home')"\n "or batch"\n "set DAGSTER_HOME=%UserProfile%/dagster_home"\n "Alternatively, DagsterInstance.ephemeral() can be used for a transient instance.\\n"\n )\n )\n\n dagster_home_path = os.path.expanduser(dagster_home_path)\n\n if not os.path.isabs(dagster_home_path):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" must be an absolute path. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem."\n ).format(dagster_home_path)\n )\n\n if not (os.path.exists(dagster_home_path) and os.path.isdir(dagster_home_path)):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" is not a directory or does not exist. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem"\n ).format(dagster_home_path)\n )\n\n return DagsterInstance.from_config(dagster_home_path)\n\n @staticmethod\n def local_temp(tempdir=None, overrides=None) -> "DagsterInstance":\n if tempdir is None:\n tempdir = DagsterInstance.temp_storage()\n\n return DagsterInstance.from_ref(InstanceRef.from_dir(tempdir, overrides=overrides))\n\n @staticmethod\n def from_config(\n config_dir: str,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n ) -> "DagsterInstance":\n instance_ref = InstanceRef.from_dir(config_dir, config_filename=config_filename)\n return DagsterInstance.from_ref(instance_ref)\n\n @staticmethod\n def from_ref(instance_ref: InstanceRef) -> "DagsterInstance":\n check.inst_param(instance_ref, "instance_ref", InstanceRef)\n\n # DagsterInstance doesn't implement ConfigurableClass, but we may still sometimes want to\n # have custom subclasses of DagsterInstance. This machinery allows for those custom\n # subclasses to receive additional keyword arguments passed through the config YAML.\n klass = instance_ref.custom_instance_class or DagsterInstance\n kwargs = instance_ref.custom_instance_class_config\n\n return klass( # type: ignore\n instance_type=InstanceType.PERSISTENT,\n local_artifact_storage=instance_ref.local_artifact_storage,\n run_storage=instance_ref.run_storage,\n event_storage=instance_ref.event_storage,\n compute_log_manager=instance_ref.compute_log_manager,\n schedule_storage=instance_ref.schedule_storage,\n scheduler=instance_ref.scheduler,\n run_coordinator=instance_ref.run_coordinator,\n run_launcher=instance_ref.run_launcher,\n settings=instance_ref.settings,\n ref=instance_ref,\n **kwargs,\n )\n\n # flags\n\n @property\n def is_persistent(self) -> bool:\n return self._instance_type == InstanceType.PERSISTENT\n\n @property\n def is_ephemeral(self) -> bool:\n return self._instance_type == InstanceType.EPHEMERAL\n\n def get_ref(self) -> InstanceRef:\n if self._ref:\n return self._ref\n\n check.failed(\n "Attempted to prepare an ineligible DagsterInstance ({inst_type}) for cross "\n "process communication.{dagster_home_msg}".format(\n inst_type=self._instance_type,\n dagster_home_msg="\\nDAGSTER_HOME environment variable is not set, set it to "\n "a directory on the filesystem for dagster to use for storage and cross "\n "process coordination."\n if os.getenv("DAGSTER_HOME") is None\n else "",\n )\n )\n\n @property\n def root_directory(self) -> str:\n return self._local_artifact_storage.base_dir\n\n @staticmethod\n def temp_storage() -> str:\n from dagster.core.test_utils import environ\n\n if DagsterInstance._PROCESS_TEMPDIR is None:\n DagsterInstance._EXIT_STACK = ExitStack()\n DagsterInstance._EXIT_STACK.enter_context(\n environ({"DAGSTER_TELEMETRY_DISABLED": "yes"})\n )\n DagsterInstance._PROCESS_TEMPDIR = TemporaryDirectory()\n return cast(TemporaryDirectory, DagsterInstance._PROCESS_TEMPDIR).name\n\n def _info(self, component):\n # ConfigurableClass may not have inst_data if it's a direct instantiation\n # which happens for ephemeral instances\n if isinstance(component, ConfigurableClass) and component.inst_data:\n return component.inst_data.info_dict()\n if type(component) is dict:\n return component\n return component.__class__.__name__\n\n def _info_str_for_component(self, component_name, component):\n return yaml.dump(\n {component_name: self._info(component)}, default_flow_style=False, sort_keys=False\n )\n\n def info_dict(self):\n\n settings = self._settings if self._settings else {}\n\n ret = {\n "local_artifact_storage": self._info(self._local_artifact_storage),\n "run_storage": self._info(self._run_storage),\n "event_log_storage": self._info(self._event_storage),\n "compute_logs": self._info(self._compute_log_manager),\n "schedule_storage": self._info(self._schedule_storage),\n "scheduler": self._info(self._scheduler),\n "run_coordinator": self._info(self._run_coordinator),\n "run_launcher": self._info(self._run_launcher),\n }\n ret.update(\n {\n settings_key: self._info(settings_value)\n for settings_key, settings_value in settings.items()\n }\n )\n\n return ret\n\n def info_str(self) -> str:\n return yaml.dump(self.info_dict(), default_flow_style=False, sort_keys=False)\n\n @property\n def run_storage(self) -> "RunStorage":\n return self._run_storage\n\n @property\n def event_log_storage(self) -> "EventLogStorage":\n return self._event_storage\n\n # schedule storage\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n return self._schedule_storage\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n return self._scheduler\n\n @property\n def scheduler_class(self) -> Optional[str]:\n return self.scheduler.__class__.__name__ if self.scheduler else None\n\n # run coordinator\n\n @property\n def run_coordinator(self) -> "RunCoordinator":\n return self._run_coordinator\n\n # run launcher\n\n @property\n def run_launcher(self) -> "RunLauncher":\n return self._run_launcher\n\n # compute logs\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n return self._compute_log_manager\n\n def get_settings(self, settings_key: str) -> Any:\n check.str_param(settings_key, "settings_key")\n if self._settings and settings_key in self._settings:\n return self._settings.get(settings_key)\n return {}\n\n @property\n def telemetry_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n dagster_telemetry_enabled_default = True\n\n telemetry_settings = self.get_settings("telemetry")\n\n if not telemetry_settings:\n return dagster_telemetry_enabled_default\n\n if "enabled" in telemetry_settings:\n return telemetry_settings["enabled"]\n elif "experimental_dagit" in telemetry_settings:\n return telemetry_settings["experimental_dagit"]\n else:\n return dagster_telemetry_enabled_default\n\n # run monitoring\n\n @property\n def run_monitoring_enabled(self) -> bool:\n return self._run_monitoring_enabled\n\n @property\n def run_monitoring_settings(self) -> Dict:\n return self.get_settings("run_monitoring")\n\n @property\n def run_monitoring_start_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("start_timeout_seconds", 180)\n\n @property\n def run_monitoring_max_resume_run_attempts(self) -> int:\n default_max_resume_run_attempts = 3 if self.run_launcher.supports_resume_run else 0\n return self.run_monitoring_settings.get(\n "max_resume_run_attempts", default_max_resume_run_attempts\n )\n\n @property\n def run_monitoring_poll_interval_seconds(self) -> int:\n return self.run_monitoring_settings.get("poll_interval_seconds", 120)\n\n @property\n def cancellation_thread_poll_interval_seconds(self) -> int:\n return self.get_settings("run_monitoring").get(\n "cancellation_thread_poll_interval_seconds", 10\n )\n\n # python logs\n\n @property\n def managed_python_loggers(self) -> List[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n return python_log_settings.get("managed_python_loggers", [])\n\n @property\n def python_log_level(self) -> Optional[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n return python_log_settings.get("python_log_level")\n\n def upgrade(self, print_fn=None):\n from dagster.core.storage.migration.utils import upgrading_instance\n\n with upgrading_instance(self):\n\n if print_fn:\n print_fn("Updating run storage...")\n self._run_storage.upgrade()\n self._run_storage.migrate(print_fn)\n\n if print_fn:\n print_fn("Updating event storage...")\n self._event_storage.upgrade()\n self._event_storage.reindex_assets(print_fn=print_fn)\n\n if print_fn:\n print_fn("Updating schedule storage...")\n self._schedule_storage.upgrade()\n self._schedule_storage.migrate(print_fn)\n\n def optimize_for_dagit(self, statement_timeout):\n if self._schedule_storage:\n self._schedule_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n self._run_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n self._event_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n\n def reindex(self, print_fn=lambda _: None):\n print_fn("Checking for reindexing...")\n self._event_storage.reindex_events(print_fn)\n self._event_storage.reindex_assets(print_fn)\n self._run_storage.optimize(print_fn)\n self._schedule_storage.optimize(print_fn)\n print_fn("Done.")\n\n def dispose(self):\n self._run_storage.dispose()\n self.run_coordinator.dispose()\n self._run_launcher.dispose()\n self._event_storage.dispose()\n self._compute_log_manager.dispose()\n\n # run storage\n @traced\n def get_run_by_id(self, run_id: str) -> Optional[PipelineRun]:\n return self._run_storage.get_run_by_id(run_id)\n\n @traced\n def get_pipeline_snapshot(self, snapshot_id: str) -> "PipelineSnapshot":\n return self._run_storage.get_pipeline_snapshot(snapshot_id)\n\n @traced\n def has_pipeline_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_pipeline_snapshot(snapshot_id)\n\n @traced\n def has_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_snapshot(snapshot_id)\n\n @traced\n def get_historical_pipeline(self, snapshot_id: str) -> "HistoricalPipeline":\n from dagster.core.host_representation import HistoricalPipeline\n\n snapshot = self._run_storage.get_pipeline_snapshot(snapshot_id)\n parent_snapshot = (\n self._run_storage.get_pipeline_snapshot(snapshot.lineage_snapshot.parent_snapshot_id)\n if snapshot.lineage_snapshot\n else None\n )\n return HistoricalPipeline(snapshot, snapshot_id, parent_snapshot)\n\n @traced\n def has_historical_pipeline(self, snapshot_id: str) -> bool:\n return self._run_storage.has_pipeline_snapshot(snapshot_id)\n\n @traced\n def get_execution_plan_snapshot(self, snapshot_id: str) -> "ExecutionPlanSnapshot":\n return self._run_storage.get_execution_plan_snapshot(snapshot_id)\n\n @traced\n def get_run_stats(self, run_id: str) -> PipelineRunStatsSnapshot:\n return self._event_storage.get_stats_for_run(run_id)\n\n @traced\n def get_run_step_stats(self, run_id, step_keys=None) -> List["RunStepKeyStatsSnapshot"]:\n return self._event_storage.get_step_stats_for_run(run_id, step_keys)\n\n @traced\n def get_run_tags(self) -> List[Tuple[str, Set[str]]]:\n return self._run_storage.get_run_tags()\n\n @traced\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Iterable[PipelineRun]]]:\n return self._run_storage.get_run_group(run_id)\n\n def create_run_for_pipeline(\n self,\n pipeline_def,\n execution_plan=None,\n run_id=None,\n run_config=None,\n mode=None,\n solids_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n solid_selection=None,\n external_pipeline_origin=None,\n pipeline_code_origin=None,\n ):\n from dagster.core.execution.api import create_execution_plan\n from dagster.core.execution.plan.plan import ExecutionPlan\n from dagster.core.snap import snapshot_from_execution_plan\n\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)\n\n # note that solids_to_execute is required to execute the solid subset, which is the\n # frozenset version of the previous solid_subset.\n # solid_selection is not required and will not be converted to solids_to_execute here.\n # i.e. this function doesn't handle solid queries.\n # solid_selection is only used to pass the user queries further down.\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n\n if solids_to_execute:\n if isinstance(pipeline_def, PipelineSubsetDefinition):\n # for the case when pipeline_def is created by IPipeline or ExternalPipeline\n check.invariant(\n solids_to_execute == pipeline_def.solids_to_execute,\n "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} "\n "that conflicts with solids_to_execute arg {solids_to_execute}".format(\n pipeline_solids_to_execute=str_format_list(pipeline_def.solids_to_execute),\n solids_to_execute=str_format_list(solids_to_execute),\n ),\n )\n else:\n # for cases when `create_run_for_pipeline` is directly called\n pipeline_def = pipeline_def.get_pipeline_subset_def(\n solids_to_execute=solids_to_execute\n )\n\n step_keys_to_execute = None\n\n if execution_plan:\n step_keys_to_execute = execution_plan.step_keys_to_execute\n\n else:\n execution_plan = create_execution_plan(\n pipeline=InMemoryPipeline(pipeline_def),\n run_config=run_config,\n mode=mode,\n instance_ref=self.get_ref() if self.is_persistent else None,\n tags=tags,\n )\n\n return self.create_run(\n pipeline_name=pipeline_def.name,\n run_id=run_id,\n run_config=run_config,\n mode=check.opt_str_param(mode, "mode", default=pipeline_def.get_default_mode_name()),\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan,\n pipeline_def.get_pipeline_snapshot_id(),\n ),\n parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),\n external_pipeline_origin=external_pipeline_origin,\n pipeline_code_origin=pipeline_code_origin,\n )\n\n def _construct_run_with_snapshots(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n status,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n solid_selection=None,\n external_pipeline_origin=None,\n pipeline_code_origin=None,\n ):\n\n # https://github.com/dagster-io/dagster/issues/2403\n if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:\n if AIRFLOW_EXECUTION_DATE_STR not in tags:\n tags[AIRFLOW_EXECUTION_DATE_STR] = get_current_datetime_in_utc().isoformat()\n\n check.invariant(\n not (not pipeline_snapshot and execution_plan_snapshot),\n "It is illegal to have an execution plan snapshot and not have a pipeline snapshot. "\n "It is possible to have no execution plan snapshot since we persist runs "\n "that do not successfully compile execution plans in the scheduled case.",\n )\n\n pipeline_snapshot_id = (\n self._ensure_persisted_pipeline_snapshot(pipeline_snapshot, parent_pipeline_snapshot)\n if pipeline_snapshot\n else None\n )\n\n execution_plan_snapshot_id = (\n self._ensure_persisted_execution_plan_snapshot(\n execution_plan_snapshot, pipeline_snapshot_id, step_keys_to_execute\n )\n if execution_plan_snapshot and pipeline_snapshot_id\n else None\n )\n\n return DagsterRun(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_pipeline_origin=external_pipeline_origin,\n pipeline_code_origin=pipeline_code_origin,\n )\n\n def _ensure_persisted_pipeline_snapshot(self, pipeline_snapshot, parent_pipeline_snapshot):\n from dagster.core.snap import PipelineSnapshot, create_pipeline_snapshot_id\n\n check.inst_param(pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot)\n check.opt_inst_param(parent_pipeline_snapshot, "parent_pipeline_snapshot", PipelineSnapshot)\n\n if pipeline_snapshot.lineage_snapshot:\n if not self._run_storage.has_pipeline_snapshot(\n pipeline_snapshot.lineage_snapshot.parent_snapshot_id\n ):\n check.invariant(\n create_pipeline_snapshot_id(parent_pipeline_snapshot)\n == pipeline_snapshot.lineage_snapshot.parent_snapshot_id,\n "Parent pipeline snapshot id out of sync with passed parent pipeline snapshot",\n )\n\n returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(\n parent_pipeline_snapshot\n )\n check.invariant(\n pipeline_snapshot.lineage_snapshot.parent_snapshot_id\n == returned_pipeline_snapshot_id\n )\n\n pipeline_snapshot_id = create_pipeline_snapshot_id(pipeline_snapshot)\n if not self._run_storage.has_pipeline_snapshot(pipeline_snapshot_id):\n returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(\n pipeline_snapshot\n )\n check.invariant(pipeline_snapshot_id == returned_pipeline_snapshot_id)\n\n return pipeline_snapshot_id\n\n def _ensure_persisted_execution_plan_snapshot(\n self, execution_plan_snapshot, pipeline_snapshot_id, step_keys_to_execute\n ):\n from dagster.core.snap.execution_plan_snapshot import (\n ExecutionPlanSnapshot,\n create_execution_plan_snapshot_id,\n )\n\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n check.opt_nullable_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n check.invariant(\n execution_plan_snapshot.pipeline_snapshot_id == pipeline_snapshot_id,\n (\n "Snapshot mismatch: Snapshot ID in execution plan snapshot is "\n '"{ep_pipeline_snapshot_id}" and snapshot_id created in memory is '\n '"{pipeline_snapshot_id}"'\n ).format(\n ep_pipeline_snapshot_id=execution_plan_snapshot.pipeline_snapshot_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n ),\n )\n\n execution_plan_snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n if not self._run_storage.has_execution_plan_snapshot(execution_plan_snapshot_id):\n returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(\n execution_plan_snapshot\n )\n\n check.invariant(execution_plan_snapshot_id == returned_execution_plan_snapshot_id)\n\n return execution_plan_snapshot_id\n\n def _log_asset_materialization_planned_events(self, pipeline_run, execution_plan_snapshot):\n from dagster.core.events import DagsterEvent\n from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n\n pipeline_name = pipeline_run.pipeline_name\n\n for step in execution_plan_snapshot.steps:\n if step.key in execution_plan_snapshot.step_keys_to_execute:\n for output in step.outputs:\n asset_key = output.properties.asset_key\n if asset_key:\n # Logs and stores asset_materialization_planned event\n DagsterEvent.asset_materialization_planned(\n pipeline_name, asset_key, initialize_console_manager(pipeline_run, self)\n )\n\n def create_run(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n status,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n solid_selection=None,\n external_pipeline_origin=None,\n pipeline_code_origin=None,\n ):\n\n pipeline_run = self._construct_run_with_snapshots(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_pipeline_snapshot=parent_pipeline_snapshot,\n external_pipeline_origin=external_pipeline_origin,\n pipeline_code_origin=pipeline_code_origin,\n )\n\n pipeline_run = self._run_storage.add_run(pipeline_run)\n\n if execution_plan_snapshot:\n self._log_asset_materialization_planned_events(pipeline_run, execution_plan_snapshot)\n\n return pipeline_run\n\n def register_managed_run(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n solid_selection=None,\n ):\n # The usage of this method is limited to dagster-airflow, specifically in Dagster\n # Operators that are executed in Airflow. Because a common workflow in Airflow is to\n # retry dags from arbitrary tasks, we need any node to be capable of creating a\n # PipelineRun.\n #\n # The try-except DagsterRunAlreadyExists block handles the race when multiple "root" tasks\n # simultaneously execute self._run_storage.add_run(pipeline_run). When this happens, only\n # one task succeeds in creating the run, while the others get DagsterRunAlreadyExists\n # error; at this point, the failed tasks try again to fetch the existing run.\n # https://github.com/dagster-io/dagster/issues/2412\n\n pipeline_run = self._construct_run_with_snapshots(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=PipelineRunStatus.MANAGED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_pipeline_snapshot=parent_pipeline_snapshot,\n )\n\n def get_run():\n candidate_run = self.get_run_by_id(pipeline_run.run_id)\n\n field_diff = _check_run_equality(pipeline_run, candidate_run)\n\n if field_diff:\n raise DagsterRunConflict(\n "Found conflicting existing run with same id {run_id}. Runs differ in:"\n "\\n{field_diff}".format(\n run_id=pipeline_run.run_id,\n field_diff=_format_field_diff(field_diff),\n ),\n )\n return candidate_run\n\n if self.has_run(pipeline_run.run_id):\n return get_run()\n\n try:\n return self._run_storage.add_run(pipeline_run)\n except DagsterRunAlreadyExists:\n return get_run()\n\n @traced\n def add_run(self, pipeline_run: PipelineRun):\n return self._run_storage.add_run(pipeline_run)\n\n @traced\n def add_snapshot(self, snapshot, snapshot_id=None):\n return self._run_storage.add_snapshot(snapshot, snapshot_id)\n\n @traced\n def handle_run_event(self, run_id: str, event: "DagsterEvent"):\n return self._run_storage.handle_run_event(run_id, event)\n\n @traced\n def add_run_tags(self, run_id: str, new_tags: Dict[str, str]):\n return self._run_storage.add_run_tags(run_id, new_tags)\n\n @traced\n def has_run(self, run_id: str) -> bool:\n return self._run_storage.has_run(run_id)\n\n @traced\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Iterable[PipelineRun]:\n return self._run_storage.get_runs(filters, cursor, limit, bucket_by)\n\n @traced\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n return self._run_storage.get_runs_count(filters)\n\n @traced\n def get_run_groups(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Dict[str, Dict[str, Union[Iterable[PipelineRun], int]]]:\n return self._run_storage.get_run_groups(filters=filters, cursor=cursor, limit=limit)\n\n @traced\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> List[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n return self._run_storage.get_run_records(\n filters, limit, order_by, ascending, cursor, bucket_by\n )\n\n @property\n def supports_bucket_queries(self):\n return self._run_storage.supports_bucket_queries\n\n def wipe(self):\n self._run_storage.wipe()\n self._event_storage.wipe()\n\n @traced\n def delete_run(self, run_id: str):\n self._run_storage.delete_run(run_id)\n self._event_storage.delete_events(run_id)\n\n # event storage\n @traced\n def logs_after(\n self,\n run_id,\n cursor,\n of_type: Optional["DagsterEventType"] = None,\n limit: Optional[int] = None,\n ):\n return self._event_storage.get_logs_for_run(\n run_id,\n cursor=cursor,\n of_type=of_type,\n limit=limit,\n )\n\n @traced\n def all_logs(\n self, run_id, of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None\n ):\n return self._event_storage.get_logs_for_run(run_id, of_type=of_type)\n\n def watch_event_logs(self, run_id, cursor, cb):\n return self._event_storage.watch(run_id, cursor, cb)\n\n def end_watch_event_logs(self, run_id, cb):\n return self._event_storage.end_watch(run_id, cb)\n\n # asset storage\n\n @traced\n def all_asset_keys(self):\n return self._event_storage.all_asset_keys()\n\n @traced\n def get_asset_keys(self, prefix=None, limit=None, cursor=None):\n return self._event_storage.get_asset_keys(prefix=prefix, limit=limit, cursor=cursor)\n\n @traced\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n return self._event_storage.has_asset_key(asset_key)\n\n @traced\n def get_latest_materialization_events(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n return self._event_storage.get_latest_materialization_events(asset_keys)\n\n @traced\n def get_event_records(\n self,\n event_records_filter: Optional["EventRecordsFilter"] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable["EventLogRecord"]:\n """Return a list of event records stored in the event log storage.\n\n Args:\n event_records_filter (Optional[EventRecordsFilter]): the filter by which to filter event\n records.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[EventLogRecord]: List of event log records stored in the event log storage.\n """\n return self._event_storage.get_event_records(event_records_filter, limit, ascending)\n\n @traced\n def events_for_asset_key(\n self,\n asset_key,\n partitions=None,\n before_cursor=None,\n after_cursor=None,\n cursor=None,\n before_timestamp=None,\n limit=None,\n ascending=False,\n ):\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n warnings.warn(\n """\nThe method `events_for_asset_key` on DagsterInstance has been deprecated as of `0.12.0` in favor of\nthe method `get_event_records`. The method `get_event_records` takes in an `EventRecordsFilter`\nargument that allows for filtering by asset key and asset key partitions. The return value is a\nlist of `EventLogRecord` objects, each of which contains a storage_id and an event log entry.\n\nExample:\nrecords = instance.get_event_records(\n EventRecordsFilter(\n asset_key=asset_key,\n asset_partitions=partitions,\n after_cursor=after_cursor,\n ),\n)\n"""\n )\n\n return self._event_storage.get_asset_events(\n asset_key,\n partitions,\n before_cursor,\n after_cursor,\n limit,\n before_timestamp=before_timestamp,\n ascending=ascending,\n include_cursor=True,\n cursor=cursor,\n )\n\n @traced\n def run_ids_for_asset_key(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n return self._event_storage.get_asset_run_ids(asset_key)\n\n @traced\n def wipe_assets(self, asset_keys):\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset(asset_key)\n\n @traced\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n return self._event_storage.get_materialization_count_by_partition(asset_keys)\n\n # event subscriptions\n\n def _get_yaml_python_handlers(self):\n if self._settings:\n logging_config = self.get_settings("python_logs").get("dagster_handler_config", {})\n\n if logging_config:\n experimental_functionality_warning("Handling yaml-defined logging configuration")\n\n # Handlers can only be retrieved from dictConfig configuration if they are attached\n # to a logger. We add a dummy logger to the configuration that allows us to access user\n # defined handlers.\n handler_names = logging_config.get("handlers", {}).keys()\n\n dagster_dummy_logger_name = "dagster_dummy_logger"\n\n processed_dict_conf = {\n "version": 1,\n "disable_existing_loggers": False,\n "loggers": {dagster_dummy_logger_name: {"handlers": handler_names}},\n }\n processed_dict_conf.update(logging_config)\n\n logging.config.dictConfig(processed_dict_conf)\n\n dummy_logger = logging.getLogger(dagster_dummy_logger_name)\n return dummy_logger.handlers\n return []\n\n def _get_event_log_handler(self):\n event_log_handler = _EventListenerLogHandler(self)\n event_log_handler.setLevel(10)\n return event_log_handler\n\n def get_handlers(self):\n handlers = [self._get_event_log_handler()]\n handlers.extend(self._get_yaml_python_handlers())\n return handlers\n\n def store_event(self, event):\n self._event_storage.store_event(event)\n\n def handle_new_event(self, event):\n run_id = event.run_id\n\n self._event_storage.store_event(event)\n\n if event.is_dagster_event and event.dagster_event.is_pipeline_event:\n self._run_storage.handle_run_event(run_id, event.dagster_event)\n\n for sub in self._subscribers[run_id]:\n sub(event)\n\n def add_event_listener(self, run_id, cb):\n self._subscribers[run_id].append(cb)\n\n
[docs] def report_engine_event(\n self,\n message,\n pipeline_run=None,\n engine_event_data=None,\n cls=None,\n step_key=None,\n pipeline_name=None,\n run_id=None,\n ):\n """\n Report a EngineEvent that occurred outside of a pipeline execution context.\n """\n from dagster.core.events import DagsterEvent, DagsterEventType, EngineEventData\n from dagster.core.events.log import EventLogEntry\n\n check.opt_class_param(cls, "cls")\n check.str_param(message, "message")\n check.opt_inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(run_id, "run_id")\n check.opt_str_param(pipeline_name, "pipeline_name")\n\n check.invariant(\n pipeline_run or (pipeline_name and run_id),\n "Must include either pipeline_run or pipeline_name and run_id",\n )\n\n run_id = run_id if run_id else pipeline_run.run_id\n pipeline_name = pipeline_name if pipeline_name else pipeline_run.pipeline_name\n\n engine_event_data = check.opt_inst_param(\n engine_event_data,\n "engine_event_data",\n EngineEventData,\n EngineEventData([]),\n )\n\n if cls:\n message = "[{}] {}".format(cls.__name__, message)\n\n log_level = logging.INFO\n if engine_event_data and engine_event_data.error:\n log_level = logging.ERROR\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=pipeline_name,\n message=message,\n event_specific_data=engine_event_data,\n step_key=step_key,\n )\n event_record = EventLogEntry(\n user_message="",\n level=log_level,\n pipeline_name=pipeline_name,\n run_id=run_id,\n error_info=None,\n timestamp=time.time(),\n step_key=step_key,\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event
\n\n def report_run_canceling(self, run, message=None):\n\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import EventLogEntry\n\n check.inst_param(run, "run", PipelineRun)\n message = check.opt_str_param(\n message,\n "message",\n "Sending run termination request.",\n )\n canceling_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELING.value,\n pipeline_name=run.pipeline_name,\n message=message,\n )\n\n event_record = EventLogEntry(\n user_message="",\n level=logging.INFO,\n pipeline_name=run.pipeline_name,\n run_id=run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=canceling_event,\n )\n\n self.handle_new_event(event_record)\n\n def report_run_canceled(\n self,\n pipeline_run,\n message=None,\n ):\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import EventLogEntry\n\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n message = check.opt_str_param(\n message,\n "mesage",\n "This run has been marked as canceled from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELED.value,\n pipeline_name=pipeline_run.pipeline_name,\n message=message,\n )\n event_record = EventLogEntry(\n user_message="",\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event\n\n def report_run_failed(self, pipeline_run, message=None):\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import EventLogEntry\n\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n message = check.opt_str_param(\n message,\n "message",\n "This run has been marked as failed from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_FAILURE.value,\n pipeline_name=pipeline_run.pipeline_name,\n message=message,\n )\n event_record = EventLogEntry(\n user_message="",\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event\n\n # directories\n\n def file_manager_directory(self, run_id):\n return self._local_artifact_storage.file_manager_dir(run_id)\n\n def storage_directory(self):\n return self._local_artifact_storage.storage_dir\n\n def schedules_directory(self):\n return self._local_artifact_storage.schedules_dir\n\n # Runs coordinator\n\n
[docs] def submit_run(self, run_id, workspace: "IWorkspace") -> PipelineRun:\n """Submit a pipeline run to the coordinator.\n\n This method delegates to the ``RunCoordinator``, configured on the instance, and will\n call its implementation of ``RunCoordinator.submit_run()`` to send the run to the\n coordinator for execution. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.NOT_STARTED`` state. They also must have a non-null\n ExternalPipelineOrigin.\n\n Args:\n run_id (str): The id of the run.\n """\n\n from dagster.core.host_representation import ExternalPipelineOrigin\n from dagster.core.origin import PipelinePythonOrigin\n from dagster.core.run_coordinator import SubmitRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to submit_run"\n )\n\n check.inst(\n run.external_pipeline_origin,\n ExternalPipelineOrigin,\n "External pipeline origin must be set for submitted runs",\n )\n check.inst(\n run.pipeline_code_origin,\n PipelinePythonOrigin,\n "Python origin must be set for submitted runs",\n )\n\n try:\n submitted_run = self._run_coordinator.submit_run(\n SubmitRunContext(run, workspace=workspace)\n )\n except:\n from dagster.core.events import EngineEventData\n\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return submitted_run
\n\n # Run launcher\n\n
[docs] def launch_run(self, run_id: str, workspace: "IWorkspace"):\n """Launch a pipeline run.\n\n This method is typically called using `instance.submit_run` rather than being invoked\n directly. This method delegates to the ``RunLauncher``, if any, configured on the instance,\n and will call its implementation of ``RunLauncher.launch_run()`` to begin the execution of\n the specified run. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and should be in the\n ``PipelineRunStatus.NOT_STARTED`` state.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster.core.events import DagsterEvent, DagsterEventType, EngineEventData\n from dagster.core.events.log import EventLogEntry\n from dagster.core.launcher import LaunchRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to launch_run"\n )\n\n launch_started_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_STARTING.value,\n pipeline_name=run.pipeline_name,\n )\n\n event_record = EventLogEntry(\n user_message="",\n level=logging.INFO,\n pipeline_name=run.pipeline_name,\n run_id=run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=launch_started_event,\n )\n\n self.handle_new_event(event_record)\n\n run = self.get_run_by_id(run_id)\n if run is None:\n check.failed(f"Failed to reload run {run_id}")\n\n try:\n self._run_launcher.launch_run(LaunchRunContext(pipeline_run=run, workspace=workspace))\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run
\n\n
[docs] def resume_run(self, run_id: str, workspace: "IWorkspace", attempt_number: int):\n """Resume a pipeline run.\n\n This method should be called on runs which have already been launched, but whose run workers\n have died.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster.core.events import EngineEventData\n from dagster.core.launcher import ResumeRunContext\n from dagster.daemon.monitoring import RESUME_RUN_LOG_MESSAGE\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to resume_run"\n )\n if run.status not in IN_PROGRESS_RUN_STATUSES:\n raise DagsterInvariantViolationError(\n f"Run {run_id} is not in a state that can be resumed"\n )\n\n self.report_engine_event(\n RESUME_RUN_LOG_MESSAGE,\n run,\n )\n\n try:\n self._run_launcher.resume_run(\n ResumeRunContext(\n pipeline_run=run,\n workspace=workspace,\n resume_attempt_number=attempt_number,\n )\n )\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run
\n\n def count_resume_run_attempts(self, run_id: str):\n from dagster.core.events import DagsterEventType\n from dagster.daemon.monitoring import RESUME_RUN_LOG_MESSAGE\n\n events = self.all_logs(run_id, of_type=DagsterEventType.ENGINE_EVENT)\n return len([event for event in events if event.message == RESUME_RUN_LOG_MESSAGE])\n\n def run_will_resume(self, run_id: str):\n if not self.run_monitoring_enabled:\n return False\n return self.count_resume_run_attempts(run_id) < self.run_monitoring_max_resume_run_attempts\n\n # Scheduler\n\n def start_schedule(self, external_schedule):\n return self._scheduler.start_schedule(self, external_schedule)\n\n def stop_schedule(self, schedule_origin_id, external_schedule):\n return self._scheduler.stop_schedule(self, schedule_origin_id, external_schedule)\n\n def scheduler_debug_info(self):\n from dagster.core.definitions.run_request import InstigatorType\n from dagster.core.scheduler import SchedulerDebugInfo\n\n errors = []\n\n schedules = []\n for schedule_state in self.all_instigator_state(instigator_type=InstigatorType.SCHEDULE):\n schedule_info = {\n schedule_state.instigator_name: {\n "status": schedule_state.status.value,\n "cron_schedule": schedule_state.instigator_data.cron_schedule,\n "schedule_origin_id": schedule_state.instigator_origin_id,\n "repository_origin_id": schedule_state.repository_origin_id,\n }\n }\n\n schedules.append(yaml.safe_dump(schedule_info, default_flow_style=False))\n\n return SchedulerDebugInfo(\n scheduler_config_info=self._info_str_for_component("Scheduler", self.scheduler),\n scheduler_info=self.scheduler.debug_info(),\n schedule_storage=schedules,\n errors=errors,\n )\n\n # Schedule / Sensor Storage\n\n def start_sensor(self, external_sensor):\n from dagster.core.definitions.run_request import InstigatorType\n from dagster.core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n state = self.get_instigator_state(\n external_sensor.get_external_origin_id(), external_sensor.selector_id\n )\n\n if external_sensor.get_current_instigator_state(state).is_running:\n raise Exception(\n "You have attempted to start sensor {name}, but it is already running".format(\n name=external_sensor.name\n )\n )\n\n if not state:\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.RUNNING,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(state.with_status(InstigatorStatus.RUNNING))\n\n def stop_sensor(self, instigator_origin_id, external_sensor):\n from dagster.core.definitions.run_request import InstigatorType\n from dagster.core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n state = self.get_instigator_state(instigator_origin_id, external_sensor.selector_id)\n\n if not state:\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.STOPPED,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(state.with_status(InstigatorStatus.STOPPED))\n\n @traced\n def all_instigator_state(\n self, repository_origin_id=None, repository_selector_id=None, instigator_type=None\n ):\n return self._schedule_storage.all_instigator_state(\n repository_origin_id, repository_selector_id, instigator_type\n )\n\n @traced\n def get_instigator_state(self, origin_id, selector_id):\n return self._schedule_storage.get_instigator_state(origin_id, selector_id)\n\n def add_instigator_state(self, state):\n return self._schedule_storage.add_instigator_state(state)\n\n def update_instigator_state(self, state):\n return self._schedule_storage.update_instigator_state(state)\n\n def delete_instigator_state(self, origin_id, selector_id):\n return self._schedule_storage.delete_instigator_state(origin_id, selector_id)\n\n @property\n def supports_batch_tick_queries(self):\n return self._schedule_storage and self._schedule_storage.supports_batch_queries\n\n @traced\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Mapping[str, Iterable["InstigatorTick"]]:\n if not self._schedule_storage:\n return {}\n return self._schedule_storage.get_batch_ticks(selector_ids, limit, statuses)\n\n @traced\n def get_tick(self, origin_id, selector_id, timestamp):\n matches = self._schedule_storage.get_ticks(\n origin_id, selector_id, before=timestamp + 1, after=timestamp - 1, limit=1\n )\n return matches[0] if len(matches) else None\n\n @traced\n def get_ticks(self, origin_id, selector_id, before=None, after=None, limit=None, statuses=None):\n return self._schedule_storage.get_ticks(\n origin_id, selector_id, before=before, after=after, limit=limit, statuses=statuses\n )\n\n def create_tick(self, tick_data):\n return self._schedule_storage.create_tick(tick_data)\n\n def update_tick(self, tick):\n return self._schedule_storage.update_tick(tick)\n\n def purge_ticks(self, origin_id, selector_id, tick_status, before):\n self._schedule_storage.purge_ticks(origin_id, selector_id, tick_status, before)\n\n def wipe_all_schedules(self):\n if self._scheduler:\n self._scheduler.wipe(self)\n\n self._schedule_storage.wipe()\n\n def logs_path_for_schedule(self, schedule_origin_id):\n return self._scheduler.get_logs_path(self, schedule_origin_id)\n\n def __enter__(self):\n return self\n\n def __exit__(self, exception_type, exception_value, traceback):\n self.dispose()\n if DagsterInstance._EXIT_STACK:\n DagsterInstance._EXIT_STACK.close()\n\n
[docs] def get_addresses_for_step_output_versions(self, step_output_versions):\n """\n For each given step output, finds whether an output exists with the given\n version, and returns its address if it does.\n\n Args:\n step_output_versions (Dict[(str, StepOutputHandle), str]):\n (pipeline name, step output handle) -> version.\n\n Returns:\n Dict[(str, StepOutputHandle), str]: (pipeline name, step output handle) -> address.\n For each step output, an address if there is one and None otherwise.\n """\n return self._event_storage.get_addresses_for_step_output_versions(step_output_versions)
\n\n # dagster daemon\n
[docs] def add_daemon_heartbeat(self, daemon_heartbeat: "DaemonHeartbeat"):\n """Called on a regular interval by the daemon"""\n self._run_storage.add_daemon_heartbeat(daemon_heartbeat)
\n\n
[docs] def get_daemon_heartbeats(self) -> Dict[str, "DaemonHeartbeat"]:\n """Latest heartbeats of all daemon types"""\n return self._run_storage.get_daemon_heartbeats()
\n\n def wipe_daemon_heartbeats(self):\n self._run_storage.wipe_daemon_heartbeats()\n\n def get_required_daemon_types(self):\n from dagster.core.run_coordinator import QueuedRunCoordinator\n from dagster.core.scheduler import DagsterDaemonScheduler\n from dagster.daemon.daemon import (\n BackfillDaemon,\n MonitoringDaemon,\n SchedulerDaemon,\n SensorDaemon,\n )\n from dagster.daemon.run_coordinator.queued_run_coordinator_daemon import (\n QueuedRunCoordinatorDaemon,\n )\n\n if self.is_ephemeral:\n return []\n\n daemons = [SensorDaemon.daemon_type(), BackfillDaemon.daemon_type()]\n if isinstance(self.scheduler, DagsterDaemonScheduler):\n daemons.append(SchedulerDaemon.daemon_type())\n if isinstance(self.run_coordinator, QueuedRunCoordinator):\n daemons.append(QueuedRunCoordinatorDaemon.daemon_type())\n if self.run_monitoring_enabled:\n daemons.append(MonitoringDaemon.daemon_type())\n return daemons\n\n # backfill\n def get_backfills(self, status=None, cursor=None, limit=None):\n return self._run_storage.get_backfills(status=status, cursor=cursor, limit=limit)\n\n def get_backfill(self, backfill_id):\n return self._run_storage.get_backfill(backfill_id)\n\n def add_backfill(self, partition_backfill):\n self._run_storage.add_backfill(partition_backfill)\n\n def update_backfill(self, partition_backfill):\n return self._run_storage.update_backfill(partition_backfill)\n\n @property\n def should_start_background_run_thread(self) -> bool:\n """\n Gate on an experimental feature to start a thread that monitors for if the run should be canceled.\n """\n return False
\n\n\ndef is_dagit_telemetry_enabled(instance):\n telemetry_settings = instance.get_settings("telemetry")\n if not telemetry_settings:\n return False\n\n if "experimental_dagit" in telemetry_settings:\n return telemetry_settings["experimental_dagit"]\n else:\n return False\n
", "current_page_name": "_modules/dagster/core/instance", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "ref": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.instance.ref

\nimport os\nfrom typing import Dict, NamedTuple, Optional\n\nimport yaml\n\nfrom dagster import check\nfrom dagster.serdes import ConfigurableClassData, class_from_code_pointer, whitelist_for_serdes\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME, dagster_instance_config\n\n\ndef _runs_directory(base):\n    return os.path.join(base, "history", "")\n\n\ndef compute_logs_directory(base):\n    return os.path.join(base, "storage")\n\n\ndef _event_logs_directory(base):\n    return os.path.join(base, "history", "runs", "")\n\n\ndef _schedule_directory(base):\n    return os.path.join(base, "schedules")\n\n\ndef configurable_class_data(config_field):\n    return ConfigurableClassData(\n        check.str_elem(config_field, "module"),\n        check.str_elem(config_field, "class"),\n        yaml.dump(check.opt_dict_elem(config_field, "config"), default_flow_style=False),\n    )\n\n\ndef configurable_class_data_or_default(config_value, field_name, default):\n    return (\n        configurable_class_data(config_value[field_name])\n        if config_value.get(field_name)\n        else default\n    )\n\n\n
[docs]@whitelist_for_serdes\nclass InstanceRef(\n NamedTuple(\n "_InstanceRef",\n [\n ("local_artifact_storage_data", ConfigurableClassData),\n ("run_storage_data", ConfigurableClassData),\n ("event_storage_data", ConfigurableClassData),\n ("compute_logs_data", ConfigurableClassData),\n ("schedule_storage_data", Optional[ConfigurableClassData]),\n ("scheduler_data", Optional[ConfigurableClassData]),\n ("run_coordinator_data", Optional[ConfigurableClassData]),\n ("run_launcher_data", Optional[ConfigurableClassData]),\n ("settings", Dict[str, object]),\n ("custom_instance_class_data", Optional[ConfigurableClassData]),\n ],\n )\n):\n """Serializable representation of a :py:class:`DagsterInstance`.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n local_artifact_storage_data: ConfigurableClassData,\n run_storage_data: ConfigurableClassData,\n event_storage_data: ConfigurableClassData,\n compute_logs_data: ConfigurableClassData,\n schedule_storage_data: Optional[ConfigurableClassData],\n scheduler_data: Optional[ConfigurableClassData],\n run_coordinator_data: Optional[ConfigurableClassData],\n run_launcher_data: Optional[ConfigurableClassData],\n settings: Dict[str, object],\n custom_instance_class_data: Optional[ConfigurableClassData] = None,\n ):\n return super(cls, InstanceRef).__new__(\n cls,\n local_artifact_storage_data=check.inst_param(\n local_artifact_storage_data, "local_artifact_storage_data", ConfigurableClassData\n ),\n run_storage_data=check.inst_param(\n run_storage_data, "run_storage_data", ConfigurableClassData\n ),\n event_storage_data=check.inst_param(\n event_storage_data, "event_storage_data", ConfigurableClassData\n ),\n compute_logs_data=check.inst_param(\n compute_logs_data, "compute_logs_data", ConfigurableClassData\n ),\n schedule_storage_data=check.opt_inst_param(\n schedule_storage_data, "schedule_storage_data", ConfigurableClassData\n ),\n scheduler_data=check.opt_inst_param(\n scheduler_data, "scheduler_data", ConfigurableClassData\n ),\n run_coordinator_data=check.opt_inst_param(\n run_coordinator_data, "run_coordinator_data", ConfigurableClassData\n ),\n run_launcher_data=check.opt_inst_param(\n run_launcher_data, "run_launcher_data", ConfigurableClassData\n ),\n settings=check.opt_dict_param(settings, "settings", key_type=str),\n custom_instance_class_data=check.opt_inst_param(\n custom_instance_class_data,\n "instance_class",\n ConfigurableClassData,\n ),\n )\n\n @staticmethod\n def config_defaults(base_dir):\n return {\n "local_artifact_storage": ConfigurableClassData(\n "dagster.core.storage.root",\n "LocalArtifactStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "run_storage": ConfigurableClassData(\n "dagster.core.storage.runs",\n "SqliteRunStorage",\n yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n ),\n "event_log_storage": ConfigurableClassData(\n "dagster.core.storage.event_log",\n "SqliteEventLogStorage",\n yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n ),\n "compute_logs": ConfigurableClassData(\n "dagster.core.storage.local_compute_log_manager",\n "LocalComputeLogManager",\n yaml.dump({"base_dir": compute_logs_directory(base_dir)}, default_flow_style=False),\n ),\n "schedule_storage": ConfigurableClassData(\n "dagster.core.storage.schedules",\n "SqliteScheduleStorage",\n yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n ),\n "scheduler": ConfigurableClassData(\n "dagster.core.scheduler",\n "DagsterDaemonScheduler",\n yaml.dump({}),\n ),\n "run_coordinator": ConfigurableClassData(\n "dagster.core.run_coordinator", "DefaultRunCoordinator", yaml.dump({})\n ),\n "run_launcher": ConfigurableClassData(\n "dagster",\n "DefaultRunLauncher",\n yaml.dump({}),\n ),\n }\n\n @staticmethod\n def from_dir(base_dir, config_filename=DAGSTER_CONFIG_YAML_FILENAME, overrides=None):\n overrides = check.opt_dict_param(overrides, "overrides")\n config_value, custom_instance_class = dagster_instance_config(\n base_dir, config_filename=config_filename, overrides=overrides\n )\n\n if custom_instance_class:\n config_keys = set(custom_instance_class.config_schema().keys())\n custom_instance_class_config = {\n key: val for key, val in config_value.items() if key in config_keys\n }\n custom_instance_class_data = ConfigurableClassData(\n config_value["instance_class"]["module"],\n config_value["instance_class"]["class"],\n yaml.dump(custom_instance_class_config, default_flow_style=False),\n )\n defaults = custom_instance_class.config_defaults(base_dir)\n else:\n custom_instance_class_data = None\n defaults = InstanceRef.config_defaults(base_dir)\n\n local_artifact_storage_data = configurable_class_data_or_default(\n config_value, "local_artifact_storage", defaults["local_artifact_storage"]\n )\n\n run_storage_data = configurable_class_data_or_default(\n config_value, "run_storage", defaults["run_storage"]\n )\n\n event_storage_data = configurable_class_data_or_default(\n config_value, "event_log_storage", defaults["event_log_storage"]\n )\n\n compute_logs_data = configurable_class_data_or_default(\n config_value,\n "compute_logs",\n defaults["compute_logs"],\n )\n\n schedule_storage_data = configurable_class_data_or_default(\n config_value, "schedule_storage", defaults["schedule_storage"]\n )\n\n scheduler_data = configurable_class_data_or_default(\n config_value, "scheduler", defaults["scheduler"]\n )\n\n run_coordinator_data = configurable_class_data_or_default(\n config_value,\n "run_coordinator",\n defaults["run_coordinator"],\n )\n\n run_launcher_data = configurable_class_data_or_default(\n config_value,\n "run_launcher",\n defaults["run_launcher"],\n )\n\n settings_keys = {"telemetry", "python_logs", "run_monitoring"}\n settings = {key: config_value.get(key) for key in settings_keys if config_value.get(key)}\n\n return InstanceRef(\n local_artifact_storage_data=local_artifact_storage_data,\n run_storage_data=run_storage_data,\n event_storage_data=event_storage_data,\n compute_logs_data=compute_logs_data,\n schedule_storage_data=schedule_storage_data,\n scheduler_data=scheduler_data,\n run_coordinator_data=run_coordinator_data,\n run_launcher_data=run_launcher_data,\n settings=settings,\n custom_instance_class_data=custom_instance_class_data,\n )\n\n @staticmethod\n def from_dict(instance_ref_dict):\n def value_for_ref_item(k, v):\n if v is None:\n return None\n if k == "settings":\n return v\n return ConfigurableClassData(*v)\n\n return InstanceRef(**{k: value_for_ref_item(k, v) for k, v in instance_ref_dict.items()})\n\n @property\n def local_artifact_storage(self):\n return self.local_artifact_storage_data.rehydrate()\n\n @property\n def run_storage(self):\n return self.run_storage_data.rehydrate()\n\n @property\n def event_storage(self):\n return self.event_storage_data.rehydrate()\n\n @property\n def compute_log_manager(self):\n return self.compute_logs_data.rehydrate()\n\n @property\n def schedule_storage(self):\n return self.schedule_storage_data.rehydrate() if self.schedule_storage_data else None\n\n @property\n def scheduler(self):\n return self.scheduler_data.rehydrate() if self.scheduler_data else None\n\n @property\n def run_coordinator(self):\n return self.run_coordinator_data.rehydrate() if self.run_coordinator_data else None\n\n @property\n def run_launcher(self):\n return self.run_launcher_data.rehydrate() if self.run_launcher_data else None\n\n @property\n def custom_instance_class(self):\n return (\n class_from_code_pointer(\n self.custom_instance_class_data.module_name,\n self.custom_instance_class_data.class_name,\n )\n if self.custom_instance_class_data\n else None\n )\n\n @property\n def custom_instance_class_config(self):\n return (\n self.custom_instance_class_data.config_dict if self.custom_instance_class_data else {}\n )\n\n def to_dict(self):\n return self._asdict()
\n
", "current_page_name": "_modules/dagster/core/instance/ref", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster.core.instance"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.instance.ref"}, "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.instance"}, "launcher": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.launcher.base

\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.origin import PipelinePythonOrigin\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.workspace.workspace import IWorkspace\nfrom dagster.serdes import whitelist_for_serdes\n\n\nclass LaunchRunContext(NamedTuple):\n    """\n    Context available within a run launcher's launch_run call.\n    """\n\n    pipeline_run: PipelineRun\n    workspace: Optional[IWorkspace]\n\n    @property\n    def pipeline_code_origin(self) -> Optional[PipelinePythonOrigin]:\n        return self.pipeline_run.pipeline_code_origin\n\n\nclass ResumeRunContext(NamedTuple):\n    """\n    Context available within a run launcher's resume_run call.\n    """\n\n    pipeline_run: PipelineRun\n    workspace: Optional[IWorkspace]\n    resume_attempt_number: Optional[int] = None\n\n    @property\n    def pipeline_code_origin(self) -> Optional[PipelinePythonOrigin]:\n        return self.pipeline_run.pipeline_code_origin\n\n\n@whitelist_for_serdes\nclass WorkerStatus(Enum):\n    RUNNING = "RUNNING"\n    NOT_FOUND = "NOT_FOUND"\n    FAILED = "FAILED"\n    SUCCESS = "SUCCESS"\n    UNKNOWN = "UNKNOWN"\n\n\nclass CheckRunHealthResult(NamedTuple):\n    """\n    Result of a check_run_worker_health call.\n    """\n\n    status: WorkerStatus\n    msg: Optional[str] = None\n\n    def __str__(self) -> str:\n        return f"{self.status.value}: '{self.msg}'"\n\n\n
[docs]class RunLauncher(ABC, MayHaveInstanceWeakref):\n @abstractmethod\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run.\n\n This method should begin the execution of the specified run, and may emit engine events.\n Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.STARTING`` state. Typically, this method will\n not be invoked directly, but should be invoked through ``DagsterInstance.launch_run()``.\n\n Args:\n context (LaunchRunContext): information about the launch - every run launcher\n will need the PipelineRun, and some run launchers may need information from the\n IWorkspace from which the run was launched.\n """\n\n @abstractmethod\n def can_terminate(self, run_id):\n """\n Can this run_id be terminated by this run launcher.\n """\n\n @abstractmethod\n def terminate(self, run_id):\n """\n Terminates a process.\n\n Returns False is the process was already terminated. Returns true if\n the process was alive and was successfully terminated\n """\n\n def dispose(self):\n """\n Do any resource cleanup that should happen when the DagsterInstance is\n cleaning itself up.\n """\n\n def join(self, timeout=30):\n pass\n\n @property\n def supports_check_run_worker_health(self):\n """\n Whether the run launcher supports check_run_worker_health.\n """\n return False\n\n def check_run_worker_health(self, run: PipelineRun) -> CheckRunHealthResult:\n raise NotImplementedError(\n "This run launcher does not support run monitoring. Please disable it on your instance."\n )\n\n @property\n def supports_resume_run(self):\n """\n Whether the run launcher supports resume_run.\n """\n return False\n\n def resume_run(self, context: ResumeRunContext) -> None:\n raise NotImplementedError(\n "This run launcher does not support resuming runs. If using "\n "run monitoring, set max_resume_run_attempts to 0."\n )
\n
", "current_page_name": "_modules/dagster/core/launcher/base", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.launcher.base"}, "default_run_launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.launcher.default_run_launcher

\nimport time\n\nfrom dagster import Bool, Field, check, seven\nfrom dagster.core.errors import (\n    DagsterInvariantViolationError,\n    DagsterLaunchFailedError,\n    DagsterUserCodeUnreachableError,\n)\nfrom dagster.core.host_representation.grpc_server_registry import ProcessGrpcServerRegistry\nfrom dagster.core.host_representation.repository_location import GrpcServerRepositoryLocation\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import GRPC_INFO_TAG\nfrom dagster.grpc.client import DagsterGrpcClient\nfrom dagster.grpc.types import (\n    CanCancelExecutionRequest,\n    CancelExecutionRequest,\n    ExecuteExternalPipelineArgs,\n    StartRunResult,\n)\nfrom dagster.serdes import ConfigurableClass, deserialize_as, deserialize_json_to_dagster_namedtuple\nfrom dagster.utils import merge_dicts\n\nfrom .base import LaunchRunContext, RunLauncher\n\n\n
[docs]class DefaultRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs against running GRPC servers."""\n\n def __init__(self, inst_data=None, wait_for_processes=False):\n self._inst_data = inst_data\n\n # Whether to wait for any processes that were used to launch runs to finish\n # before disposing of this launcher. Primarily useful for test cleanup where\n # we want to make sure that resources used by the test are cleaned up before\n # the test ends.\n self._wait_for_processes = check.bool_param(wait_for_processes, "wait_for_processes")\n\n self._run_ids = set()\n\n self._locations_to_wait_for = []\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"wait_for_processes": Field(Bool, is_required=False)}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return DefaultRunLauncher(\n inst_data=inst_data, wait_for_processes=config_value.get("wait_for_processes", False)\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.pipeline_run\n\n check.inst_param(run, "run", PipelineRun)\n\n if not context.workspace:\n raise DagsterInvariantViolationError(\n "DefaultRunLauncher requires a workspace to be included in its LaunchRunContext"\n )\n\n external_pipeline_origin = check.not_none(run.external_pipeline_origin)\n repository_location = context.workspace.get_location(\n external_pipeline_origin.external_repository_origin.repository_location_origin.location_name\n )\n\n check.inst(\n repository_location,\n GrpcServerRepositoryLocation,\n "DefaultRunLauncher: Can't launch runs for pipeline not loaded from a GRPC server",\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {\n GRPC_INFO_TAG: seven.json.dumps(\n merge_dicts(\n {"host": repository_location.host},\n (\n {"port": repository_location.port}\n if repository_location.port\n else {"socket": repository_location.socket}\n ),\n ({"use_ssl": True} if repository_location.use_ssl else {}),\n )\n )\n },\n )\n\n res = deserialize_as(\n repository_location.client.start_run(\n ExecuteExternalPipelineArgs(\n pipeline_origin=external_pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n )\n ),\n StartRunResult,\n )\n if not res.success:\n raise (\n DagsterLaunchFailedError(\n res.message, serializable_error_info=res.serializable_error_info\n )\n )\n\n self._run_ids.add(run.run_id)\n\n if self._wait_for_processes:\n self._locations_to_wait_for.append(repository_location)\n\n def _get_grpc_client_for_termination(self, run_id):\n if not self._instance:\n return None\n\n run = self._instance.get_run_by_id(run_id)\n if not run or run.is_finished:\n return None\n\n tags = run.tags\n\n if GRPC_INFO_TAG not in tags:\n return None\n\n grpc_info = seven.json.loads(tags.get(GRPC_INFO_TAG))\n\n return DagsterGrpcClient(\n port=grpc_info.get("port"),\n socket=grpc_info.get("socket"),\n host=grpc_info.get("host"),\n use_ssl=bool(grpc_info.get("use_ssl", False)),\n )\n\n def can_terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n client = self._get_grpc_client_for_termination(run_id)\n if not client:\n return False\n\n try:\n res = deserialize_json_to_dagster_namedtuple(\n client.can_cancel_execution(CanCancelExecutionRequest(run_id=run_id), timeout=5)\n )\n except DagsterUserCodeUnreachableError:\n # Server that created the run may no longer exist\n return False\n\n return res.can_cancel\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n if not self._instance:\n return False\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n client = self._get_grpc_client_for_termination(run_id)\n\n if not client:\n self._instance.report_engine_event(\n message="Unable to get grpc client to send termination request to.",\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n self._instance.report_run_canceling(run)\n res = deserialize_json_to_dagster_namedtuple(\n client.cancel_execution(CancelExecutionRequest(run_id=run_id))\n )\n return res.success\n\n def join(self, timeout=30):\n # If this hasn't been initialized at all, we can just do a noop\n if not self._instance:\n return\n\n total_time = 0\n interval = 0.01\n\n while True:\n active_run_ids = [\n run_id\n for run_id in self._run_ids\n if (\n self._instance.get_run_by_id(run_id)\n and not self._instance.get_run_by_id(run_id).is_finished\n )\n ]\n\n if len(active_run_ids) == 0:\n return\n\n if total_time >= timeout:\n raise Exception(\n "Timed out waiting for these runs to finish: {active_run_ids}".format(\n active_run_ids=repr(active_run_ids)\n )\n )\n\n total_time += interval\n time.sleep(interval)\n interval = interval * 2\n\n def dispose(self):\n if not self._wait_for_processes:\n return\n\n for location in self._locations_to_wait_for:\n if isinstance(location, GrpcServerRepositoryLocation) and isinstance(\n location.grpc_server_registry, ProcessGrpcServerRegistry\n ):\n location.grpc_server_registry.wait_for_processes()
\n
", "current_page_name": "_modules/dagster/core/launcher/default_run_launcher", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.launcher.default_run_launcher"}}, "log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.log_manager

\nimport datetime\nimport logging\nfrom typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Union\n\nfrom dagster import check\nfrom dagster.core.utils import coerce_valid_log_level, make_new_run_id\nfrom dagster.utils.log import get_dagster_logger\n\nif TYPE_CHECKING:\n    from dagster import DagsterInstance, PipelineRun\n    from dagster.core.events import DagsterEvent\n\nDAGSTER_META_KEY = "dagster_meta"\n\n\nclass DagsterMessageProps(\n    NamedTuple(\n        "_DagsterMessageProps",\n        [\n            ("orig_message", Optional[str]),\n            ("log_message_id", Optional[str]),\n            ("log_timestamp", Optional[str]),\n            ("dagster_event", Optional[Any]),\n        ],\n    )\n):\n    """Internal class used to represent specific attributes about a logged message"""\n\n    def __new__(\n        cls,\n        orig_message: str,\n        log_message_id: Optional[str] = None,\n        log_timestamp: Optional[str] = None,\n        dagster_event: Optional["DagsterEvent"] = None,\n    ):\n        return super().__new__(\n            cls,\n            orig_message=check.str_param(orig_message, "orig_message"),\n            log_message_id=check.opt_str_param(\n                log_message_id, "log_message_id", default=make_new_run_id()\n            ),\n            log_timestamp=check.opt_str_param(\n                log_timestamp, "log_timestamp", default=datetime.datetime.utcnow().isoformat()\n            ),\n            dagster_event=dagster_event,\n        )\n\n    @property\n    def error_str(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n\n        event_specific_data = self.dagster_event.event_specific_data\n        if not event_specific_data:\n            return None\n\n        error = getattr(event_specific_data, "error", None)\n        if error:\n            return "\\n\\n" + getattr(event_specific_data, "error_display_string", error.to_string())\n        return None\n\n    @property\n    def pid(self) -> Optional[str]:\n        if self.dagster_event is None or self.dagster_event.pid is None:\n            return None\n        return str(self.dagster_event.pid)\n\n    @property\n    def step_key(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.step_key\n\n    @property\n    def event_type_value(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.event_type_value\n\n\nclass DagsterLoggingMetadata(\n    NamedTuple(\n        "_DagsterLoggingMetadata",\n        [\n            ("run_id", Optional[str]),\n            ("pipeline_name", Optional[str]),\n            ("pipeline_tags", Dict[str, str]),\n            ("step_key", Optional[str]),\n            ("solid_name", Optional[str]),\n            ("resource_name", Optional[str]),\n            ("resource_fn_name", Optional[str]),\n        ],\n    )\n):\n    """Internal class used to represent the context in which a given message was logged (i.e. the\n    step, pipeline run, resource, etc.)\n    """\n\n    def __new__(\n        cls,\n        run_id: Optional[str] = None,\n        pipeline_name: Optional[str] = None,\n        pipeline_tags: Optional[Dict[str, str]] = None,\n        step_key: Optional[str] = None,\n        solid_name: Optional[str] = None,\n        resource_name: Optional[str] = None,\n        resource_fn_name: Optional[str] = None,\n    ):\n        return super().__new__(\n            cls,\n            run_id=run_id,\n            pipeline_name=pipeline_name,\n            pipeline_tags=pipeline_tags or {},\n            step_key=step_key,\n            solid_name=solid_name,\n            resource_name=resource_name,\n            resource_fn_name=resource_fn_name,\n        )\n\n    @property\n    def log_source(self):\n        if self.resource_name is None:\n            return self.pipeline_name or "system"\n        return f"resource:{self.resource_name}"\n\n    def to_tags(self) -> Dict[str, str]:\n        # converts all values into strings\n        return {k: str(v) for k, v in self._asdict().items()}\n\n\ndef construct_log_string(\n    logging_metadata: DagsterLoggingMetadata, message_props: DagsterMessageProps\n) -> str:\n\n    from dagster.core.events import EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n\n    event_type_str = (\n        EVENT_TYPE_VALUE_TO_DISPLAY_STRING[message_props.event_type_value]\n        if message_props.event_type_value in EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n        else message_props.event_type_value\n    )\n    return " - ".join(\n        filter(\n            None,\n            (\n                logging_metadata.log_source,\n                logging_metadata.run_id,\n                message_props.pid,\n                logging_metadata.step_key,\n                event_type_str,\n                message_props.orig_message,\n            ),\n        )\n    ) + (message_props.error_str or "")\n\n\ndef get_dagster_meta_dict(\n    logging_metadata: DagsterLoggingMetadata, dagster_message_props: DagsterMessageProps\n) -> Dict[str, Any]:\n    # combine all dagster meta information into a single dictionary\n    meta_dict = {\n        **logging_metadata._asdict(),\n        **dagster_message_props._asdict(),\n    }\n    # step-level events can be logged from a pipeline context. for these cases, pull the step\n    # key from the underlying DagsterEvent\n    if meta_dict["step_key"] is None:\n        meta_dict["step_key"] = dagster_message_props.step_key\n\n    return meta_dict\n\n\nclass DagsterLogHandler(logging.Handler):\n    """Internal class used to turn regular logs into Dagster logs by adding Dagster-specific\n    metadata (such as pipeline_name or step_key), as well as reformatting the underlying message.\n\n    Note: The `loggers` argument will be populated with the set of @loggers supplied to the current\n    pipeline run. These essentially work as handlers (they do not create their own log messages,\n    they simply re-log messages that are created from context.log.x() calls), which is why they are\n    referenced from within this handler class.\n    """\n\n    def __init__(\n        self,\n        logging_metadata: DagsterLoggingMetadata,\n        loggers: List[logging.Logger],\n        handlers: List[logging.Handler],\n    ):\n        self._logging_metadata = logging_metadata\n        self._loggers = loggers\n        self._handlers = handlers\n        self._should_capture = True\n        super().__init__()\n\n    @property\n    def logging_metadata(self):\n        return self._logging_metadata\n\n    def with_tags(self, **new_tags):\n        return DagsterLogHandler(\n            logging_metadata=self.logging_metadata._replace(**new_tags),\n            loggers=self._loggers,\n            handlers=self._handlers,\n        )\n\n    def _extract_extra(self, record: logging.LogRecord) -> Dict[str, Any]:\n        """In the logging.Logger log() implementation, the elements of the `extra` dictionary\n        argument are smashed into the __dict__ of the underlying logging.LogRecord.\n        This function figures out what the original `extra` values of the log call were by\n        comparing the set of attributes in the received record to those of a default record.\n        """\n        ref_attrs = list(logging.makeLogRecord({}).__dict__.keys()) + ["message", "asctime"]\n        return {k: v for k, v in record.__dict__.items() if k not in ref_attrs}\n\n    def _convert_record(self, record: logging.LogRecord) -> logging.LogRecord:\n        # we store the originating DagsterEvent in the DAGSTER_META_KEY field, if applicable\n        dagster_meta = getattr(record, DAGSTER_META_KEY, None)\n\n        # generate some properties for this specific record\n        dagster_message_props = DagsterMessageProps(\n            orig_message=record.getMessage(), dagster_event=dagster_meta\n        )\n\n        # set the dagster meta info for the record\n        setattr(\n            record,\n            DAGSTER_META_KEY,\n            get_dagster_meta_dict(self._logging_metadata, dagster_message_props),\n        )\n\n        # update the message to be formatted like other dagster logs\n        record.msg = construct_log_string(self._logging_metadata, dagster_message_props)\n        record.args = ()\n\n        return record\n\n    def filter(self, record: logging.LogRecord) -> bool:\n        """If you list multiple levels of a python logging hierarchy as managed loggers, and do not\n        set the propagate attribute to False, this will result in that record getting logged\n        multiple times, as the DagsterLogHandler will be invoked at each level of the hierarchy as\n        the message is propagated. This filter prevents this from happening.\n        """\n        return self._should_capture and not isinstance(\n            getattr(record, DAGSTER_META_KEY, None), dict\n        )\n\n    def emit(self, record: logging.LogRecord):\n        """For any received record, add Dagster metadata, and have handlers handle it"""\n\n        try:\n            # to prevent the potential for infinite loops in which a handler produces log messages\n            # which are then captured and then handled by that same handler (etc.), do not capture\n            # any log messages while one is currently being emitted\n            self._should_capture = False\n            dagster_record = self._convert_record(record)\n            # built-in handlers\n            for handler in self._handlers:\n                if dagster_record.levelno >= handler.level:\n                    handler.handle(dagster_record)\n            # user-defined @loggers\n            for logger in self._loggers:\n                logger.log(\n                    dagster_record.levelno,\n                    dagster_record.msg,\n                    exc_info=dagster_record.exc_info,\n                    extra=self._extract_extra(record),\n                )\n        finally:\n            self._should_capture = True\n\n\n
[docs]class DagsterLogManager(logging.Logger):\n """Centralized dispatch for logging from user code.\n\n Handles the construction of uniform structured log messages and passes them through to the\n underlying loggers/handlers.\n\n An instance of the log manager is made available to ops as ``context.log``. Users should not\n initialize instances of the log manager directly. To configure custom loggers, set the\n ``logger_defs`` argument in an `@job` decorator or when calling the `to_job()` method on a\n :py:class:`GraphDefinition`.\n\n The log manager inherits standard convenience methods like those exposed by the Python standard\n library :py:mod:`python:logging` module (i.e., within the body of an op,\n ``context.log.{debug, info, warning, warn, error, critical, fatal}``).\n\n The underlying integer API can also be called directly using, e.g.\n ``context.log.log(5, msg)``, and the log manager will delegate to the ``log`` method\n defined on each of the loggers it manages.\n\n User-defined custom log levels are not supported, and calls to, e.g.,\n ``context.log.trace`` or ``context.log.notice`` will result in hard exceptions **at runtime**.\n """\n\n def __init__(\n self,\n dagster_handler: DagsterLogHandler,\n level: int = logging.NOTSET,\n managed_loggers: Optional[List[logging.Logger]] = None,\n ):\n super().__init__(name="dagster", level=coerce_valid_log_level(level))\n self._managed_loggers = check.opt_list_param(\n managed_loggers, "managed_loggers", of_type=logging.Logger\n )\n self._dagster_handler = dagster_handler\n self.addHandler(dagster_handler)\n\n @classmethod\n def create(\n cls,\n loggers: List[logging.Logger],\n handlers: Optional[List[logging.Handler]] = None,\n instance: Optional["DagsterInstance"] = None,\n pipeline_run: Optional["PipelineRun"] = None,\n ) -> "DagsterLogManager":\n """Create a DagsterLogManager with a set of subservient loggers."""\n\n handlers = check.opt_list_param(handlers, "handlers", of_type=logging.Handler)\n\n managed_loggers = [get_dagster_logger()]\n python_log_level = logging.NOTSET\n\n if instance:\n handlers += instance.get_handlers()\n managed_loggers += [\n logging.getLogger(lname) if lname != "root" else logging.getLogger()\n for lname in instance.managed_python_loggers\n ]\n if instance.python_log_level is not None:\n python_log_level = coerce_valid_log_level(instance.python_log_level)\n\n # set all loggers to the declared logging level\n for logger in managed_loggers:\n logger.setLevel(python_log_level)\n\n if pipeline_run:\n logging_metadata = DagsterLoggingMetadata(\n run_id=pipeline_run.run_id,\n pipeline_name=pipeline_run.pipeline_name,\n pipeline_tags=pipeline_run.tags,\n )\n else:\n logging_metadata = DagsterLoggingMetadata()\n\n return cls(\n dagster_handler=DagsterLogHandler(\n logging_metadata=logging_metadata,\n loggers=loggers,\n handlers=handlers,\n ),\n level=python_log_level,\n managed_loggers=managed_loggers,\n )\n\n @property\n def logging_metadata(self) -> DagsterLoggingMetadata:\n return self._dagster_handler.logging_metadata\n\n def begin_python_log_capture(self):\n for logger in self._managed_loggers:\n logger.addHandler(self._dagster_handler)\n\n def end_python_log_capture(self):\n for logger in self._managed_loggers:\n logger.removeHandler(self._dagster_handler)\n\n def log_dagster_event(self, level: Union[str, int], msg: str, dagster_event: "DagsterEvent"):\n """Log a DagsterEvent at the given level. Attributes about the context it was logged in\n (such as the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): message describing the event\n dagster_event (DagsterEvent): DagsterEvent that will be logged\n """\n self.log(level=level, msg=msg, extra={DAGSTER_META_KEY: dagster_event})\n\n def log(self, level, msg, *args, **kwargs):\n """Log a message at the given level. Attributes about the context it was logged in (such as\n the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): the message to be logged\n *args: the logged message will be msg % args\n """\n level = coerce_valid_log_level(level)\n # log DagsterEvents regardless of level\n if self.isEnabledFor(level) or ("extra" in kwargs and DAGSTER_META_KEY in kwargs["extra"]):\n self._log(level, msg, args, **kwargs)\n\n def with_tags(self, **new_tags):\n """Add new tags in "new_tags" to the set of tags attached to this log manager instance, and\n return a new DagsterLogManager with the merged set of tags.\n\n Args:\n new_tags (Dict[str,str]): Dictionary of tags\n\n Returns:\n DagsterLogManager: a new DagsterLogManager namedtuple with updated tags for the same\n run ID and loggers.\n """\n return DagsterLogManager(\n dagster_handler=self._dagster_handler.with_tags(**new_tags),\n managed_loggers=self._managed_loggers,\n level=self.level,\n )
\n
", "current_page_name": "_modules/dagster/core/log_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.log_manager"}, "run_coordinator": {"default_run_coordinator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.run_coordinator.default_run_coordinator

\nfrom dagster import check\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\n
[docs]class DefaultRunCoordinator(RunCoordinator, ConfigurableClass):\n """Immediately send runs to the run launcher."""\n\n def __init__(self, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n def submit_run(self, context: SubmitRunContext) -> PipelineRun:\n pipeline_run = context.pipeline_run\n check.invariant(pipeline_run.status == PipelineRunStatus.NOT_STARTED)\n\n self._instance.launch_run(pipeline_run.run_id, context.workspace)\n run = self._instance.get_run_by_id(pipeline_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {pipeline_run.run_id}")\n return run\n\n def can_cancel_run(self, run_id):\n return self._instance.run_launcher.can_terminate(run_id)\n\n def cancel_run(self, run_id):\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/core/run_coordinator/default_run_coordinator", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.run_coordinator.default_run_coordinator"}, "queued_run_coordinator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.run_coordinator.queued_run_coordinator

\nimport logging\nimport time\nfrom typing import Any, Dict, List, NamedTuple, Optional\n\nfrom dagster import DagsterEvent, DagsterEventType, IntSource, String, check\nfrom dagster.builtins import Bool\nfrom dagster.config import Field\nfrom dagster.config.config_type import Array, Noneable, ScalarUnion\nfrom dagster.config.field_utils import Shape\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\nclass RunQueueConfig(\n    NamedTuple(\n        "_RunQueueConfig",\n        [("max_concurrent_runs", int), ("tag_concurrency_limits", Optional[List[Dict[str, Any]]])],\n    )\n):\n    pass\n\n\n
[docs]class QueuedRunCoordinator(RunCoordinator, ConfigurableClass):\n """\n Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\n the Dagster Daemon process to be alive in order for runs to be launched.\n """\n\n def __init__(\n self,\n max_concurrent_runs=None,\n tag_concurrency_limits=None,\n dequeue_interval_seconds=None,\n inst_data=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._max_concurrent_runs = check.opt_int_param(\n max_concurrent_runs, "max_concurrent_runs", 10\n )\n self._tag_concurrency_limits = check.opt_list_param(\n tag_concurrency_limits,\n "tag_concurrency_limits",\n )\n self._dequeue_interval_seconds = check.opt_int_param(\n dequeue_interval_seconds, "dequeue_interval_seconds", 5\n )\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def get_run_queue_config(self):\n return RunQueueConfig(\n max_concurrent_runs=self._max_concurrent_runs,\n tag_concurrency_limits=self._tag_concurrency_limits,\n )\n\n @property\n def dequeue_interval_seconds(self):\n return self._dequeue_interval_seconds\n\n @classmethod\n def config_type(cls):\n return {\n "max_concurrent_runs": Field(\n config=IntSource,\n is_required=False,\n description="The maximum number of runs that are allowed to be in progress at once",\n ),\n "tag_concurrency_limits": Field(\n config=Noneable(\n Array(\n Shape(\n {\n "key": String,\n "value": Field(\n ScalarUnion(\n scalar_type=String,\n non_scalar_schema=Shape({"applyLimitPerUniqueValue": Bool}),\n ),\n is_required=False,\n ),\n "limit": Field(int),\n }\n )\n )\n ),\n is_required=False,\n description="A set of limits that are applied to runs with particular tags. "\n "If a value is set, the limit is applied to only that key-value pair. "\n "If no value is set, the limit is applied across all values of that key. "\n "If the value is set to a dict with `applyLimitPerUniqueValue: true`, the limit "\n "will apply to the number of unique values for that key.",\n ),\n "dequeue_interval_seconds": Field(\n config=IntSource,\n is_required=False,\n description="The interval in seconds at which the Dagster Daemon "\n "should periodically check the run queue for new runs to launch.",\n ),\n }\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(\n inst_data=inst_data,\n max_concurrent_runs=config_value.get("max_concurrent_runs"),\n tag_concurrency_limits=config_value.get("tag_concurrency_limits"),\n dequeue_interval_seconds=config_value.get("dequeue_interval_seconds"),\n )\n\n def submit_run(self, context: SubmitRunContext) -> PipelineRun:\n pipeline_run = context.pipeline_run\n check.invariant(pipeline_run.status == PipelineRunStatus.NOT_STARTED)\n\n enqueued_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_ENQUEUED.value,\n pipeline_name=pipeline_run.pipeline_name,\n )\n event_record = EventLogEntry(\n user_message="",\n level=logging.INFO,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=enqueued_event,\n )\n self._instance.handle_new_event(event_record)\n\n run = self._instance.get_run_by_id(pipeline_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {pipeline_run.run_id}")\n return run\n\n def can_cancel_run(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n if run.status == PipelineRunStatus.QUEUED:\n return True\n else:\n return self._instance.run_launcher.can_terminate(run_id)\n\n def cancel_run(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n # NOTE: possible race condition if the dequeuer acts on this run at the same time\n # https://github.com/dagster-io/dagster/issues/3323\n if run.status == PipelineRunStatus.QUEUED:\n self._instance.report_run_canceling(\n run,\n message="Canceling run from the queue.",\n )\n self._instance.report_run_canceled(run)\n return True\n else:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/core/run_coordinator/queued_run_coordinator", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.run_coordinator.queued_run_coordinator"}}, "scheduler": {"scheduler": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.scheduler.scheduler

\nimport abc\nimport os\nfrom typing import List, NamedTuple\n\nfrom dagster import check\nfrom dagster.config import Field\nfrom dagster.config.source import IntSource\nfrom dagster.core.definitions.run_request import InstigatorType\nfrom dagster.core.errors import DagsterError\nfrom dagster.core.host_representation import ExternalSchedule\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.scheduler.instigation import (\n    InstigatorState,\n    InstigatorStatus,\n    ScheduleInstigatorData,\n)\nfrom dagster.serdes import ConfigurableClass\nfrom dagster.seven import get_current_datetime_in_utc\nfrom dagster.utils import mkdir_p\n\n\nclass DagsterSchedulerError(DagsterError):\n    """Base class for all Dagster Scheduler errors"""\n\n\nclass DagsterScheduleDoesNotExist(DagsterSchedulerError):\n    """Errors raised when fetching a schedule."""\n\n\nclass SchedulerDebugInfo(\n    NamedTuple(\n        "SchedulerDebugInfo",\n        [\n            ("errors", List[str]),\n            ("scheduler_config_info", str),\n            ("scheduler_info", str),\n            ("schedule_storage", List[str]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        errors: List[str],\n        scheduler_config_info: str,\n        scheduler_info: str,\n        schedule_storage: List[str],\n    ):\n        return super(SchedulerDebugInfo, cls).__new__(\n            cls,\n            errors=check.list_param(errors, "errors", of_type=str),\n            scheduler_config_info=check.str_param(scheduler_config_info, "scheduler_config_info"),\n            scheduler_info=check.str_param(scheduler_info, "scheduler_info"),\n            schedule_storage=check.list_param(schedule_storage, "schedule_storage", of_type=str),\n        )\n\n\n
[docs]class Scheduler(abc.ABC):\n """Abstract base class for a scheduler. This component is responsible for interfacing with\n an external system such as cron to ensure scheduled repeated execution according.\n """\n\n def start_schedule(self, instance, external_schedule):\n """\n Updates the status of the given schedule to `InstigatorStatus.RUNNING` in schedule storage,\n\n This should not be overridden by subclasses.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start\n\n """\n\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n schedule_state = instance.get_instigator_state(\n external_schedule.get_external_origin_id(), external_schedule.selector_id\n )\n if external_schedule.get_current_instigator_state(schedule_state).is_running:\n raise DagsterSchedulerError(\n "You have attempted to start schedule {name}, but it is already running".format(\n name=external_schedule.name\n )\n )\n\n new_instigator_data = ScheduleInstigatorData(\n external_schedule.cron_schedule,\n get_current_datetime_in_utc().timestamp(),\n )\n\n if not schedule_state:\n started_schedule = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.RUNNING,\n new_instigator_data,\n )\n instance.add_instigator_state(started_schedule)\n else:\n started_schedule = schedule_state.with_status(InstigatorStatus.RUNNING).with_data(\n new_instigator_data\n )\n instance.update_instigator_state(started_schedule)\n return started_schedule\n\n def stop_schedule(self, instance, schedule_origin_id, external_schedule):\n """\n Updates the status of the given schedule to `InstigatorStatus.STOPPED` in schedule storage,\n\n This should not be overridden by subclasses.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to stop running.\n """\n\n check.str_param(schedule_origin_id, "schedule_origin_id")\n check.opt_inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n schedule_state = instance.get_instigator_state(\n schedule_origin_id, external_schedule.selector_id\n )\n if (\n external_schedule\n and not external_schedule.get_current_instigator_state(schedule_state).is_running\n ) or (schedule_state and not schedule_state.is_running):\n raise DagsterSchedulerError(\n "You have attempted to stop schedule {name}, but it is already stopped".format(\n name=external_schedule.name\n )\n )\n\n if not schedule_state:\n stopped_schedule = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.STOPPED,\n ScheduleInstigatorData(\n external_schedule.cron_schedule,\n ),\n )\n instance.add_instigator_state(stopped_schedule)\n else:\n stopped_schedule = schedule_state.with_status(InstigatorStatus.STOPPED).with_data(\n ScheduleInstigatorData(\n cron_schedule=schedule_state.instigator_data.cron_schedule,\n )\n )\n instance.update_instigator_state(stopped_schedule)\n\n return stopped_schedule\n\n @abc.abstractmethod\n def debug_info(self):\n """Returns debug information about the scheduler"""\n\n @abc.abstractmethod\n def get_logs_path(self, instance, schedule_origin_id):\n """Get path to store logs for schedule\n\n Args:\n schedule_origin_id (string): The id of the schedule target to retrieve the log path for\n """
\n\n\nDEFAULT_MAX_CATCHUP_RUNS = 5\n\n\n
[docs]class DagsterDaemonScheduler(Scheduler, ConfigurableClass):\n """Default scheduler implementation that submits runs from the `dagster-daemon`\n long-lived process. Periodically checks each running schedule for execution times that don't\n have runs yet and launches them.\n """\n\n def __init__(\n self, max_catchup_runs=DEFAULT_MAX_CATCHUP_RUNS, max_tick_retries=0, inst_data=None\n ):\n self.max_catchup_runs = check.opt_int_param(\n max_catchup_runs, "max_catchup_runs", DEFAULT_MAX_CATCHUP_RUNS\n )\n self.max_tick_retries = check.opt_int_param(max_tick_retries, "max_tick_retries", 0)\n self._inst_data = inst_data\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "max_catchup_runs": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_MAX_CATCHUP_RUNS,\n description="""For partitioned schedules, controls the maximum number of past\n partitions for each schedule that will be considered when looking for missing\n runs . Generally this parameter will only come into play if the scheduler\n falls behind or launches after experiencing downtime. This parameter will not be checked for\n schedules without partition sets (for example, schedules created using the @schedule\n decorator) - only the most recent execution time will be considered for those schedules.\n\n Note that no matter what this value is, the scheduler will never launch a run from a time\n before the schedule was turned on (even if the start_date on the schedule is earlier) - if\n you want to launch runs for earlier partitions, launch a backfill.\n """,\n ),\n "max_tick_retries": Field(\n IntSource,\n default_value=0,\n is_required=False,\n description="For each schedule tick that raises an error, how many times to retry that tick",\n ),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return DagsterDaemonScheduler(inst_data=inst_data, **config_value)\n\n def debug_info(self):\n return ""\n\n def wipe(self, instance):\n pass\n\n def _get_or_create_logs_directory(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)\n if not os.path.isdir(logs_directory):\n mkdir_p(logs_directory)\n\n return logs_directory\n\n def get_logs_path(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n return os.path.join(logs_directory, "scheduler.log")
\n
", "current_page_name": "_modules/dagster/core/scheduler/scheduler", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.scheduler.scheduler"}}, "storage": {"compute_log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.compute_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nfrom rx import Observable\n\nfrom dagster import check\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.storage.pipeline_run import PipelineRun\n\nMAX_BYTES_FILE_READ = 33554432  # 32 MB\nMAX_BYTES_CHUNK_READ = 4194304  # 4 MB\n\n\nclass ComputeIOType(Enum):\n    STDOUT = "stdout"\n    STDERR = "stderr"\n\n\nclass ComputeLogFileData(\n    NamedTuple(\n        "ComputeLogFileData",\n        [\n            ("path", str),\n            ("data", Optional[str]),\n            ("cursor", int),\n            ("size", int),\n            ("download_url", Optional[str]),\n        ],\n    )\n):\n    """Representation of a chunk of compute execution log data"""\n\n    def __new__(\n        cls, path: str, data: Optional[str], cursor: int, size: int, download_url: Optional[str]\n    ):\n        return super(ComputeLogFileData, cls).__new__(\n            cls,\n            path=check.str_param(path, "path"),\n            data=check.opt_str_param(data, "data"),\n            cursor=check.int_param(cursor, "cursor"),\n            size=check.int_param(size, "size"),\n            download_url=check.opt_str_param(download_url, "download_url"),\n        )\n\n\n
[docs]class ComputeLogManager(ABC, MayHaveInstanceWeakref):\n """Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\n steps of pipeline solids."""\n\n @contextmanager\n def watch(self, pipeline_run, step_key=None):\n """\n Watch the stdout/stderr for a given execution for a given run_id / step_key and persist it.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n\n if not self.enabled(pipeline_run, step_key):\n yield\n return\n\n self.on_watch_start(pipeline_run, step_key)\n with self._watch_logs(pipeline_run, step_key):\n yield\n self.on_watch_finish(pipeline_run, step_key)\n\n @contextmanager\n @abstractmethod\n def _watch_logs(self, pipeline_run, step_key=None):\n """\n Method to watch the stdout/stderr logs for a given run_id / step_key. Kept separate from\n blessed `watch` method, which triggers all the start/finish hooks that are necessary to\n implement the different remote implementations.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n def get_local_path(self, run_id, key, io_type):\n """Get the local path of the logfile for a given execution step. This determines the\n location on the local filesystem to which stdout/stderr will be rerouted.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either ComputeIOType.STDOUT or\n ComputeIOType.STDERR\n\n Returns:\n str\n """\n\n @abstractmethod\n def is_watch_completed(self, run_id, key):\n """Flag indicating when computation for a given execution step has completed.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def on_watch_start(self, pipeline_run, step_key):\n """Hook called when starting to watch compute logs.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def on_watch_finish(self, pipeline_run, step_key):\n """Hook called when computation for a given execution step is finished.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def download_url(self, run_id, key, io_type):\n """Get a URL where the logs can be downloaded.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n\n Returns:\n String\n """\n\n @abstractmethod\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n """Get compute log data for a given compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n max_bytes (Optional[Int]): Maximum number of bytes to be read and returned\n\n Returns:\n ComputeLogFileData\n """\n\n def enabled(self, _pipeline_run, _step_key):\n """Hook for disabling compute log capture.\n\n Args:\n _step_key (Optional[String]): The step_key for a compute step\n\n Returns:\n Boolean\n """\n return True\n\n @abstractmethod\n def on_subscribe(self, subscription):\n """Hook for managing streaming subscriptions for log data from `dagit`\n\n Args:\n subscription (ComputeLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def on_unsubscribe(self, subscription):\n pass\n\n def observable(self, run_id, key, io_type, cursor=None):\n """Return an Observable which streams back log data from the execution logs for a given\n compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n\n Returns:\n Observable\n """\n check.str_param(run_id, "run_id")\n check.str_param(key, "key")\n check.inst_param(io_type, "io_type", ComputeIOType)\n check.opt_str_param(cursor, "cursor")\n\n if cursor:\n cursor = int(cursor)\n else:\n cursor = 0\n\n subscription = ComputeLogSubscription(self, run_id, key, io_type, cursor)\n self.on_subscribe(subscription)\n return Observable.create(subscription) # pylint: disable=E1101\n\n def dispose(self):\n pass
\n\n\nclass ComputeLogSubscription:\n """Observable object that generates ComputeLogFileData objects as compute step execution logs\n are written\n """\n\n def __init__(self, manager, run_id, key, io_type, cursor):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.io_type = io_type\n self.cursor = cursor\n self.observer = None\n\n def __call__(self, observer):\n self.observer = observer\n self.fetch()\n if self.manager.is_watch_completed(self.run_id, self.key):\n self.complete()\n return self\n\n def dispose(self):\n # called when the connection gets closed, allowing the observer to get GC'ed\n if self.observer and callable(getattr(self.observer, "dispose", None)):\n self.observer.dispose()\n self.observer = None\n self.manager.on_unsubscribe(self)\n\n def fetch(self):\n if not self.observer:\n return\n\n should_fetch = True\n while should_fetch:\n update = self.manager.read_logs_file(\n self.run_id,\n self.key,\n self.io_type,\n self.cursor,\n max_bytes=MAX_BYTES_CHUNK_READ,\n )\n if not self.cursor or update.cursor != self.cursor:\n self.observer.on_next(update)\n self.cursor = update.cursor\n should_fetch = update.data and len(update.data.encode("utf-8")) >= MAX_BYTES_CHUNK_READ\n\n def complete(self):\n if not self.observer:\n return\n self.observer.on_completed()\n
", "current_page_name": "_modules/dagster/core/storage/compute_log_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.compute_log_manager"}, "event_log": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.event_log.base

\nimport warnings\nfrom abc import ABC, abstractmethod\nfrom datetime import datetime\nfrom typing import (\n    Callable,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n)\n\nfrom dagster import check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.events import DagsterEventType\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.execution.stats import (\n    RunStepKeyStatsSnapshot,\n    build_run_stats_from_events,\n    build_run_step_stats_from_events,\n)\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.storage.pipeline_run import PipelineRunStatsSnapshot\nfrom dagster.serdes import whitelist_for_serdes\n\n\n
[docs]class RunShardedEventsCursor(NamedTuple):\n """Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\n performance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\n run-sharded storages, the id field is ignored, since they may not be unique across shards\n """\n\n id: int\n run_updated_after: datetime
\n\n\n
[docs]class EventLogRecord(NamedTuple):\n """Internal representation of an event record, as stored in a\n :py:class:`~dagster.core.storage.event_log.EventLogStorage`.\n """\n\n storage_id: int\n event_log_entry: EventLogEntry
\n\n\n
[docs]@whitelist_for_serdes\nclass EventRecordsFilter(\n NamedTuple(\n "_EventRecordsFilter",\n [\n ("event_type", Optional[DagsterEventType]),\n ("asset_key", Optional[AssetKey]),\n ("asset_partitions", Optional[List[str]]),\n ("after_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("before_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("after_timestamp", Optional[float]),\n ("before_timestamp", Optional[float]),\n ],\n )\n):\n """Defines a set of filter fields for fetching a set of event log entries or event log records.\n\n Args:\n event_type (Optional[DagsterEventType]): Filter argument for dagster event type\n asset_key (Optional[AssetKey]): Asset key for which to get asset materialization event\n entries / records.\n asset_partitions (Optional[List[str]]): Filter parameter such that only asset\n materialization events with a partition value matching one of the provided values. Only\n valid when the `asset_key` parameter is provided.\n after_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that only\n records with storage_id greater than the provided value are returned. Using a\n run-sharded events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n before_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that\n records with storage_id less than the provided value are returned. Using a run-sharded\n events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n after_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp greater than the provided value are returned.\n before_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp less than the provided value are returned.\n """\n\n def __new__(\n cls,\n event_type: Optional[DagsterEventType] = None,\n asset_key: Optional[AssetKey] = None,\n asset_partitions: Optional[List[str]] = None,\n after_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n before_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n after_timestamp: Optional[float] = None,\n before_timestamp: Optional[float] = None,\n ):\n check.opt_list_param(asset_partitions, "asset_partitions", of_type=str)\n return super(EventRecordsFilter, cls).__new__(\n cls,\n event_type=check.opt_inst_param(event_type, "event_type", DagsterEventType),\n asset_key=check.opt_inst_param(asset_key, "asset_key", AssetKey),\n asset_partitions=asset_partitions,\n after_cursor=check.opt_inst_param(\n after_cursor, "after_cursor", (int, RunShardedEventsCursor)\n ),\n before_cursor=check.opt_inst_param(\n before_cursor, "before_cursor", (int, RunShardedEventsCursor)\n ),\n after_timestamp=check.opt_float_param(after_timestamp, "after_timestamp"),\n before_timestamp=check.opt_float_param(before_timestamp, "before_timestamp"),\n )
\n\n\n
[docs]class EventLogStorage(ABC, MayHaveInstanceWeakref):\n """Abstract base class for storing structured event logs from pipeline runs.\n\n Note that event log storages using SQL databases as backing stores should implement\n :py:class:`~dagster.core.storage.event_log.SqlEventLogStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagit`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def get_logs_for_run(\n self,\n run_id: str,\n cursor: Optional[int] = -1,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Iterable[EventLogEntry]:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n """\n\n def get_stats_for_run(self, run_id: str) -> PipelineRunStatsSnapshot:\n """Get a summary of events that have ocurred in a run."""\n return build_run_stats_from_events(run_id, self.get_logs_for_run(run_id))\n\n def get_step_stats_for_run(self, run_id: str, step_keys=None) -> List[RunStepKeyStatsSnapshot]:\n """Get per-step stats for a pipeline run."""\n logs = self.get_logs_for_run(run_id)\n if step_keys:\n logs = [\n event\n for event in logs\n if event.is_dagster_event and event.get_dagster_event().step_key in step_keys\n ]\n\n return build_run_step_stats_from_events(run_id, logs)\n\n @abstractmethod\n def store_event(self, event: EventLogEntry):\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n\n @abstractmethod\n def delete_events(self, run_id: str):\n """Remove events for a given run id"""\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def reindex_events(self, print_fn: Callable = lambda _: None, force: bool = False):\n """Call this method to run any data migrations across the event_log tables."""\n\n @abstractmethod\n def reindex_assets(self, print_fn: Callable = lambda _: None, force: bool = False):\n """Call this method to run any data migrations across the asset tables."""\n\n @abstractmethod\n def wipe(self):\n """Clear the log storage."""\n\n @abstractmethod\n def watch(self, run_id: str, start_cursor: int, callback: Callable):\n """Call this method to start watching."""\n\n @abstractmethod\n def end_watch(self, run_id: str, handler: Callable):\n """Call this method to stop watching."""\n\n @property\n @abstractmethod\n def is_persistent(self) -> bool:\n """bool: Whether the storage is persistent."""\n\n def dispose(self):\n """Explicit lifecycle management."""\n\n def optimize_for_dagit(self, statement_timeout: int):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""\n\n @abstractmethod\n def get_event_records(\n self,\n event_records_filter: Optional[EventRecordsFilter] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n pass\n\n @abstractmethod\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n pass\n\n @abstractmethod\n def all_asset_keys(self) -> Iterable[AssetKey]:\n pass\n\n def get_asset_keys(\n self,\n prefix: Optional[List[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Iterable[AssetKey]:\n # base implementation of get_asset_keys, using the existing `all_asset_keys` and doing the\n # filtering in-memory\n asset_keys = sorted(self.all_asset_keys(), key=str)\n if prefix:\n asset_keys = [\n asset_key for asset_key in asset_keys if asset_key.path[: len(prefix)] == prefix\n ]\n if cursor:\n cursor_asset = AssetKey.from_db_string(cursor)\n if cursor_asset and cursor_asset in asset_keys:\n idx = asset_keys.index(cursor_asset)\n asset_keys = asset_keys[idx + 1 :]\n if limit:\n asset_keys = asset_keys[:limit]\n return asset_keys\n\n @abstractmethod\n def get_latest_materialization_events(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n pass\n\n @abstractmethod\n def get_asset_events(\n self,\n asset_key: AssetKey,\n partitions: Optional[List[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n include_cursor: bool = False,\n before_timestamp=None,\n cursor: Optional[int] = None, # deprecated\n ) -> Union[Iterable[EventLogEntry], Iterable[Tuple[int, EventLogEntry]]]:\n pass\n\n @abstractmethod\n def get_asset_run_ids(self, asset_key: AssetKey) -> Iterable[str]:\n pass\n\n @abstractmethod\n def wipe_asset(self, asset_key: AssetKey):\n """Remove asset index history from event log for given asset_key"""\n\n @abstractmethod\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n pass
\n\n\ndef extract_asset_events_cursor(cursor, before_cursor, after_cursor, ascending):\n if cursor:\n warnings.warn(\n "Parameter `cursor` is a deprecated for `get_asset_events`. Use `before_cursor` or `after_cursor` instead"\n )\n if ascending and after_cursor is None:\n after_cursor = cursor\n if not ascending and before_cursor is None:\n before_cursor = cursor\n\n if after_cursor is not None:\n try:\n after_cursor = int(after_cursor)\n except ValueError:\n after_cursor = None\n\n if before_cursor is not None:\n try:\n before_cursor = int(before_cursor)\n except ValueError:\n before_cursor = None\n\n return before_cursor, after_cursor\n
", "current_page_name": "_modules/dagster/core/storage/event_log/base", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.event_log.base"}, "sql_event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.event_log.sql_event_log

\nimport logging\nfrom abc import abstractmethod\nfrom collections import OrderedDict\nfrom datetime import datetime\nfrom typing import Dict, Iterable, List, Mapping, Optional, Sequence, cast\n\nimport pendulum\nimport sqlalchemy as db\n\nfrom dagster import check, seven\nfrom dagster.core.assets import AssetDetails\nfrom dagster.core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster.core.errors import DagsterEventLogInvalidForRun\nfrom dagster.core.events import DagsterEventType\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.execution.stats import build_run_step_stats_from_events\nfrom dagster.serdes import deserialize_json_to_dagster_namedtuple, serialize_dagster_namedtuple\nfrom dagster.serdes.errors import DeserializationError\nfrom dagster.utils import datetime_as_float, utc_datetime_from_naive, utc_datetime_from_timestamp\n\nfrom ..pipeline_run import PipelineRunStatsSnapshot\nfrom .base import (\n    EventLogRecord,\n    EventLogStorage,\n    EventRecordsFilter,\n    RunShardedEventsCursor,\n    extract_asset_events_cursor,\n)\nfrom .migration import ASSET_DATA_MIGRATIONS, ASSET_KEY_INDEX_COLS, EVENT_LOG_DATA_MIGRATIONS\nfrom .schema import AssetKeyTable, SecondaryIndexMigrationTable, SqlEventLogStorageTable\n\nMIN_ASSET_ROWS = 25\n\n\n
[docs]class SqlEventLogStorage(EventLogStorage):\n """Base class for SQL backed event log storages.\n\n Distinguishes between run-based connections and index connections in order to support run-level\n sharding, while maintaining the ability to do cross-run queries\n """\n\n @abstractmethod\n def run_connection(self, run_id):\n """Context manager yielding a connection to access the event logs for a specific run.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def index_connection(self):\n """Context manager yielding a connection to access cross-run indexed tables.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def prepare_insert_event(self, event):\n """Helper method for preparing the event log SQL insertion statement. Abstracted away to\n have a single place for the logical table representation of the event, while having a way\n for SQL backends to implement different execution implementations for `store_event`. See\n the `dagster-postgres` implementation which overrides the generic SQL implementation of\n `store_event`.\n """\n\n dagster_event_type = None\n asset_key_str = None\n partition = None\n step_key = event.step_key\n\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n step_key = event.dagster_event.step_key\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n if event.dagster_event.partition:\n partition = event.dagster_event.partition\n\n # https://stackoverflow.com/a/54386260/324449\n return SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter\n run_id=event.run_id,\n event=serialize_dagster_namedtuple(event),\n dagster_event_type=dagster_event_type,\n # Postgres requires a datetime that is in UTC but has no timezone info set\n # in order to be stored correctly\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=step_key,\n asset_key=asset_key_str,\n partition=partition,\n )\n\n def has_asset_key_index_cols(self):\n with self.index_connection() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(AssetKeyTable.name)]\n return "last_materialization_timestamp" in column_names\n\n def store_asset(self, event):\n check.inst_param(event, "event", EventLogEntry)\n if not event.is_dagster_event or not event.dagster_event.asset_key:\n return\n\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation creation, we want to extend\n # this functionality to ensure that assets with observation OR materialization timestamp\n # > wipe timestamp display in Dagit.\n\n # As of the following PR, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation or materialization that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n if event.dagster_event.is_asset_observation:\n self.store_asset_observation(event)\n elif event.dagster_event.is_step_materialization:\n self.store_asset_materialization(event)\n\n def store_asset_observation(self, event):\n # last_materialization_timestamp is updated upon observation or materialization\n # See store_asset method above for more details\n if self.has_asset_key_index_cols():\n insert_statement = AssetKeyTable.insert().values(\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n )\n update_statement = AssetKeyTable.update().values(\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n )\n\n with self.index_connection() as conn:\n try:\n conn.execute(insert_statement)\n except db.exc.IntegrityError:\n conn.execute(update_statement)\n\n def store_asset_materialization(self, event):\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n # last_materialization_timestamp is updated upon observation or materialization\n # See store_asset method above for more details\n if self.has_asset_key_index_cols():\n materialization = event.dagster_event.step_materialization_data.materialization\n insert_statement = (\n AssetKeyTable.insert().values( # pylint: disable=no-value-for-parameter\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization=serialize_dagster_namedtuple(event),\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n last_run_id=event.run_id,\n tags=seven.json.dumps(materialization.tags) if materialization.tags else None,\n )\n )\n update_statement = (\n AssetKeyTable.update()\n .values( # pylint: disable=no-value-for-parameter\n last_materialization=serialize_dagster_namedtuple(event),\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n last_run_id=event.run_id,\n tags=seven.json.dumps(materialization.tags) if materialization.tags else None,\n )\n .where(\n AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(),\n )\n )\n else:\n insert_statement = (\n AssetKeyTable.insert().values( # pylint: disable=no-value-for-parameter\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization=serialize_dagster_namedtuple(event),\n last_run_id=event.run_id,\n )\n )\n update_statement = (\n AssetKeyTable.update()\n .values( # pylint: disable=no-value-for-parameter\n last_materialization=serialize_dagster_namedtuple(event),\n last_run_id=event.run_id,\n )\n .where(\n AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(),\n )\n )\n\n with self.index_connection() as conn:\n try:\n conn.execute(insert_statement)\n except db.exc.IntegrityError:\n conn.execute(update_statement)\n\n def store_event(self, event):\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.run_connection(run_id) as conn:\n conn.execute(insert_event_statement)\n\n if (\n event.is_dagster_event\n and (\n event.dagster_event.is_step_materialization\n or event.dagster_event.is_asset_observation\n )\n and event.dagster_event.asset_key\n ):\n self.store_asset(event)\n\n def get_logs_for_run_by_log_id(\n self,\n run_id,\n cursor=-1,\n dagster_event_type=None,\n limit=None,\n ):\n check.str_param(run_id, "run_id")\n check.int_param(cursor, "cursor")\n check.invariant(\n cursor >= -1,\n "Don't know what to do with negative cursor {cursor}".format(cursor=cursor),\n )\n\n dagster_event_types = (\n {dagster_event_type}\n if isinstance(dagster_event_type, DagsterEventType)\n else check.opt_set_param(\n dagster_event_type, "dagster_event_type", of_type=DagsterEventType\n )\n )\n\n query = (\n db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n # adjust 0 based index cursor to SQL offset\n query = query.offset(cursor + 1)\n\n if limit:\n query = query.limit(limit)\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n events = {}\n try:\n for (\n record_id,\n json_str,\n ) in results:\n events[record_id] = check.inst_param(\n deserialize_json_to_dagster_namedtuple(json_str), "event", EventLogEntry\n )\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n return events\n\n def get_logs_for_run(\n self,\n run_id,\n cursor=-1,\n of_type=None,\n limit=None,\n ):\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): the maximum number of events to fetch\n """\n check.str_param(run_id, "run_id")\n check.int_param(cursor, "cursor")\n check.invariant(\n cursor >= -1,\n "Don't know what to do with negative cursor {cursor}".format(cursor=cursor),\n )\n\n check.invariant(\n not of_type\n or isinstance(of_type, DagsterEventType)\n or isinstance(of_type, (frozenset, set))\n )\n\n events_by_id = self.get_logs_for_run_by_log_id(run_id, cursor, of_type, limit)\n return [event for id, event in sorted(events_by_id.items(), key=lambda x: x[0])]\n\n def get_stats_for_run(self, run_id):\n check.str_param(run_id, "run_id")\n\n query = (\n db.select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n db.func.count().label("n_events_of_type"),\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("last_event_timestamp"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.run_id == run_id,\n SqlEventLogStorageTable.c.dagster_event_type != None,\n )\n )\n .group_by("dagster_event_type")\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n try:\n counts = {}\n times = {}\n for result in results:\n (dagster_event_type, n_events_of_type, last_event_timestamp) = result\n check.invariant(dagster_event_type is not None)\n counts[dagster_event_type] = n_events_of_type\n times[dagster_event_type] = last_event_timestamp\n\n enqueued_time = times.get(DagsterEventType.PIPELINE_ENQUEUED.value, None)\n launch_time = times.get(DagsterEventType.PIPELINE_STARTING.value, None)\n start_time = times.get(DagsterEventType.PIPELINE_START.value, None)\n end_time = times.get(\n DagsterEventType.PIPELINE_SUCCESS.value,\n times.get(\n DagsterEventType.PIPELINE_FAILURE.value,\n times.get(DagsterEventType.PIPELINE_CANCELED.value, None),\n ),\n )\n\n return PipelineRunStatsSnapshot(\n run_id=run_id,\n steps_succeeded=counts.get(DagsterEventType.STEP_SUCCESS.value, 0),\n steps_failed=counts.get(DagsterEventType.STEP_FAILURE.value, 0),\n materializations=counts.get(DagsterEventType.ASSET_MATERIALIZATION.value, 0),\n expectations=counts.get(DagsterEventType.STEP_EXPECTATION_RESULT.value, 0),\n enqueued_time=datetime_as_float(enqueued_time) if enqueued_time else None,\n launch_time=datetime_as_float(launch_time) if launch_time else None,\n start_time=datetime_as_float(start_time) if start_time else None,\n end_time=datetime_as_float(end_time) if end_time else None,\n )\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def get_step_stats_for_run(self, run_id, step_keys=None):\n check.str_param(run_id, "run_id")\n check.opt_list_param(step_keys, "step_keys", of_type=str)\n\n # Originally, this was two different queries:\n # 1) one query which aggregated top-level step stats by grouping by event type / step_key in\n # a single query, using pure SQL (e.g. start_time, end_time, status, attempt counts).\n # 2) one query which fetched all the raw events for a specific event type and then inspected\n # the deserialized event object to aggregate stats derived from sequences of events.\n # (e.g. marker events, materializations, expectations resuls, attempts timing, etc.)\n #\n # For simplicity, we now just do the second type of query and derive the stats in Python\n # from the raw events. This has the benefit of being easier to read and also the benefit of\n # being able to share code with the in-memory event log storage implementation. We may\n # choose to revisit this in the future, especially if we are able to do JSON-column queries\n # in SQL as a way of bypassing the serdes layer in all cases.\n raw_event_query = (\n db.select([SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.step_key != None)\n .where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [\n DagsterEventType.STEP_START.value,\n DagsterEventType.STEP_SUCCESS.value,\n DagsterEventType.STEP_SKIPPED.value,\n DagsterEventType.STEP_FAILURE.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.ASSET_MATERIALIZATION.value,\n DagsterEventType.STEP_EXPECTATION_RESULT.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.STEP_UP_FOR_RETRY.value,\n DagsterEventType.ENGINE_EVENT.value,\n ]\n )\n )\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n if step_keys:\n raw_event_query = raw_event_query.where(\n SqlEventLogStorageTable.c.step_key.in_(step_keys)\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(raw_event_query).fetchall()\n\n try:\n records = [\n check.inst_param(\n deserialize_json_to_dagster_namedtuple(json_str), "event", EventLogEntry\n )\n for (json_str,) in results\n ]\n return build_run_step_stats_from_events(run_id, records)\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def _apply_migration(self, migration_name, migration_fn, print_fn, force):\n if self.has_secondary_index(migration_name):\n if not force:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n return\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.enable_secondary_index(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def reindex_events(self, print_fn=None, force=False):\n """Call this method to run any data migrations across the event_log table"""\n for migration_name, migration_fn in EVENT_LOG_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def reindex_assets(self, print_fn=None, force=False):\n """Call this method to run any data migrations across the asset_keys table"""\n for migration_name, migration_fn in ASSET_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def wipe(self):\n """Clears the event log storage."""\n # Should be overridden by SqliteEventLogStorage and other storages that shard based on\n # run_id\n\n # https://stackoverflow.com/a/54386260/324449\n with self.run_connection(run_id=None) as conn:\n conn.execute(SqlEventLogStorageTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(AssetKeyTable.delete()) # pylint: disable=no-value-for-parameter\n\n with self.index_connection() as conn:\n conn.execute(SqlEventLogStorageTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(AssetKeyTable.delete()) # pylint: disable=no-value-for-parameter\n\n def delete_events(self, run_id):\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n\n def delete_events_for_run(self, conn, run_id):\n check.str_param(run_id, "run_id")\n\n delete_statement = (\n SqlEventLogStorageTable.delete().where( # pylint: disable=no-value-for-parameter\n SqlEventLogStorageTable.c.run_id == run_id\n )\n )\n removed_asset_key_query = (\n db.select([SqlEventLogStorageTable.c.asset_key])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.asset_key != None)\n .group_by(SqlEventLogStorageTable.c.asset_key)\n )\n\n removed_asset_keys = [\n AssetKey.from_db_string(row[0])\n for row in conn.execute(removed_asset_key_query).fetchall()\n ]\n conn.execute(delete_statement)\n if len(removed_asset_keys) > 0:\n keys_to_check = []\n keys_to_check.extend([key.to_string() for key in removed_asset_keys])\n keys_to_check.extend([key.to_string(legacy=True) for key in removed_asset_keys])\n remaining_asset_keys = [\n AssetKey.from_db_string(row[0])\n for row in conn.execute(\n db.select([SqlEventLogStorageTable.c.asset_key])\n .where(SqlEventLogStorageTable.c.asset_key.in_(keys_to_check))\n .group_by(SqlEventLogStorageTable.c.asset_key)\n )\n ]\n to_remove = set(removed_asset_keys) - set(remaining_asset_keys)\n if to_remove:\n keys_to_remove = []\n keys_to_remove.extend([key.to_string() for key in to_remove])\n keys_to_remove.extend([key.to_string(legacy=True) for key in to_remove])\n conn.execute(\n AssetKeyTable.delete().where( # pylint: disable=no-value-for-parameter\n AssetKeyTable.c.asset_key.in_(keys_to_remove)\n )\n )\n\n @property\n def is_persistent(self):\n return True\n\n def update_event_log_record(self, record_id, event):\n """Utility method for migration scripts to update SQL representation of event records."""\n check.int_param(record_id, "record_id")\n check.inst_param(event, "event", EventLogEntry)\n dagster_event_type = None\n asset_key_str = None\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n\n with self.run_connection(run_id=event.run_id) as conn:\n conn.execute(\n SqlEventLogStorageTable.update() # pylint: disable=no-value-for-parameter\n .where(SqlEventLogStorageTable.c.id == record_id)\n .values(\n event=serialize_dagster_namedtuple(event),\n dagster_event_type=dagster_event_type,\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=event.step_key,\n asset_key=asset_key_str,\n )\n )\n\n def get_event_log_table_data(self, run_id, record_id):\n """Utility method to test representation of the record in the SQL table. Returns all of\n the columns stored in the event log storage (as opposed to the deserialized `EventLogEntry`).\n This allows checking that certain fields are extracted to support performant lookups (e.g.\n extracting `step_key` for fast filtering)"""\n with self.run_connection(run_id=run_id) as conn:\n query = (\n db.select([SqlEventLogStorageTable])\n .where(SqlEventLogStorageTable.c.id == record_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n return conn.execute(query).fetchone()\n\n def has_secondary_index(self, name):\n """This method uses a checkpoint migration table to see if summary data has been constructed\n in a secondary index table. Can be used to checkpoint event_log data migrations.\n """\n query = (\n db.select([1])\n .where(SecondaryIndexMigrationTable.c.name == name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None)\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def enable_secondary_index(self, name):\n """This method marks an event_log data migration as complete, to indicate that a summary\n data migration is complete.\n """\n query = (\n SecondaryIndexMigrationTable.insert().values( # pylint: disable=no-value-for-parameter\n name=name,\n migration_completed=datetime.now(),\n )\n )\n with self.index_connection() as conn:\n try:\n conn.execute(query)\n except db.exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update() # pylint: disable=no-value-for-parameter\n .where(SecondaryIndexMigrationTable.c.name == name)\n .values(migration_completed=datetime.now())\n )\n\n def _apply_filter_to_query(\n self,\n query,\n event_records_filter=None,\n asset_details=None,\n apply_cursor_filters=True,\n ):\n if not event_records_filter:\n return query\n\n if event_records_filter.event_type:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type\n == event_records_filter.event_type.value\n )\n\n if event_records_filter.asset_key:\n query = query.where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key\n == event_records_filter.asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key\n == event_records_filter.asset_key.to_string(legacy=True),\n )\n )\n\n if event_records_filter.asset_partitions:\n query = query.where(\n SqlEventLogStorageTable.c.partition.in_(event_records_filter.asset_partitions)\n )\n\n if asset_details and asset_details.last_wipe_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if apply_cursor_filters:\n # allow the run-sharded sqlite implementation to disable this cursor filtering so that\n # it can implement its own custom cursor logic, as cursor ids are not unique across run\n # shards\n if event_records_filter.before_cursor is not None:\n before_cursor_id = (\n event_records_filter.before_cursor.id\n if isinstance(event_records_filter.before_cursor, RunShardedEventsCursor)\n else event_records_filter.before_cursor\n )\n before_query = db.select([SqlEventLogStorageTable.c.id]).where(\n SqlEventLogStorageTable.c.id == before_cursor_id\n )\n query = query.where(SqlEventLogStorageTable.c.id < before_query)\n\n if event_records_filter.after_cursor is not None:\n after_cursor_id = (\n event_records_filter.after_cursor.id\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else event_records_filter.after_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor_id)\n\n if event_records_filter.before_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n < datetime.utcfromtimestamp(event_records_filter.before_timestamp)\n )\n\n if event_records_filter.after_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(event_records_filter.after_timestamp)\n )\n\n return query\n\n def get_event_records(\n self,\n event_records_filter: Optional[EventRecordsFilter] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n """Returns a list of (record_id, record)."""\n check.opt_inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n query = db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n if event_records_filter and event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n )\n if limit:\n query = query.limit(limit)\n\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.id.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.id.desc())\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n event_records = []\n for row_id, json_str in results:\n try:\n event_record = deserialize_json_to_dagster_namedtuple(json_str)\n if not isinstance(event_record, EventLogEntry):\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `{}`.".format(\n row_id\n )\n )\n continue\n else:\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `{}`.".format(row_id))\n\n return event_records\n\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n check.inst_param(asset_key, "asset_key", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=[asset_key])\n return bool(rows)\n\n def all_asset_keys(self):\n rows = self._fetch_asset_rows()\n asset_keys = [AssetKey.from_db_string(row[0]) for row in sorted(rows, key=lambda x: x[0])]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_asset_keys(\n self,\n prefix: Optional[List[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Iterable[AssetKey]:\n rows = self._fetch_asset_rows(prefix=prefix, limit=limit, cursor=cursor)\n asset_keys = [AssetKey.from_db_string(row[0]) for row in sorted(rows, key=lambda x: x[0])]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_latest_materialization_events(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n check.list_param(asset_keys, "asset_keys", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n to_backcompat_fetch = set()\n results: Dict[AssetKey, Optional[EventLogEntry]] = {}\n for row in rows:\n asset_key = AssetKey.from_db_string(row[0])\n if not asset_key:\n continue\n event_or_materialization = (\n deserialize_json_to_dagster_namedtuple(row[1]) if row[1] else None\n )\n if isinstance(event_or_materialization, EventLogEntry):\n results[asset_key] = event_or_materialization\n else:\n to_backcompat_fetch.add(asset_key)\n\n if to_backcompat_fetch:\n latest_event_subquery = (\n db.select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("timestamp"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in to_backcompat_fetch]\n ),\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key)\n .alias("latest_materializations")\n )\n backcompat_query = db.select(\n [SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.event]\n ).select_from(\n latest_event_subquery.join(\n SqlEventLogStorageTable,\n db.and_(\n SqlEventLogStorageTable.c.asset_key == latest_event_subquery.c.asset_key,\n SqlEventLogStorageTable.c.timestamp == latest_event_subquery.c.timestamp,\n ),\n )\n )\n with self.index_connection() as conn:\n event_rows = conn.execute(backcompat_query).fetchall()\n\n for row in event_rows:\n asset_key = AssetKey.from_db_string(row[0])\n if asset_key:\n results[asset_key] = cast(\n EventLogEntry, deserialize_json_to_dagster_namedtuple(row[1])\n )\n\n return results\n\n def _fetch_asset_rows(self, asset_keys=None, prefix=None, limit=None, cursor=None):\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments.\n #\n # Differs from _fetch_raw_asset_rows, in that it loops through to make sure enough rows are\n # returned to satisfy the limit.\n #\n # returns a list of rows where each row is a tuple of serialized asset_key, materialization,\n # and asset_details\n should_query = True\n current_cursor = cursor\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # if we have migrated, we can limit using SQL\n fetch_limit = limit\n else:\n # if we haven't migrated, overfetch in case the first N results are wiped\n fetch_limit = max(limit, MIN_ASSET_ROWS) if limit else None\n result = []\n\n while should_query:\n rows, has_more, current_cursor = self._fetch_raw_asset_rows(\n asset_keys=asset_keys, prefix=prefix, limit=fetch_limit, cursor=current_cursor\n )\n result.extend(rows)\n should_query = bool(has_more) and bool(limit) and len(result) < cast(int, limit)\n\n is_partial_query = bool(asset_keys) or bool(prefix) or bool(limit) or bool(cursor)\n if not is_partial_query and self._can_mark_assets_as_migrated(rows):\n self.enable_secondary_index(ASSET_KEY_INDEX_COLS)\n\n return result[:limit] if limit else result\n\n def _fetch_raw_asset_rows(self, asset_keys=None, prefix=None, limit=None, cursor=None):\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments. Does not guarantee that the number of\n # rows returned will match the limit specified. This helper function is used to fetch a\n # chunk of asset key rows, which may or may not be wiped.\n #\n # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized\n # asset_key, materialization, and asset_details\n\n columns = [\n AssetKeyTable.c.asset_key,\n AssetKeyTable.c.last_materialization,\n AssetKeyTable.c.asset_details,\n ]\n\n is_partial_query = bool(asset_keys) or bool(prefix) or bool(limit) or bool(cursor)\n if self.has_asset_key_index_cols() and not is_partial_query:\n # if the schema has been migrated, fetch the last_materialization_timestamp to see if\n # we can lazily migrate the data table\n columns.append(AssetKeyTable.c.last_materialization_timestamp)\n columns.append(AssetKeyTable.c.wipe_timestamp)\n\n query = db.select(columns).order_by(AssetKeyTable.c.asset_key.asc())\n query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor)\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n query = query.where(\n db.or_(\n AssetKeyTable.c.wipe_timestamp == None,\n AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp,\n )\n )\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n return rows, False, None\n\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n wiped_timestamps_by_asset_key = {}\n row_by_asset_key = OrderedDict()\n\n for row in rows:\n asset_key = AssetKey.from_db_string(row[0])\n if not asset_key:\n continue\n asset_details = AssetDetails.from_db_string(row[2])\n if not asset_details or not asset_details.last_wipe_timestamp:\n row_by_asset_key[asset_key] = row\n continue\n materialization_or_event = (\n deserialize_json_to_dagster_namedtuple(row[1]) if row[1] else None\n )\n if isinstance(materialization_or_event, EventLogEntry):\n if asset_details.last_wipe_timestamp > materialization_or_event.timestamp:\n # this asset has not been materialized since being wiped, skip\n continue\n else:\n # add the key\n row_by_asset_key[asset_key] = row\n else:\n row_by_asset_key[asset_key] = row\n wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp\n\n if wiped_timestamps_by_asset_key:\n materialization_times = self._fetch_backcompat_materialization_times(\n wiped_timestamps_by_asset_key.keys()\n )\n for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items():\n materialization_time = materialization_times.get(asset_key)\n if not materialization_time or utc_datetime_from_naive(\n materialization_time\n ) < utc_datetime_from_timestamp(wiped_timestamp):\n # remove rows that have not been materialized since being wiped\n row_by_asset_key.pop(asset_key)\n\n has_more = limit and len(rows) == limit\n new_cursor = rows[-1][0] if rows else None\n\n return row_by_asset_key.values(), has_more, new_cursor\n\n def _fetch_backcompat_materialization_times(self, asset_keys):\n # fetches the latest materialization timestamp for the given asset_keys. Uses the (slower)\n # raw event log table.\n backcompat_query = (\n db.select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.timestamp),\n ]\n )\n .where(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key)\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())\n )\n with self.index_connection() as conn:\n backcompat_rows = conn.execute(backcompat_query).fetchall()\n return {AssetKey.from_db_string(row[0]): row[1] for row in backcompat_rows}\n\n def _can_mark_assets_as_migrated(self, rows):\n if not self.has_asset_key_index_cols():\n return False\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # we have already migrated\n return False\n\n for row in rows:\n if not _get_from_row(row, "last_materialization_timestamp"):\n return False\n\n if _get_from_row(row, "asset_details") and not _get_from_row(row, "wipe_timestamp"):\n return False\n\n return True\n\n def _apply_asset_filter_to_query(\n self,\n query,\n asset_keys=None,\n prefix=None,\n limit=None,\n cursor=None,\n ):\n if asset_keys:\n query = query.where(\n AssetKeyTable.c.asset_key.in_([asset_key.to_string() for asset_key in asset_keys])\n )\n\n if prefix:\n prefix_str = seven.dumps(prefix)[:-1]\n query = query.where(AssetKeyTable.c.asset_key.startswith(prefix_str))\n\n if cursor:\n query = query.where(AssetKeyTable.c.asset_key > cursor)\n\n if limit:\n query = query.limit(limit)\n return query\n\n def _get_assets_details(self, asset_keys: Sequence[AssetKey]):\n check.list_param(asset_keys, "asset_key", AssetKey)\n rows = None\n with self.index_connection() as conn:\n rows = conn.execute(\n db.select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details]).where(\n AssetKeyTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n )\n ).fetchall()\n\n asset_key_to_details = {\n row[0]: (deserialize_json_to_dagster_namedtuple(row[1]) if row[1] else None)\n for row in rows\n }\n\n # returns a list of the corresponding asset_details to provided asset_keys\n return [\n asset_key_to_details.get(asset_key.to_string(), None) for asset_key in asset_keys\n ]\n\n def _add_assets_wipe_filter_to_query(\n self, query, assets_details: Sequence[str], asset_keys: Sequence[AssetKey]\n ):\n check.invariant(\n len(assets_details) == len(asset_keys),\n "asset_details and asset_keys must be the same length",\n )\n for i in range(len(assets_details)):\n asset_key, asset_details = asset_keys[i], assets_details[i]\n if asset_details and asset_details.last_wipe_timestamp: # type: ignore[attr-defined]\n asset_key_in_row = db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n # If asset key is in row, keep the row if the timestamp > wipe timestamp, else remove the row.\n # If asset key is not in row, keep the row.\n query = query.where(\n db.or_(\n db.and_(\n asset_key_in_row,\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp), # type: ignore[attr-defined]\n ),\n db.not_(asset_key_in_row),\n )\n )\n\n return query\n\n def get_asset_events(\n self,\n asset_key,\n partitions=None,\n before_cursor=None,\n after_cursor=None,\n limit=None,\n ascending=False,\n include_cursor=False, # deprecated\n before_timestamp=None,\n cursor=None, # deprecated\n ):\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.opt_list_param(partitions, "partitions", of_type=str)\n before_cursor, after_cursor = extract_asset_events_cursor(\n cursor, before_cursor, after_cursor, ascending\n )\n event_records = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n asset_partitions=partitions,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n before_timestamp=before_timestamp,\n ),\n limit=limit,\n ascending=ascending,\n )\n if include_cursor:\n return [tuple([record.storage_id, record.event_log_entry]) for record in event_records]\n else:\n return [record.event_log_entry for record in event_records]\n\n def get_asset_run_ids(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n query = (\n db.select(\n [SqlEventLogStorageTable.c.run_id, db.func.max(SqlEventLogStorageTable.c.timestamp)]\n )\n .where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .group_by(\n SqlEventLogStorageTable.c.run_id,\n )\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).desc())\n )\n\n asset_keys = [asset_key]\n asset_details = self._get_assets_details(asset_keys)\n query = self._add_assets_wipe_filter_to_query(query, asset_details, asset_keys)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return [run_id for (run_id, _timestamp) in results]\n\n def _asset_materialization_from_json_column(self, json_str):\n if not json_str:\n return None\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n event_or_materialization = deserialize_json_to_dagster_namedtuple(json_str)\n if isinstance(event_or_materialization, AssetMaterialization):\n return event_or_materialization\n\n if (\n not isinstance(event_or_materialization, EventLogEntry)\n or not event_or_materialization.is_dagster_event\n or not event_or_materialization.dagster_event.asset_key\n ):\n return None\n\n return event_or_materialization.dagster_event.step_materialization_data.materialization\n\n def wipe_asset(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n wipe_timestamp = pendulum.now("UTC").timestamp()\n\n if self.has_asset_key_index_cols():\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update() # pylint: disable=no-value-for-parameter\n .where(\n db.or_(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .values(\n asset_details=serialize_dagster_namedtuple(\n AssetDetails(last_wipe_timestamp=wipe_timestamp)\n ),\n wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp),\n )\n )\n\n else:\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update() # pylint: disable=no-value-for-parameter\n .where(\n db.or_(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .values(\n asset_details=serialize_dagster_namedtuple(\n AssetDetails(last_wipe_timestamp=wipe_timestamp)\n ),\n )\n )\n\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n check.list_param(asset_keys, "asset_keys", AssetKey)\n\n query = (\n db.select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.partition,\n db.func.count(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n db.or_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string(legacy=True) for asset_key in asset_keys]\n ),\n ),\n SqlEventLogStorageTable.c.partition != None,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details(asset_keys)\n query = self._add_assets_wipe_filter_to_query(query, assets_details, asset_keys)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n materialization_count_by_partition: Dict[AssetKey, Dict[str, int]] = {\n asset_key: {} for asset_key in asset_keys\n }\n for row in results:\n asset_key = AssetKey.from_db_string(row[0])\n if asset_key:\n materialization_count_by_partition[asset_key][row[1]] = row[2]\n\n return materialization_count_by_partition
\n\n\ndef _get_from_row(row, column):\n """utility function for extracting a column from a sqlalchemy row proxy, since '_asdict' is not\n supported in sqlalchemy 1.3"""\n if not row.has_key(column):\n return None\n return row[column]\n
", "current_page_name": "_modules/dagster/core/storage/event_log/sql_event_log", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.event_log.sql_event_log"}, "sqlite": {"consolidated_sqlite_event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.event_log.sqlite.consolidated_sqlite_event_log

\nimport logging\nimport os\nfrom collections import defaultdict\nfrom contextlib import contextmanager\n\nfrom sqlalchemy.pool import NullPool\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nfrom dagster import check\nfrom dagster.config.source import StringSource\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    handle_schema_errors,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata\nfrom ..sql_event_log import SqlEventLogStorage\n\nSQLITE_EVENT_LOG_FILENAME = "event_log"\n\n\n
[docs]class ConsolidatedSqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed consolidated event log storage intended for test cases only.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\n the following to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster.core.storage.event_log\n class: ConsolidatedSqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the database.\n """\n\n def __init__(self, base_dir, inst_data=None):\n self._base_dir = check.str_param(base_dir, "base_dir")\n self._conn_string = create_db_conn_string(base_dir, SQLITE_EVENT_LOG_FILENAME)\n self._secondary_index_cache = {}\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._watchers = defaultdict(dict)\n self._obs = None\n\n if not os.path.exists(self.get_db_path()):\n self._init_db()\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return ConsolidatedSqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def _init_db(self):\n mkdir_p(self._base_dir)\n engine = create_engine(self._conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n if should_mark_indexes:\n # mark all secondary indexes\n self.reindex_events()\n self.reindex_assets()\n\n @contextmanager\n def _connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n conn = engine.connect()\n try:\n with handle_schema_errors(conn, get_alembic_config(__file__)):\n yield conn\n finally:\n conn.close()\n\n def run_connection(self, run_id):\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def get_db_path(self):\n return os.path.join(self._base_dir, "{}.db".format(SQLITE_EVENT_LOG_FILENAME))\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n ConsolidatedSqliteEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(ConsolidatedSqliteEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, start_cursor, callback):\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n self._obs.schedule(\n ConsolidatedSqliteEventLogStorageWatchdog(self), self._base_dir, True\n )\n\n cursor = start_cursor if start_cursor is not None else -1\n self._watchers[run_id][callback] = cursor\n\n def on_modified(self):\n keys = [\n (run_id, callback)\n for run_id, callback_dict in self._watchers.items()\n for callback, _ in callback_dict.items()\n ]\n for run_id, callback in keys:\n cursor = self._watchers[run_id][callback]\n\n # fetch events\n events = self.get_logs_for_run(run_id, cursor)\n\n # update cursor\n self._watchers[run_id][callback] = cursor + len(events)\n\n for event in events:\n status = None\n try:\n status = callback(event)\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", run_id)\n\n if (\n status == PipelineRunStatus.SUCCESS\n or status == PipelineRunStatus.FAILURE\n or status == PipelineRunStatus.CANCELED\n ):\n self.end_watch(run_id, callback)\n\n def end_watch(self, run_id, handler):\n if run_id in self._watchers and handler in self._watchers[run_id]:\n del self._watchers[run_id][handler]\n\n def dispose(self):\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)
\n\n\nclass ConsolidatedSqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", ConsolidatedSqliteEventLogStorage\n )\n self._log_path = event_log_storage.get_db_path()\n super(ConsolidatedSqliteEventLogStorageWatchdog, self).__init__(\n patterns=[self._log_path], **kwargs\n )\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._event_log_storage.on_modified()\n
", "current_page_name": "_modules/dagster/core/storage/event_log/sqlite/consolidated_sqlite_event_log", "customsidebar": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.event_log.sqlite.consolidated_sqlite_event_log"}, "sqlite_event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.event_log.sqlite.sqlite_event_log

\nimport glob\nimport logging\nimport os\nimport sqlite3\nimport threading\nimport time\nimport warnings\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import Iterable, Optional\n\nimport sqlalchemy as db\nfrom sqlalchemy.pool import NullPool\nfrom tqdm import tqdm\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nfrom dagster import check, seven\nfrom dagster.config.source import StringSource\nfrom dagster.core.events import DagsterEventType\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.storage.event_log.base import EventLogRecord, EventRecordsFilter\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus, RunsFilter\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    handle_schema_errors,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string\nfrom dagster.serdes import (\n    ConfigurableClass,\n    ConfigurableClassData,\n    deserialize_json_to_dagster_namedtuple,\n)\nfrom dagster.utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata, SqlEventLogStorageTable\nfrom ..sql_event_log import RunShardedEventsCursor, SqlEventLogStorage\n\nINDEX_SHARD_NAME = "index"\n\n\n
[docs]class SqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default event log storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for event log storage, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n event_log_storage:\n module: dagster.core.storage.event_log\n class: SqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the databases. To\n improve concurrent performance, event logs are stored in a separate SQLite database for each\n run.\n """\n\n def __init__(self, base_dir, inst_data=None):\n """Note that idempotent initialization of the SQLite database is done on a per-run_id\n basis in the body of connect, since each run is stored in a separate database."""\n self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir"))\n mkdir_p(self._base_dir)\n\n self._obs = None\n\n self._watchers = defaultdict(dict)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n # Used to ensure that each run ID attempts to initialize its DB the first time it connects,\n # ensuring that the database will be created if it doesn't exist\n self._initialized_dbs = set()\n\n # Ensure that multiple threads (like the event log watcher) interact safely with each other\n self._db_lock = threading.Lock()\n\n if not os.path.exists(self.path_for_shard(INDEX_SHARD_NAME)):\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n self._initdb(engine)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def upgrade(self):\n all_run_ids = self.get_all_run_ids()\n print( # pylint: disable=print-call\n f"Updating event log storage for {len(all_run_ids)} runs on disk..."\n )\n alembic_config = get_alembic_config(__file__)\n if all_run_ids:\n for run_id in tqdm(all_run_ids):\n with self.run_connection(run_id) as conn:\n run_alembic_upgrade(alembic_config, conn, run_id)\n\n print("Updating event log storage for index db on disk...") # pylint: disable=print-call\n with self.index_connection() as conn:\n run_alembic_upgrade(alembic_config, conn, "index")\n\n self._initialized_dbs = set()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def get_all_run_ids(self):\n all_filenames = glob.glob(os.path.join(self._base_dir, "*.db"))\n return [\n os.path.splitext(os.path.basename(filename))[0]\n for filename in all_filenames\n if os.path.splitext(os.path.basename(filename))[0] != INDEX_SHARD_NAME\n ]\n\n def path_for_shard(self, run_id):\n return os.path.join(self._base_dir, "{run_id}.db".format(run_id=run_id))\n\n def conn_string_for_shard(self, shard_name):\n check.str_param(shard_name, "shard_name")\n return create_db_conn_string(self._base_dir, shard_name)\n\n def _initdb(self, engine):\n alembic_config = get_alembic_config(__file__)\n\n retry_limit = 10\n\n while True:\n try:\n\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n\n break\n except (db.exc.DatabaseError, sqlite3.DatabaseError, sqlite3.OperationalError) as exc:\n # This is SQLite-specific handling for concurrency issues that can arise when\n # multiple processes (e.g. the dagit process and user code process) contend with\n # each other to init the db. When we hit the following errors, we know that another\n # process is on the case and we should retry.\n err_msg = str(exc)\n\n if not (\n "table asset_keys already exists" in err_msg\n or "table secondary_indexes already exists" in err_msg\n or "table event_logs already exists" in err_msg\n or "database is locked" in err_msg\n or "table alembic_version already exists" in err_msg\n or "UNIQUE constraint failed: alembic_version.version_num" in err_msg\n ):\n raise\n\n if retry_limit == 0:\n raise\n else:\n logging.info(\n "SqliteEventLogStorage._initdb: Encountered apparent concurrent init, "\n "retrying ({retry_limit} retries left). Exception: {str_exc}".format(\n retry_limit=retry_limit, str_exc=err_msg\n )\n )\n time.sleep(0.2)\n retry_limit -= 1\n\n @contextmanager\n def _connect(self, shard):\n with self._db_lock:\n check.str_param(shard, "shard")\n\n conn_string = self.conn_string_for_shard(shard)\n engine = create_engine(conn_string, poolclass=NullPool)\n\n if not shard in self._initialized_dbs:\n self._initdb(engine)\n self._initialized_dbs.add(shard)\n\n conn = engine.connect()\n\n try:\n with handle_schema_errors(conn, get_alembic_config(__file__)):\n yield conn\n finally:\n conn.close()\n engine.dispose()\n\n def run_connection(self, run_id=None):\n return self._connect(run_id)\n\n def index_connection(self):\n return self._connect(INDEX_SHARD_NAME)\n\n def store_event(self, event):\n """\n Overridden method to replicate asset events in a central assets.db sqlite shard, enabling\n cross-run asset queries.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.run_connection(run_id) as conn:\n conn.execute(insert_event_statement)\n\n if event.is_dagster_event and event.dagster_event.asset_key:\n check.invariant(\n event.dagster_event_type == DagsterEventType.ASSET_MATERIALIZATION\n or event.dagster_event_type == DagsterEventType.ASSET_OBSERVATION\n or event.dagster_event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n "Can only store asset materializations, materialization_planned, and observations in index database",\n )\n\n # mirror the event in the cross-run index database\n with self.index_connection() as conn:\n conn.execute(insert_event_statement)\n\n if (\n event.dagster_event.is_step_materialization\n or event.dagster_event.is_asset_observation\n ):\n self.store_asset(event)\n\n def get_event_records(\n self,\n event_records_filter: Optional[EventRecordsFilter] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n """Overridden method to enable cross-run event queries in sqlite.\n\n The record id in sqlite does not auto increment cross runs, so instead of fetching events\n after record id, we only fetch events whose runs updated after update_timestamp.\n """\n check.opt_inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n is_asset_query = event_records_filter and (\n event_records_filter.event_type == DagsterEventType.ASSET_MATERIALIZATION\n or event_records_filter.event_type == DagsterEventType.ASSET_OBSERVATION\n )\n if is_asset_query:\n # asset materializations and observations get mirrored into the index shard, so no\n # custom run shard-aware cursor logic needed\n return super(SqliteEventLogStorage, self).get_event_records(\n event_records_filter=event_records_filter, limit=limit, ascending=ascending\n )\n\n query = db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n if event_records_filter and event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if not event_records_filter or not (\n isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n ):\n warnings.warn(\n """\n Called `get_event_records` on a run-sharded event log storage with a query that\n is not run aware (e.g. not using a RunShardedEventsCursor). This likely has poor\n performance characteristics. Consider adding a RunShardedEventsCursor to your query\n or switching your instance configuration to use a non-run sharded event log storage\n (e.g. PostgresEventLogStorage, ConsolidatedSqliteEventLogStorage)\n """\n )\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n apply_cursor_filters=False, # run-sharded cursor filters don't really make sense\n )\n if limit:\n query = query.limit(limit)\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc())\n\n # workaround for the run-shard sqlite to enable cross-run queries: get a list of run_ids\n # whose events may qualify the query, and then open run_connection per run_id at a time.\n run_updated_after = (\n event_records_filter.after_cursor.run_updated_after\n if event_records_filter\n and isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else None\n )\n run_records = self._instance.get_run_records(\n filters=RunsFilter(updated_after=run_updated_after),\n order_by="update_timestamp",\n ascending=ascending,\n )\n\n event_records = []\n for run_record in run_records:\n run_id = run_record.pipeline_run.run_id\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n for row_id, json_str in results:\n try:\n event_record = deserialize_json_to_dagster_namedtuple(json_str)\n if not isinstance(event_record, EventLogEntry):\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `{}`.".format(\n row_id\n )\n )\n continue\n else:\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n if limit and len(event_records) >= limit:\n break\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `{}`.".format(row_id))\n\n if limit and len(event_records) >= limit:\n break\n\n return event_records[:limit]\n\n def delete_events(self, run_id):\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n\n # delete the mirrored event in the cross-run index database\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n\n def wipe(self):\n # should delete all the run-sharded dbs as well as the index db\n for filename in (\n glob.glob(os.path.join(self._base_dir, "*.db"))\n + glob.glob(os.path.join(self._base_dir, "*.db-wal"))\n + glob.glob(os.path.join(self._base_dir, "*.db-shm"))\n ):\n os.unlink(filename)\n\n self._initialized_dbs = set()\n\n def _delete_mirrored_events_for_asset_key(self, asset_key):\n with self.index_connection() as conn:\n conn.execute(\n SqlEventLogStorageTable.delete().where( # pylint: disable=no-value-for-parameter\n db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n )\n\n def wipe_asset(self, asset_key):\n # default implementation will update the event_logs in the sharded dbs, and the asset_key\n # table in the asset shard, but will not remove the mirrored event_log events in the asset\n # shard\n super(SqliteEventLogStorage, self).wipe_asset(asset_key)\n self._delete_mirrored_events_for_asset_key(asset_key)\n\n def watch(self, run_id, start_cursor, callback):\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n\n watchdog = SqliteEventLogStorageWatchdog(self, run_id, callback, start_cursor)\n self._watchers[run_id][callback] = (\n watchdog,\n self._obs.schedule(watchdog, self._base_dir, True),\n )\n\n def end_watch(self, run_id, handler):\n if handler in self._watchers[run_id]:\n event_handler, watch = self._watchers[run_id][handler]\n self._obs.remove_handler_for_watch(event_handler, watch)\n del self._watchers[run_id][handler]\n\n def dispose(self):\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)
\n\n\nclass SqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, run_id, callback, start_cursor, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", SqliteEventLogStorage\n )\n self._run_id = check.str_param(run_id, "run_id")\n self._cb = check.callable_param(callback, "callback")\n self._log_path = event_log_storage.path_for_shard(run_id)\n self._cursor = start_cursor if start_cursor is not None else -1\n super(SqliteEventLogStorageWatchdog, self).__init__(patterns=[self._log_path], **kwargs)\n\n def _process_log(self):\n events = self._event_log_storage.get_logs_for_run(self._run_id, self._cursor)\n self._cursor += len(events)\n for event in events:\n status = None\n try:\n status = self._cb(event)\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", self._run_id)\n\n if (\n status == PipelineRunStatus.SUCCESS\n or status == PipelineRunStatus.FAILURE\n or status == PipelineRunStatus.CANCELED\n ):\n self._event_log_storage.end_watch(self._run_id, self._cb)\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._process_log()\n
", "current_page_name": "_modules/dagster/core/storage/event_log/sqlite/sqlite_event_log", "customsidebar": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.event_log.sqlite.sqlite_event_log"}}}, "file_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.file_manager

\nimport io\nimport os\nimport shutil\nimport uuid\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import BinaryIO, Optional, TextIO, Union\n\nfrom dagster import check\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.definitions.resource_definition import resource\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.types.decorator import usable_as_dagster_type\nfrom dagster.utils import mkdir_p\n\nfrom .temp_file_manager import TempfileManager\n\n\n# pylint: disable=no-init\n
[docs]@usable_as_dagster_type\nclass FileHandle(ABC):\n """A reference to a file as manipulated by a FileManager\n\n Subclasses may handle files that are resident on the local file system, in an object store, or\n in any arbitrary place where a file can be stored.\n\n This exists to handle the very common case where you wish to write a computation that reads,\n transforms, and writes files, but where you also want the same code to work in local development\n as well as on a cluster where the files will be stored in a globally available object store\n such as S3.\n """\n\n @property\n @abstractmethod\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n raise NotImplementedError()
\n\n\n
[docs]@usable_as_dagster_type\nclass LocalFileHandle(FileHandle):\n """A reference to a file on a local filesystem."""\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @property\n def path(self) -> str:\n """The file's path."""\n return self._path\n\n @property\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n return self._path
\n\n\n
[docs]class FileManager(ABC): # pylint: disable=no-init\n """Base class for all file managers in dagster.\n\n The file manager is an interface that can be implemented by resources to provide abstract\n access to a file system such as local disk, S3, or other cloud storage.\n\n For examples of usage, see the documentation of the concrete file manager implementations.\n """\n\n
[docs] @abstractmethod\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n """Copy a file represented by a file handle to a temp file.\n\n In an implementation built around an object store such as S3, this method would be expected\n to download the file from S3 to local filesystem in a location assigned by the standard\n library's :py:mod:`python:tempfile` module.\n\n Temp files returned by this method are *not* guaranteed to be reusable across solid\n boundaries. For files that must be available across solid boundaries, use the\n :py:meth:`~dagster.core.storage.file_manager.FileManager.read`,\n :py:meth:`~dagster.core.storage.file_manager.FileManager.read_data`,\n :py:meth:`~dagster.core.storage.file_manager.FileManager.write`, and\n :py:meth:`~dagster.core.storage.file_manager.FileManager.write_data` methods.\n\n Args:\n file_handle (FileHandle): The handle to the file to make available as a local temp file.\n\n Returns:\n str: Path to the local temp file.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def delete_local_temp(self):\n """Delete all local temporary files created by previous calls to\n :py:meth:`~dagster.core.storage.file_manager.FileManager.copy_handle_to_local_temp`.\n\n Should typically only be called by framework implementors.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def read(self, file_handle: FileHandle, mode: str = "rb") -> Union[TextIO, BinaryIO]:\n """Return a file-like stream for the file handle.\n\n This may incur an expensive network call for file managers backed by object stores\n such as S3.\n\n Args:\n file_handle (FileHandle): The file handle to make available as a stream.\n mode (str): The mode in which to open the file. Default: ``"rb"``.\n\n Returns:\n Union[TextIO, BinaryIO]: A file-like stream.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def read_data(self, file_handle: FileHandle) -> bytes:\n """Return the bytes for a given file handle. This may incur an expensive network\n call for file managers backed by object stores such as s3.\n\n Args:\n file_handle (FileHandle): The file handle for which to return bytes.\n\n Returns:\n bytes: Bytes for a given file handle.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def write(\n self, file_obj: Union[TextIO, BinaryIO], mode: str = "wb", ext: Optional[str] = None\n ) -> FileHandle:\n """Write the bytes contained within the given file object into the file manager.\n\n Args:\n file_obj (Union[TextIO, StringIO]): A file-like object.\n mode (Optional[str]): The mode in which to write the file into the file manager.\n Default: ``"wb"``.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def write_data(self, data: bytes, ext: Optional[str] = None) -> FileHandle:\n """Write raw bytes into the file manager.\n\n Args:\n data (bytes): The bytes to write into the file manager.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n\n
[docs]@resource(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef local_file_manager(init_context):\n """FileManager that provides abstract access to a local filesystem.\n\n By default, files will be stored in `<local_artifact_storage>/storage/file_manager` where\n `<local_artifact_storage>` can be configured the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n\n Examples:\n\n .. code-block:: python\n\n import tempfile\n\n from dagster import ModeDefinition, local_file_manager, pipeline, solid\n\n\n @solid(required_resource_keys={"file_manager"})\n def write_files(context):\n fh_1 = context.resources.file_manager.write_data(b"foo")\n\n with tempfile.NamedTemporaryFile("w+") as fd:\n fd.write("bar")\n fd.seek(0)\n fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n return (fh_1, fh_2)\n\n\n @solid(required_resource_keys={"file_manager"})\n def read_files(context, file_handles):\n fh_1, fh_2 = file_handles\n assert context.resources.file_manager.read_data(fh_2) == b"bar"\n fd = context.resources.file_manager.read(fh_2, mode="r")\n assert fd.read() == "foo"\n fd.close()\n\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"file_manager": local_file_manager})])\n def files_pipeline():\n read_files(write_files())\n\n Or to specify the file directory:\n\n .. code-block:: python\n\n @pipeline(\n mode_defs=[\n ModeDefinition(\n resource_defs={\n "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n }\n )\n ]\n )\n def files_pipeline():\n read_files(write_files())\n\n """\n\n return LocalFileManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "file_manager")\n )\n )
\n\n\ndef check_file_like_obj(obj):\n check.invariant(obj and hasattr(obj, "read") and hasattr(obj, "write"))\n\n\nclass LocalFileManager(FileManager):\n def __init__(self, base_dir):\n self.base_dir = base_dir\n self._base_dir_ensured = False\n self._temp_file_manager = TempfileManager()\n\n @staticmethod\n def for_instance(instance, run_id):\n check.inst_param(instance, "instance", DagsterInstance)\n return LocalFileManager(instance.file_manager_directory(run_id))\n\n def ensure_base_dir_exists(self):\n if self._base_dir_ensured:\n return\n\n mkdir_p(self.base_dir)\n\n self._base_dir_ensured = True\n\n def copy_handle_to_local_temp(self, file_handle):\n check.inst_param(file_handle, "file_handle", FileHandle)\n with self.read(file_handle, "rb") as handle_obj:\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_file_obj.write(handle_obj.read())\n temp_name = temp_file_obj.name\n temp_file_obj.close()\n return temp_name\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", LocalFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n with open(file_handle.path, mode) as file_obj:\n yield file_obj\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n check.opt_str_param(ext, "ext")\n\n self.ensure_base_dir_exists()\n\n dest_file_path = os.path.join(\n self.base_dir, str(uuid.uuid4()) + (("." + ext) if ext is not None else "")\n )\n\n with open(dest_file_path, mode) as dest_file_obj:\n shutil.copyfileobj(file_obj, dest_file_obj)\n return LocalFileHandle(dest_file_path)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster/core/storage/file_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.file_manager"}, "fs_asset_io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.fs_asset_io_manager

\nimport os\n\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.storage.io_manager import io_manager\n\nfrom .fs_io_manager import PickledObjectFilesystemIOManager\n\n\n
[docs]@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef fs_asset_io_manager(init_context):\n """IO manager that stores values on the local filesystem, serializing them with pickle.\n\n Each asset is assigned to a single filesystem path, at "<base_dir>/<asset_key>". If the asset\n key has multiple components, the final component is used as the name of the file, and the\n preceding components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n\n If not provided via configuration, the base dir is the local_artifact_storage in your\n dagster.yaml file. That will be a temporary directory if not explicitly set.\n\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Specify a collection-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all assets in the collection.\n\n .. code-block:: python\n\n from dagster import AssetGroup, asset, fs_asset_io_manager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n asset_group = AssetGroup(\n [asset1, asset2],\n resource_defs={\n "io_manager": fs_asset_io_manager.configured({"base_path": "/my/base/path"})\n },\n )\n\n 2. Specify IO manager on the asset, which allows the user to set different IO managers on\n different assets.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op, Out\n\n @asset(io_manager_key="my_io_manager")\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n asset_group = AssetGroup(\n [asset1, asset2],\n resource_defs={\n "my_io_manager": fs_asset_io_manager.configured({"base_path": "/my/base/path"})\n },\n )\n """\n base_dir = init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory()\n )\n\n return AssetPickledObjectFilesystemIOManager(base_dir=base_dir)
\n\n\nclass AssetPickledObjectFilesystemIOManager(PickledObjectFilesystemIOManager):\n def _get_path(self, context):\n return os.path.join(self.base_dir, *context.asset_key.path)\n
", "current_page_name": "_modules/dagster/core/storage/fs_asset_io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.fs_asset_io_manager"}, "fs_io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.fs_io_manager

\nimport os\nimport pickle\n\nfrom dagster import check\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster.core.definitions.metadata import MetadataEntry, MetadataValue\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.execution.context.input import InputContext\nfrom dagster.core.execution.context.output import OutputContext\nfrom dagster.core.storage.io_manager import IOManager, io_manager\nfrom dagster.core.storage.memoizable_io_manager import MemoizableIOManager\nfrom dagster.utils import PICKLE_PROTOCOL, mkdir_p\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef fs_io_manager(init_context):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n Allows users to specify a base directory where all the step outputs will be stored. By\n default, step outputs will be stored in the directory specified by local_artifact_storage in\n your dagster.yaml file (which will be a temporary directory if not explicitly set).\n\n Serializes and deserializes output values using pickling and automatically constructs\n the filepaths for the assets.\n\n Example usage:\n\n 1. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": fs_io_manager.configured({"base_path": "/my/base/path"})\n }\n )\n def job():\n op_b(op_a())\n\n\n 2. Specify IO manager on :py:class:`Out`, which allows the user to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": fs_io_manager})\n def job():\n op_b(op_a())\n\n """\n base_dir = init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory()\n )\n\n return PickledObjectFilesystemIOManager(base_dir=base_dir)
\n\n\nclass PickledObjectFilesystemIOManager(MemoizableIOManager):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context):\n """Automatically construct filepath."""\n keys = context.get_output_identifier()\n\n return os.path.join(self.base_dir, *keys)\n\n def has_output(self, context):\n filepath = self._get_path(context)\n\n return os.path.exists(filepath)\n\n def handle_output(self, context, obj):\n """Pickle the data and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n\n filepath = self._get_path(context)\n context.log.debug(f"Writing file at: {filepath}")\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n try:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n except (AttributeError, RecursionError, ImportError, pickle.PicklingError) as e:\n executor = context.step_context.pipeline_def.mode_definitions[0].executor_defs[0]\n\n if isinstance(e, RecursionError):\n # if obj can't be pickled because of RecursionError then __str__() will also\n # throw a RecursionError\n obj_repr = f"{obj.__class__} exceeds recursion limit and"\n else:\n obj_repr = obj.__str__()\n\n raise DagsterInvariantViolationError(\n f"Object {obj_repr} is not picklable. You are currently using the "\n f"fs_io_manager and the {executor.name}. You will need to use a different "\n "io manager to continue using this output. For example, you can use the "\n "mem_io_manager with the in_process_executor.\\n"\n "For more information on io managers, visit "\n "https://docs.dagster.io/concepts/io-management/io-managers \\n"\n "For more information on executors, vist "\n "https://docs.dagster.io/deployment/executors#overview"\n )\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n\n filepath = self._get_path(context.upstream_output)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\nclass CustomPathPickledObjectFilesystemIOManager(IOManager):\n """Built-in filesystem IO managerthat stores and retrieves values using pickling and\n allow users to specify file path for outputs.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, path):\n return os.path.join(self.base_dir, path)\n\n def handle_output(self, context, obj):\n """Pickle the data and store the object to a custom file path.\n\n This method emits an AssetMaterialization event so the assets will be tracked by the\n Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n metadata = context.metadata\n path = check.str_param(metadata.get("path"), "metadata.path")\n\n filepath = self._get_path(path)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n context.log.debug(f"Writing file at: {filepath}")\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n return AssetMaterialization(\n asset_key=AssetKey([context.pipeline_name, context.step_key, context.name]),\n metadata_entries=[\n MetadataEntry("path", value=MetadataValue.path(os.path.abspath(filepath)))\n ],\n )\n\n def load_input(self, context):\n """Unpickle the file from a given file path and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n metadata = context.upstream_output.metadata\n path = check.str_param(metadata.get("path"), "metadata.path")\n filepath = self._get_path(path)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\n
[docs]@io_manager(config_schema={"base_dir": Field(StringSource, is_required=True)})\n@experimental\ndef custom_path_fs_io_manager(init_context):\n """Built-in IO manager that allows users to custom output file path per output definition.\n\n It requires users to specify a base directory where all the step output will be stored in. It\n serializes and deserializes output values (assets) using pickling and stores the pickled object\n in the user-provided file paths.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import custom_path_fs_io_manager, job, op\n\n @op(out=Out(metadata={"path": "path/to/sample_output"}))\n def sample_data(df):\n return df[:5]\n\n my_custom_path_fs_io_manager = custom_path_fs_io_manager.configured(\n {"base_dir": "path/to/basedir"}\n )\n\n @job(resource_defs={"io_manager": my_custom_path_fs_io_manager})\n def my_job():\n sample_data()\n\n """\n\n return CustomPathPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get("base_dir")\n )
\n
", "current_page_name": "_modules/dagster/core/storage/fs_io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.fs_io_manager"}, "io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.io_manager

\nfrom abc import abstractmethod\nfrom functools import update_wrapper\nfrom typing import Optional, Set\n\nfrom dagster import check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.definition_config_schema import (\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.storage.input_manager import InputManager\nfrom dagster.core.storage.output_manager import IOutputManagerDefinition, OutputManager\nfrom dagster.core.storage.root_input_manager import IInputManagerDefinition\n\n\n
[docs]class IOManagerDefinition(ResourceDefinition, IInputManagerDefinition, IOutputManagerDefinition):\n """Definition of an IO manager resource.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n An IOManagerDefinition is a :py:class:`ResourceDefinition` whose `resource_fn` returns an\n :py:class:`IOManager`.\n\n The easiest way to create an IOManagerDefnition is with the :py:func:`@io_manager <io_manager>`\n decorator.\n """\n\n def __init__(\n self,\n resource_fn=None,\n config_schema=None,\n description=None,\n required_resource_keys=None,\n version=None,\n input_config_schema=None,\n output_config_schema=None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n # Unlike other configurable objects, whose config schemas default to Any, output_config_schema\n # defaults to None. This the because IOManager input / output config shares config\n # namespace with dagster type loaders and materializers. The absence of provided\n # output_config_schema means that we should fall back to using the materializer that\n # corresponds to the output dagster type.\n self._output_config_schema = (\n convert_user_facing_definition_config_schema(output_config_schema)\n if output_config_schema is not None\n else None\n )\n super(IOManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self):\n return self._input_config_schema\n\n @property\n def output_config_schema(self):\n return self._output_config_schema\n\n def copy_for_configured(self, description, config_schema, _):\n return IOManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n output_config_schema=self.output_config_schema,\n )\n\n
[docs] @staticmethod\n def hardcoded_io_manager(value, description=None):\n """A helper function that creates an ``IOManagerDefinition`` with a hardcoded IOManager.\n\n Args:\n value (Any): A hardcoded IO Manager which helps mock the definition.\n description ([Optional[str]]): The description of the IO Manager. Defaults to None.\n\n Returns:\n [IOManagerDefinition]: A hardcoded resource.\n """\n check.inst_param(value, "value", IOManager)\n return IOManagerDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n\n
[docs]class IOManager(InputManager, OutputManager):\n """\n Base class for user-provided IO managers.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n Extend this class to handle how objects are loaded and stored. Users should implement\n ``handle_output`` to store an object and ``load_input`` to retrieve an object.\n """\n\n
[docs] @abstractmethod\n def load_input(self, context):\n """User-defined method that loads an input to an op.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n\n Returns:\n Any: The data object.\n """
\n\n
[docs] @abstractmethod\n def handle_output(self, context, obj):\n """User-defined method that stores an output of an op.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n obj (Any): The object, returned by the op, to be stored.\n """
\n\n
[docs] def get_output_asset_key(self, _context) -> Optional[AssetKey]:\n """User-defined method that associates outputs handled by this IOManager with a particular\n AssetKey.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n """\n return None
\n\n
[docs] def get_output_asset_partitions(self, _context) -> Set[str]:\n """User-defined method that associates outputs handled by this IOManager with a set of\n partitions of an AssetKey.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n """\n return set()
\n\n
[docs] def get_input_asset_key(self, context) -> Optional[AssetKey]:\n """User-defined method that associates inputs loaded by this IOManager with a particular\n AssetKey.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n """\n return self.get_output_asset_key(context.upstream_output)
\n\n
[docs] def get_input_asset_partitions(self, context) -> Set[str]:\n """User-defined method that associates inputs loaded by this IOManager with a set of\n partitions of an AssetKey.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n """\n return self.get_output_asset_partitions(context.upstream_output)
\n\n\n
[docs]def io_manager(\n config_schema=None,\n description=None,\n output_config_schema=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n):\n """\n Define an IO manager.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an\n :py:class:`IOManager`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource config. Configuration\n data available in `init_context.resource_config`. If not set, Dagster will accept any\n config provided.\n description(Optional[str]): A human-readable description of the resource.\n output_config_schema (Optional[ConfigSchema]): The schema for per-output config. If not set,\n no per-output configuration will be allowed.\n input_config_schema (Optional[ConfigSchema]): The schema for per-input config. If not set,\n Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the object\n manager.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n\n **Examples:**\n\n .. code-block:: python\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n write_csv("some/path")\n\n def load_input(self, context):\n return read_csv("some/path")\n\n @io_manager\n def my_io_manager(init_context):\n return MyIOManager()\n\n @op(out=Out(io_manager_key="my_io_manager_key"))\n def my_op(_):\n return do_stuff()\n\n @job(resource_defs={"my_io_manager_key": my_io_manager})\n def my_job():\n my_op()\n\n """\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _IOManagerDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn):\n return _IOManagerDecoratorCallable(\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n output_config_schema=output_config_schema,\n input_config_schema=input_config_schema,\n )(resource_fn)\n\n return _wrap
\n\n\nclass _IOManagerDecoratorCallable:\n def __init__(\n self,\n config_schema=None,\n description=None,\n required_resource_keys=None,\n version=None,\n output_config_schema=None,\n input_config_schema=None,\n ):\n # type validation happens in IOManagerDefinition\n self.config_schema = config_schema\n self.description = description\n self.required_resource_keys = required_resource_keys\n self.version = version\n self.output_config_schema = output_config_schema\n self.input_config_schema = input_config_schema\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n io_manager_def = IOManagerDefinition(\n resource_fn=fn,\n config_schema=self.config_schema,\n description=self.description,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n output_config_schema=self.output_config_schema,\n input_config_schema=self.input_config_schema,\n )\n\n update_wrapper(io_manager_def, wrapped=fn)\n\n return io_manager_def\n
", "current_page_name": "_modules/dagster/core/storage/io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.io_manager"}, "local_compute_log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.local_compute_log_manager

\nimport hashlib\nimport os\nimport sys\nfrom collections import defaultdict\nfrom contextlib import contextmanager\n\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers.polling import PollingObserver\n\nfrom dagster import Field, Float, StringSource, check\nfrom dagster.core.execution.compute_logs import mirror_stream_to_file\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import ensure_dir, touch_file\n\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n    ComputeLogSubscription,\n)\n\nDEFAULT_WATCHDOG_POLLING_TIMEOUT = 2.5\n\nIO_TYPE_EXTENSION = {ComputeIOType.STDOUT: "out", ComputeIOType.STDERR: "err"}\n\nMAX_FILENAME_LENGTH = 255\n\n\n
[docs]class LocalComputeLogManager(ComputeLogManager, ConfigurableClass):\n """Stores copies of stdout & stderr for each compute step locally on disk."""\n\n def __init__(self, base_dir, polling_timeout=None, inst_data=None):\n self._base_dir = base_dir\n self._polling_timeout = check.opt_float_param(\n polling_timeout, "polling_timeout", DEFAULT_WATCHDOG_POLLING_TIMEOUT\n )\n self._subscription_manager = LocalComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, pipeline_run, step_key=None):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n\n key = self.get_key(pipeline_run, step_key)\n outpath = self.get_local_path(pipeline_run.run_id, key, ComputeIOType.STDOUT)\n errpath = self.get_local_path(pipeline_run.run_id, key, ComputeIOType.STDERR)\n with mirror_stream_to_file(sys.stdout, outpath):\n with mirror_stream_to_file(sys.stderr, errpath):\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def polling_timeout(self):\n return self._polling_timeout\n\n @classmethod\n def config_type(cls):\n return {\n "base_dir": StringSource,\n "polling_timeout": Field(Float, is_required=False),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return LocalComputeLogManager(inst_data=inst_data, **config_value)\n\n def _run_directory(self, run_id):\n return os.path.join(self._base_dir, run_id, "compute_logs")\n\n def get_local_path(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return self._get_local_path(run_id, key, IO_TYPE_EXTENSION[io_type])\n\n def complete_artifact_path(self, run_id, key):\n return self._get_local_path(run_id, key, "complete")\n\n def _get_local_path(self, run_id, key, extension):\n filename = "{}.{}".format(key, extension)\n if len(filename) > MAX_FILENAME_LENGTH:\n filename = "{}.{}".format(hashlib.md5(key.encode("utf-8")).hexdigest(), extension)\n return os.path.join(self._run_directory(run_id), filename)\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n path = self.get_local_path(run_id, key, io_type)\n\n if not os.path.exists(path) or not os.path.isfile(path):\n return ComputeLogFileData(path=path, data=None, cursor=0, size=0, download_url=None)\n\n # See: https://docs.python.org/2/library/stdtypes.html#file.tell for Windows behavior\n with open(path, "rb") as f:\n f.seek(cursor, os.SEEK_SET)\n data = f.read(max_bytes)\n cursor = f.tell()\n stats = os.fstat(f.fileno())\n\n # local download path\n download_url = self.download_url(run_id, key, io_type)\n return ComputeLogFileData(\n path=path,\n data=data.decode("utf-8"),\n cursor=cursor,\n size=stats.st_size,\n download_url=download_url,\n )\n\n def is_watch_completed(self, run_id, key):\n return os.path.exists(self.complete_artifact_path(run_id, key))\n\n def on_watch_start(self, pipeline_run, step_key):\n pass\n\n def get_key(self, pipeline_run, step_key):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n return step_key or pipeline_run.pipeline_name\n\n def on_watch_finish(self, pipeline_run, step_key=None):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n key = self.get_key(pipeline_run, step_key)\n touchpath = self.complete_artifact_path(pipeline_run.run_id, key)\n touch_file(touchpath)\n\n def download_url(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return "/download/{}/{}/{}".format(run_id, key, io_type.value)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()
\n\n\nclass LocalComputeLogSubscriptionManager:\n def __init__(self, manager):\n self._manager = manager\n self._subscriptions = defaultdict(list)\n self._watchers = {}\n self._observer = None\n\n def _watch_key(self, run_id, key):\n return "{}:{}".format(run_id, key)\n\n def add_subscription(self, subscription):\n check.inst_param(subscription, "subscription", ComputeLogSubscription)\n if self._manager.is_watch_completed(subscription.run_id, subscription.key):\n subscription.fetch()\n subscription.complete()\n else:\n watch_key = self._watch_key(subscription.run_id, subscription.key)\n self._subscriptions[watch_key].append(subscription)\n self.watch(subscription.run_id, subscription.key)\n\n def remove_subscription(self, subscription):\n check.inst_param(subscription, "subscription", ComputeLogSubscription)\n watch_key = self._watch_key(subscription.run_id, subscription.key)\n if subscription in self._subscriptions[watch_key]:\n self._subscriptions[watch_key].remove(subscription)\n subscription.complete()\n\n def remove_all_subscriptions(self, run_id, step_key):\n watch_key = self._watch_key(run_id, step_key)\n for subscription in self._subscriptions.pop(watch_key, []):\n subscription.complete()\n\n def watch(self, run_id, step_key):\n watch_key = self._watch_key(run_id, step_key)\n if watch_key in self._watchers:\n return\n\n update_paths = [\n self._manager.get_local_path(run_id, step_key, ComputeIOType.STDOUT),\n self._manager.get_local_path(run_id, step_key, ComputeIOType.STDERR),\n ]\n complete_paths = [self._manager.complete_artifact_path(run_id, step_key)]\n directory = os.path.dirname(\n self._manager.get_local_path(run_id, step_key, ComputeIOType.STDERR)\n )\n\n if not self._observer:\n self._observer = PollingObserver(self._manager.polling_timeout)\n self._observer.start()\n\n ensure_dir(directory)\n\n self._watchers[watch_key] = self._observer.schedule(\n LocalComputeLogFilesystemEventHandler(\n self, run_id, step_key, update_paths, complete_paths\n ),\n str(directory),\n )\n\n def notify_subscriptions(self, run_id, step_key):\n watch_key = self._watch_key(run_id, step_key)\n for subscription in self._subscriptions[watch_key]:\n subscription.fetch()\n\n def unwatch(self, run_id, step_key, handler):\n watch_key = self._watch_key(run_id, step_key)\n if watch_key in self._watchers:\n self._observer.remove_handler_for_watch(handler, self._watchers[watch_key])\n del self._watchers[watch_key]\n\n def dispose(self):\n if self._observer:\n self._observer.stop()\n self._observer.join(15)\n\n\nclass LocalComputeLogFilesystemEventHandler(PatternMatchingEventHandler):\n def __init__(self, manager, run_id, key, update_paths, complete_paths):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.update_paths = update_paths\n self.complete_paths = complete_paths\n patterns = update_paths + complete_paths\n super(LocalComputeLogFilesystemEventHandler, self).__init__(patterns=patterns)\n\n def on_created(self, event):\n if event.src_path in self.complete_paths:\n self.manager.remove_all_subscriptions(self.run_id, self.key)\n self.manager.unwatch(self.run_id, self.key, self)\n\n def on_modified(self, event):\n if event.src_path in self.update_paths:\n self.manager.notify_subscriptions(self.run_id, self.key)\n
", "current_page_name": "_modules/dagster/core/storage/local_compute_log_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.local_compute_log_manager"}, "mem_io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.mem_io_manager

\nfrom dagster.core.storage.io_manager import IOManager, io_manager\n\n\nclass InMemoryIOManager(IOManager):\n    def __init__(self):\n        self.values = {}\n\n    def handle_output(self, context, obj):\n        keys = tuple(context.get_output_identifier())\n        self.values[keys] = obj\n\n    def load_input(self, context):\n        keys = tuple(context.upstream_output.get_output_identifier())\n        return self.values[keys]\n\n\n
[docs]@io_manager\ndef mem_io_manager(_):\n """Built-in IO manager that stores and retrieves values in memory."""\n\n return InMemoryIOManager()
\n
", "current_page_name": "_modules/dagster/core/storage/mem_io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.mem_io_manager"}, "memoizable_io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.memoizable_io_manager

\nimport os\nimport pickle\nfrom abc import abstractmethod\n\nfrom dagster import check\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.execution.context.output import OutputContext\nfrom dagster.core.storage.io_manager import IOManager, io_manager\nfrom dagster.utils import PICKLE_PROTOCOL, mkdir_p\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]class MemoizableIOManager(IOManager):\n """\n Base class for IO manager enabled to work with memoized execution. Users should implement\n the ``load_input`` and ``handle_output`` methods described in the ``IOManager`` API, and the\n ``has_output`` method, which returns a boolean representing whether a data object can be found.\n """\n\n
[docs] @abstractmethod\n def has_output(self, context: OutputContext) -> bool:\n """The user-defined method that returns whether data exists given the metadata.\n\n Args:\n context (OutputContext): The context of the step performing this check.\n\n Returns:\n bool: True if there is data present that matches the provided context. False otherwise.\n """
\n\n\nclass VersionedPickledObjectFilesystemIOManager(MemoizableIOManager):\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context):\n # automatically construct filepath\n step_key = check.str_param(context.step_key, "context.step_key")\n output_name = check.str_param(context.name, "context.name")\n version = check.str_param(context.version, "context.version")\n\n return os.path.join(self.base_dir, step_key, output_name, version)\n\n def handle_output(self, context, obj):\n """Pickle the data with the associated version, and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n\n filepath = self._get_path(context)\n\n context.log.debug(f"Writing file at: {filepath}")\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n\n filepath = self._get_path(context.upstream_output)\n\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n def has_output(self, context):\n """Returns true if data object exists with the associated version, False otherwise."""\n\n filepath = self._get_path(context)\n\n context.log.debug(f"Checking for file at: {filepath}")\n\n return os.path.exists(filepath) and not os.path.isdir(filepath)\n\n\n@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\n@experimental\ndef versioned_filesystem_io_manager(init_context):\n """Filesystem IO manager that utilizes versioning of stored objects.\n\n It requires users to specify a base directory where all the step outputs will be stored in. It\n serializes and deserializes output values (assets) using pickling and automatically constructs\n the filepaths for the assets using the provided directory, and the version for a provided step\n output.\n """\n return VersionedPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "versioned_outputs")\n )\n )\n
", "current_page_name": "_modules/dagster/core/storage/memoizable_io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.memoizable_io_manager"}, "pipeline_run": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.pipeline_run

\nimport warnings\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, List, NamedTuple, Optional, Type\n\nfrom dagster import check\nfrom dagster.core.origin import PipelinePythonOrigin\nfrom dagster.core.storage.tags import PARENT_RUN_ID_TAG, ROOT_RUN_ID_TAG\nfrom dagster.core.utils import make_new_run_id\nfrom dagster.serdes.serdes import (\n    DefaultNamedTupleSerializer,\n    EnumSerializer,\n    WhitelistMap,\n    register_serdes_enum_fallbacks,\n    register_serdes_tuple_fallbacks,\n    replace_storage_keys,\n    unpack_inner_value,\n    whitelist_for_serdes,\n)\n\nfrom .tags import (\n    BACKFILL_ID_TAG,\n    PARTITION_NAME_TAG,\n    PARTITION_SET_TAG,\n    RESUME_RETRY_TAG,\n    SCHEDULE_NAME_TAG,\n    SENSOR_NAME_TAG,\n)\n\nif TYPE_CHECKING:\n    from dagster.core.host_representation.origin import ExternalPipelineOrigin\n\n\nclass DagsterRunStatusSerializer(EnumSerializer):\n    @classmethod\n    def value_from_storage_str(cls, storage_str: str, klass: Type) -> Enum:\n        return getattr(klass, storage_str)\n\n    @classmethod\n    def value_to_storage_str(\n        cls, value: Enum, whitelist_map: WhitelistMap, descent_path: str\n    ) -> str:\n        enum_value = value.value\n        # Store DagsterRunStatus with backcompat name PipelineRunStatus\n        backcompat_name = "PipelineRunStatus"\n        return ".".join([backcompat_name, enum_value])\n\n\n
[docs]@whitelist_for_serdes(serializer=DagsterRunStatusSerializer)\nclass DagsterRunStatus(Enum):\n """The status of pipeline execution."""\n\n QUEUED = "QUEUED"\n NOT_STARTED = "NOT_STARTED"\n MANAGED = "MANAGED"\n STARTING = "STARTING"\n STARTED = "STARTED"\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n CANCELING = "CANCELING"\n CANCELED = "CANCELED"
\n\n\nPipelineRunStatus = DagsterRunStatus\nregister_serdes_enum_fallbacks({"PipelineRunStatus": DagsterRunStatus})\n\n# These statuses that indicate a run may be using compute resources\nIN_PROGRESS_RUN_STATUSES = [\n PipelineRunStatus.STARTING,\n PipelineRunStatus.STARTED,\n PipelineRunStatus.CANCELING,\n]\n\n# This serves as an explicit list of run statuses that indicate that the run is not using compute\n# resources. This and the enum above should cover all run statuses.\nNON_IN_PROGRESS_RUN_STATUSES = [\n PipelineRunStatus.QUEUED,\n PipelineRunStatus.NOT_STARTED,\n PipelineRunStatus.SUCCESS,\n PipelineRunStatus.FAILURE,\n PipelineRunStatus.MANAGED,\n PipelineRunStatus.CANCELED,\n]\n\n\n@whitelist_for_serdes\nclass PipelineRunStatsSnapshot(\n NamedTuple(\n "_PipelineRunStatsSnapshot",\n [\n ("run_id", str),\n ("steps_succeeded", int),\n ("steps_failed", int),\n ("materializations", int),\n ("expectations", int),\n ("enqueued_time", Optional[float]),\n ("launch_time", Optional[float]),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n steps_succeeded: int,\n steps_failed: int,\n materializations: int,\n expectations: int,\n enqueued_time: Optional[float],\n launch_time: Optional[float],\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(PipelineRunStatsSnapshot, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n steps_succeeded=check.int_param(steps_succeeded, "steps_succeeded"),\n steps_failed=check.int_param(steps_failed, "steps_failed"),\n materializations=check.int_param(materializations, "materializations"),\n expectations=check.int_param(expectations, "expectations"),\n enqueued_time=check.opt_float_param(enqueued_time, "enqueued_time"),\n launch_time=check.opt_float_param(launch_time, "launch_time"),\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\nclass DagsterRunSerializer(DefaultNamedTupleSerializer):\n @classmethod\n def value_from_storage_dict(\n cls,\n storage_dict,\n klass,\n args_for_class,\n whitelist_map,\n descent_path,\n ):\n # unpack all stored fields\n unpacked_dict = {\n key: unpack_inner_value(value, whitelist_map, f"{descent_path}.{key}")\n for key, value in storage_dict.items()\n }\n # called by the serdes layer, delegates to helper method with expanded kwargs\n return pipeline_run_from_storage(**unpacked_dict)\n\n @classmethod\n def value_to_storage_dict(\n cls,\n value: NamedTuple,\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Dict[str, Any]:\n storage = super().value_to_storage_dict(\n value,\n whitelist_map,\n descent_path,\n )\n # persist using legacy name PipelineRun\n storage["__class__"] = "PipelineRun"\n return storage\n\n\ndef pipeline_run_from_storage(\n pipeline_name=None,\n run_id=None,\n run_config=None,\n mode=None,\n solid_selection=None,\n solids_to_execute=None,\n step_keys_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n pipeline_snapshot_id=None,\n execution_plan_snapshot_id=None,\n # backcompat\n environment_dict=None,\n previous_run_id=None,\n selector=None,\n solid_subset=None,\n reexecution_config=None, # pylint: disable=unused-argument\n external_pipeline_origin=None,\n pipeline_code_origin=None,\n **kwargs,\n):\n\n # serdes log\n # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve\n # * added pipeline_snapshot_id\n # * renamed previous_run_id -> parent_run_id, added root_run_id\n # * added execution_plan_snapshot_id\n # * removed selector\n # * added solid_subset\n # * renamed solid_subset -> solid_selection, added solids_to_execute\n # * renamed environment_dict -> run_config\n\n # back compat for environment dict => run_config\n if environment_dict:\n check.invariant(\n not run_config,\n "Cannot set both run_config and environment_dict. Use run_config parameter.",\n )\n run_config = environment_dict\n\n # back compat for previous_run_id => parent_run_id, root_run_id\n if previous_run_id and not (parent_run_id and root_run_id):\n parent_run_id = previous_run_id\n root_run_id = previous_run_id\n\n # back compat for selector => pipeline_name, solids_to_execute\n selector = check.opt_inst_param(selector, "selector", ExecutionSelector)\n if selector:\n check.invariant(\n pipeline_name is None or selector.name == pipeline_name,\n (\n "Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: "\n "selector was passed with pipeline {selector_pipeline}".format(\n pipeline_name=pipeline_name, selector_pipeline=selector.name\n )\n ),\n )\n if pipeline_name is None:\n pipeline_name = selector.name\n\n check.invariant(\n solids_to_execute is None or set(selector.solid_subset) == solids_to_execute,\n (\n "Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: "\n "selector was passed with subset {selector_subset}".format(\n solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset\n )\n ),\n )\n # for old runs that only have selector but no solids_to_execute\n if solids_to_execute is None:\n solids_to_execute = frozenset(selector.solid_subset) if selector.solid_subset else None\n\n # back compat for solid_subset => solids_to_execute\n check.opt_list_param(solid_subset, "solid_subset", of_type=str)\n if solid_subset:\n solids_to_execute = frozenset(solid_subset)\n\n # warn about unused arguments\n if len(kwargs):\n warnings.warn(\n "Found unhandled arguments from stored PipelineRun: {args}".format(args=kwargs.keys())\n )\n\n return DagsterRun( # pylint: disable=redundant-keyword-arg\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_pipeline_origin=external_pipeline_origin,\n pipeline_code_origin=pipeline_code_origin,\n )\n\n\n
[docs]class PipelineRun(\n NamedTuple(\n "_PipelineRun",\n [\n ("pipeline_name", str),\n ("run_id", str),\n ("run_config", Dict[str, object]),\n ("mode", Optional[str]),\n ("solid_selection", Optional[List[str]]),\n ("solids_to_execute", Optional[AbstractSet[str]]),\n ("step_keys_to_execute", Optional[List[str]]),\n ("status", PipelineRunStatus),\n ("tags", Dict[str, str]),\n ("root_run_id", Optional[str]),\n ("parent_run_id", Optional[str]),\n ("pipeline_snapshot_id", Optional[str]),\n ("execution_plan_snapshot_id", Optional[str]),\n ("external_pipeline_origin", Optional["ExternalPipelineOrigin"]),\n ("pipeline_code_origin", Optional[PipelinePythonOrigin]),\n ],\n )\n):\n """Serializable internal representation of a pipeline run, as stored in a\n :py:class:`~dagster.core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n pipeline_name: str,\n run_id: Optional[str] = None,\n run_config: Optional[Dict[str, object]] = None,\n mode: Optional[str] = None,\n solid_selection: Optional[List[str]] = None,\n solids_to_execute: Optional[AbstractSet[str]] = None,\n step_keys_to_execute: Optional[List[str]] = None,\n status: Optional[PipelineRunStatus] = None,\n tags: Optional[Dict[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n pipeline_snapshot_id: Optional[str] = None,\n execution_plan_snapshot_id: Optional[str] = None,\n external_pipeline_origin: Optional["ExternalPipelineOrigin"] = None,\n pipeline_code_origin: Optional[PipelinePythonOrigin] = None,\n ):\n check.invariant(\n (root_run_id is not None and parent_run_id is not None)\n or (root_run_id is None and parent_run_id is None),\n (\n "Must set both root_run_id and parent_run_id when creating a PipelineRun that "\n "belongs to a run group"\n ),\n )\n # a frozenset which contains the names of the solids to execute\n solids_to_execute = check.opt_nullable_set_param(\n solids_to_execute, "solids_to_execute", of_type=str\n )\n # a list of solid queries provided by the user\n # possible to be None when only solids_to_execute is set by the user directly\n solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n check.opt_nullable_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster.core.host_representation.origin import ExternalPipelineOrigin\n\n if status == PipelineRunStatus.QUEUED:\n check.inst_param(\n external_pipeline_origin,\n "external_pipeline_origin",\n ExternalPipelineOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n if run_id is None:\n run_id = make_new_run_id()\n\n return super(PipelineRun, cls).__new__(\n cls,\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n run_id=check.str_param(run_id, "run_id"),\n run_config=check.opt_dict_param(run_config, "run_config", key_type=str),\n mode=check.opt_str_param(mode, "mode"),\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=check.opt_inst_param(\n status, "status", PipelineRunStatus, PipelineRunStatus.NOT_STARTED\n ),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n root_run_id=check.opt_str_param(root_run_id, "root_run_id"),\n parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"),\n pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, "pipeline_snapshot_id"),\n execution_plan_snapshot_id=check.opt_str_param(\n execution_plan_snapshot_id, "execution_plan_snapshot_id"\n ),\n external_pipeline_origin=check.opt_inst_param(\n external_pipeline_origin, "external_pipeline_origin", ExternalPipelineOrigin\n ),\n pipeline_code_origin=check.opt_inst_param(\n pipeline_code_origin, "pipeline_code_origin", PipelinePythonOrigin\n ),\n )\n\n def with_status(self, status):\n if status == PipelineRunStatus.QUEUED:\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster.core.host_representation.origin import ExternalPipelineOrigin\n\n check.inst(\n self.external_pipeline_origin,\n ExternalPipelineOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n return self._replace(status=status)\n\n def with_mode(self, mode):\n return self._replace(mode=mode)\n\n def with_tags(self, tags):\n return self._replace(tags=tags)\n\n def get_root_run_id(self):\n return self.tags.get(ROOT_RUN_ID_TAG)\n\n def get_parent_run_id(self):\n return self.tags.get(PARENT_RUN_ID_TAG)\n\n @property\n def is_finished(self):\n return (\n self.status == PipelineRunStatus.SUCCESS\n or self.status == PipelineRunStatus.FAILURE\n or self.status == PipelineRunStatus.CANCELED\n )\n\n @property\n def is_success(self):\n return self.status == PipelineRunStatus.SUCCESS\n\n @property\n def is_failure(self):\n return self.status == PipelineRunStatus.FAILURE\n\n @property\n def is_failure_or_canceled(self):\n return self.status == PipelineRunStatus.FAILURE or self.status == PipelineRunStatus.CANCELED\n\n @property\n def is_resume_retry(self):\n return self.tags.get(RESUME_RETRY_TAG) == "true"\n\n @property\n def previous_run_id(self):\n # Compat\n return self.parent_run_id\n\n @staticmethod\n def tags_for_schedule(schedule):\n return {SCHEDULE_NAME_TAG: schedule.name}\n\n @staticmethod\n def tags_for_sensor(sensor):\n return {SENSOR_NAME_TAG: sensor.name}\n\n @staticmethod\n def tags_for_backfill_id(backfill_id):\n return {BACKFILL_ID_TAG: backfill_id}\n\n @staticmethod\n def tags_for_partition_set(partition_set, partition):\n return {PARTITION_NAME_TAG: partition.name, PARTITION_SET_TAG: partition_set.name}
\n\n\n@whitelist_for_serdes(serializer=DagsterRunSerializer)\nclass DagsterRun(PipelineRun):\n """Serializable internal representation of a dagster run, as stored in a\n :py:class:`~dagster.core.storage.runs.RunStorage`.\n\n Subclasses PipelineRun for backcompat purposes. DagsterRun is the actual initialized class used throughout the system.\n """\n\n\n# DagsterRun is serialized as PipelineRun so that it can be read by older (pre 0.13.x) version of\n# Dagster, but is read back in as a DagsterRun.\nregister_serdes_tuple_fallbacks({"PipelineRun": DagsterRun})\n\n\nclass RunsFilterSerializer(DefaultNamedTupleSerializer):\n @classmethod\n def value_to_storage_dict(\n cls,\n value: NamedTuple,\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Dict[str, Any]:\n storage = super().value_to_storage_dict(\n value,\n whitelist_map,\n descent_path,\n )\n # For backcompat, we store:\n # job_name as pipeline_name\n return replace_storage_keys(storage, {"job_name": "pipeline_name"})\n\n\n@whitelist_for_serdes(serializer=RunsFilterSerializer)\nclass RunsFilter(\n NamedTuple(\n "_RunsFilter",\n [\n ("run_ids", List[str]),\n ("job_name", Optional[str]),\n ("statuses", List[PipelineRunStatus]),\n ("tags", Dict[str, str]),\n ("snapshot_id", Optional[str]),\n ("updated_after", Optional[datetime]),\n ("mode", Optional[str]),\n ("created_before", Optional[datetime]),\n ],\n )\n):\n def __new__(\n cls,\n run_ids: Optional[List[str]] = None,\n job_name: Optional[str] = None,\n statuses: Optional[List[PipelineRunStatus]] = None,\n tags: Optional[Dict[str, str]] = None,\n snapshot_id: Optional[str] = None,\n updated_after: Optional[datetime] = None,\n mode: Optional[str] = None,\n created_before: Optional[datetime] = None,\n pipeline_name: Optional[str] = None, # for backcompat purposes\n ):\n job_name = job_name or pipeline_name\n\n check.invariant(run_ids != [], "When filtering on run ids, a non-empty list must be used.")\n\n return super(RunsFilter, cls).__new__(\n cls,\n run_ids=check.opt_list_param(run_ids, "run_ids", of_type=str),\n job_name=check.opt_str_param(job_name, "job_name"),\n statuses=check.opt_list_param(statuses, "statuses", of_type=PipelineRunStatus),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n snapshot_id=check.opt_str_param(snapshot_id, "snapshot_id"),\n updated_after=check.opt_inst_param(updated_after, "updated_after", datetime),\n mode=check.opt_str_param(mode, "mode"),\n created_before=check.opt_inst_param(created_before, "created_before", datetime),\n )\n\n @property\n def pipeline_name(self):\n return self.job_name\n\n @staticmethod\n def for_schedule(schedule):\n return RunsFilter(tags=PipelineRun.tags_for_schedule(schedule))\n\n @staticmethod\n def for_partition(partition_set, partition):\n return RunsFilter(tags=PipelineRun.tags_for_partition_set(partition_set, partition))\n\n @staticmethod\n def for_sensor(sensor):\n return RunsFilter(tags=PipelineRun.tags_for_sensor(sensor))\n\n @staticmethod\n def for_backfill(backfill_id):\n return RunsFilter(tags=PipelineRun.tags_for_backfill_id(backfill_id))\n\n\nregister_serdes_tuple_fallbacks({"PipelineRunsFilter": RunsFilter})\n# DEPRECATED - keeping around for backcompat reasons (some folks might have imported directly)\nPipelineRunsFilter = RunsFilter\n\n\nclass JobBucket(NamedTuple):\n job_names: List[str]\n bucket_limit: Optional[int]\n\n\nclass TagBucket(NamedTuple):\n tag_key: str\n tag_values: List[str]\n bucket_limit: Optional[int]\n\n\nclass RunRecord(\n NamedTuple(\n "_RunRecord",\n [\n ("storage_id", int),\n ("pipeline_run", PipelineRun),\n ("create_timestamp", datetime),\n ("update_timestamp", datetime),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n """Internal representation of a run record, as stored in a\n :py:class:`~dagster.core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n storage_id,\n pipeline_run,\n create_timestamp,\n update_timestamp,\n start_time=None,\n end_time=None,\n ):\n return super(RunRecord, cls).__new__(\n cls,\n storage_id=check.int_param(storage_id, "storage_id"),\n pipeline_run=check.inst_param(pipeline_run, "pipeline_run", PipelineRun),\n create_timestamp=check.inst_param(create_timestamp, "create_timestamp", datetime),\n update_timestamp=check.inst_param(update_timestamp, "update_timestamp", datetime),\n # start_time and end_time fields will be populated once the run has started and ended, respectively, but will be None beforehand.\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\n###################################################################################################\n# GRAVEYARD\n#\n# -|-\n# |\n# _-'~~~~~`-_\n# .' '.\n# | R I P |\n# | |\n# | Execution |\n# | Selector |\n# | |\n# | |\n###################################################################################################\n\n\n@whitelist_for_serdes\nclass ExecutionSelector(\n NamedTuple("_ExecutionSelector", [("name", str), ("solid_subset", Optional[List[str]])])\n):\n """\n Kept here to maintain loading of PipelineRuns from when it was still alive.\n """\n\n def __new__(cls, name: str, solid_subset: Optional[List[str]] = None):\n return super(ExecutionSelector, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n solid_subset=None\n if solid_subset is None\n else check.list_param(solid_subset, "solid_subset", of_type=str),\n )\n
", "current_page_name": "_modules/dagster/core/storage/pipeline_run", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.pipeline_run"}, "root": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.root

\nimport os\n\nfrom dagster import StringSource, check\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\n\n
[docs]class LocalArtifactStorage(ConfigurableClass):\n def __init__(self, base_dir, inst_data=None):\n self._base_dir = base_dir\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def base_dir(self):\n return self._base_dir\n\n def file_manager_dir(self, run_id):\n check.str_param(run_id, "run_id")\n return os.path.join(self.base_dir, "storage", run_id, "files")\n\n @property\n def storage_dir(self):\n return os.path.join(self.base_dir, "storage")\n\n @property\n def schedules_dir(self):\n return os.path.join(self.base_dir, "schedules")\n\n
[docs] @staticmethod\n def from_config_value(inst_data, config_value):\n return LocalArtifactStorage(inst_data=inst_data, **config_value)
\n\n
[docs] @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}
\n
", "current_page_name": "_modules/dagster/core/storage/root", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.root"}, "root_input_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.root_input_manager

\nfrom abc import abstractmethod\nfrom functools import update_wrapper\n\nfrom dagster import check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.definition_config_schema import (\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster.core.definitions.resource_definition import ResourceDefinition, is_context_provided\nfrom dagster.core.storage.input_manager import InputManager\nfrom dagster.utils.backcompat import experimental\n\nfrom ..decorator_utils import get_function_params\n\n\nclass IInputManagerDefinition:\n    @property\n    @abstractmethod\n    def input_config_schema(self):\n        """The schema for per-input configuration for inputs that are managed by this\n        input manager"""\n\n\n
[docs]class RootInputManagerDefinition(ResourceDefinition, IInputManagerDefinition):\n """Definition of a root input manager resource.\n\n Root input managers load op inputs that aren't connected to upstream outputs.\n\n An RootInputManagerDefinition is a :py:class:`ResourceDefinition` whose resource_fn returns an\n :py:class:`RootInputManager`.\n\n The easiest way to create an RootInputManagerDefinition is with the\n :py:func:`@root_input_manager <root_input_manager>` decorator.\n """\n\n def __init__(\n self,\n resource_fn=None,\n config_schema=None,\n description=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n super(RootInputManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self):\n return self._input_config_schema\n\n def copy_for_configured(self, description, config_schema, _):\n return RootInputManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n )
\n\n\n
[docs]class RootInputManager(InputManager):\n """RootInputManagers are used to load inputs to ops at the root of a job.\n\n The easiest way to define an RootInputManager is with the\n :py:func:`@root_input_manager <root_input_manager>` decorator.\n """\n\n
[docs] @abstractmethod\n def load_input(self, context):\n """The user-defined read method that loads data given its metadata.\n\n Args:\n context (InputContext): The context of the step output that produces this asset.\n\n Returns:\n Any: The data object.\n """
\n\n\n
[docs]@experimental\ndef root_input_manager(\n config_schema=None,\n description=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n):\n """Define a root input manager.\n\n Root input managers load op inputs that aren't connected to upstream outputs.\n\n The decorated function should accept a :py:class:`InputContext` and resource config, and return\n a loaded object that will be passed into one of the inputs of an op.\n\n The decorator produces an :py:class:`RootInputManagerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource-level config. If not\n set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the resource.\n input_config_schema (Optional[ConfigSchema]): A schema for the input-level config. Each\n input that uses this input manager can be configured separately using this config.\n If not set, Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the input\n manager.\n version (Optional[str]): (Experimental) the version of the input manager definition.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import root_input_manager, op, job, In\n\n @root_input_manager\n def csv_loader(_):\n return read_csv("some/path")\n\n @op(ins={"input1": In(root_manager_key="csv_loader_key")})\n def my_op(_, input1):\n do_stuff(input1)\n\n @job(resource_defs={"csv_loader_key": csv_loader})\n def my_job():\n my_op()\n\n @root_input_manager(config_schema={"base_dir": str})\n def csv_loader(context):\n return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n @root_input_manager(input_config_schema={"path": str})\n def csv_loader(context):\n return read_csv(context.config["path"])\n """\n\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _InputManagerDecoratorCallable()(config_schema)\n\n def _wrap(load_fn):\n return _InputManagerDecoratorCallable(\n config_schema=config_schema,\n description=description,\n version=version,\n input_config_schema=input_config_schema,\n required_resource_keys=required_resource_keys,\n )(load_fn)\n\n return _wrap
\n\n\nclass RootInputManagerWrapper(RootInputManager):\n def __init__(self, load_fn):\n self._load_fn = load_fn\n\n def load_input(self, context):\n return (\n self._load_fn(context)\n if is_context_provided(get_function_params(self._load_fn))\n else self._load_fn()\n )\n\n\nclass _InputManagerDecoratorCallable:\n def __init__(\n self,\n config_schema=None,\n description=None,\n version=None,\n input_config_schema=None,\n required_resource_keys=None,\n ):\n self.config_schema = config_schema\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.input_config_schema = input_config_schema\n self.required_resource_keys = required_resource_keys\n\n def __call__(self, load_fn):\n check.callable_param(load_fn, "load_fn")\n\n def _resource_fn(_):\n return RootInputManagerWrapper(load_fn)\n\n root_input_manager_def = RootInputManagerDefinition(\n resource_fn=_resource_fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n input_config_schema=self.input_config_schema,\n required_resource_keys=self.required_resource_keys,\n )\n\n update_wrapper(root_input_manager_def, wrapped=load_fn)\n\n return root_input_manager_def\n
", "current_page_name": "_modules/dagster/core/storage/root_input_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.root_input_manager"}, "runs": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.runs.base

\nfrom abc import ABC, abstractmethod\nfrom typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union\n\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.snap import ExecutionPlanSnapshot, PipelineSnapshot\nfrom dagster.core.storage.pipeline_run import (\n    JobBucket,\n    PipelineRun,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster.daemon.types import DaemonHeartbeat\n\n\n
[docs]class RunStorage(ABC, MayHaveInstanceWeakref):\n """Abstract base class for storing pipeline run history.\n\n Note that run storages using SQL databases as backing stores should implement\n :py:class:`~dagster.core.storage.runs.SqlRunStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagit`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def add_run(self, pipeline_run: PipelineRun) -> PipelineRun:\n """Add a run to storage.\n\n If a run already exists with the same ID, raise DagsterRunAlreadyExists\n If the run's snapshot ID does not exist raise DagsterSnapshotDoesNotExist\n\n Args:\n pipeline_run (PipelineRun): The run to add.\n """\n\n @abstractmethod\n def handle_run_event(self, run_id: str, event: DagsterEvent):\n """Update run storage in accordance to a pipeline run related DagsterEvent\n\n Args:\n run_id (str)\n event (DagsterEvent)\n """\n\n @abstractmethod\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Iterable[PipelineRun]:\n """Return all the runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n List[PipelineRun]\n """\n\n @abstractmethod\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n """Return the number of runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.PipelineRunFilter` by which to filter\n runs\n\n Returns:\n int: The number of runs that match the given filters.\n """\n\n @abstractmethod\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Iterable[PipelineRun]]]:\n """Get the run group to which a given run belongs.\n\n Args:\n run_id (str): If the corresponding run is the descendant of some root run (i.e., there\n is a root_run_id on the :py:class:`PipelineRun`), that root run and all of its\n descendants are returned; otherwise, the group will consist only of the given run\n (a run that does not descend from any root is its own root).\n\n Returns:\n Optional[Tuple[string, List[PipelineRun]]]: If there is a corresponding run group, tuple\n whose first element is the root_run_id and whose second element is a list of all the\n descendent runs. Otherwise `None`.\n """\n\n @abstractmethod\n def get_run_groups(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Dict[str, Dict[str, Union[Iterable[PipelineRun], int]]]:\n """Return all of the run groups present in the storage that include rows matching the\n given filter.\n\n Args:\n filter (Optional[RunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n Dict[str, Dict[str, Union[List[PipelineRun], int]]]: Specifically, a dict of the form\n ``{'pipeline_run_id': {'runs': [PipelineRun, ...], 'count': int}, ...}``. The\n instances of :py:class:`~dagster.core.pipeline_run.PipelineRun` returned in this\n data structure correspond to all of the runs that would have been returned by\n calling :py:meth:`get_run_groups` with the same arguments, plus their corresponding\n root runs, if any. The keys of this structure are the run_ids of all of the root\n runs (a run with no root is its own root). The integer counts are inclusive of all\n of the root runs' children, including those that would not have been returned by\n calling :py:meth:`get_run_groups` with the same arguments, but exclusive of the root\n run itself; i.e., if a run has no children, the count will be 0.\n """\n\n # Note that we could have made the opposite decision here and filtered for root runs\n # matching a given filter, etc., rather than for child runs; so that asking for the last 5\n # run groups would give the last 5 roots and their descendants, rather than the last 5\n # children and their roots. Consider the case where we have just been retrying runs\n # belonging to a group created long ago; it makes sense to bump these to the top of the\n # interface rather than burying them deeply paginated down. Note also that this query can\n # return no more run groups than there are runs in an equivalent call to get_runs, and no\n # more than 2x total instances of PipelineRun.\n\n @abstractmethod\n def get_run_by_id(self, run_id: str) -> Optional[PipelineRun]:\n """Get a run by its id.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n Optional[PipelineRun]\n """\n\n @abstractmethod\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> List[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n\n @abstractmethod\n def get_run_tags(self) -> List[Tuple[str, Set[str]]]:\n """Get a list of tag keys and the values that have been associated with them.\n\n Returns:\n List[Tuple[str, Set[str]]]\n """\n\n @abstractmethod\n def add_run_tags(self, run_id: str, new_tags: Dict[str, str]):\n """Add additional tags for a pipeline run.\n\n Args:\n run_id (str)\n new_tags (Dict[string, string])\n """\n\n @abstractmethod\n def has_run(self, run_id: str) -> bool:\n """Check if the storage contains a run.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n bool\n """\n\n def add_snapshot(\n self,\n snapshot: Union[PipelineSnapshot, ExecutionPlanSnapshot],\n snapshot_id: Optional[str] = None,\n ):\n """Add a snapshot to the storage.\n\n Args:\n snapshot (Union[PipelineSnapshot, ExecutionPlanSnapshot])\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n """\n if isinstance(snapshot, PipelineSnapshot):\n self.add_pipeline_snapshot(snapshot, snapshot_id)\n else:\n self.add_execution_plan_snapshot(snapshot, snapshot_id)\n\n def has_snapshot(self, snapshot_id: str):\n return self.has_pipeline_snapshot(snapshot_id) or self.has_execution_plan_snapshot(\n snapshot_id\n )\n\n @abstractmethod\n def has_pipeline_snapshot(self, pipeline_snapshot_id: str) -> bool:\n """Check to see if storage contains a pipeline snapshot.\n\n Args:\n pipeline_snapshot_id (str): The id of the run.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_pipeline_snapshot(\n self, pipeline_snapshot: PipelineSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n """Add a pipeline snapshot to the run store.\n\n Pipeline snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n pipeline_snapshot (PipelineSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The pipeline_snapshot_id\n """\n\n @abstractmethod\n def get_pipeline_snapshot(self, pipeline_snapshot_id: str) -> PipelineSnapshot:\n """Fetch a snapshot by ID\n\n Args:\n pipeline_snapshot_id (str)\n\n Returns:\n PipelineSnapshot\n """\n\n @abstractmethod\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n """Check to see if storage contains an execution plan snapshot.\n\n Args:\n execution_plan_snapshot_id (str): The id of the execution plan.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n """Add an execution plan snapshot to the run store.\n\n Execution plan snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n execution_plan_snapshot (ExecutionPlanSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The execution_plan_snapshot_id\n """\n\n @abstractmethod\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n """Fetch a snapshot by ID\n\n Args:\n execution_plan_snapshot_id (str)\n\n Returns:\n ExecutionPlanSnapshot\n """\n\n @abstractmethod\n def wipe(self):\n """Clears the run storage."""\n\n @abstractmethod\n def delete_run(self, run_id: str):\n """Remove a run from storage"""\n\n @property\n def supports_bucket_queries(self):\n return True\n\n def migrate(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n """Call this method to run any required data migrations"""\n\n def optimize(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n """Call this method to run any optional data migrations for optimized reads"""\n\n def dispose(self):\n """Explicit lifecycle management."""\n\n def optimize_for_dagit(self, statement_timeout: int):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""\n\n # Daemon Heartbeat Storage\n #\n # Holds heartbeats from the Dagster Daemon so that other system components can alert when it's not\n # alive.\n # This is temporarily placed along with run storage to avoid adding a new instance concept. It\n # should be split out once all metadata storages are configured together.\n\n @abstractmethod\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat):\n """Called on a regular interval by the daemon"""\n\n @abstractmethod\n def get_daemon_heartbeats(self) -> Dict[str, DaemonHeartbeat]:\n """Latest heartbeats of all daemon types"""\n\n @abstractmethod\n def wipe_daemon_heartbeats(self):\n """Wipe all daemon heartbeats"""\n\n # Backfill storage\n @abstractmethod\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> List[PartitionBackfill]:\n """Get a list of partition backfills"""\n\n @abstractmethod\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n """Get the partition backfill of the given backfill id."""\n\n @abstractmethod\n def add_backfill(self, partition_backfill: PartitionBackfill):\n """Add partition backfill to run storage"""\n\n @abstractmethod\n def update_backfill(self, partition_backfill: PartitionBackfill):\n """Update a partition backfill in run storage"""
\n
", "current_page_name": "_modules/dagster/core/storage/runs/base", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.runs.base"}, "sql_run_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.runs.sql_run_storage

\nimport logging\nimport uuid\nimport zlib\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union\n\nimport pendulum\nimport sqlalchemy as db\n\nfrom dagster import check\nfrom dagster.core.errors import (\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunNotFoundError,\n    DagsterSnapshotDoesNotExist,\n)\nfrom dagster.core.events import EVENT_TYPE_TO_PIPELINE_RUN_STATUS, DagsterEvent, DagsterEventType\nfrom dagster.core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster.core.snap import (\n    ExecutionPlanSnapshot,\n    PipelineSnapshot,\n    create_execution_plan_snapshot_id,\n    create_pipeline_snapshot_id,\n)\nfrom dagster.core.storage.tags import PARTITION_NAME_TAG, PARTITION_SET_TAG, ROOT_RUN_ID_TAG\nfrom dagster.daemon.types import DaemonHeartbeat\nfrom dagster.serdes import (\n    deserialize_as,\n    deserialize_json_to_dagster_namedtuple,\n    serialize_dagster_namedtuple,\n)\nfrom dagster.seven import JSONDecodeError\nfrom dagster.utils import merge_dicts, utc_datetime_from_timestamp\n\nfrom ..pipeline_run import JobBucket, PipelineRun, RunRecord, RunsFilter, TagBucket\nfrom .base import RunStorage\nfrom .migration import OPTIONAL_DATA_MIGRATIONS, REQUIRED_DATA_MIGRATIONS, RUN_PARTITIONS\nfrom .schema import (\n    BulkActionsTable,\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunTagsTable,\n    RunsTable,\n    SecondaryIndexMigrationTable,\n    SnapshotsTable,\n)\n\n\nclass SnapshotType(Enum):\n    PIPELINE = "PIPELINE"\n    EXECUTION_PLAN = "EXECUTION_PLAN"\n\n\n
[docs]class SqlRunStorage(RunStorage): # pylint: disable=no-init\n """Base class for SQL based run storages"""\n\n @abstractmethod\n def connect(self):\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema or data migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def fetchall(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def fetchone(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n row = result_proxy.fetchone()\n result_proxy.close()\n\n return row\n\n def add_run(self, pipeline_run: PipelineRun) -> PipelineRun:\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n if pipeline_run.pipeline_snapshot_id and not self.has_pipeline_snapshot(\n pipeline_run.pipeline_snapshot_id\n ):\n raise DagsterSnapshotDoesNotExist(\n "Snapshot {ss_id} does not exist in run storage".format(\n ss_id=pipeline_run.pipeline_snapshot_id\n )\n )\n\n has_tags = pipeline_run.tags and len(pipeline_run.tags) > 0\n partition = pipeline_run.tags.get(PARTITION_NAME_TAG) if has_tags else None\n partition_set = pipeline_run.tags.get(PARTITION_SET_TAG) if has_tags else None\n\n runs_insert = RunsTable.insert().values( # pylint: disable=no-value-for-parameter\n run_id=pipeline_run.run_id,\n pipeline_name=pipeline_run.pipeline_name,\n status=pipeline_run.status.value,\n run_body=serialize_dagster_namedtuple(pipeline_run),\n snapshot_id=pipeline_run.pipeline_snapshot_id,\n partition=partition,\n partition_set=partition_set,\n )\n with self.connect() as conn:\n try:\n conn.execute(runs_insert)\n except db.exc.IntegrityError as exc:\n raise DagsterRunAlreadyExists from exc\n\n if pipeline_run.tags and len(pipeline_run.tags) > 0:\n conn.execute(\n RunTagsTable.insert(), # pylint: disable=no-value-for-parameter\n [\n dict(run_id=pipeline_run.run_id, key=k, value=v)\n for k, v in pipeline_run.tags.items()\n ],\n )\n\n return pipeline_run\n\n def handle_run_event(self, run_id: str, event: DagsterEvent):\n check.str_param(run_id, "run_id")\n check.inst_param(event, "event", DagsterEvent)\n\n if event.event_type not in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n return\n\n run = self.get_run_by_id(run_id)\n if not run:\n # TODO log?\n return\n\n new_pipeline_status = EVENT_TYPE_TO_PIPELINE_RUN_STATUS[event.event_type]\n\n run_stats_cols_in_index = self.has_run_stats_index_cols()\n\n kwargs = {}\n\n # consider changing the `handle_run_event` signature to get timestamp off of the\n # EventLogEntry instead of the DagsterEvent, for consistency\n now = pendulum.now("UTC")\n\n if run_stats_cols_in_index and event.event_type == DagsterEventType.PIPELINE_START:\n kwargs["start_time"] = now.timestamp()\n\n if run_stats_cols_in_index and event.event_type in {\n DagsterEventType.PIPELINE_CANCELED,\n DagsterEventType.PIPELINE_FAILURE,\n DagsterEventType.PIPELINE_SUCCESS,\n }:\n kwargs["end_time"] = now.timestamp()\n\n with self.connect() as conn:\n\n conn.execute(\n RunsTable.update() # pylint: disable=no-value-for-parameter\n .where(RunsTable.c.run_id == run_id)\n .values(\n status=new_pipeline_status.value,\n run_body=serialize_dagster_namedtuple(run.with_status(new_pipeline_status)),\n update_timestamp=now,\n **kwargs,\n )\n )\n\n def _row_to_run(self, row: Tuple) -> PipelineRun:\n return deserialize_as(row[0], PipelineRun)\n\n def _rows_to_runs(self, rows: Iterable[Tuple]) -> List[PipelineRun]:\n return list(map(self._row_to_run, rows))\n\n def _add_cursor_limit_to_query(\n self,\n query,\n cursor: Optional[str],\n limit: Optional[int],\n order_by: Optional[str],\n ascending: Optional[bool],\n ):\n """Helper function to deal with cursor/limit pagination args"""\n\n if cursor:\n cursor_query = db.select([RunsTable.c.id]).where(RunsTable.c.run_id == cursor)\n query = query.where(RunsTable.c.id < cursor_query)\n\n if limit:\n query = query.limit(limit)\n\n sorting_column = getattr(RunsTable.c, order_by) if order_by else RunsTable.c.id\n direction = db.asc if ascending else db.desc\n query = query.order_by(direction(sorting_column))\n\n return query\n\n def _add_filters_to_query(self, query, filters: RunsFilter):\n check.inst_param(filters, "filters", RunsFilter)\n\n if filters.run_ids:\n query = query.where(RunsTable.c.run_id.in_(filters.run_ids))\n\n if filters.job_name:\n query = query.where(RunsTable.c.pipeline_name == filters.job_name)\n\n if filters.mode:\n query = query.where(RunsTable.c.mode == filters.mode)\n\n if filters.statuses:\n query = query.where(\n RunsTable.c.status.in_([status.value for status in filters.statuses])\n )\n\n if filters.tags:\n query = query.where(\n db.or_(\n *(\n db.and_(RunTagsTable.c.key == key, RunTagsTable.c.value == value)\n for key, value in filters.tags.items()\n )\n )\n ).group_by(RunsTable.c.run_body, RunsTable.c.id)\n\n if len(filters.tags) > 0:\n query = query.having(db.func.count(RunsTable.c.run_id) == len(filters.tags))\n\n if filters.snapshot_id:\n query = query.where(RunsTable.c.snapshot_id == filters.snapshot_id)\n\n if filters.updated_after:\n query = query.where(RunsTable.c.update_timestamp > filters.updated_after)\n\n if filters.created_before:\n query = query.where(RunsTable.c.create_timestamp < filters.created_before)\n\n return query\n\n def _runs_query(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n columns: Optional[List[str]] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ):\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_str_param(cursor, "cursor")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(columns, "columns")\n check.opt_str_param(order_by, "order_by")\n check.opt_bool_param(ascending, "ascending")\n\n if columns is None:\n columns = ["run_body"]\n\n if bucket_by:\n if limit or cursor:\n check.failed("cannot specify bucket_by and limit/cursor at the same time")\n return self._bucketed_runs_query(bucket_by, filters, columns, order_by, ascending)\n\n query_columns = [getattr(RunsTable.c, column) for column in columns]\n if filters.tags:\n base_query = db.select(query_columns).select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n )\n else:\n base_query = db.select(query_columns).select_from(RunsTable)\n\n base_query = self._add_filters_to_query(base_query, filters)\n return self._add_cursor_limit_to_query(base_query, cursor, limit, order_by, ascending)\n\n def _bucket_rank_column(self, bucket_by, order_by, ascending):\n check.inst_param(bucket_by, "bucket_by", (JobBucket, TagBucket))\n check.invariant(\n self.supports_bucket_queries, "Bucket queries are not supported by this storage layer"\n )\n sorting_column = getattr(RunsTable.c, order_by) if order_by else RunsTable.c.id\n direction = db.asc if ascending else db.desc\n bucket_column = (\n RunsTable.c.pipeline_name if isinstance(bucket_by, JobBucket) else RunTagsTable.c.value\n )\n return (\n db.func.rank()\n .over(order_by=direction(sorting_column), partition_by=bucket_column)\n .label("rank")\n )\n\n def _bucketed_runs_query(\n self,\n bucket_by: Union[JobBucket, TagBucket],\n filters: RunsFilter,\n columns: List[str],\n order_by: Optional[str] = None,\n ascending: bool = False,\n ):\n bucket_rank = self._bucket_rank_column(bucket_by, order_by, ascending)\n query_columns = [getattr(RunsTable.c, column) for column in columns] + [bucket_rank]\n\n if isinstance(bucket_by, JobBucket):\n # bucketing by job\n base_query = (\n db.select(query_columns)\n .select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n if filters.tags\n else RunsTable\n )\n .where(RunsTable.c.pipeline_name.in_(bucket_by.job_names))\n )\n base_query = self._add_filters_to_query(base_query, filters)\n\n elif not filters.tags:\n # bucketing by tag, no tag filters\n base_query = (\n db.select(query_columns)\n .select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n )\n .where(RunTagsTable.c.key == bucket_by.tag_key)\n .where(RunTagsTable.c.value.in_(bucket_by.tag_values))\n )\n base_query = self._add_filters_to_query(base_query, filters)\n\n else:\n # there are tag filters as well as tag buckets, so we have to apply the tag filters in\n # a separate join\n filtered_query = db.select([RunsTable.c.run_id]).select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n )\n filtered_query = self._add_filters_to_query(filtered_query, filters)\n filtered_query = filtered_query.alias("filtered_query")\n\n base_query = (\n db.select(query_columns)\n .select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id).join(\n filtered_query, RunsTable.c.run_id == filtered_query.c.run_id\n )\n )\n .where(RunTagsTable.c.key == bucket_by.tag_key)\n .where(RunTagsTable.c.value.in_(bucket_by.tag_values))\n )\n\n subquery = base_query.alias("subquery")\n\n # select all the columns, but skip the bucket_rank column, which is only used for applying\n # the limit / order\n subquery_columns = [getattr(subquery.c, column) for column in columns]\n query = db.select(subquery_columns).order_by(subquery.c.rank.asc())\n if bucket_by.bucket_limit:\n query = query.where(subquery.c.rank <= bucket_by.bucket_limit)\n\n return query\n\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> List[PipelineRun]:\n query = self._runs_query(filters, cursor, limit, bucket_by=bucket_by)\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n subquery = self._runs_query(filters=filters).alias("subquery")\n\n # We use an alias here because Postgres requires subqueries to be\n # aliased.\n subquery = subquery.alias("subquery")\n\n query = db.select([db.func.count()]).select_from(subquery)\n rows = self.fetchall(query)\n count = rows[0][0]\n return count\n\n def get_run_by_id(self, run_id: str) -> Optional[PipelineRun]:\n """Get a run by its id.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n Optional[PipelineRun]\n """\n check.str_param(run_id, "run_id")\n\n query = db.select([RunsTable.c.run_body]).where(RunsTable.c.run_id == run_id)\n rows = self.fetchall(query)\n return deserialize_as(rows[0][0], PipelineRun) if len(rows) else None\n\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> List[RunRecord]:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_int_param(limit, "limit")\n\n columns = ["id", "run_body", "create_timestamp", "update_timestamp"]\n\n if self.has_run_stats_index_cols():\n columns += ["start_time", "end_time"]\n # only fetch columns we use to build RunRecord\n query = self._runs_query(\n filters=filters,\n limit=limit,\n columns=columns,\n order_by=order_by,\n ascending=ascending,\n cursor=cursor,\n bucket_by=bucket_by,\n )\n\n rows = self.fetchall(query)\n return [\n RunRecord(\n storage_id=check.int_param(row["id"], "id"),\n pipeline_run=deserialize_as(\n check.str_param(row["run_body"], "run_body"), PipelineRun\n ),\n create_timestamp=check.inst(row["create_timestamp"], datetime),\n update_timestamp=check.inst(row["update_timestamp"], datetime),\n start_time=check.opt_inst(row["start_time"], float)\n if "start_time" in row\n else None,\n end_time=check.opt_inst(row["end_time"], float) if "end_time" in row else None,\n )\n for row in rows\n ]\n\n def get_run_tags(self) -> List[Tuple[str, Set[str]]]:\n result = defaultdict(set)\n query = db.select([RunTagsTable.c.key, RunTagsTable.c.value]).distinct(\n RunTagsTable.c.key, RunTagsTable.c.value\n )\n rows = self.fetchall(query)\n for r in rows:\n result[r[0]].add(r[1])\n return sorted(list([(k, v) for k, v in result.items()]), key=lambda x: x[0])\n\n def add_run_tags(self, run_id: str, new_tags: Dict[str, str]):\n check.str_param(run_id, "run_id")\n check.dict_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n run = self.get_run_by_id(run_id)\n if not run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n current_tags = run.tags if run.tags else {}\n\n all_tags = merge_dicts(current_tags, new_tags)\n partition = all_tags.get(PARTITION_NAME_TAG)\n partition_set = all_tags.get(PARTITION_SET_TAG)\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update() # pylint: disable=no-value-for-parameter\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_dagster_namedtuple(\n run.with_tags(merge_dicts(current_tags, new_tags))\n ),\n partition=partition,\n partition_set=partition_set,\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n RunTagsTable.update() # pylint: disable=no-value-for-parameter\n .where(db.and_(RunTagsTable.c.run_id == run_id, RunTagsTable.c.key == tag))\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n RunTagsTable.insert(), # pylint: disable=no-value-for-parameter\n [dict(run_id=run_id, key=tag, value=new_tags[tag]) for tag in added_tags],\n )\n\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Iterable[PipelineRun]]]:\n check.str_param(run_id, "run_id")\n pipeline_run = self.get_run_by_id(run_id)\n if not pipeline_run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n\n # find root_run\n root_run_id = pipeline_run.root_run_id if pipeline_run.root_run_id else pipeline_run.run_id\n root_run = self.get_run_by_id(root_run_id)\n if not root_run:\n raise DagsterRunNotFoundError(\n f"Run id {root_run} set as root run id for run {run_id} was not found in instance.",\n invalid_run_id=root_run,\n )\n\n # root_run_id to run_id 1:1 mapping\n # https://github.com/dagster-io/dagster/issues/2495\n # Note: we currently use tags to persist the run group info\n root_to_run = (\n db.select(\n [RunTagsTable.c.value.label("root_run_id"), RunTagsTable.c.run_id.label("run_id")]\n )\n .where(\n db.and_(RunTagsTable.c.key == ROOT_RUN_ID_TAG, RunTagsTable.c.value == root_run_id)\n )\n .alias("root_to_run")\n )\n # get run group\n run_group_query = (\n db.select([RunsTable.c.run_body])\n .select_from(\n root_to_run.join(\n RunsTable,\n root_to_run.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n .alias("run_group")\n )\n\n with self.connect() as conn:\n res = conn.execute(run_group_query)\n run_group = self._rows_to_runs(res)\n\n return (root_run_id, [root_run] + run_group)\n\n def get_run_groups(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Dict[str, Dict[str, Union[Iterable[PipelineRun], int]]]:\n # The runs that would be returned by calling RunStorage.get_runs with the same arguments\n runs = self._runs_query(\n filters=filters, cursor=cursor, limit=limit, columns=["run_body", "run_id"]\n ).alias("runs")\n\n # Gets us the run_id and associated root_run_id for every run in storage that is a\n # descendant run of some root\n #\n # pseudosql:\n # with all_descendant_runs as (\n # select *\n # from run_tags\n # where key = @ROOT_RUN_ID_TAG\n # )\n\n all_descendant_runs = (\n db.select([RunTagsTable])\n .where(RunTagsTable.c.key == ROOT_RUN_ID_TAG)\n .alias("all_descendant_runs")\n )\n\n # Augment the runs in our query, for those runs that are the descendant of some root run,\n # with the root_run_id\n #\n # pseudosql:\n #\n # with runs_augmented as (\n # select\n # runs.run_id as run_id,\n # all_descendant_runs.value as root_run_id\n # from runs\n # left outer join all_descendant_runs\n # on all_descendant_runs.run_id = runs.run_id\n # )\n\n runs_augmented = (\n db.select(\n [\n runs.c.run_id.label("run_id"),\n all_descendant_runs.c.value.label("root_run_id"),\n ]\n )\n .select_from(\n runs.join(\n all_descendant_runs,\n all_descendant_runs.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n .alias("runs_augmented")\n )\n\n # Get all the runs our query will return. This includes runs as well as their root runs.\n #\n # pseudosql:\n #\n # with runs_and_root_runs as (\n # select runs.run_id as run_id\n # from runs, runs_augmented\n # where\n # runs.run_id = runs_augmented.run_id or\n # runs.run_id = runs_augmented.root_run_id\n # )\n\n runs_and_root_runs = (\n db.select([RunsTable.c.run_id.label("run_id")])\n .select_from(runs_augmented)\n .where(\n db.or_(\n RunsTable.c.run_id == runs_augmented.c.run_id,\n RunsTable.c.run_id == runs_augmented.c.root_run_id,\n )\n )\n .distinct(RunsTable.c.run_id)\n ).alias("runs_and_root_runs")\n\n # We count the descendants of all of the runs in our query that are roots so that\n # we can accurately display when a root run has more descendants than are returned by this\n # query and afford a drill-down. This might be an unnecessary complication, but the\n # alternative isn't obvious -- we could go and fetch *all* the runs in any group that we're\n # going to return in this query, and then append those.\n #\n # pseudosql:\n #\n # select runs.run_body, count(all_descendant_runs.id) as child_counts\n # from runs\n # join runs_and_root_runs on runs.run_id = runs_and_root_runs.run_id\n # left outer join all_descendant_runs\n # on all_descendant_runs.value = runs_and_root_runs.run_id\n # group by runs.run_body\n # order by child_counts desc\n\n runs_and_root_runs_with_descendant_counts = (\n db.select(\n [\n RunsTable.c.run_body,\n db.func.count(all_descendant_runs.c.id).label("child_counts"),\n ]\n )\n .select_from(\n RunsTable.join(\n runs_and_root_runs, RunsTable.c.run_id == runs_and_root_runs.c.run_id\n ).join(\n all_descendant_runs,\n all_descendant_runs.c.value == runs_and_root_runs.c.run_id,\n isouter=True,\n )\n )\n .group_by(RunsTable.c.run_body)\n .order_by(db.desc(db.column("child_counts")))\n )\n\n with self.connect() as conn:\n res = conn.execute(runs_and_root_runs_with_descendant_counts).fetchall()\n\n # Postprocess: descendant runs get aggregated with their roots\n root_run_id_to_group: Dict[str, List[PipelineRun]] = defaultdict(list)\n root_run_id_to_count: Dict[str, int] = defaultdict(int)\n for (run_body, count) in res:\n row = (run_body,)\n pipeline_run = self._row_to_run(row)\n root_run_id = pipeline_run.get_root_run_id()\n if root_run_id is not None:\n root_run_id_to_group[root_run_id].append(pipeline_run)\n else:\n root_run_id_to_group[pipeline_run.run_id].append(pipeline_run)\n root_run_id_to_count[pipeline_run.run_id] = count + 1\n\n return {\n root_run_id: {\n "runs": list(run_group),\n "count": root_run_id_to_count[root_run_id],\n }\n for root_run_id, run_group in root_run_id_to_group.items()\n }\n\n def has_run(self, run_id: str) -> bool:\n check.str_param(run_id, "run_id")\n return bool(self.get_run_by_id(run_id))\n\n def delete_run(self, run_id: str):\n check.str_param(run_id, "run_id")\n query = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(query)\n\n def has_pipeline_snapshot(self, pipeline_snapshot_id: str) -> bool:\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n return self._has_snapshot_id(pipeline_snapshot_id)\n\n def add_pipeline_snapshot(\n self, pipeline_snapshot: PipelineSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n check.inst_param(pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_pipeline_snapshot_id(pipeline_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=pipeline_snapshot,\n snapshot_type=SnapshotType.PIPELINE,\n )\n\n def get_pipeline_snapshot(self, pipeline_snapshot_id: str) -> PipelineSnapshot:\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n return self._get_snapshot(pipeline_snapshot_id)\n\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return bool(self.get_execution_plan_snapshot(execution_plan_snapshot_id))\n\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=execution_plan_snapshot,\n snapshot_type=SnapshotType.EXECUTION_PLAN,\n )\n\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return self._get_snapshot(execution_plan_snapshot_id)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n check.str_param(snapshot_id, "snapshot_id")\n check.not_none_param(snapshot_obj, "snapshot_obj")\n check.inst_param(snapshot_type, "snapshot_type", SnapshotType)\n\n with self.connect() as conn:\n snapshot_insert = (\n SnapshotsTable.insert().values( # pylint: disable=no-value-for-parameter\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(\n serialize_dagster_namedtuple(snapshot_obj).encode("utf-8")\n ),\n snapshot_type=snapshot_type.value,\n )\n )\n conn.execute(snapshot_insert)\n return snapshot_id\n\n def get_run_storage_id(self) -> str:\n query = db.select([InstanceInfo.c.run_storage_id])\n row = self.fetchone(query)\n if not row:\n run_storage_id = str(uuid.uuid4())\n with self.connect() as conn:\n conn.execute(InstanceInfo.insert().values(run_storage_id=run_storage_id))\n return run_storage_id\n else:\n return row[0]\n\n def _has_snapshot_id(self, snapshot_id: str) -> bool:\n query = db.select([SnapshotsTable.c.snapshot_id]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return bool(row)\n\n def _get_snapshot(self, snapshot_id: str):\n query = db.select([SnapshotsTable.c.snapshot_body]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return defensively_unpack_pipeline_snapshot_query(logging, row) if row else None\n\n def _get_partition_runs(\n self, partition_set_name: str, partition_name: str\n ) -> List[PipelineRun]:\n # utility method to help test reads off of the partition column\n if not self.has_built_index(RUN_PARTITIONS):\n # query by tags\n return self.get_runs(\n filters=RunsFilter(\n tags={\n PARTITION_SET_TAG: partition_set_name,\n PARTITION_NAME_TAG: partition_name,\n }\n )\n )\n else:\n query = (\n self._runs_query()\n .where(RunsTable.c.partition == partition_name)\n .where(RunsTable.c.partition_set == partition_set_name)\n )\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n # Tracking data migrations over secondary indexes\n\n def _execute_data_migrations(\n self, migrations, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False\n ):\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n self._execute_data_migrations(REQUIRED_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def optimize(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n self._execute_data_migrations(OPTIONAL_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def has_built_index(self, migration_name: str) -> bool:\n query = (\n db.select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None)\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str):\n query = (\n SecondaryIndexMigrationTable.insert().values( # pylint: disable=no-value-for-parameter\n name=migration_name,\n migration_completed=datetime.now(),\n )\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db.exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update() # pylint: disable=no-value-for-parameter\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n # Checking for migrations\n\n def has_run_stats_index_cols(self):\n with self.connect() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(RunsTable.name)]\n return "start_time" in column_names and "end_time" in column_names\n\n # Daemon heartbeats\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat):\n with self.connect() as conn:\n\n # insert, or update if already present\n try:\n conn.execute(\n DaemonHeartbeatsTable.insert().values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n )\n except db.exc.IntegrityError:\n conn.execute(\n DaemonHeartbeatsTable.update() # pylint: disable=no-value-for-parameter\n .where(DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type)\n .values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n )\n\n def get_daemon_heartbeats(self) -> Dict[str, DaemonHeartbeat]:\n\n with self.connect() as conn:\n rows = conn.execute(db.select(DaemonHeartbeatsTable.columns))\n heartbeats = []\n for row in rows:\n heartbeats.append(deserialize_as(row.body, DaemonHeartbeat))\n return {heartbeat.daemon_type: heartbeat for heartbeat in heartbeats}\n\n def wipe(self):\n """Clears the run storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(RunsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(RunTagsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(SnapshotsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(DaemonHeartbeatsTable.delete()) # pylint: disable=no-value-for-parameter\n\n def wipe_daemon_heartbeats(self):\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(DaemonHeartbeatsTable.delete()) # pylint: disable=no-value-for-parameter\n\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> List[PartitionBackfill]:\n check.opt_inst_param(status, "status", BulkActionStatus)\n query = db.select([BulkActionsTable.c.body])\n if status:\n query = query.where(BulkActionsTable.c.status == status.value)\n if cursor:\n cursor_query = db.select([BulkActionsTable.c.id]).where(\n BulkActionsTable.c.key == cursor\n )\n query = query.where(BulkActionsTable.c.id < cursor_query)\n if limit:\n query = query.limit(limit)\n query = query.order_by(BulkActionsTable.c.id.desc())\n rows = self.fetchall(query)\n return [deserialize_as(row[0], PartitionBackfill) for row in rows]\n\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n check.str_param(backfill_id, "backfill_id")\n query = db.select([BulkActionsTable.c.body]).where(BulkActionsTable.c.key == backfill_id)\n row = self.fetchone(query)\n return deserialize_as(row[0], PartitionBackfill) if row else None\n\n def add_backfill(self, partition_backfill: PartitionBackfill):\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n with self.connect() as conn:\n conn.execute(\n BulkActionsTable.insert().values( # pylint: disable=no-value-for-parameter\n key=partition_backfill.backfill_id,\n status=partition_backfill.status.value,\n timestamp=utc_datetime_from_timestamp(partition_backfill.backfill_timestamp),\n body=serialize_dagster_namedtuple(partition_backfill),\n )\n )\n\n def update_backfill(self, partition_backfill: PartitionBackfill):\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n backfill_id = partition_backfill.backfill_id\n if not self.get_backfill(backfill_id):\n raise DagsterInvariantViolationError(\n f"Backfill {backfill_id} is not present in storage"\n )\n with self.connect() as conn:\n conn.execute(\n BulkActionsTable.update() # pylint: disable=no-value-for-parameter\n .where(BulkActionsTable.c.key == backfill_id)\n .values(\n status=partition_backfill.status.value,\n body=serialize_dagster_namedtuple(partition_backfill),\n )\n )
\n\n\nGET_PIPELINE_SNAPSHOT_QUERY_ID = "get-pipeline-snapshot"\n\n\ndef defensively_unpack_pipeline_snapshot_query(logger, row):\n # no checking here because sqlalchemy returns a special\n # row proxy and don't want to instance check on an internal\n # implementation detail\n\n def _warn(msg):\n logger.warning("get-pipeline-snapshot: {msg}".format(msg=msg))\n\n if not isinstance(row[0], bytes):\n _warn("First entry in row is not a binary type.")\n return None\n\n try:\n uncompressed_bytes = zlib.decompress(row[0])\n except zlib.error:\n _warn("Could not decompress bytes stored in snapshot table.")\n return None\n\n try:\n decoded_str = uncompressed_bytes.decode("utf-8")\n except UnicodeDecodeError:\n _warn("Could not unicode decode decompressed bytes stored in snapshot table.")\n return None\n\n try:\n return deserialize_json_to_dagster_namedtuple(decoded_str)\n except JSONDecodeError:\n _warn("Could not parse json in snapshot table.")\n return None\n
", "current_page_name": "_modules/dagster/core/storage/runs/sql_run_storage", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.runs.sql_run_storage"}, "sqlite": {"sqlite_run_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.runs.sqlite.sqlite_run_storage

\nimport os\nfrom contextlib import contextmanager\nfrom urllib.parse import urljoin, urlparse\n\nimport sqlalchemy as db\nfrom sqlalchemy.pool import NullPool\n\nfrom dagster import StringSource, check\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    handle_schema_errors,\n    run_alembic_downgrade,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string, get_sqlite_version\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\n\nfrom ..schema import InstanceInfo, RunStorageSqlMetadata, RunTagsTable, RunsTable\nfrom ..sql_run_storage import SqlRunStorage\n\nMINIMUM_SQLITE_BUCKET_VERSION = [3, 25, 0]\n\n\n
[docs]class SqliteRunStorage(SqlRunStorage, ConfigurableClass):\n """SQLite-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default run storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster.core.storage.runs\n class: SqliteRunStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the run storage where on disk to store the database.\n """\n\n def __init__(self, conn_string, inst_data=None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteRunStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(cls, base_dir, inst_data=None):\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "runs")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n RunStorageSqlMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n table_names = db.inspect(engine).get_table_names()\n if "instance_info" not in table_names:\n InstanceInfo.create(engine)\n\n run_storage = cls(conn_string, inst_data)\n\n if should_mark_indexes:\n run_storage.migrate()\n run_storage.optimize()\n\n return run_storage\n\n @contextmanager\n def connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n conn = engine.connect()\n try:\n with handle_schema_errors(conn, get_alembic_config(__file__)):\n yield conn\n finally:\n conn.close()\n\n def _alembic_upgrade(self, rev="head"):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn, rev=rev)\n\n def _alembic_downgrade(self, rev="head"):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_downgrade(alembic_config, conn, rev=rev)\n\n @property\n def supports_bucket_queries(self):\n parts = get_sqlite_version().split(".")\n try:\n for i in range(min(len(parts), len(MINIMUM_SQLITE_BUCKET_VERSION))):\n curr = int(parts[i])\n if curr < MINIMUM_SQLITE_BUCKET_VERSION[i]:\n return False\n if curr > MINIMUM_SQLITE_BUCKET_VERSION[i]:\n return True\n except ValueError:\n return False\n\n return False\n\n def upgrade(self):\n self._check_for_version_066_migration_and_perform()\n self._alembic_upgrade()\n\n # In version 0.6.6, we changed the layout of the of the sqllite dbs on disk\n # to move from the root of DAGSTER_HOME/runs.db to DAGSTER_HOME/history/runs.bd\n # This function checks for that condition and does the move\n def _check_for_version_066_migration_and_perform(self):\n old_conn_string = "sqlite://" + urljoin(urlparse(self._conn_string).path, "../runs.db")\n path_to_old_db = urlparse(old_conn_string).path\n # sqlite URLs look like `sqlite:///foo/bar/baz on Unix/Mac` but on Windows they look like\n # `sqlite:///D:/foo/bar/baz` (or `sqlite:///D:\\foo\\bar\\baz`)\n if os.name == "nt":\n path_to_old_db = path_to_old_db.lstrip("/")\n if os.path.exists(path_to_old_db):\n old_storage = SqliteRunStorage(old_conn_string)\n old_runs = old_storage.get_runs()\n for run in old_runs:\n self.add_run(run)\n os.unlink(path_to_old_db)\n\n def delete_run(self, run_id):\n """Override the default sql delete run implementation until we can get full\n support on cascading deletes"""\n check.str_param(run_id, "run_id")\n remove_tags = db.delete(RunTagsTable).where(RunTagsTable.c.run_id == run_id)\n remove_run = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(remove_tags)\n conn.execute(remove_run)
\n
", "current_page_name": "_modules/dagster/core/storage/runs/sqlite/sqlite_run_storage", "customsidebar": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.runs.sqlite.sqlite_run_storage"}}}, "schedules": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.schedules.base

\nimport abc\nfrom typing import Callable, Iterable, List, Mapping, Optional, Sequence\n\nfrom dagster.core.definitions.run_request import InstigatorType\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.scheduler.instigation import InstigatorState, InstigatorTick, TickData, TickStatus\n\n\n
[docs]class ScheduleStorage(abc.ABC, MayHaveInstanceWeakref):\n """Abstract class for managing persistance of scheduler artifacts"""\n\n @abc.abstractmethod\n def wipe(self):\n """Delete all schedules from storage"""\n\n @abc.abstractmethod\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n ) -> Iterable[InstigatorState]:\n """Return all InstigationStates present in storage\n\n Args:\n repository_origin_id (Optional[str]): The ExternalRepository target id to scope results to\n repository_selector_id (Optional[str]): The repository selector id to scope results to\n instigator_type (Optional[InstigatorType]): The InstigatorType to scope results to\n """\n\n @abc.abstractmethod\n def get_instigator_state(self, origin_id: str, selector_id: str) -> InstigatorState:\n """Return the instigator state for the given id\n\n Args:\n origin_id (str): The unique instigator identifier\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def add_instigator_state(self, state: InstigatorState):\n """Add an instigator state to storage.\n\n Args:\n state (InstigatorState): The state to add\n """\n\n @abc.abstractmethod\n def update_instigator_state(self, state: InstigatorState):\n """Update an instigator state in storage.\n\n Args:\n state (InstigatorState): The state to update\n """\n\n @abc.abstractmethod\n def delete_instigator_state(self, origin_id: str, selector_id: str):\n """Delete a state in storage.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n """\n\n @property\n def supports_batch_queries(self):\n return False\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Iterable[InstigatorTick]]:\n raise NotImplementedError()\n\n @abc.abstractmethod\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[List[TickStatus]] = None,\n ) -> Iterable[InstigatorTick]:\n """Get the ticks for a given instigator.\n\n Args:\n origin_id (str): The id of the instigator target\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def create_tick(self, tick_data: TickData):\n """Add a tick to storage.\n\n Args:\n tick_data (TickData): The tick to add\n """\n\n @abc.abstractmethod\n def update_tick(self, tick: InstigatorTick):\n """Update a tick already in storage.\n\n Args:\n tick (InstigatorTick): The tick to update\n """\n\n @abc.abstractmethod\n def purge_ticks(self, origin_id: str, selector_id: str, tick_status: TickStatus, before: float):\n """Wipe ticks for an instigator for a certain status and timestamp.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n tick_status (TickStatus): The tick status to wipe\n before (datetime): All ticks before this datetime will get purged\n """\n\n @abc.abstractmethod\n def upgrade(self):\n """Perform any needed migrations"""\n\n def migrate(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n """Call this method to run any required data migrations"""\n\n def optimize(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n """Call this method to run any optional data migrations for optimized reads"""\n\n def optimize_for_dagit(self, statement_timeout: int):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""
\n
", "current_page_name": "_modules/dagster/core/storage/schedules/base", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.schedules.base"}, "sql_schedule_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.schedules.sql_schedule_storage

\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import Callable, Iterable, Mapping, Optional, Sequence, cast\n\nimport pendulum\nimport sqlalchemy as db\n\nfrom dagster import check\nfrom dagster.core.definitions.run_request import InstigatorType\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.scheduler.instigation import InstigatorState, InstigatorTick, TickData, TickStatus\nfrom dagster.serdes import deserialize_json_to_dagster_namedtuple, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom .base import ScheduleStorage\nfrom .migration import (\n    OPTIONAL_SCHEDULE_DATA_MIGRATIONS,\n    REQUIRED_SCHEDULE_DATA_MIGRATIONS,\n    SCHEDULE_JOBS_SELECTOR_ID,\n    SCHEDULE_TICKS_SELECTOR_ID,\n)\nfrom .schema import InstigatorsTable, JobTable, JobTickTable, SecondaryIndexMigrationTable\n\n\n
[docs]class SqlScheduleStorage(ScheduleStorage):\n """Base class for SQL backed schedule storage"""\n\n @abstractmethod\n def connect(self):\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n def execute(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def _deserialize_rows(self, rows):\n return list(map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows))\n\n def all_instigator_state(\n self, repository_origin_id=None, repository_selector_id=None, instigator_type=None\n ):\n check.opt_inst_param(instigator_type, "instigator_type", InstigatorType)\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = db.select([InstigatorsTable.c.instigator_body]).select_from(InstigatorsTable)\n if repository_selector_id:\n query = query.where(\n InstigatorsTable.c.repository_selector_id == repository_selector_id\n )\n if instigator_type:\n query = query.where(InstigatorsTable.c.instigator_type == instigator_type.value)\n else:\n query = db.select([JobTable.c.job_body]).select_from(JobTable)\n if repository_origin_id:\n query = query.where(JobTable.c.repository_origin_id == repository_origin_id)\n if instigator_type:\n query = query.where(JobTable.c.job_type == instigator_type.value)\n\n rows = self.execute(query)\n return self._deserialize_rows(rows)\n\n def get_instigator_state(self, origin_id, selector_id):\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = (\n db.select([InstigatorsTable.c.instigator_body])\n .select_from(InstigatorsTable)\n .where(InstigatorsTable.c.selector_id == selector_id)\n )\n else:\n query = (\n db.select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.job_origin_id == origin_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None\n\n def _has_instigator_state_by_selector(self, selector_id):\n check.str_param(selector_id, "selector_id")\n\n query = (\n db.select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None\n\n def _add_or_update_instigators_table(self, conn, state):\n selector_id = state.selector_id\n try:\n conn.execute(\n InstigatorsTable.insert().values( # pylint: disable=no-value-for-parameter\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_dagster_namedtuple(state),\n )\n )\n except db.exc.IntegrityError:\n conn.execute(\n InstigatorsTable.update()\n .where(InstigatorsTable.c.selector_id == selector_id)\n .values(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_dagster_namedtuple(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def add_instigator_state(self, state):\n check.inst_param(state, "state", InstigatorState)\n with self.connect() as conn:\n try:\n conn.execute(\n JobTable.insert().values( # pylint: disable=no-value-for-parameter\n job_origin_id=state.instigator_origin_id,\n repository_origin_id=state.repository_origin_id,\n status=state.status.value,\n job_type=state.instigator_type.value,\n job_body=serialize_dagster_namedtuple(state),\n )\n )\n except db.exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is already present in storage"\n ) from exc\n\n # try writing to the instigators table\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def update_instigator_state(self, state):\n check.inst_param(state, "state", InstigatorState)\n if not self.get_instigator_state(state.instigator_origin_id, state.selector_id):\n raise DagsterInvariantViolationError(\n "InstigatorState {id} is not present in storage".format(\n id=state.instigator_origin_id\n )\n )\n\n values = {\n "status": state.status.value,\n "job_body": serialize_dagster_namedtuple(state),\n "update_timestamp": pendulum.now("UTC"),\n }\n if self.has_instigators_table():\n values["selector_id"] = state.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTable.update() # pylint: disable=no-value-for-parameter\n .where(JobTable.c.job_origin_id == state.instigator_origin_id)\n .values(**values)\n )\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n def delete_instigator_state(self, origin_id, selector_id):\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if not self.get_instigator_state(origin_id, selector_id):\n raise DagsterInvariantViolationError(\n "InstigatorState {id} is not present in storage".format(id=origin_id)\n )\n\n with self.connect() as conn:\n conn.execute(\n JobTable.delete().where( # pylint: disable=no-value-for-parameter\n JobTable.c.job_origin_id == origin_id\n )\n )\n\n if self._has_instigators_table(conn):\n if not self._jobs_has_selector_state(conn, selector_id):\n conn.execute(\n InstigatorsTable.delete().where( # pylint: disable=no-value-for-parameter\n InstigatorsTable.c.selector_id == selector_id\n )\n )\n\n def _jobs_has_selector_state(self, conn, selector_id):\n query = (\n db.select([db.func.count()])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n result = conn.execute(query)\n row = result.fetchone()\n result.close()\n return row[0] > 0\n\n def _add_filter_limit(self, query, before=None, after=None, limit=None, statuses=None):\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n if before:\n query = query.where(JobTickTable.c.timestamp < utc_datetime_from_timestamp(before))\n if after:\n query = query.where(JobTickTable.c.timestamp > utc_datetime_from_timestamp(after))\n if limit:\n query = query.limit(limit)\n if statuses:\n query = query.where(JobTickTable.c.status.in_([status.value for status in statuses]))\n return query\n\n @property\n def supports_batch_queries(self):\n return self.has_instigators_table() and self.has_built_index(SCHEDULE_TICKS_SELECTOR_ID)\n\n def has_instigators_table(self):\n with self.connect() as conn:\n return self._has_instigators_table(conn)\n\n def _has_instigators_table(self, conn):\n table_names = db.inspect(conn).get_table_names()\n return "instigators" in table_names\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Iterable[InstigatorTick]]:\n check.list_param(selector_ids, "selector_ids", of_type=str)\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n bucket_rank_column = (\n db.func.rank()\n .over(\n order_by=db.desc(JobTickTable.c.timestamp),\n partition_by=JobTickTable.c.selector_id,\n )\n .label("rank")\n )\n subquery = (\n db.select(\n [\n JobTickTable.c.id,\n JobTickTable.c.selector_id,\n JobTickTable.c.tick_body,\n bucket_rank_column,\n ]\n )\n .select_from(JobTickTable)\n .where(JobTickTable.c.selector_id.in_(selector_ids))\n .alias("subquery")\n )\n if statuses:\n subquery = subquery.where(\n JobTickTable.c.status.in_([status.value for status in statuses])\n )\n\n query = (\n db.select([subquery.c.id, subquery.c.selector_id, subquery.c.tick_body])\n .order_by(subquery.c.rank.asc())\n .where(subquery.c.rank <= limit)\n )\n\n rows = self.execute(query)\n results = defaultdict(list)\n for row in rows:\n tick_id = row[0]\n selector_id = row[1]\n tick_data = cast(TickData, deserialize_json_to_dagster_namedtuple(row[2]))\n results[selector_id].append(InstigatorTick(tick_id, tick_data))\n return results\n\n def get_ticks(self, origin_id, selector_id, before=None, after=None, limit=None, statuses=None):\n check.str_param(origin_id, "origin_id")\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n base_query = (\n db.select([JobTickTable.c.id, JobTickTable.c.tick_body])\n .select_from(JobTickTable)\n .order_by(JobTickTable.c.timestamp.desc())\n )\n if self.has_instigators_table():\n query = base_query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id == None,\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = base_query.where(JobTickTable.c.job_origin_id == origin_id)\n\n query = self._add_filter_limit(\n query, before=before, after=after, limit=limit, statuses=statuses\n )\n\n rows = self.execute(query)\n return list(\n map(lambda r: InstigatorTick(r[0], deserialize_json_to_dagster_namedtuple(r[1])), rows)\n )\n\n def create_tick(self, tick_data):\n check.inst_param(tick_data, "tick_data", TickData)\n\n values = {\n "job_origin_id": tick_data.instigator_origin_id,\n "status": tick_data.status.value,\n "type": tick_data.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick_data.timestamp),\n "tick_body": serialize_dagster_namedtuple(tick_data),\n }\n if self.has_instigators_table() and tick_data.selector_id:\n values["selector_id"] = tick_data.selector_id\n\n with self.connect() as conn:\n try:\n tick_insert = JobTickTable.insert().values(\n **values\n ) # pylint: disable=no-value-for-parameter\n result = conn.execute(tick_insert)\n tick_id = result.inserted_primary_key[0]\n return InstigatorTick(tick_id, tick_data)\n except db.exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"Unable to insert InstigatorTick for job {tick_data.instigator_name} in storage"\n ) from exc\n\n def update_tick(self, tick):\n check.inst_param(tick, "tick", InstigatorTick)\n\n values = {\n "status": tick.status.value,\n "type": tick.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick.timestamp),\n "tick_body": serialize_dagster_namedtuple(tick.tick_data),\n }\n if self.has_instigators_table() and tick.selector_id:\n values["selector_id"] = tick.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTickTable.update() # pylint: disable=no-value-for-parameter\n .where(JobTickTable.c.id == tick.tick_id)\n .values(**values)\n )\n\n return tick\n\n def purge_ticks(self, origin_id, selector_id, tick_status, before):\n check.str_param(origin_id, "origin_id")\n check.inst_param(tick_status, "tick_status", TickStatus)\n check.float_param(before, "before")\n\n utc_before = utc_datetime_from_timestamp(before)\n\n base_query = (\n JobTickTable.delete() # pylint: disable=no-value-for-parameter\n .where(JobTickTable.c.status == tick_status.value)\n .where(JobTickTable.c.timestamp < utc_before)\n )\n\n if self.has_instigators_table():\n query = base_query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id == None,\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = base_query.where(JobTickTable.c.job_origin_id == origin_id)\n\n with self.connect() as conn:\n conn.execute(query)\n\n def wipe(self):\n """Clears the schedule storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(JobTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(JobTickTable.delete()) # pylint: disable=no-value-for-parameter\n if self._has_instigators_table(conn):\n conn.execute(InstigatorsTable.delete())\n\n # MIGRATIONS\n\n def has_secondary_index_table(self):\n with self.connect() as conn:\n return "secondary_indexes" in db.inspect(conn).get_table_names()\n\n def has_built_index(self, migration_name: str) -> bool:\n if not self.has_secondary_index_table():\n return False\n\n query = (\n db.select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None)\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str):\n query = (\n SecondaryIndexMigrationTable.insert().values( # pylint: disable=no-value-for-parameter\n name=migration_name,\n migration_completed=datetime.now(),\n )\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db.exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update() # pylint: disable=no-value-for-parameter\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n def _execute_data_migrations(\n self, migrations, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False\n ):\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn("Skipping already applied migration: {}".format(migration_name))\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n self._execute_data_migrations(\n REQUIRED_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )\n\n def optimize(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n self._execute_data_migrations(\n OPTIONAL_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )
\n
", "current_page_name": "_modules/dagster/core/storage/schedules/sql_schedule_storage", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.schedules.sql_schedule_storage"}, "sqlite": {"sqlite_schedule_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.schedules.sqlite.sqlite_schedule_storage

\nfrom contextlib import contextmanager\n\nfrom sqlalchemy.pool import NullPool\n\nfrom dagster import StringSource, check\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    handle_schema_errors,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string, get_sqlite_version\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\n\nfrom ..schema import ScheduleStorageSqlMetadata\nfrom ..sql_schedule_storage import SqlScheduleStorage\n\nMINIMUM_SQLITE_BATCH_VERSION = "3.25.0"\n\n\n
[docs]class SqliteScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Local SQLite backed schedule storage"""\n\n def __init__(self, conn_string, inst_data=None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteScheduleStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(cls, base_dir, inst_data=None):\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "schedules")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_migrate_data = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n ScheduleStorageSqlMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n should_migrate_data = True\n\n schedule_storage = cls(conn_string, inst_data)\n if should_migrate_data:\n schedule_storage.migrate()\n schedule_storage.optimize()\n\n return schedule_storage\n\n @contextmanager\n def connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n conn = engine.connect()\n try:\n with handle_schema_errors(\n conn,\n get_alembic_config(__file__),\n ):\n yield conn\n finally:\n conn.close()\n\n @property\n def supports_batch_queries(self):\n return (\n get_sqlite_version() > MINIMUM_SQLITE_BATCH_VERSION and super().supports_batch_queries\n )\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/core/storage/schedules/sqlite/sqlite_schedule_storage", "customsidebar": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.schedules.sqlite.sqlite_schedule_storage"}}}}, "types": {"config_schema": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.types.config_schema

\nimport hashlib\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Generator, Optional\n\nfrom dagster import check\nfrom dagster.config.config_type import ConfigType\nfrom dagster.core.decorator_utils import get_function_params, validate_expected_params\nfrom dagster.core.definitions.events import AssetMaterialization\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.utils import ensure_gen\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nif TYPE_CHECKING:\n    from dagster.core.execution.context.system import StepExecutionContext\n\n\n
[docs]class DagsterTypeLoader(ABC):\n """\n Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\n to.\n\n The recommended way to define a type loader is with the\n :py:func:`@dagster_type_loader <dagster_type_loader>` decorator.\n """\n\n @property\n @abstractmethod\n def schema_type(self) -> ConfigType:\n pass\n\n @property\n def loader_version(self) -> Optional[str]:\n return None\n\n def compute_loaded_input_version(self, _config_value: object) -> Optional[str]:\n return None\n\n def construct_from_config_value(\n self, _context: "StepExecutionContext", config_value: object\n ) -> object:\n """\n How to create a runtime value from config data.\n """\n return config_value\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset()
\n\n\n
[docs]class DagsterTypeMaterializer(ABC):\n """\n Dagster type materializers are used to materialize outputs of the dagster type they are attached\n to.\n\n The recommended way to define a type loader is with the\n :py:func:`@dagster_type_materializer <dagster_type_materializer>` decorator.\n """\n\n @property\n @abstractmethod\n def schema_type(self) -> ConfigType:\n pass\n\n @abstractmethod\n def materialize_runtime_values(\n self, _context: "StepExecutionContext", _config_value: object, _runtime_value: object\n ) -> object:\n """\n How to materialize a runtime value given configuration.\n """\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset()
\n\n\nclass DagsterTypeLoaderFromDecorator(DagsterTypeLoader):\n def __init__(\n self,\n config_type,\n func,\n required_resource_keys,\n loader_version=None,\n external_version_fn=None,\n ):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._loader_version = check.opt_str_param(loader_version, "loader_version")\n if self._loader_version:\n experimental_arg_warning("loader_version", "DagsterTypeLoaderFromDecorator.__init__")\n self._external_version_fn = check.opt_callable_param(\n external_version_fn, "external_version_fn"\n )\n if self._external_version_fn:\n experimental_arg_warning(\n "external_version_fn", "DagsterTypeLoaderFromDecorator.__init__"\n )\n\n @property\n def schema_type(self) -> ConfigType:\n return self._config_type\n\n @property\n def loader_version(self) -> Optional[str]:\n return self._loader_version\n\n def compute_loaded_input_version(self, config_value: object) -> Optional[str]:\n """Compute the type-loaded input from a given config_value.\n\n Args:\n config_value (object): Config value to be ingested by the external version\n loading function.\n Returns:\n Optional[str]: Hash of concatenated loader version and external input version if both\n are provided, else None.\n """\n version = ""\n if self.loader_version:\n version += str(self.loader_version)\n if self._external_version_fn:\n ext_version = self._external_version_fn(config_value)\n version += str(ext_version)\n\n if version == "":\n return None # Sentinel value for no version provided.\n else:\n return hashlib.sha1(version.encode("utf-8")).hexdigest()\n\n def construct_from_config_value(self, context: "StepExecutionContext", config_value: object):\n return self._func(context, config_value)\n\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n\ndef _create_type_loader_for_decorator(\n config_type: ConfigType,\n func,\n required_resource_keys: AbstractSet[str],\n loader_version=None,\n external_version_fn=None,\n):\n return DagsterTypeLoaderFromDecorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n\n
[docs]def dagster_type_loader(\n config_schema: object,\n required_resource_keys=None,\n loader_version=None,\n external_version_fn=None,\n):\n """Create an dagster type loader that maps config data to a runtime value.\n\n The decorated function should take the execution context and parsed config value and return the\n appropriate runtime value.\n\n Args:\n config_schema (ConfigSchema): The schema for the config that's passed to the decorated\n function.\n loader_version (str): (Experimental) The version of the decorated compute function. Two\n loading functions should have the same version if and only if they deterministically\n produce the same outputs when provided the same inputs.\n external_version_fn (Callable): (Experimental) A function that takes in the same parameters as the loader\n function (config_value) and returns a representation of the version of the external\n asset (str). Two external assets with identical versions are treated as identical to one\n another.\n\n Examples:\n\n .. code-block:: python\n\n @dagster_type_loader(Permissive())\n def load_dict(_context, value):\n return value\n """\n from dagster.config.field import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n EXPECTED_POSITIONALS = ["context", "*"]\n\n def wrapper(func):\n params = get_function_params(func)\n missing_positional = validate_expected_params(params, EXPECTED_POSITIONALS)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n "@dagster_type_loader '{solid_name}' decorated function does not have required positional "\n "parameter '{missing_param}'. @dagster_type_loader decorated functions should only have keyword arguments "\n "that match input names and a first positional parameter named 'context'.".format(\n solid_name=func.__name__, missing_param=missing_positional\n )\n )\n\n return _create_type_loader_for_decorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n return wrapper
\n\n\nclass DagsterTypeMaterializerForDecorator(DagsterTypeMaterializer):\n def __init__(self, config_type, func, required_resource_keys):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n\n @property\n def schema_type(self) -> ConfigType:\n return self._config_type\n\n def materialize_runtime_values(\n self, context: "StepExecutionContext", config_value: object, runtime_value: object\n ) -> Generator[AssetMaterialization, Any, Any]:\n return ensure_gen(self._func(context, config_value, runtime_value))\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset(self._required_resource_keys)\n\n\ndef _create_output_materializer_for_decorator(\n config_type: ConfigType,\n func: Callable[["StepExecutionContext", object, object], AssetMaterialization],\n required_resource_keys: Optional[AbstractSet[str]],\n) -> DagsterTypeMaterializerForDecorator:\n return DagsterTypeMaterializerForDecorator(config_type, func, required_resource_keys)\n\n\n
[docs]def dagster_type_materializer(\n config_schema: object, required_resource_keys: Optional[AbstractSet[str]] = None\n) -> Callable[\n [Callable[["StepExecutionContext", object, object], AssetMaterialization]],\n DagsterTypeMaterializerForDecorator,\n]:\n """Create an output materialization hydration config that configurably materializes a runtime\n value.\n\n The decorated function should take the execution context, the parsed config value, and the\n runtime value. It should materialize the runtime value, and should\n return an appropriate :py:class:`AssetMaterialization`.\n\n Args:\n config_schema (object): The type of the config data expected by the decorated function.\n\n Examples:\n\n .. code-block:: python\n\n # Takes a list of dicts such as might be read in using csv.DictReader, as well as a config\n value, and writes\n @dagster_type_materializer(str)\n def materialize_df(_context, path, value):\n with open(path, 'w') as fd:\n writer = csv.DictWriter(fd, fieldnames=value[0].keys())\n writer.writeheader()\n writer.writerows(rowdicts=value)\n\n return AssetMaterialization.file(path)\n\n """\n from dagster.config.field import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n return lambda func: _create_output_materializer_for_decorator(\n config_type, func, required_resource_keys # type: ignore\n )
\n
", "current_page_name": "_modules/dagster/core/types/config_schema", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.types.config_schema"}, "dagster_type": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.types.dagster_type

\nimport typing as t\nfrom abc import abstractmethod\nfrom enum import Enum as PythonEnum\nfrom functools import partial\nfrom typing import cast\n\nfrom dagster import check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.config.config_type import Array, ConfigType\nfrom dagster.config.config_type import Noneable as ConfigNoneable\nfrom dagster.core.definitions.events import TypeCheck\nfrom dagster.core.definitions.metadata import MetadataEntry, RawMetadataValue, normalize_metadata\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.serdes import whitelist_for_serdes\n\nfrom .builtin_config_schemas import BuiltinSchemas\nfrom .config_schema import DagsterTypeLoader, DagsterTypeMaterializer\n\nif t.TYPE_CHECKING:\n    from dagster.core.execution.context.system import (  # pylint: disable=unused-import\n        StepExecutionContext,\n        TypeCheckContext,\n    )\n\nTypeCheckFn = t.Callable[["TypeCheckContext", object], t.Union[TypeCheck, bool]]\n\n\n@whitelist_for_serdes\nclass DagsterTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    LIST = "LIST"\n    NOTHING = "NOTHING"\n    NULLABLE = "NULLABLE"\n    REGULAR = "REGULAR"\n\n\n
[docs]class DagsterType:\n """Define a type in dagster. These can be used in the inputs and outputs of ops.\n\n Args:\n type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]):\n The function that defines the type check. It takes the value flowing\n through the input or output of the op. If it passes, return either\n ``True`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``True``. If it fails,\n return either ``False`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``False``.\n The first argument must be named ``context`` (or, if unused, ``_``, ``_context``, or ``context_``).\n Use ``required_resource_keys`` for access to resources.\n key (Optional[str]): The unique key to identify types programmatically.\n The key property always has a value. If you omit key to the argument\n to the init function, it instead receives the value of ``name``. If\n neither ``key`` nor ``name`` is provided, a ``CheckError`` is thrown.\n\n In the case of a generic type such as ``List`` or ``Optional``, this is\n generated programmatically based on the type parameters.\n\n For most use cases, name should be set and the key argument should\n not be specified.\n name (Optional[str]): A unique name given by a user. If ``key`` is ``None``, ``key``\n becomes this value. Name is not given in a case where the user does\n not specify a unique name for this type, such as a generic class.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_materializer <dagster.dagster_type_materializer>`\n decorator to construct these arguments.\n required_resource_keys (Optional[Set[str]]): Resource keys required by the ``type_check_fn``.\n is_builtin (bool): Defaults to False. This is used by tools to display or\n filter built-in types (such as :py:class:`~dagster.String`, :py:class:`~dagster.Int`) to visually distinguish\n them from user-defined types. Meant for internal use.\n kind (DagsterTypeKind): Defaults to None. This is used to determine the kind of runtime type\n for InputDefinition and OutputDefinition type checking.\n typing_type: Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\n value contained within the DagsterType. Meant for internal use.\n """\n\n def __init__(\n self,\n type_check_fn: TypeCheckFn,\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n loader: t.Optional[DagsterTypeLoader] = None,\n materializer: t.Optional[DagsterTypeMaterializer] = None,\n required_resource_keys: t.Optional[t.Set[str]] = None,\n kind: DagsterTypeKind = DagsterTypeKind.REGULAR,\n typing_type: t.Any = None,\n metadata_entries: t.Optional[t.List[MetadataEntry]] = None,\n metadata: t.Optional[t.Dict[str, RawMetadataValue]] = None,\n ):\n check.opt_str_param(key, "key")\n check.opt_str_param(name, "name")\n\n check.invariant(not (name is None and key is None), "Must set key or name")\n if name is None:\n key = check.not_none(\n key,\n "If name is not provided, must provide key.",\n )\n self.key, self._name = key, None\n elif key is None:\n name = check.not_none(\n name,\n "If key is not provided, must provide name.",\n )\n self.key, self._name = name, name\n else:\n check.invariant(key and name)\n self.key, self._name = key, name\n\n self.description = check.opt_str_param(description, "description")\n self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)\n self.materializer = check.opt_inst_param(\n materializer, "materializer", DagsterTypeMaterializer\n )\n\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys,\n "required_resource_keys",\n )\n\n self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")\n _validate_type_check_fn(self._type_check_fn, self._name)\n\n self.is_builtin = check.bool_param(is_builtin, "is_builtin")\n check.invariant(\n self.display_name is not None,\n "All types must have a valid display name, got None for key {}".format(key),\n )\n\n self.kind = check.inst_param(kind, "kind", DagsterTypeKind)\n\n self.typing_type = typing_type\n\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n self._metadata_entries = normalize_metadata(metadata, metadata_entries)\n\n def type_check(self, context: "TypeCheckContext", value: object) -> TypeCheck:\n retval = self._type_check_fn(context, value)\n\n if not isinstance(retval, (bool, TypeCheck)):\n raise DagsterInvariantViolationError(\n (\n "You have returned {retval} of type {retval_type} from the type "\n 'check function of type "{type_key}". Return value must be instance '\n "of TypeCheck or a bool."\n ).format(retval=repr(retval), retval_type=type(retval), type_key=self.key)\n )\n\n return TypeCheck(success=retval) if isinstance(retval, bool) else retval\n\n def __eq__(self, other):\n return isinstance(other, DagsterType) and self.key == other.key\n\n def __ne__(self, other):\n return not self.__eq__(other)\n\n @staticmethod\n def from_builtin_enum(builtin_enum) -> "DagsterType":\n check.invariant(BuiltinEnum.contains(builtin_enum), "must be member of BuiltinEnum")\n return _RUNTIME_MAP[builtin_enum]\n\n @property\n def metadata_entries(self) -> t.List[MetadataEntry]:\n return self._metadata_entries # type: ignore\n\n @property\n def display_name(self) -> str:\n """Either the name or key (if name is `None`) of the type, overridden in many subclasses"""\n return cast(str, self._name or self.key)\n\n @property\n def unique_name(self) -> t.Optional[str]:\n """The unique name of this type. Can be None if the type is not unique, such as container types"""\n # TODO: docstring and body inconsistent-- can this be None or not?\n check.invariant(\n self._name is not None,\n "unique_name requested but is None for type {}".format(self.display_name),\n )\n return self._name\n\n @property\n def has_unique_name(self) -> bool:\n return self._name is not None\n\n @property\n def inner_types(self) -> t.List["DagsterType"]:\n return []\n\n @property\n def loader_schema_key(self) -> t.Optional[str]:\n return self.loader.schema_type.key if self.loader else None\n\n @property\n def materializer_schema_key(self) -> t.Optional[str]:\n return self.materializer.schema_type.key if self.materializer else None\n\n @property\n def type_param_keys(self) -> t.List[str]:\n return []\n\n @property\n def is_nothing(self) -> bool:\n return self.kind == DagsterTypeKind.NOTHING\n\n @property\n def supports_fan_in(self) -> bool:\n return False\n\n def get_inner_type_for_fan_in(self) -> "DagsterType":\n check.failed(\n "DagsterType {name} does not support fan-in, should have checked supports_fan_in before calling getter.".format(\n name=self.display_name\n )\n )
\n\n\ndef _validate_type_check_fn(fn: t.Callable, name: t.Optional[str]) -> bool:\n from dagster.seven import get_args\n\n args = get_args(fn)\n\n # py2 doesn't filter out self\n if len(args) >= 1 and args[0] == "self":\n args = args[1:]\n\n if len(args) == 2:\n possible_names = {\n "_",\n "context",\n "_context",\n "context_",\n }\n if args[0] not in possible_names:\n DagsterInvalidDefinitionError(\n 'type_check function on type "{name}" must have first '\n 'argument named "context" (or _, _context, context_).'.format(\n name=name,\n )\n )\n return True\n\n raise DagsterInvalidDefinitionError(\n 'type_check_fn argument on type "{name}" must take 2 arguments, '\n "received {count}.".format(name=name, count=len(args))\n )\n\n\nclass BuiltinScalarDagsterType(DagsterType):\n def __init__(self, name: str, type_check_fn: TypeCheckFn, typing_type: t.Type, **kwargs):\n super(BuiltinScalarDagsterType, self).__init__(\n key=name,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=type_check_fn,\n is_builtin=True,\n typing_type=typing_type,\n **kwargs,\n )\n\n # This is passed to the constructor of subclasses as the argument `type_check_fn`-- that's why\n # it exists together with the `type_check_fn` arg.\n def type_check_fn(self, _context, value) -> TypeCheck:\n return self.type_check_scalar_value(value)\n\n @abstractmethod\n def type_check_scalar_value(self, _value) -> TypeCheck:\n raise NotImplementedError()\n\n\ndef _typemismatch_error_str(value: object, expected_type_desc: str) -> str:\n return 'Value "{value}" of python type "{python_type}" must be a {type_desc}.'.format(\n value=value, python_type=type(value).__name__, type_desc=expected_type_desc\n )\n\n\ndef _fail_if_not_of_type(\n value: object, value_type: t.Type[t.Any], value_type_desc: str\n) -> TypeCheck:\n\n if not isinstance(value, value_type):\n return TypeCheck(success=False, description=_typemismatch_error_str(value, value_type_desc))\n\n return TypeCheck(success=True)\n\n\nclass _Int(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Int, self).__init__(\n name="Int",\n loader=BuiltinSchemas.INT_INPUT,\n materializer=BuiltinSchemas.INT_OUTPUT,\n type_check_fn=self.type_check_fn,\n typing_type=int,\n )\n\n def type_check_scalar_value(self, value) -> TypeCheck:\n return _fail_if_not_of_type(value, int, "int")\n\n\nclass _String(BuiltinScalarDagsterType):\n def __init__(self):\n super(_String, self).__init__(\n name="String",\n loader=BuiltinSchemas.STRING_INPUT,\n materializer=BuiltinSchemas.STRING_OUTPUT,\n type_check_fn=self.type_check_fn,\n typing_type=str,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\nclass _Float(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Float, self).__init__(\n name="Float",\n loader=BuiltinSchemas.FLOAT_INPUT,\n materializer=BuiltinSchemas.FLOAT_OUTPUT,\n type_check_fn=self.type_check_fn,\n typing_type=float,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, float, "float")\n\n\nclass _Bool(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Bool, self).__init__(\n name="Bool",\n loader=BuiltinSchemas.BOOL_INPUT,\n materializer=BuiltinSchemas.BOOL_OUTPUT,\n type_check_fn=self.type_check_fn,\n typing_type=bool,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, bool, "bool")\n\n\nclass Anyish(DagsterType):\n def __init__(\n self,\n key: t.Optional[str],\n name: t.Optional[str],\n loader: t.Optional[DagsterTypeLoader] = None,\n materializer: t.Optional[DagsterTypeMaterializer] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n ):\n super(Anyish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.ANY,\n loader=loader,\n materializer=materializer,\n is_builtin=is_builtin,\n type_check_fn=self.type_check_method,\n description=description,\n typing_type=t.Any,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", _value: object) -> TypeCheck:\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n # Anyish all the way down\n return self\n\n\nclass _Any(Anyish):\n def __init__(self):\n super(_Any, self).__init__(\n key="Any",\n name="Any",\n loader=BuiltinSchemas.ANY_INPUT,\n materializer=BuiltinSchemas.ANY_OUTPUT,\n is_builtin=True,\n )\n\n\ndef create_any_type(\n name: str,\n loader: t.Optional[DagsterTypeLoader] = None,\n materializer: t.Optional[DagsterTypeMaterializer] = None,\n description: t.Optional[str] = None,\n) -> Anyish:\n return Anyish(\n key=name,\n name=name,\n description=description,\n loader=loader,\n materializer=materializer,\n )\n\n\nclass _Nothing(DagsterType):\n def __init__(self):\n super(_Nothing, self).__init__(\n key="Nothing",\n name="Nothing",\n kind=DagsterTypeKind.NOTHING,\n loader=None,\n materializer=None,\n type_check_fn=self.type_check_method,\n is_builtin=True,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n if value is not None:\n return TypeCheck(\n success=False,\n description="Value must be None, got a {value_type}".format(value_type=type(value)),\n )\n\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n return self\n\n\ndef isinstance_type_check_fn(\n expected_python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n dagster_type_name: str,\n expected_python_type_str: str,\n) -> TypeCheckFn:\n def type_check(_context: "TypeCheckContext", value: object) -> TypeCheck:\n if not isinstance(value, expected_python_type):\n return TypeCheck(\n success=False,\n description=(\n f"Value of type {type(value)} failed type check for Dagster type {dagster_type_name}, "\n f"expected value to be of Python type {expected_python_type_str}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check\n\n\n
[docs]class PythonObjectDagsterType(DagsterType):\n """Define a type in dagster whose typecheck is an isinstance check.\n\n Specifically, the type can either be a single python type (e.g. int),\n or a tuple of types (e.g. (int, float)) which is treated as a union.\n\n Examples:\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=int)\n assert ntype.name == 'int'\n assert_success(ntype, 1)\n assert_failure(ntype, 'a')\n\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=(int, float))\n assert ntype.name == 'Union[int, float]'\n assert_success(ntype, 1)\n assert_success(ntype, 1.5)\n assert_failure(ntype, 'a')\n\n\n Args:\n python_type (Union[Type, Tuple[Type, ...]): The dagster typecheck function calls instanceof on\n this type.\n name (Optional[str]): Name the type. Defaults to the name of ``python_type``.\n key (Optional[str]): Key of the type. Defaults to name.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_mate <dagster.dagster_type_mate>`\n decorator to construct these arguments.\n """\n\n def __init__(\n self,\n python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n **kwargs,\n ):\n if isinstance(python_type, tuple):\n self.python_type = check.tuple_param(\n python_type, "python_type", of_shape=tuple(type for item in python_type)\n )\n self.type_str = "Union[{}]".format(\n ", ".join(python_type.__name__ for python_type in python_type)\n )\n typing_type = t.Union[python_type] # type: ignore\n\n else:\n self.python_type = check.class_param(python_type, "python_type") # type: ignore\n self.type_str = cast(str, python_type.__name__)\n typing_type = self.python_type # type: ignore\n name = check.opt_str_param(name, "name", self.type_str)\n key = check.opt_str_param(key, "key", name)\n super(PythonObjectDagsterType, self).__init__(\n key=key,\n name=name,\n type_check_fn=isinstance_type_check_fn(python_type, name, self.type_str),\n typing_type=typing_type,\n **kwargs,\n )
\n\n\nclass NoneableInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type: DagsterType):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n self._inner_loader = check.not_none_param(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = ConfigNoneable(self._inner_loader.schema_type)\n\n @property\n def schema_type(self) -> ConfigType:\n return self._schema_type\n\n def construct_from_config_value(\n self, context: "StepExecutionContext", config_value: object\n ) -> object:\n if config_value is None:\n return None\n return self._inner_loader.construct_from_config_value(context, config_value)\n\n\ndef _create_nullable_input_schema(inner_type: DagsterType) -> t.Optional[DagsterTypeLoader]:\n if not inner_type.loader:\n return None\n\n return NoneableInputSchema(inner_type)\n\n\nclass OptionalType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n inner_type = resolve_dagster_type(inner_type)\n\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError(\n "Type Nothing can not be wrapped in List or Optional"\n )\n\n key = "Optional." + cast(str, inner_type.key)\n self.inner_type = inner_type\n super(OptionalType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.NULLABLE,\n type_check_fn=self.type_check_method,\n loader=_create_nullable_input_schema(inner_type),\n # This throws a type error with Py\n typing_type=t.Optional[inner_type.typing_type], # type: ignore\n )\n\n @property\n def display_name(self) -> str:\n return self.inner_type.display_name + "?"\n\n def type_check_method(self, context, value):\n return (\n TypeCheck(success=True) if value is None else self.inner_type.type_check(context, value)\n )\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return self.inner_type.supports_fan_in\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type.get_inner_type_for_fan_in()\n\n\nclass ListInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = Array(inner_dagster_type.loader.schema_type)\n\n @property\n def schema_type(self):\n return self._schema_type\n\n def construct_from_config_value(self, context, config_value):\n convert_item = partial(self._inner_dagster_type.loader.construct_from_config_value, context)\n return list(map(convert_item, config_value))\n\n\ndef _create_list_input_schema(inner_type):\n if not inner_type.loader:\n return None\n\n return ListInputSchema(inner_type)\n\n\nclass ListType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n key = "List." + inner_type.key\n self.inner_type = inner_type\n super(ListType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.LIST,\n type_check_fn=self.type_check_method,\n loader=_create_list_input_schema(inner_type),\n typing_type=t.List[inner_type.typing_type], # type: ignore\n )\n\n @property\n def display_name(self):\n return "[" + self.inner_type.display_name + "]"\n\n def type_check_method(self, context, value):\n value_check = _fail_if_not_of_type(value, list, "list")\n if not value_check.success:\n return value_check\n\n for item in value:\n item_check = self.inner_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type\n\n\nclass DagsterListApi:\n def __getitem__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(resolve_dagster_type(inner_type))\n\n def __call__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(inner_type)\n\n\nList = DagsterListApi()\n\n\ndef _List(inner_type):\n check.inst_param(inner_type, "inner_type", DagsterType)\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError("Type Nothing can not be wrapped in List or Optional")\n return ListType(inner_type)\n\n\nclass Stringish(DagsterType):\n def __init__(self, key: t.Optional[str] = None, name: t.Optional[str] = None, **kwargs):\n name = check.opt_str_param(name, "name", type(self).__name__)\n key = check.opt_str_param(key, "key", name)\n super(Stringish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=self.type_check_method,\n loader=BuiltinSchemas.STRING_INPUT,\n materializer=BuiltinSchemas.STRING_OUTPUT,\n typing_type=str,\n **kwargs,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\ndef create_string_type(name, description=None):\n return Stringish(name=name, key=name, description=description)\n\n\nAny = _Any()\nBool = _Bool()\nFloat = _Float()\nInt = _Int()\nString = _String()\nNothing = _Nothing()\n\n_RUNTIME_MAP = {\n BuiltinEnum.ANY: Any,\n BuiltinEnum.BOOL: Bool,\n BuiltinEnum.FLOAT: Float,\n BuiltinEnum.INT: Int,\n BuiltinEnum.STRING: String,\n BuiltinEnum.NOTHING: Nothing,\n}\n\n_PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY: t.Dict[type, DagsterType] = {}\n"""Python types corresponding to user-defined RunTime types created using @map_to_dagster_type or\nas_dagster_type are registered here so that we can remap the Python types to runtime types."""\n\n\n
[docs]def make_python_type_usable_as_dagster_type(python_type: t.Type, dagster_type: DagsterType) -> None:\n """\n Take any existing python type and map it to a dagster type (generally created with\n :py:class:`DagsterType <dagster.DagsterType>`) This can only be called once\n on a given python type.\n """\n check.inst_param(python_type, "python_type", type)\n check.inst_param(dagster_type, "dagster_type", DagsterType)\n registered_dagster_type = _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY.get(python_type)\n\n if registered_dagster_type is None:\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n elif registered_dagster_type is not dagster_type:\n # This would be just a great place to insert a short URL pointing to the type system\n # documentation into the error message\n # https://github.com/dagster-io/dagster/issues/1831\n if isinstance(registered_dagster_type, TypeHintInferredDagsterType):\n raise DagsterInvalidDefinitionError(\n f"A Dagster type has already been registered for the Python type "\n f'{python_type}. The Dagster type was "auto-registered" - i.e. a solid definition '\n f"used the Python type as an annotation for one of its arguments or for its return "\n f"value before make_python_type_usable_as_dagster_type was called, and we "\n f"generated a Dagster type to correspond to it. To override the auto-generated "\n f"Dagster type, call make_python_type_usable_as_dagster_type before any solid "\n f"definitions refer to the Python type."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"A Dagster type has already been registered for the Python type "\n f"{python_type}. make_python_type_usable_as_dagster_type can only "\n f"be called once on a python type as it is registering a 1:1 mapping "\n f"between that python type and a dagster type."\n )
\n\n\nDAGSTER_INVALID_TYPE_ERROR_MESSAGE = (\n "Invalid type: dagster_type must be an instance of DagsterType or a Python type: "\n "got {dagster_type}{additional_msg}"\n)\n\n\nclass TypeHintInferredDagsterType(DagsterType):\n def __init__(self, python_type: t.Type):\n qualified_name = f"{python_type.__module__}.{python_type.__name__}"\n self.python_type = python_type\n super(TypeHintInferredDagsterType, self).__init__(\n key=f"_TypeHintInferred[{qualified_name}]",\n description=f"DagsterType created from a type hint for the Python type {qualified_name}",\n type_check_fn=isinstance_type_check_fn(\n python_type, python_type.__name__, qualified_name\n ),\n typing_type=python_type,\n )\n\n @property\n def display_name(self) -> str:\n return self.python_type.__name__\n\n\ndef resolve_dagster_type(dagster_type: object) -> DagsterType:\n # circular dep\n from dagster.primitive_mapping import (\n is_supported_runtime_python_builtin,\n remap_python_builtin_for_runtime,\n )\n from dagster.utils.typing_api import is_typing_type\n\n from .python_dict import Dict, PythonDict\n from .python_set import DagsterSetApi, PythonSet\n from .python_tuple import DagsterTupleApi, PythonTuple\n from .transform_typing import transform_typing_type\n\n check.invariant(\n not (isinstance(dagster_type, type) and issubclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType)),\n "Do not pass runtime type classes. Got {}".format(dagster_type),\n )\n\n # First check to see if it is part of python's typing library\n if is_typing_type(dagster_type):\n dagster_type = transform_typing_type(dagster_type)\n\n if isinstance(dagster_type, DagsterType):\n return dagster_type\n\n # Test for unhashable objects -- this is if, for instance, someone has passed us an instance of\n # a dict where they meant to pass dict or Dict, etc.\n try:\n hash(dagster_type)\n except TypeError:\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n additional_msg=(\n ", which isn't hashable. Did you pass an instance of a type instead of "\n "the type?"\n ),\n dagster_type=str(dagster_type),\n )\n )\n\n if BuiltinEnum.contains(dagster_type):\n return DagsterType.from_builtin_enum(dagster_type)\n\n if is_supported_runtime_python_builtin(dagster_type):\n return remap_python_builtin_for_runtime(dagster_type)\n\n if dagster_type is None:\n return Any\n\n if dagster_type is Dict:\n return PythonDict\n if isinstance(dagster_type, DagsterTupleApi):\n return PythonTuple\n if isinstance(dagster_type, DagsterSetApi):\n return PythonSet\n if isinstance(dagster_type, DagsterListApi):\n return List(Any)\n\n if isinstance(dagster_type, type):\n return resolve_python_type_to_dagster_type(dagster_type)\n\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n dagster_type=str(dagster_type), additional_msg="."\n )\n )\n\n\ndef resolve_python_type_to_dagster_type(python_type: t.Type) -> DagsterType:\n """\n Resolves a Python type to a Dagster type.\n """\n check.inst_param(python_type, "python_type", type)\n\n if python_type in _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY:\n return _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type]\n else:\n dagster_type = TypeHintInferredDagsterType(python_type)\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n return dagster_type\n\n\nALL_RUNTIME_BUILTINS = list(_RUNTIME_MAP.values())\n\n\ndef construct_dagster_type_dictionary(solid_defs):\n type_dict_by_name = {t.unique_name: t for t in ALL_RUNTIME_BUILTINS}\n type_dict_by_key = {t.key: t for t in ALL_RUNTIME_BUILTINS}\n for solid_def in solid_defs:\n for dagster_type in solid_def.all_dagster_types():\n # We don't do uniqueness check on key because with classes\n # like Array, Noneable, etc, those are ephemeral objects\n # and it is perfectly fine to have many of them.\n type_dict_by_key[dagster_type.key] = dagster_type\n\n if not dagster_type.has_unique_name:\n continue\n\n if dagster_type.unique_name not in type_dict_by_name:\n type_dict_by_name[dagster_type.unique_name] = dagster_type\n continue\n\n if type_dict_by_name[dagster_type.unique_name] is not dagster_type:\n raise DagsterInvalidDefinitionError(\n (\n 'You have created two dagster types with the same name "{type_name}". '\n "Dagster types have must have unique names."\n ).format(type_name=dagster_type.display_name)\n )\n\n return type_dict_by_key\n\n\nclass DagsterOptionalApi:\n def __getitem__(self, inner_type: t.Union[t.Type, DagsterType]) -> OptionalType:\n inner_type = resolve_dagster_type(check.not_none_param(inner_type, "inner_type"))\n return OptionalType(inner_type)\n\n\nOptional = DagsterOptionalApi()\n
", "current_page_name": "_modules/dagster/core/types/dagster_type", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.types.dagster_type"}, "decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.types.decorator

\nfrom dagster import check\n\nfrom .dagster_type import PythonObjectDagsterType, make_python_type_usable_as_dagster_type\n\n\n
[docs]def usable_as_dagster_type(\n name=None,\n description=None,\n loader=None,\n materializer=None,\n):\n """Decorate a Python class to make it usable as a Dagster Type.\n\n This is intended to make it straightforward to annotate existing business logic classes to\n make them dagster types whose typecheck is an isinstance check against that python class.\n\n Args:\n python_type (cls): The python type to make usable as python type.\n name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of\n the ``python_type`` will be used.\n description (Optional[str]): A user-readable description of the type.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_materializer <dagster.dagster_type_materializer>`\n decorator to construct these arguments.\n\n Examples:\n\n .. code-block:: python\n\n # dagster_aws.s3.file_manager.S3FileHandle\n @usable_as_dagster_type\n class S3FileHandle(FileHandle):\n def __init__(self, s3_bucket, s3_key):\n self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n self._s3_key = check.str_param(s3_key, 's3_key')\n\n @property\n def s3_bucket(self):\n return self._s3_bucket\n\n @property\n def s3_key(self):\n return self._s3_key\n\n @property\n def path_desc(self):\n return self.s3_path\n\n @property\n def s3_path(self):\n return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n """\n\n def _with_args(bare_cls):\n check.class_param(bare_cls, "bare_cls")\n new_name = name if name else bare_cls.__name__\n\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(\n name=new_name,\n description=description,\n python_type=bare_cls,\n loader=loader,\n materializer=materializer,\n ),\n )\n return bare_cls\n\n # check for no args, no parens case\n if callable(name):\n bare_cls = name # with no parens, name is actually the decorated class\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(python_type=bare_cls, name=bare_cls.__name__, description=None),\n )\n return bare_cls\n\n return _with_args
\n
", "current_page_name": "_modules/dagster/core/types/decorator", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.types.decorator"}}}, "serdes": {"config_class": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.serdes.config_class

\nimport importlib\nfrom abc import ABC, abstractmethod\nfrom typing import NamedTuple\n\nimport yaml\n\nfrom dagster import check\n\nfrom .serdes import whitelist_for_serdes\n\n\n
[docs]@whitelist_for_serdes\nclass ConfigurableClassData(\n NamedTuple(\n "_ConfigurableClassData",\n [\n ("module_name", str),\n ("class_name", str),\n ("config_yaml", str),\n ],\n )\n):\n """Serializable tuple describing where to find a class and the config fragment that should\n be used to instantiate it.\n\n Users should not instantiate this class directly.\n\n Classes intended to be serialized in this way should implement the\n :py:class:`dagster.serdes.ConfigurableClass` mixin.\n """\n\n def __new__(cls, module_name, class_name, config_yaml):\n return super(ConfigurableClassData, cls).__new__(\n cls,\n check.str_param(module_name, "module_name"),\n check.str_param(class_name, "class_name"),\n check.str_param(config_yaml, "config_yaml"),\n )\n\n @property\n def config_dict(self):\n return yaml.safe_load(self.config_yaml)\n\n def info_dict(self):\n return {\n "module": self.module_name,\n "class": self.class_name,\n "config": self.config_dict,\n }\n\n def rehydrate(self):\n from dagster.config.field import resolve_to_config_type\n from dagster.config.validate import process_config\n from dagster.core.errors import DagsterInvalidConfigError\n\n try:\n module = importlib.import_module(self.module_name)\n except ModuleNotFoundError:\n check.failed(\n f"Couldn't import module {self.module_name} when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n try:\n klass = getattr(module, self.class_name)\n except AttributeError:\n check.failed(\n f"Couldn't find class {self.class_name} in module when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n\n if not issubclass(klass, ConfigurableClass):\n raise check.CheckError(\n klass,\n f"class {self.class_name} in module {self.module_name}",\n ConfigurableClass,\n )\n\n config_dict = self.config_dict\n result = process_config(resolve_to_config_type(klass.config_type()), config_dict)\n if not result.success:\n raise DagsterInvalidConfigError(\n f"Errors whilst loading configuration for {klass.config_type()}.",\n result.errors,\n config_dict,\n )\n return klass.from_config_value(self, result.value)
\n\n\n
[docs]class ConfigurableClass(ABC):\n """Abstract mixin for classes that can be loaded from config.\n\n This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\n of conditional imports / optional extras_requires in dagster core and b) a magic directory or\n file in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\n run storage, pluggable with a config chunk like:\n\n .. code-block:: yaml\n\n run_storage:\n module: very_cool_package.run_storage\n class: SplendidRunStorage\n config:\n magic_word: "quux"\n\n This same pattern should eventually be viable for other system components, e.g. engines.\n\n The ``ConfigurableClass`` mixin provides the necessary hooks for classes to be instantiated from\n an instance of ``ConfigurableClassData``.\n\n Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\n type such as:\n\n .. code-block:: python\n\n {'module': str, 'class': str, 'config': Field(Permissive())}\n\n """\n\n @property\n @abstractmethod\n def inst_data(self):\n """\n Subclass must be able to return the inst_data as a property if it has been constructed\n through the from_config_value code path.\n """\n\n
[docs] @classmethod\n @abstractmethod\n def config_type(cls):\n """dagster.ConfigType: The config type against which to validate a config yaml fragment\n serialized in an instance of ``ConfigurableClassData``.\n """\n raise NotImplementedError(f"{cls.__name__} must implement the config_type classmethod")
\n\n
[docs] @staticmethod\n @abstractmethod\n def from_config_value(inst_data, config_value):\n """New up an instance of the ConfigurableClass from a validated config value.\n\n Called by ConfigurableClassData.rehydrate.\n\n Args:\n config_value (dict): The validated config value to use. Typically this should be the\n ``value`` attribute of a\n :py:class:`~dagster.core.types.evaluator.evaluation.EvaluateValueResult`.\n\n\n A common pattern is for the implementation to align the config_value with the signature\n of the ConfigurableClass's constructor:\n\n .. code-block:: python\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n\n """\n raise NotImplementedError(\n "ConfigurableClass subclasses must implement the from_config_value staticmethod"\n )
\n\n\ndef class_from_code_pointer(module_name, class_name):\n try:\n module = importlib.import_module(module_name)\n except ModuleNotFoundError:\n check.failed(\n "Couldn't import module {module_name} when attempting to load the "\n "class {klass}".format(\n module_name=module_name,\n klass=module_name + "." + class_name,\n )\n )\n try:\n return getattr(module, class_name)\n except AttributeError:\n check.failed(\n "Couldn't find class {class_name} in module when attempting to load the "\n "class {klass}".format(\n class_name=class_name,\n klass=module_name + "." + class_name,\n )\n )\n
", "current_page_name": "_modules/dagster/serdes/config_class", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.serdes.config_class"}}, "utils": {"alabaster_version": "0.7.12", "backcompat": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.backcompat

\nimport inspect\nimport warnings\nfrom functools import wraps\nfrom typing import Optional\n\nfrom dagster import check\n\nEXPERIMENTAL_WARNING_HELP = (\n    "To mute warnings for experimental functionality, invoke"\n    ' warnings.filterwarnings("ignore", category=dagster.ExperimentalWarning) or use'\n    " one of the other methods described at"\n    " https://docs.python.org/3/library/warnings.html#describing-warning-filters."\n)\n\n\ndef canonicalize_backcompat_args(new_val, new_arg, old_val, old_arg, breaking_version, **kwargs):\n    """\n    Utility for managing backwards compatibility of two related arguments.\n\n    For example if you had an existing function\n\n    def is_new(old_flag):\n        return not new_flag\n\n    And you decided you wanted a new function to be:\n\n    def is_new(new_flag):\n        return new_flag\n\n    However you want an in between period where either flag is accepted. Use\n    canonicalize_backcompat_args to manage that:\n\n    def is_new(old_flag=None, new_flag=None):\n        return canonicalize_backcompat_args(\n            new_val=new_flag,\n            new_arg='new_flag',\n            old_val=old_flag,\n            old_arg='old_flag',\n            breaking_version='0.9.0',\n            coerce_old_to_new=lambda val: not val,\n        )\n\n\n    In this example, if the caller sets both new_flag and old_flag, it will fail by throwing\n    a CheckError. If the caller sets old_flag, it will run it through the coercion function\n    , warn, and then execute.\n\n    canonicalize_backcompat_args returns the value as if *only* new_val were specified\n    """\n    coerce_old_to_new = kwargs.get("coerce_old_to_new")\n    additional_warn_txt = kwargs.get("additional_warn_txt")\n    # stacklevel=3 punches up to the caller of canonicalize_backcompat_args\n    stacklevel = kwargs.get("stacklevel", 3)\n\n    check.str_param(new_arg, "new_arg")\n    check.str_param(old_arg, "old_arg")\n    check.opt_callable_param(coerce_old_to_new, "coerce_old_to_new")\n    check.opt_str_param(additional_warn_txt, "additional_warn_txt")\n    check.opt_int_param(stacklevel, "stacklevel")\n    if new_val is not None:\n        if old_val is not None:\n            check.failed(\n                'Do not use deprecated "{old_arg}" now that you are using "{new_arg}".'.format(\n                    old_arg=old_arg, new_arg=new_arg\n                )\n            )\n        return new_val\n    if old_val is not None:\n        _additional_warn_txt = f'Use "{new_arg}" instead.' + (\n            (" " + additional_warn_txt) if additional_warn_txt else ""\n        )\n        deprecation_warning(\n            f'Argument "{old_arg}"', breaking_version, _additional_warn_txt, stacklevel + 1\n        )\n        return coerce_old_to_new(old_val) if coerce_old_to_new else old_val\n\n    return new_val\n\n\ndef deprecation_warning(\n    subject: str,\n    breaking_version: str,\n    additional_warn_txt: Optional[str] = None,\n    stacklevel: int = 3,\n):\n    warnings.warn(\n        f"{subject} is deprecated and will be removed in {breaking_version}."\n        + ((" " + additional_warn_txt) if additional_warn_txt else ""),\n        category=DeprecationWarning,\n        stacklevel=stacklevel,\n    )\n\n\ndef rename_warning(new_name, old_name, breaking_version, additional_warn_txt=None, stacklevel=3):\n    """\n    Common utility for managing backwards compatibility of renaming.\n    """\n    warnings.warn(\n        '"{old_name}" is deprecated and will be removed in {breaking_version}, use "{new_name}" instead.'.format(\n            old_name=old_name,\n            new_name=new_name,\n            breaking_version=breaking_version,\n        )\n        + ((" " + additional_warn_txt) if additional_warn_txt else ""),\n        category=DeprecationWarning,\n        stacklevel=stacklevel,\n    )\n\n\n
[docs]class ExperimentalWarning(Warning):\n pass
\n\n\ndef experimental_fn_warning(name, stacklevel=3):\n """Utility for warning that a function is experimental"""\n warnings.warn(\n '"{name}" is an experimental function. It may break in future versions, even between dot'\n " releases. {help}".format(name=name, help=EXPERIMENTAL_WARNING_HELP),\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_decorator_warning(name, stacklevel=3):\n """Utility for warning that a decorator is experimental"""\n warnings.warn(\n f'"{name}" is an experimental decorator. It may break in future versions, even between dot'\n f" releases. {EXPERIMENTAL_WARNING_HELP}",\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_class_warning(name, stacklevel=3):\n """Utility for warning that a class is experimental. Expected to be called from the class's\n __init__ method.\n\n Usage:\n\n .. code-block:: python\n\n class MyExperimentalClass:\n def __init__(self, some_arg):\n experimental_class_warning('MyExperimentalClass')\n # do other initialization stuff\n """\n warnings.warn(\n '"{name}" is an experimental class. It may break in future versions, even between dot'\n " releases. {help}".format(name=name, help=EXPERIMENTAL_WARNING_HELP),\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_arg_warning(arg_name, fn_name, stacklevel=3):\n """Utility for warning that an argument to a function is experimental"""\n warnings.warn(\n '"{arg_name}" is an experimental argument to function "{fn_name}". '\n "It may break in future versions, even between dot releases. {help}".format(\n arg_name=arg_name, fn_name=fn_name, help=EXPERIMENTAL_WARNING_HELP\n ),\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_functionality_warning(desc, stacklevel=3):\n """Utility for warning that a particular functionality is experimental"""\n warnings.warn(\n f"{desc} is currently experimental functionality. It may break in future versions, even "\n f"between dot releases. {EXPERIMENTAL_WARNING_HELP}",\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_class_param_warning(param_name, class_name, stacklevel=3):\n """Utility for warning that an argument to a constructor is experimental"""\n warnings.warn(\n (\n f'"{param_name}" is an experimental parameter to the class "{class_name}". It may '\n f"break in future versions, even between dot releases. {EXPERIMENTAL_WARNING_HELP}"\n ),\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental(callable_):\n """\n Spews an "experimental" warning whenever the given callable is called. If the argument is a\n class, this means the warning will be emitted when the class is instantiated.\n\n Usage:\n\n .. code-block:: python\n\n @experimental\n def my_experimental_function(my_arg):\n do_stuff()\n\n @experimental\n class MyExperimentalClass:\n pass\n """\n check.callable_param(callable_, "callable_")\n\n if inspect.isfunction(callable_):\n\n @wraps(callable_)\n def _inner(*args, **kwargs):\n experimental_fn_warning(callable_.__name__, stacklevel=3)\n return callable_(*args, **kwargs)\n\n return _inner\n\n if inspect.isclass(callable_):\n\n undecorated_init = callable_.__init__\n\n def __init__(self, *args, **kwargs):\n experimental_class_warning(callable_.__name__, stacklevel=3)\n # Tuples must be handled differently, because the undecorated_init does not take any\n # arguments-- they're assigned in __new__.\n if issubclass(callable_, tuple):\n undecorated_init(self)\n else:\n undecorated_init(self, *args, **kwargs)\n\n callable_.__init__ = __init__\n\n return callable_\n\n\ndef experimental_decorator(decorator):\n """\n Spews an "experimental" warning whenever the given decorator is invoked.\n\n Usage:\n\n .. code-block:: python\n\n @experimental_decorator\n def my_experimental_decorator(...):\n ...\n """\n check.callable_param(decorator, "decorator")\n\n @wraps(decorator)\n def _inner(*args, **kwargs):\n experimental_decorator_warning(decorator.__name__, stacklevel=3)\n return decorator(*args, **kwargs)\n\n return _inner\n
", "current_page_name": "_modules/dagster/utils/backcompat", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.backcompat"}, "body": "

Source code for dagster.utils

\nimport _thread as thread\nimport contextlib\nimport contextvars\nimport datetime\nimport errno\nimport functools\nimport inspect\nimport multiprocessing\nimport os\nimport re\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport tempfile\nimport threading\nfrom collections import OrderedDict, defaultdict, namedtuple\nfrom datetime import timezone\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Callable, ContextManager, Generator, Generic, Iterator\nfrom typing import Mapping as TypingMapping\nfrom typing import Optional, Type, TypeVar, Union, cast, overload\nfrom warnings import warn\n\nimport yaml\n\nfrom dagster import check, seven\nfrom dagster.core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster.seven import IS_WINDOWS\nfrom dagster.seven.abc import Mapping\n\nfrom .merger import merge_dicts\nfrom .yaml_utils import load_yaml_from_glob_list, load_yaml_from_globs, load_yaml_from_path\n\nif sys.version_info > (3,):\n    from pathlib import Path  # pylint: disable=import-error\nelse:\n    from pathlib2 import Path  # pylint: disable=import-error\n\nif TYPE_CHECKING:\n    from dagster.core.events import DagsterEvent\n\nT = TypeVar("T")\n\nEPOCH = datetime.datetime.utcfromtimestamp(0)\n\nPICKLE_PROTOCOL = 4\n\n\nDEFAULT_WORKSPACE_YAML_FILENAME = "workspace.yaml"\n\n\n# Back-compat after make_email_on_pipeline_failure_sensor and make_email_on_run_failure_sensor\n# were moved to avoid circular-dependency issues\ndef make_email_on_pipeline_failure_sensor(*args, **kwargs):\n    from .alert import make_email_on_pipeline_failure_sensor  # pylint: disable=redefined-outer-name\n\n    return make_email_on_pipeline_failure_sensor(*args, **kwargs)\n\n\n
[docs]def make_email_on_run_failure_sensor(*args, **kwargs):\n from .alert import make_email_on_run_failure_sensor # pylint: disable=redefined-outer-name\n\n return make_email_on_run_failure_sensor(*args, **kwargs)
\n\n\n
[docs]def file_relative_path(dunderfile: str, relative_path: str) -> str:\n """Get a path relative to the currently executing Python file.\n\n This function is useful when one needs to load a file that is relative to the position of\n the current file. (Such as when you encode a configuration file path in source file and want\n in runnable in any current working directory)\n\n Args:\n dunderfile (str): Should always be ``__file__``.\n relative_path (str): Path to get relative to the currently executing file.\n\n **Examples**:\n\n .. code-block:: python\n\n file_relative_path(__file__, 'path/relative/to/file')\n\n """\n\n check.str_param(dunderfile, "dunderfile")\n check.str_param(relative_path, "relative_path")\n\n return os.path.join(os.path.dirname(dunderfile), relative_path)
\n\n\ndef script_relative_path(file_path: str) -> str:\n """\n Useful for testing with local files. Use a path relative to where the\n test resides and this function will return the absolute path\n of that file. Otherwise it will be relative to script that\n ran the test\n\n Note: this is function is very, very expensive (on the order of 1\n millisecond per invocation) so this should only be used in performance\n insensitive contexts. Prefer file_relative_path for anything with\n performance constraints.\n\n """\n # from http://bit.ly/2snyC6s\n\n check.str_param(file_path, "file_path")\n scriptdir = inspect.stack()[1][1]\n return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(scriptdir)), file_path))\n\n\n# Adapted from https://github.com/okunishinishi/python-stringcase/blob/master/stringcase.py\ndef camelcase(string):\n check.str_param(string, "string")\n\n string = re.sub(r"^[\\-_\\.]", "", str(string))\n if not string:\n return string\n return str(string[0]).upper() + re.sub(\n r"[\\-_\\.\\s]([a-z])", lambda matched: str(matched.group(1)).upper(), string[1:]\n )\n\n\ndef ensure_single_item(ddict):\n check.dict_param(ddict, "ddict")\n check.param_invariant(len(ddict) == 1, "ddict", "Expected dict with single item")\n return list(ddict.items())[0]\n\n\n@contextlib.contextmanager\ndef pushd(path):\n old_cwd = os.getcwd()\n os.chdir(path)\n try:\n yield path\n finally:\n os.chdir(old_cwd)\n\n\ndef safe_isfile(path):\n """ "Backport of Python 3.8 os.path.isfile behavior.\n\n This is intended to backport https://docs.python.org/dev/whatsnew/3.8.html#os-path. I'm not\n sure that there are other ways to provoke this behavior on Unix other than the null byte,\n but there are certainly other ways to do it on Windows. Afaict, we won't mask other\n ValueErrors, and the behavior in the status quo ante is rough because we risk throwing an\n unexpected, uncaught ValueError from very deep in our logic.\n """\n try:\n return os.path.isfile(path)\n except ValueError:\n return False\n\n\ndef mkdir_p(path):\n try:\n os.makedirs(path)\n return path\n except OSError as exc: # Python >2.5\n if exc.errno == errno.EEXIST and os.path.isdir(path):\n pass\n else:\n raise\n\n\n# TODO: Make frozendict generic for type annotations\n# https://github.com/dagster-io/dagster/issues/3641\nclass frozendict(dict):\n def __readonly__(self, *args, **kwargs):\n raise RuntimeError("Cannot modify ReadOnlyDict")\n\n # https://docs.python.org/3/library/pickle.html#object.__reduce__\n #\n # For a dict, the default behavior for pickle is to iteratively call __setitem__ (see 5th item\n # in __reduce__ tuple). Since we want to disable __setitem__ and still inherit dict, we\n # override this behavior by defining __reduce__. We return the 3rd item in the tuple, which is\n # passed to __setstate__, allowing us to restore the frozendict.\n\n def __reduce__(self):\n return (frozendict, (), dict(self))\n\n def __setstate__(self, state):\n self.__init__(state)\n\n __setitem__ = __readonly__\n __delitem__ = __readonly__\n pop = __readonly__ # type: ignore[assignment]\n popitem = __readonly__\n clear = __readonly__\n update = __readonly__ # type: ignore[assignment]\n setdefault = __readonly__\n del __readonly__\n\n\nclass frozenlist(list):\n def __readonly__(self, *args, **kwargs):\n raise RuntimeError("Cannot modify ReadOnlyList")\n\n # https://docs.python.org/3/library/pickle.html#object.__reduce__\n #\n # Like frozendict, implement __reduce__ and __setstate__ to handle pickling.\n # Otherwise, __setstate__ will be called to restore the frozenlist, causing\n # a RuntimeError because frozenlist is not mutable.\n\n def __reduce__(self):\n return (frozenlist, (), list(self))\n\n def __setstate__(self, state):\n self.__init__(state)\n\n __setitem__ = __readonly__ # type: ignore[assignment]\n __delitem__ = __readonly__\n append = __readonly__\n clear = __readonly__\n extend = __readonly__\n insert = __readonly__\n pop = __readonly__\n remove = __readonly__\n reverse = __readonly__\n sort = __readonly__ # type: ignore[assignment]\n\n def __hash__(self):\n return hash(tuple(self))\n\n\ndef make_readonly_value(value):\n if isinstance(value, list):\n return frozenlist(list(map(make_readonly_value, value)))\n elif isinstance(value, dict):\n return frozendict({key: make_readonly_value(value) for key, value in value.items()})\n else:\n return value\n\n\ndef get_prop_or_key(elem, key):\n if isinstance(elem, Mapping):\n return elem.get(key)\n else:\n return getattr(elem, key)\n\n\ndef list_pull(alist, key):\n return list(map(lambda elem: get_prop_or_key(elem, key), alist))\n\n\ndef all_none(kwargs):\n for value in kwargs.values():\n if value is not None:\n return False\n return True\n\n\ndef check_script(path, return_code=0):\n try:\n subprocess.check_output([sys.executable, path])\n except subprocess.CalledProcessError as exc:\n if return_code != 0:\n if exc.returncode == return_code:\n return\n raise\n\n\ndef check_cli_execute_file_pipeline(path, pipeline_fn_name, env_file=None):\n from dagster.core.test_utils import instance_for_test\n\n with instance_for_test():\n cli_cmd = [\n sys.executable,\n "-m",\n "dagster",\n "pipeline",\n "execute",\n "-f",\n path,\n "-a",\n pipeline_fn_name,\n ]\n\n if env_file:\n cli_cmd.append("-c")\n cli_cmd.append(env_file)\n\n try:\n subprocess.check_output(cli_cmd)\n except subprocess.CalledProcessError as cpe:\n print(cpe) # pylint: disable=print-call\n raise cpe\n\n\ndef safe_tempfile_path_unmanaged() -> str:\n # This gets a valid temporary file path in the safest possible way, although there is still no\n # guarantee that another process will not create a file at this path. The NamedTemporaryFile is\n # deleted when the context manager exits and the file object is closed.\n #\n # This is preferable to using NamedTemporaryFile as a context manager and passing the name\n # attribute of the file object around because NamedTemporaryFiles cannot be opened a second time\n # if already open on Windows NT or later:\n # https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile\n # https://github.com/dagster-io/dagster/issues/1582\n with tempfile.NamedTemporaryFile() as fd:\n path = fd.name\n return Path(path).as_posix()\n\n\n@contextlib.contextmanager\ndef safe_tempfile_path() -> Iterator[str]:\n try:\n path = safe_tempfile_path_unmanaged()\n yield path\n finally:\n if os.path.exists(path):\n os.unlink(path)\n\n\n@overload\ndef ensure_gen(thing_or_gen: Generator[T, Any, Any]) -> Generator[T, Any, Any]:\n pass\n\n\n@overload\ndef ensure_gen(thing_or_gen: T) -> Generator[T, Any, Any]:\n pass\n\n\ndef ensure_gen(thing_or_gen):\n if not inspect.isgenerator(thing_or_gen):\n\n def _gen_thing():\n yield thing_or_gen\n\n return _gen_thing()\n\n return thing_or_gen\n\n\ndef ensure_dir(file_path):\n try:\n os.makedirs(file_path)\n except OSError as e:\n if e.errno != errno.EEXIST:\n raise\n\n\ndef ensure_file(path):\n ensure_dir(os.path.dirname(path))\n if not os.path.exists(path):\n touch_file(path)\n\n\ndef touch_file(path):\n ensure_dir(os.path.dirname(path))\n with open(path, "a"):\n os.utime(path, None)\n\n\ndef _kill_on_event(termination_event):\n termination_event.wait()\n send_interrupt()\n\n\ndef send_interrupt():\n if IS_WINDOWS:\n # This will raise a KeyboardInterrupt in python land - meaning this wont be able to\n # interrupt things like sleep()\n thread.interrupt_main()\n else:\n # If on unix send an os level signal to interrupt any situation we may be stuck in\n os.kill(os.getpid(), signal.SIGINT)\n\n\n# Function to be invoked by daemon thread in processes which seek to be cancellable.\n# The motivation for this approach is to be able to exit cleanly on Windows. An alternative\n# path is to change how the processes are opened and send CTRL_BREAK signals, which at\n# the time of authoring seemed a more costly approach.\n#\n# Reading for the curious:\n# * https://stackoverflow.com/questions/35772001/how-to-handle-the-signal-in-python-on-windows-machine\n# * https://stefan.sofa-rockers.org/2013/08/15/handling-sub-process-hierarchies-python-linux-os-x/\ndef start_termination_thread(termination_event):\n check.inst_param(termination_event, "termination_event", ttype=type(multiprocessing.Event()))\n\n int_thread = threading.Thread(\n target=_kill_on_event, args=(termination_event,), name="kill-on-event"\n )\n int_thread.daemon = True\n int_thread.start()\n\n\n# Executes the next() function within an instance of the supplied context manager class\n# (leaving the context before yielding each result)\ndef iterate_with_context(\n context_fn: Callable[[], ContextManager], iterator: Iterator[T]\n) -> Iterator[T]:\n while True:\n # Allow interrupts during user code so that we can terminate slow/hanging steps\n with context_fn():\n try:\n next_output = next(iterator)\n except StopIteration:\n return\n\n yield next_output\n\n\ndef datetime_as_float(dt):\n check.inst_param(dt, "dt", datetime.datetime)\n return float((dt - EPOCH).total_seconds())\n\n\n# hashable frozen string to string dict\nclass frozentags(frozendict):\n def __init__(self, *args, **kwargs):\n super(frozentags, self).__init__(*args, **kwargs)\n check.dict_param(self, "self", key_type=str, value_type=str)\n\n def __hash__(self):\n return hash(tuple(sorted(self.items())))\n\n def updated_with(self, new_tags):\n check.dict_param(new_tags, "new_tags", key_type=str, value_type=str)\n updated = dict(self)\n for key, value in new_tags.items():\n updated[key] = value\n\n return frozentags(updated)\n\n\nGeneratedContext = TypeVar("GeneratedContext")\n\n\nclass EventGenerationManager(Generic[GeneratedContext]):\n """Utility class that wraps an event generator function, that also yields a single instance of\n a typed object. All events yielded before the typed object are yielded through the method\n `generate_setup_events` and all events yielded after the typed object are yielded through the\n method `generate_teardown_events`.\n\n This is used to help replace the context managers used in pipeline initialization with\n generators so that we can begin emitting initialization events AND construct a pipeline context\n object, while managing explicit setup/teardown.\n\n This does require calling `generate_setup_events` AND `generate_teardown_events` in order to\n get the typed object.\n """\n\n def __init__(\n self,\n generator: Generator[Union["DagsterEvent", GeneratedContext], None, None],\n object_cls: Type[GeneratedContext],\n require_object: Optional[bool] = True,\n ):\n self.generator = check.generator(generator)\n self.object_cls: Type[GeneratedContext] = check.class_param(object_cls, "object_cls")\n self.require_object = check.bool_param(require_object, "require_object")\n self.object: Optional[GeneratedContext] = None\n self.did_setup = False\n self.did_teardown = False\n\n def generate_setup_events(self) -> Iterator["DagsterEvent"]:\n self.did_setup = True\n try:\n while self.object is None:\n obj = next(self.generator)\n if isinstance(obj, self.object_cls):\n self.object = obj\n else:\n yield obj\n except StopIteration:\n if self.require_object:\n check.inst_param(\n self.object,\n "self.object",\n self.object_cls,\n "generator never yielded object of type {}".format(self.object_cls.__name__),\n )\n\n def get_object(self) -> GeneratedContext:\n if not self.did_setup:\n check.failed("Called `get_object` before `generate_setup_events`")\n return cast(GeneratedContext, self.object)\n\n def generate_teardown_events(self) -> Iterator["DagsterEvent"]:\n self.did_teardown = True\n if self.object:\n yield from self.generator\n\n\ndef utc_datetime_from_timestamp(timestamp):\n tz = timezone.utc\n return datetime.datetime.fromtimestamp(timestamp, tz=tz)\n\n\ndef utc_datetime_from_naive(dt):\n tz = timezone.utc\n return dt.replace(tzinfo=tz)\n\n\ndef is_enum_value(value):\n return False if value is None else issubclass(value.__class__, Enum)\n\n\ndef git_repository_root():\n return subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()\n\n\ndef segfault():\n """Reliable cross-Python version segfault.\n\n https://bugs.python.org/issue1215#msg143236\n """\n import ctypes\n\n ctypes.string_at(0)\n\n\ndef find_free_port():\n with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:\n s.bind(("", 0))\n s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n return s.getsockname()[1]\n\n\n@contextlib.contextmanager\ndef alter_sys_path(to_add, to_remove):\n to_restore = [path for path in sys.path]\n\n # remove paths\n for path in to_remove:\n if path in sys.path:\n sys.path.remove(path)\n\n # add paths\n for path in to_add:\n sys.path.insert(0, path)\n\n try:\n yield\n finally:\n sys.path = to_restore\n\n\n@contextlib.contextmanager\ndef restore_sys_modules():\n sys_modules = {k: v for k, v in sys.modules.items()}\n try:\n yield\n finally:\n to_delete = set(sys.modules) - set(sys_modules)\n for key in to_delete:\n del sys.modules[key]\n\n\ndef process_is_alive(pid):\n if IS_WINDOWS:\n import psutil # pylint: disable=import-error\n\n return psutil.pid_exists(pid=pid)\n else:\n try:\n subprocess.check_output(["ps", str(pid)])\n except subprocess.CalledProcessError as exc:\n assert exc.returncode == 1\n return False\n return True\n\n\ndef compose(*args):\n """\n Compose python functions args such that compose(f, g)(x) is equivalent to f(g(x)).\n """\n # reduce using functional composition over all the arguments, with the identity function as\n # initializer\n return functools.reduce(lambda f, g: lambda x: f(g(x)), args, lambda x: x)\n\n\ndef dict_without_keys(ddict, *keys):\n return {key: value for key, value in ddict.items() if key not in set(keys)}\n\n\nclass Counter:\n def __init__(self):\n self._lock = threading.Lock()\n self._counts = OrderedDict()\n super(Counter, self).__init__()\n\n def increment(self, key: str):\n with self._lock:\n self._counts[key] = self._counts.get(key, 0) + 1\n\n def counts(self) -> TypingMapping[str, int]:\n with self._lock:\n copy = {k: v for k, v in self._counts.items()}\n return copy\n\n\ntraced_counter = contextvars.ContextVar("traced_counts", default=Counter())\n\n\ndef traced(func=None):\n """\n A decorator that keeps track of how many times a function is called.\n """\n\n def inner(*args, **kwargs):\n counter = traced_counter.get()\n if counter and isinstance(counter, Counter):\n counter.increment(func.__qualname__)\n\n return func(*args, **kwargs)\n\n return inner\n
", "current_page_name": "_modules/dagster/utils", "customsidebar": null, "forked_pdb": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.forked_pdb

\nimport pdb\nimport sys\n\n\n# From https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess\n
[docs]class ForkedPdb(pdb.Pdb):\n """A pdb subclass that may be used from a forked multiprocessing child\n\n **Examples**:\n\n .. code-block:: python\n\n from dagster.utils.forked_pdb import ForkedPdb\n\n @solid\n def complex_solid(_):\n # some complicated stuff\n\n ForkedPdb().set_trace()\n\n # some other complicated stuff\n\n You can initiate pipeline execution via dagit and use the pdb debugger to examine/step through\n execution at the breakpoint.\n """\n\n def interaction(self, frame, traceback):\n _stdin = sys.stdin\n try:\n sys.stdin = open("/dev/stdin")\n pdb.Pdb.interaction(self, frame, traceback)\n finally:\n sys.stdin = _stdin
\n
", "current_page_name": "_modules/dagster/utils/forked_pdb", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.forked_pdb"}, "log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.log

\nimport copy\nimport logging\nimport sys\nimport traceback\nfrom contextlib import contextmanager\nfrom typing import Dict, NamedTuple, Optional\n\nimport coloredlogs\nimport pendulum\n\nfrom dagster import check, seven\nfrom dagster.config import Enum, EnumValue\nfrom dagster.core.definitions.logger_definition import logger\nfrom dagster.core.utils import PYTHON_LOGGING_LEVELS_MAPPING, coerce_valid_log_level\n\nLogLevelEnum = Enum("log_level", list(map(EnumValue, PYTHON_LOGGING_LEVELS_MAPPING.keys())))\n\n\nclass JsonFileHandler(logging.Handler):\n    def __init__(self, json_path):\n        super(JsonFileHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n\n    def emit(self, record):\n        try:\n            log_dict = copy.copy(record.__dict__)\n\n            # This horrific monstrosity is to maintain backwards compatability\n            # with the old behavior of the JsonFileHandler, which the clarify\n            # project has a dependency on. It relied on the dagster-defined\n            # properties smashing all the properties of the LogRecord object\n            # and uploads all of those properties to a redshift table for\n            # in order to do analytics on the log\n\n            if "dagster_meta" in log_dict:\n                dagster_meta_dict = log_dict["dagster_meta"]\n                del log_dict["dagster_meta"]\n            else:\n                dagster_meta_dict = {}\n\n            log_dict.update(dagster_meta_dict)\n\n            with open(self.json_path, "a") as ff:\n                text_line = seven.json.dumps(log_dict)\n                ff.write(text_line + "\\n")\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:  # pylint: disable=W0703\n            logging.critical("[{}] Error during logging!".format(self.__class__.__name__))\n            logging.exception(str(e))\n\n\nclass StructuredLoggerMessage(\n    NamedTuple(\n        "_StructuredLoggerMessage",\n        [\n            ("name", str),\n            ("message", str),\n            ("level", int),\n            ("meta", Dict[object, object]),\n            ("record", logging.LogRecord),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        message: str,\n        level: int,\n        meta: Dict[object, object],\n        record: logging.LogRecord,\n    ):\n        return super(StructuredLoggerMessage, cls).__new__(\n            cls,\n            check.str_param(name, "name"),\n            check.str_param(message, "message"),\n            coerce_valid_log_level(level),\n            check.dict_param(meta, "meta"),\n            check.inst_param(record, "record", logging.LogRecord),\n        )\n\n\nclass JsonEventLoggerHandler(logging.Handler):\n    def __init__(self, json_path, construct_event_record):\n        super(JsonEventLoggerHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n        self.construct_event_record = construct_event_record\n\n    def emit(self, record):\n        try:\n            event_record = self.construct_event_record(record)\n            with open(self.json_path, "a") as ff:\n                text_line = seven.json.dumps(event_record.to_dict())\n                ff.write(text_line + "\\n")\n\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:  # pylint: disable=W0703\n            logging.critical("[{}] Error during logging!".format(self.__class__.__name__))\n            logging.exception(str(e))\n\n\nclass StructuredLoggerHandler(logging.Handler):\n    def __init__(self, callback):\n        super(StructuredLoggerHandler, self).__init__()\n        self.callback = check.is_callable(callback, "callback")\n\n    def emit(self, record):\n        try:\n            self.callback(\n                StructuredLoggerMessage(\n                    name=record.name,\n                    message=record.msg,\n                    level=record.levelno,\n                    meta=record.dagster_meta,\n                    record=record,\n                )\n            )\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:  # pylint: disable=W0703\n            logging.critical("[{}] Error during logging!".format(self.__class__.__name__))\n            logging.exception(str(e))\n\n\ndef construct_single_handler_logger(name, level, handler):\n    check.str_param(name, "name")\n    check.inst_param(handler, "handler", logging.Handler)\n\n    level = coerce_valid_log_level(level)\n\n    @logger\n    def single_handler_logger(_init_context):\n        klass = logging.getLoggerClass()\n        logger_ = klass(name, level=level)\n        logger_.addHandler(handler)\n        handler.setLevel(level)\n        return logger_\n\n    return single_handler_logger\n\n\n# Base python logger whose messages will be captured as structured Dagster log messages.\nBASE_DAGSTER_LOGGER = logging.getLogger(name="dagster")\n\n\n
[docs]def get_dagster_logger(name: Optional[str] = None) -> logging.Logger:\n """\n Creates a python logger whose output messages will be captured and converted into Dagster log\n messages. This means they will have structured information such as the step_key, run_id, etc.\n embedded into them, and will show up in the Dagster event log.\n\n This can be used as a more convenient alternative to `context.log` in most cases. If log level\n is not set explicitly, defaults to DEBUG.\n\n Args:\n name (Optional[str]): If supplied, will create a logger with the name "dagster.builtin.{name}",\n with properties inherited from the base Dagster logger. If omitted, the returned logger\n will be named "dagster.builtin".\n\n Returns:\n :class:`logging.Logger`: A logger whose output will be captured by Dagster.\n\n Example:\n\n .. code-block:: python\n\n from dagster import get_dagster_logger, op\n\n @op\n def hello_op():\n log = get_dagster_logger()\n for i in range(5):\n # do something\n log.info(f"Did {i+1} things!")\n\n """\n\n # enforce that the parent logger will always have a DEBUG log level\n BASE_DAGSTER_LOGGER.setLevel(logging.DEBUG)\n base_builtin = BASE_DAGSTER_LOGGER.getChild("builtin")\n if name:\n return base_builtin.getChild(name)\n return base_builtin
\n\n\ndef define_structured_logger(name, callback, level):\n check.str_param(name, "name")\n check.callable_param(callback, "callback")\n level = coerce_valid_log_level(level)\n\n return construct_single_handler_logger(name, level, StructuredLoggerHandler(callback))\n\n\ndef define_json_file_logger(name, json_path, level):\n check.str_param(name, "name")\n check.str_param(json_path, "json_path")\n level = coerce_valid_log_level(level)\n\n stream_handler = JsonFileHandler(json_path)\n stream_handler.setFormatter(define_default_formatter())\n return construct_single_handler_logger(name, level, stream_handler)\n\n\ndef get_stack_trace_array(exception):\n check.inst_param(exception, "exception", Exception)\n if hasattr(exception, "__traceback__"):\n tb = exception.__traceback__\n else:\n _exc_type, _exc_value, tb = sys.exc_info()\n return traceback.format_tb(tb)\n\n\ndef _mockable_formatTime(record, datefmt=None): # pylint: disable=unused-argument\n """Uses pendulum.now to determine the logging time, causing pendulum\n mocking to affect the logger timestamp in tests."""\n return pendulum.now().strftime(datefmt if datefmt else default_date_format_string())\n\n\ndef default_format_string():\n return "%(asctime)s - %(name)s - %(levelname)s - %(message)s"\n\n\ndef default_date_format_string():\n return "%Y-%m-%d %H:%M:%S %z"\n\n\ndef define_default_formatter():\n return logging.Formatter(default_format_string(), default_date_format_string())\n\n\n@contextmanager\ndef quieten(quiet=True, level=logging.WARNING):\n if quiet:\n logging.disable(level)\n try:\n yield\n finally:\n if quiet:\n logging.disable(logging.NOTSET)\n\n\ndef configure_loggers(handler="default", log_level="INFO"):\n LOGGING_CONFIG = {\n "version": 1,\n "disable_existing_loggers": False,\n "formatters": {\n "colored": {\n "()": coloredlogs.ColoredFormatter,\n "fmt": default_format_string(),\n "datefmt": default_date_format_string(),\n "field_styles": {"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n "level_styles": {"debug": {}, "error": {"color": "red"}},\n },\n },\n "handlers": {\n "default": {\n "formatter": "colored",\n "class": "logging.StreamHandler",\n "stream": sys.stdout,\n "level": log_level,\n },\n "null": {\n "class": "logging.NullHandler",\n },\n },\n "loggers": {\n "dagster": {\n "handlers": [handler],\n "level": "INFO",\n },\n "dagit": {\n "handlers": [handler],\n "level": "INFO",\n },\n },\n }\n\n logging.config.dictConfig(LOGGING_CONFIG)\n\n if handler == "default":\n for name in ["dagster", "dagit"]:\n logging.getLogger(name).handlers[0].formatter.formatTime = _mockable_formatTime\n
", "current_page_name": "_modules/dagster/utils/log", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.log"}, "parents": [{"link": "../../", "title": "Module code"}], "partitions": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.partitions

\nimport datetime\nfrom typing import Callable, Union\n\nimport pendulum\n\nfrom dagster import check\nfrom dagster.core.definitions.partition import Partition, PartitionSetDefinition\nfrom dagster.core.definitions.run_request import SkipReason\nfrom dagster.core.definitions.schedule_definition import ScheduleEvaluationContext\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.seven.compat.pendulum import PendulumDateTime, to_timezone\n\nDEFAULT_MONTHLY_FORMAT = "%Y-%m"\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\nDEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE = "%Y-%m-%d-%H:%M"\nDEFAULT_HOURLY_FORMAT_WITH_TIMEZONE = DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE + "%z"\n\n\n
[docs]def date_partition_range(\n start,\n end=None,\n delta_range="days",\n fmt=None,\n inclusive=False,\n timezone=None,\n):\n """Utility function that returns a partition generating function to be used in creating a\n `PartitionSet` definition.\n\n Args:\n start (datetime): Datetime capturing the start of the time range.\n end (Optional(datetime)): Datetime capturing the end of the partition. By default, the\n current time is used. The range is not inclusive of the end\n value.\n delta_range (Optional(str)): string representing the time duration of each partition.\n Must be a valid argument to pendulum.period.range ("days", "hours", "months", etc.).\n fmt (Optional(str)): Format string to represent each partition by its start time\n inclusive (Optional(bool)): By default, the partition set only contains date interval\n partitions for which the end time of the interval is less than current time. In other\n words, the partition set contains date interval partitions that are completely in the\n past. If inclusive is set to True, then the partition set will include all date\n interval partitions for which the start time of the interval is less than the\n current time.\n timezone (Optional(str)): Timezone in which the partition values should be expressed.\n Returns:\n Callable[[], List[Partition]]\n """\n\n check.inst_param(start, "start", datetime.datetime)\n check.opt_inst_param(end, "end", datetime.datetime)\n check.str_param(delta_range, "delta_range")\n fmt = check.opt_str_param(fmt, "fmt", default=DEFAULT_DATE_FORMAT)\n check.opt_str_param(timezone, "timezone")\n\n delta_amount = 1\n\n if end and start > end:\n raise DagsterInvariantViolationError(\n 'Selected date range start "{start}" is after date range end "{end}'.format(\n start=start.strftime(fmt),\n end=end.strftime(fmt),\n )\n )\n\n def get_date_range_partitions(current_time=None):\n check.opt_inst_param(current_time, "current_time", datetime.datetime)\n tz = timezone if timezone else "UTC"\n _start = (\n to_timezone(start, tz)\n if isinstance(start, PendulumDateTime)\n else pendulum.instance(start, tz=tz)\n )\n\n if end:\n _end = end\n elif current_time:\n _end = current_time\n else:\n _end = pendulum.now(tz)\n\n # coerce to the definition timezone\n if isinstance(_end, PendulumDateTime):\n _end = to_timezone(_end, tz)\n else:\n _end = pendulum.instance(_end, tz=tz)\n\n period = pendulum.period(_start, _end)\n date_names = [\n Partition(value=current, name=current.strftime(fmt))\n for current in period.range(delta_range, delta_amount)\n ]\n\n # We don't include the last element here by default since we only want\n # fully completed intervals, and the _end time is in the middle of the interval\n # represented by the last element of date_names\n if inclusive:\n return date_names\n\n return date_names[:-1]\n\n return get_date_range_partitions
\n\n\n
[docs]def identity_partition_selector(context, partition_set_def):\n """Utility function for supplying a partition selector when creating a schedule from a\n partition set made of ``datetime`` objects that assumes the schedule always executes at the\n partition time.\n\n It's important that the cron string passed into ``create_schedule_definition`` match\n the partition set times. For example, a schedule created from a partition set with partitions for each day at\n midnight would create its partition selector as follows:\n\n .. code-block:: python\n\n partition_set = PartitionSetDefinition(\n name='hello_world_partition_set',\n pipeline_name='hello_world_pipeline',\n partition_fn= date_partition_range(\n start=datetime.datetime(2021, 1, 1),\n delta_range="days",\n timezone="US/Central",\n )\n run_config_fn_for_partition=my_run_config_fn,\n )\n\n schedule_definition = partition_set.create_schedule_definition(\n "hello_world_daily_schedule",\n "0 0 * * *",\n partition_selector=identity_partition_selector,\n execution_timezone="US/Central",\n )\n """\n\n return create_offset_partition_selector(lambda d: d)(context, partition_set_def)
\n\n\n
[docs]def create_offset_partition_selector(\n execution_time_to_partition_fn,\n) -> Callable[[ScheduleEvaluationContext, PartitionSetDefinition], Union[Partition, SkipReason]]:\n """Utility function for supplying a partition selector when creating a schedule from a\n partition set made of ``datetime`` objects that assumes a fixed time offset between the\n partition time and the time at which the schedule executes.\n\n It's important to keep the cron string that's supplied to\n ``PartitionSetDefinition.create_schedule_definition`` in sync with the offset that's\n supplied to this function. For example, a schedule created from a partition set with\n partitions for each day at midnight that fills in the partition for day N at day N+1 at\n 10:00AM would create the partition selector as follows:\n\n .. code-block:: python\n\n partition_set = PartitionSetDefinition(\n name='hello_world_partition_set',\n pipeline_name='hello_world_pipeline',\n partition_fn= date_partition_range(\n start=datetime.datetime(2021, 1, 1),\n delta_range="days",\n timezone="US/Central",\n )\n run_config_fn_for_partition=my_run_config_fn,\n )\n\n schedule_definition = partition_set.create_schedule_definition(\n "daily_10am_schedule",\n "0 10 * * *",\n partition_selector=create_offset_partition_selector(lambda d: d.subtract(hours=10, days=1))\n execution_timezone="US/Central",\n )\n\n Args:\n execution_time_to_partition_fn (Callable[[datetime.datetime], datetime.datetime]): A\n function that maps the execution time of the schedule to the partition time.\n """\n\n check.callable_param(execution_time_to_partition_fn, "execution_time_to_partition_fn")\n\n def offset_partition_selector(\n context: ScheduleEvaluationContext, partition_set_def: PartitionSetDefinition\n ) -> Union[Partition, SkipReason]:\n no_partitions_skip_reason = SkipReason(\n "Partition selector did not return a partition. Make sure that the timezone "\n "on your partition set matches your execution timezone."\n )\n\n earliest_possible_partition = next(iter(partition_set_def.get_partitions(None)), None)\n if not earliest_possible_partition:\n return no_partitions_skip_reason\n\n valid_partitions = partition_set_def.get_partitions(context.scheduled_execution_time)\n\n if not context.scheduled_execution_time:\n if not valid_partitions:\n return no_partitions_skip_reason\n return valid_partitions[-1]\n\n partition_time = execution_time_to_partition_fn(context.scheduled_execution_time)\n\n if partition_time < earliest_possible_partition.value:\n return SkipReason(\n f"Your partition ({partition_time.isoformat()}) is before the beginning of "\n f"the partition set ({earliest_possible_partition.value.isoformat()}). "\n "Verify your schedule's start_date is correct."\n )\n\n if partition_time > valid_partitions[-1].value:\n return SkipReason(\n f"Your partition ({partition_time.isoformat()}) is after the end of "\n f"the partition set ({valid_partitions[-1].value.isoformat()}). "\n "Verify your schedule's end_date is correct."\n )\n\n for partition in valid_partitions:\n if partition.value.isoformat() == partition_time.isoformat():\n return partition\n\n return no_partitions_skip_reason\n\n return offset_partition_selector
\n
", "current_page_name": "_modules/dagster/utils/partitions", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.partitions"}, "sidebars": ["globaltoc.html", "searchbox.html"], "test": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.test

\nimport os\nimport shutil\nimport tempfile\nimport uuid\nfrom collections import defaultdict\nfrom contextlib import contextmanager\n\n# top-level include is dangerous in terms of incurring circular deps\nfrom dagster import (\n    DagsterInvariantViolationError,\n    DependencyDefinition,\n    Failure,\n    ModeDefinition,\n    NodeInvocation,\n    PipelineDefinition,\n    RepositoryDefinition,\n    TypeCheck,\n    check,\n    execute_pipeline,\n    lambda_solid,\n)\nfrom dagster.core.definitions.logger_definition import LoggerDefinition\nfrom dagster.core.definitions.pipeline_base import InMemoryPipeline\nfrom dagster.core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster.core.definitions.solid_definition import NodeDefinition\nfrom dagster.core.execution.api import create_execution_plan, scoped_pipeline_context\nfrom dagster.core.execution.context.system import PlanExecutionContext\nfrom dagster.core.execution.context_creation_pipeline import (\n    create_context_creation_data,\n    create_execution_data,\n    create_executor,\n    create_log_manager,\n    create_plan_data,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.scheduler import Scheduler\nfrom dagster.core.scheduler.scheduler import DagsterScheduleDoesNotExist, DagsterSchedulerError\nfrom dagster.core.snap import snapshot_from_execution_plan\nfrom dagster.core.storage.file_manager import LocalFileManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.types.dagster_type import resolve_dagster_type\nfrom dagster.core.utility_solids import define_stub_solid\nfrom dagster.core.utils import make_new_run_id\nfrom dagster.serdes import ConfigurableClass\n\n# pylint: disable=unused-import\nfrom ..temp_file import (\n    get_temp_dir,\n    get_temp_file_handle,\n    get_temp_file_handle_with_data,\n    get_temp_file_name,\n    get_temp_file_name_with_data,\n    get_temp_file_names,\n)\nfrom ..typing_api import is_typing_type\n\n\ndef create_test_pipeline_execution_context(logger_defs=None):\n    loggers = check.opt_dict_param(\n        logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition\n    )\n    mode_def = ModeDefinition(logger_defs=loggers)\n    pipeline_def = PipelineDefinition(\n        name="test_legacy_context", solid_defs=[], mode_defs=[mode_def]\n    )\n    run_config = {"loggers": {key: {} for key in loggers}}\n    pipeline_run = PipelineRun(pipeline_name="test_legacy_context", run_config=run_config)\n    instance = DagsterInstance.ephemeral()\n    execution_plan = create_execution_plan(pipeline=pipeline_def, run_config=run_config)\n    creation_data = create_context_creation_data(\n        InMemoryPipeline(pipeline_def), execution_plan, run_config, pipeline_run, instance\n    )\n    log_manager = create_log_manager(creation_data)\n    scoped_resources_builder = ScopedResourcesBuilder()\n    executor = create_executor(creation_data)\n\n    return PlanExecutionContext(\n        plan_data=create_plan_data(creation_data, True, executor.retries),\n        execution_data=create_execution_data(\n            context_creation_data=creation_data,\n            scoped_resources_builder=scoped_resources_builder,\n        ),\n        log_manager=log_manager,\n        output_capture=None,\n    )\n\n\ndef _dep_key_of(solid):\n    return NodeInvocation(solid.definition.name, solid.name)\n\n\ndef build_pipeline_with_input_stubs(pipeline_def, inputs):\n    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n    check.dict_param(inputs, "inputs", key_type=str, value_type=dict)\n\n    deps = defaultdict(dict)\n    for solid_name, dep_dict in pipeline_def.dependencies.items():\n        for input_name, dep in dep_dict.items():\n            deps[solid_name][input_name] = dep\n\n    stub_solid_defs = []\n\n    for solid_name, input_dict in inputs.items():\n        if not pipeline_def.has_solid_named(solid_name):\n            raise DagsterInvariantViolationError(\n                (\n                    "You are injecting an input value for solid {solid_name} "\n                    "into pipeline {pipeline_name} but that solid was not found"\n                ).format(solid_name=solid_name, pipeline_name=pipeline_def.name)\n            )\n\n        solid = pipeline_def.solid_named(solid_name)\n        for input_name, input_value in input_dict.items():\n            stub_solid_def = define_stub_solid(\n                "__stub_{solid_name}_{input_name}".format(\n                    solid_name=solid_name, input_name=input_name\n                ),\n                input_value,\n            )\n            stub_solid_defs.append(stub_solid_def)\n            deps[_dep_key_of(solid)][input_name] = DependencyDefinition(stub_solid_def.name)\n\n    return PipelineDefinition(\n        name=pipeline_def.name + "_stubbed",\n        solid_defs=pipeline_def.top_level_solid_defs + stub_solid_defs,\n        mode_defs=pipeline_def.mode_definitions,\n        dependencies=deps,\n    )\n\n\n
[docs]def execute_solids_within_pipeline(\n pipeline_def,\n solid_names,\n inputs=None,\n run_config=None,\n mode=None,\n preset=None,\n tags=None,\n instance=None,\n):\n """Execute a set of solids within an existing pipeline.\n\n Intended to support tests. Input values may be passed directly.\n\n Args:\n pipeline_def (PipelineDefinition): The pipeline within which to execute the solid.\n solid_names (FrozenSet[str]): A set of the solid names, or the aliased solids, to execute.\n inputs (Optional[Dict[str, Dict[str, Any]]]): A dict keyed on solid names, whose values are\n dicts of input names to input values, used to pass input values to the solids directly.\n You may also use the ``run_config`` to configure any inputs that are configurable.\n run_config (Optional[dict]): The configuration that parameterized this\n execution, as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Dict[str, Union[CompositeSolidExecutionResult, SolidExecutionResult]]: The results of\n executing the solids, keyed by solid name.\n """\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.set_param(solid_names, "solid_names", of_type=str)\n inputs = check.opt_dict_param(inputs, "inputs", key_type=str, value_type=dict)\n\n sub_pipeline = pipeline_def.get_pipeline_subset_def(solid_names)\n stubbed_pipeline = build_pipeline_with_input_stubs(sub_pipeline, inputs)\n result = execute_pipeline(\n stubbed_pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n instance=instance,\n )\n\n return {sr.solid.name: sr for sr in result.solid_result_list}
\n\n\n
[docs]def execute_solid_within_pipeline(\n pipeline_def,\n solid_name,\n inputs=None,\n run_config=None,\n mode=None,\n preset=None,\n tags=None,\n instance=None,\n):\n """Execute a single solid within an existing pipeline.\n\n Intended to support tests. Input values may be passed directly.\n\n Args:\n pipeline_def (PipelineDefinition): The pipeline within which to execute the solid.\n solid_name (str): The name of the solid, or the aliased solid, to execute.\n inputs (Optional[Dict[str, Any]]): A dict of input names to input values, used to\n pass input values to the solid directly. You may also use the ``run_config`` to\n configure any inputs that are configurable.\n run_config (Optional[dict]): The configuration that parameterized this\n execution, as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the\n solid.\n """\n\n return execute_solids_within_pipeline(\n pipeline_def,\n solid_names={solid_name},\n inputs={solid_name: inputs} if inputs else None,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n instance=instance,\n )[solid_name]
\n\n\n@contextmanager\ndef yield_empty_pipeline_context(run_id=None, instance=None):\n pipeline = InMemoryPipeline(PipelineDefinition([], "empty"))\n pipeline_def = pipeline.get_definition()\n instance = check.opt_inst_param(\n instance, "instance", DagsterInstance, default=DagsterInstance.ephemeral()\n )\n\n execution_plan = create_execution_plan(pipeline)\n\n pipeline_run = instance.create_run(\n pipeline_name="<empty>",\n run_id=run_id,\n run_config=None,\n mode=None,\n solids_to_execute=None,\n step_keys_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan, pipeline_def.get_pipeline_snapshot_id()\n ),\n parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),\n )\n with scoped_pipeline_context(execution_plan, pipeline, {}, pipeline_run, instance) as context:\n yield context\n\n\n
[docs]def execute_solid(\n solid_def,\n mode_def=None,\n input_values=None,\n tags=None,\n run_config=None,\n raise_on_error=True,\n):\n """Execute a single solid in an ephemeral pipeline.\n\n Intended to support unit tests. Input values may be passed directly, and no pipeline need be\n specified -- an ephemeral pipeline will be constructed.\n\n Args:\n solid_def (SolidDefinition): The solid to execute.\n mode_def (Optional[ModeDefinition]): The mode within which to execute the solid. Use this\n if, e.g., custom resources, loggers, or executors are desired.\n input_values (Optional[Dict[str, Any]]): A dict of input names to input values, used to\n pass inputs to the solid directly. You may also use the ``run_config`` to\n configure any inputs that are configurable.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n run_config (Optional[dict]): The configuration that parameterized this\n execution, as a dict.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n\n Returns:\n Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the\n solid.\n """\n check.inst_param(solid_def, "solid_def", NodeDefinition)\n check.opt_inst_param(mode_def, "mode_def", ModeDefinition)\n input_values = check.opt_dict_param(input_values, "input_values", key_type=str)\n solid_defs = [solid_def]\n\n def create_value_solid(input_name, input_value):\n @lambda_solid(name=input_name)\n def input_solid():\n return input_value\n\n return input_solid\n\n dependencies = defaultdict(dict)\n\n for input_name, input_value in input_values.items():\n dependencies[solid_def.name][input_name] = DependencyDefinition(input_name)\n solid_defs.append(create_value_solid(input_name, input_value))\n\n result = execute_pipeline(\n PipelineDefinition(\n name="ephemeral_{}_solid_pipeline".format(solid_def.name),\n solid_defs=solid_defs,\n dependencies=dependencies,\n mode_defs=[mode_def] if mode_def else None,\n ),\n run_config=run_config,\n mode=mode_def.name if mode_def else None,\n tags=tags,\n raise_on_error=raise_on_error,\n )\n return result.result_for_handle(solid_def.name)
\n\n\n
[docs]def check_dagster_type(dagster_type, value):\n """Test a custom Dagster type.\n\n Args:\n dagster_type (Any): The Dagster type to test. Should be one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or\n :py:func:`PythonObjectDagsterType`, or a Python type.\n value (Any): The runtime value to test.\n\n Returns:\n TypeCheck: The result of the type check.\n\n\n Examples:\n\n .. code-block:: python\n\n assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n """\n\n if is_typing_type(dagster_type):\n raise DagsterInvariantViolationError(\n (\n "Must pass in a type from dagster module. You passed {dagster_type} "\n "which is part of python's typing module."\n ).format(dagster_type=dagster_type)\n )\n\n dagster_type = resolve_dagster_type(dagster_type)\n with yield_empty_pipeline_context() as pipeline_context:\n context = pipeline_context.for_type(dagster_type)\n try:\n type_check = dagster_type.type_check(context, value)\n except Failure as failure:\n return TypeCheck(success=False, description=failure.description)\n\n if not isinstance(type_check, TypeCheck):\n raise DagsterInvariantViolationError(\n "Type checks can only return TypeCheck. Type {type_name} returned {value}.".format(\n type_name=dagster_type.display_name, value=repr(type_check)\n )\n )\n return type_check
\n\n\n@contextmanager\ndef copy_directory(src):\n with tempfile.TemporaryDirectory() as temp_dir:\n dst = os.path.join(temp_dir, os.path.basename(src))\n shutil.copytree(src, dst)\n yield dst\n\n\nclass FilesystemTestScheduler(Scheduler, ConfigurableClass):\n """This class is used in dagster core and dagster_graphql to test the scheduler's interactions\n with schedule storage, which are implemented in the methods defined on the base Scheduler class.\n Therefore, the following methods used to actually schedule jobs (e.g. create and remove cron jobs\n on a cron tab) are left unimplemented.\n """\n\n def __init__(self, artifacts_dir, inst_data=None):\n check.str_param(artifacts_dir, "artifacts_dir")\n self._artifacts_dir = artifacts_dir\n self._inst_data = inst_data\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": str}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return FilesystemTestScheduler(artifacts_dir=config_value["base_dir"], inst_data=inst_data)\n\n def debug_info(self):\n return ""\n\n def get_logs_path(self, _instance, schedule_origin_id):\n check.str_param(schedule_origin_id, "schedule_origin_id")\n return os.path.join(self._artifacts_dir, "logs", schedule_origin_id, "scheduler.log")\n\n def wipe(self, instance):\n pass\n
", "current_page_name": "_modules/dagster/utils/test", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.test"}, "title": "dagster.utils"}}, "dagster_airbyte": {"asset_defs": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airbyte.asset_defs

\nfrom typing import List, Optional\n\nfrom dagster_airbyte.utils import generate_materializations\n\nfrom dagster import AssetKey, Out, Output, check\nfrom dagster.core.asset_defs import AssetsDefinition, multi_asset\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]@experimental\ndef build_airbyte_assets(\n connection_id: str,\n destination_tables: List[str],\n asset_key_prefix: Optional[List[str]] = None,\n) -> List[AssetsDefinition]:\n """\n Builds a set of assets representing the tables created by an Airbyte sync operation.\n\n Args:\n connection_id (str): The Airbyte Connection ID that this op will sync. You can retrieve this\n value from the "Connections" tab of a given connector in the Airbyte UI.\n destination_tables (List[str]): The names of the tables that you want to be represented\n in the Dagster asset graph for this sync. This will generally map to the name of the\n stream in Airbyte, unless a stream prefix has been specified in Airbyte.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([table_name])`.\n """\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n @multi_asset(\n name=f"airbyte_sync_{connection_id[:5]}",\n outs={\n table: Out(\n asset_key=AssetKey(\n asset_key_prefix + [table],\n )\n )\n for table in destination_tables\n },\n required_resource_keys={"airbyte"},\n compute_kind="airbyte",\n )\n def _assets(context):\n ab_output = context.resources.airbyte.sync_and_poll(connection_id=connection_id)\n for materialization in generate_materializations(ab_output, asset_key_prefix):\n table_name = materialization.asset_key.path[-1]\n if table_name in destination_tables:\n yield Output(\n value=None,\n output_name=table_name,\n metadata={\n entry.label: entry.entry_data for entry in materialization.metadata_entries\n },\n )\n else:\n yield materialization\n\n return [_assets]
\n
", "current_page_name": "_modules/dagster_airbyte/asset_defs", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airbyte.asset_defs"}, "ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airbyte.ops

\nfrom dagster_airbyte.resources import DEFAULT_POLL_INTERVAL_SECONDS\nfrom dagster_airbyte.types import AirbyteOutput\nfrom dagster_airbyte.utils import generate_materializations\n\nfrom dagster import Array, Bool, Field, In, Noneable, Nothing, Out, Output, op\n\n\n
[docs]@op(\n required_resource_keys={"airbyte"},\n ins={"start_after": In(Nothing)},\n out=Out(\n AirbyteOutput,\n description="Parsed json dictionary representing the details of the Airbyte connector after "\n "the sync successfully completes. "\n "See the [Airbyte API Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview) "\n "to see detailed information on this response.",\n ),\n config_schema={\n "connection_id": Field(\n str,\n is_required=True,\n description="The Airbyte Connection ID that this op will sync. You can retrieve this "\n 'value from the "Connections" tab of a given connector in the Airbyte UI.',\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL_SECONDS,\n description="The time (in seconds) that will be waited between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description="The maximum time that will waited before this operation is timed out. By "\n "default, this will never time out.",\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Airbyte sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["airbyte"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "airbyte"},\n)\ndef airbyte_sync_op(context):\n """\n Executes a Airbyte job sync for a given ``connection_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\n the job details for a given ``connection_id``.\n\n It requires the use of the :py:class:`~dagster_airbyte.airbyte_resource`, which allows it to\n communicate with the Airbyte API.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource, airbyte_sync_op\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n }\n )\n\n sync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_simple_airbyte_job():\n sync_foobar()\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_composed_airbyte_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n\n airbyte_output = context.resources.airbyte.sync_and_poll(\n connection_id=context.op_config["connection_id"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n yield from generate_materializations(\n airbyte_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(\n airbyte_output,\n metadata={\n **airbyte_output.job_details.get("attempts", [{}])[-1]\n .get("attempt", {})\n .get("totalStats", {})\n },\n )
\n
", "current_page_name": "_modules/dagster_airbyte/ops", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airbyte.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airbyte.resources

\nimport logging\nimport sys\nimport time\nfrom typing import Any, Dict, Optional\n\nimport requests\nfrom dagster_airbyte.types import AirbyteOutput\nfrom requests.exceptions import RequestException\n\nfrom dagster import Failure, Field, StringSource, __version__, get_dagster_logger, resource\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n\n\nclass AirbyteState:\n    RUNNING = "running"\n    SUCCEEDED = "succeeded"\n    CANCELLED = "cancelled"\n    PENDING = "pending"\n    FAILED = "failed"\n    ERROR = "error"\n    INCOMPLETE = "incomplete"\n\n\n
[docs]class AirbyteResource:\n """\n This class exposes methods on top of the Airbyte REST API.\n """\n\n def __init__(\n self,\n host: str,\n port: str,\n use_https: bool,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n log: logging.Logger = get_dagster_logger(),\n ):\n self._host = host\n self._port = port\n self._use_https = use_https\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._log = log\n\n @property\n def api_base_url(self) -> str:\n return (\n ("https://" if self._use_https else "http://")\n + (f"{self._host}:{self._port}" if self._port else self._host)\n + "/api/v1"\n )\n\n
[docs] def make_request(self, endpoint: str, data: Optional[Dict[str, Any]]):\n """\n Creates and sends a request to the desired Airbyte REST API endpoint.\n\n Args:\n endpoint (str): The Airbyte API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n\n headers = {"accept": "application/json"}\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method="POST",\n url=self.api_base_url + endpoint,\n headers=headers,\n json=data,\n timeout=15,\n )\n response.raise_for_status()\n return response.json()\n except RequestException as e:\n self._log.error("Request to Airbyte API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure("Exceeded max number of retries.")
\n\n def start_sync(self, connection_id: str) -> dict:\n return self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})\n\n def get_job_status(self, job_id: int) -> dict:\n return self.make_request(endpoint="/jobs/get", data={"id": job_id})\n\n def get_connection_details(self, connection_id: str) -> dict:\n return self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})\n\n
[docs] def sync_and_poll(\n self,\n connection_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL_SECONDS,\n poll_timeout: Optional[float] = None,\n ):\n """\n Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n "Connection" tab of a given connection in the Arbyte UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~AirbyteOutput`:\n Details of the sync job.\n """\n connection_details = self.get_connection_details(connection_id)\n job_details = self.start_sync(connection_id)\n job_id = job_details.get("job", {}).get("id")\n self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n start = time.monotonic()\n logged_attempts = 0\n logged_lines = 0\n\n while True:\n if poll_timeout and start + poll_timeout < time.monotonic():\n raise Failure(\n f"Timeout: Airbyte job {job_id} is not ready after the timeout {poll_timeout} seconds"\n )\n time.sleep(poll_interval)\n job_details = self.get_job_status(job_id)\n cur_attempt = len(job_details.get("attempts", []))\n # spit out the available Airbyte log info\n if cur_attempt:\n log_lines = (\n job_details["attempts"][logged_attempts].get("logs", {}).get("logLines", [])\n )\n\n for line in log_lines[logged_lines:]:\n sys.stdout.write(line + "\\n")\n sys.stdout.flush()\n logged_lines = len(log_lines)\n\n # if there's a next attempt, this one will have no more log messages\n if logged_attempts < cur_attempt - 1:\n logged_lines = 0\n logged_attempts += 1\n\n state = job_details.get("job", {}).get("status")\n\n if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n continue\n elif state == AirbyteState.SUCCEEDED:\n break\n elif state == AirbyteState.ERROR:\n raise Failure(f"Job failed: {job_id}")\n elif state == AirbyteState.CANCELLED:\n raise Failure(f"Job was cancelled: {job_id}")\n else:\n raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n\n return AirbyteOutput(job_details=job_details, connection_details=connection_details)
\n\n\n
[docs]@resource(\n config_schema={\n "host": Field(\n StringSource,\n is_required=True,\n description="The Airbyte Server Address.",\n ),\n "port": Field(\n StringSource,\n is_required=False,\n description="Port for the Airbyte Server.",\n ),\n "use_https": Field(\n bool,\n default_value=False,\n description="Use https to connect in Airbyte Server.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description="The maximum number of times requests to the Airbyte API should be retried "\n "before failing.",\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps manage Airbyte connectors",\n)\ndef airbyte_resource(context) -> AirbyteResource:\n """\n This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Airbyte REST API, including expected response JSON\n schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/overview/configuration#configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n }\n )\n\n @job(resource_defs={"airbyte":my_airbyte_resource})\n def my_airbyte_job():\n ...\n\n """\n return AirbyteResource(\n host=context.resource_config["host"],\n port=context.resource_config["port"],\n use_https=context.resource_config["use_https"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airbyte.resources"}}, "dagster_airflow": {"dagster_job_factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airflow.dagster_job_factory

\nfrom dagster_airflow.dagster_pipeline_factory import make_dagster_pipeline_from_airflow_dag\n\n\n
[docs]def make_dagster_job_from_airflow_dag(\n dag, tags=None, use_airflow_template_context=False, unique_id=None\n):\n """Construct a Dagster job corresponding to a given Airflow DAG.\n\n Tasks in the resulting job will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Execute job directly. This will set execution_date to the\n time (in UTC) of the run.\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the job tags. This will override\n behavior from (1).\n\n .. code-block:: python\n\n my_dagster_job = make_dagster_job_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n my_dagster_job.execute_in_process()\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the run tags,\n such as in the Dagit UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating job name and op\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n tags (Dict[str, Field]): Job tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n unique_id (int): If not None, this id will be postpended to generated op names. Used by\n framework authors to enforce unique op names within a repo.\n\n Returns:\n JobDefinition: The generated Dagster job\n\n """\n pipeline_def = make_dagster_pipeline_from_airflow_dag(\n dag, tags, use_airflow_template_context, unique_id\n )\n # pass in tags manually because pipeline_def.graph doesn't have it threaded\n return pipeline_def.graph.to_job(tags={**pipeline_def.tags})
\n
", "current_page_name": "_modules/dagster_airflow/dagster_job_factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airflow.dagster_job_factory"}, "dagster_pipeline_factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airflow.dagster_pipeline_factory

\nimport datetime\nimport logging\nimport sys\nfrom contextlib import contextmanager\n\nimport dateutil\nimport lazy_object_proxy\nimport pendulum\nfrom airflow.models import TaskInstance\nfrom airflow.models.baseoperator import BaseOperator\nfrom airflow.models.dag import DAG\nfrom airflow.models.dagbag import DagBag\nfrom airflow.settings import LOG_FORMAT\nfrom dagster_airflow.patch_airflow_example_dag import patch_airflow_example_dag\n\nfrom dagster import (\n    DagsterInvariantViolationError,\n    DependencyDefinition,\n    InputDefinition,\n    MultiDependencyDefinition,\n    Nothing,\n    OutputDefinition,\n    PipelineDefinition,\n    SolidDefinition,\n    check,\n    repository,\n    solid,\n)\nfrom dagster.core.definitions.utils import VALID_NAME_REGEX, validate_tags\nfrom dagster.core.instance import AIRFLOW_EXECUTION_DATE_STR, IS_AIRFLOW_INGEST_PIPELINE_STR\n\n\nclass DagsterAirflowError(Exception):\n    pass\n\n\ndef contains_duplicate_task_names(dag_bag, refresh_from_airflow_db):\n    check.inst_param(dag_bag, "dag_bag", DagBag)\n    check.bool_param(refresh_from_airflow_db, "refresh_from_airflow_db")\n    seen_task_names = set()\n\n    # To enforce predictable iteration order\n    sorted_dag_ids = sorted(dag_bag.dag_ids)\n    for dag_id in sorted_dag_ids:\n        dag = dag_bag.dags.get(dag_id) if not refresh_from_airflow_db else dag_bag.get_dag(dag_id)\n        for task in dag.tasks:\n            if task.task_id in seen_task_names:\n                return True\n            else:\n                seen_task_names.add(task.task_id)\n    return False\n\n\n
[docs]def make_dagster_repo_from_airflow_dag_bag(\n dag_bag, repo_name, refresh_from_airflow_db=False, use_airflow_template_context=False\n):\n """Construct a Dagster repository corresponding to Airflow DAGs in DagBag.\n\n Usage:\n Create `make_dagster_repo.py`:\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dag_bag\n from airflow_home import my_dag_bag\n\n def make_repo_from_dag_bag():\n return make_dagster_repo_from_airflow_dag_bag(my_dag_bag, 'my_repo_name')\n\n Use RepositoryDefinition as usual, for example:\n `dagit -f path/to/make_dagster_repo.py -n make_repo_from_dag_bag`\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n repo_name (str): Name for generated RepositoryDefinition\n refresh_from_airflow_db (bool): If True, will refresh DAG if expired via DagBag.get_dag(),\n which requires access to initialized Airflow DB. If False (recommended), gets dag from\n DagBag's dags dict without depending on Airflow DB. (default: False)\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n\n Returns:\n RepositoryDefinition\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n check.str_param(repo_name, "repo_name")\n check.bool_param(refresh_from_airflow_db, "refresh_from_airflow_db")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n\n use_unique_id = contains_duplicate_task_names(dag_bag, refresh_from_airflow_db)\n\n pipeline_defs = []\n count = 0\n # To enforce predictable iteration order\n sorted_dag_ids = sorted(dag_bag.dag_ids)\n for dag_id in sorted_dag_ids:\n # Only call Airflow DB via dag_bag.get_dag(dag_id) if refresh_from_airflow_db is True\n dag = dag_bag.dags.get(dag_id) if not refresh_from_airflow_db else dag_bag.get_dag(dag_id)\n if not use_unique_id:\n pipeline_defs.append(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag, tags=None, use_airflow_template_context=use_airflow_template_context\n )\n )\n else:\n pipeline_defs.append(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag,\n tags=None,\n use_airflow_template_context=use_airflow_template_context,\n unique_id=count,\n )\n )\n count += 1\n\n @repository(name=repo_name)\n def _repo():\n return pipeline_defs\n\n return _repo
\n\n\n
[docs]def make_dagster_repo_from_airflow_example_dags(repo_name="airflow_example_dags_repo"):\n """Construct a Dagster repository for Airflow's example DAGs.\n\n Execution of the following Airflow example DAGs is not currently supported:\n 'example_external_task_marker_child',\n 'example_pig_operator',\n 'example_skip_dag',\n 'example_trigger_target_dag',\n 'example_xcom',\n 'test_utils',\n\n Usage:\n\n Create `make_dagster_repo.py`:\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_example_dags\n\n def make_airflow_example_dags():\n return make_dagster_repo_from_airflow_example_dags()\n\n Use RepositoryDefinition as usual, for example:\n `dagit -f path/to/make_dagster_repo.py -n make_airflow_example_dags`\n\n Args:\n repo_name (str): Name for generated RepositoryDefinition\n\n Returns:\n RepositoryDefinition\n """\n dag_bag = DagBag(\n dag_folder="some/empty/folder/with/no/dags", # prevent defaulting to settings.DAGS_FOLDER\n include_examples=True,\n )\n\n # There is a bug in Airflow v1.10.8, v1.10.9, v1.10.10 where the python_callable for task\n # 'search_catalog' is missing a required position argument '_'. It is currently fixed in master.\n # v1.10 stable: https://github.com/apache/airflow/blob/v1-10-stable/airflow/example_dags/example_complex.py#L133\n # master (05-05-2020): https://github.com/apache/airflow/blob/master/airflow/example_dags/example_complex.py#L136\n patch_airflow_example_dag(dag_bag)\n\n return make_dagster_repo_from_airflow_dag_bag(dag_bag, repo_name)
\n\n\n
[docs]def make_dagster_repo_from_airflow_dags_path(\n dag_path,\n repo_name,\n safe_mode=True,\n store_serialized_dags=False,\n use_airflow_template_context=False,\n):\n """Construct a Dagster repository corresponding to Airflow DAGs in dag_path.\n\n ``DagBag.get_dag()`` dependency requires Airflow DB to be initialized.\n\n Usage:\n Create ``make_dagster_repo.py``:\n\n .. code-block:: python\n\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dags_path\n\n def make_repo_from_dir():\n return make_dagster_repo_from_airflow_dags_path(\n '/path/to/dags/', 'my_repo_name'\n )\n\n Use RepositoryDefinition as usual, for example:\n ``dagit -f path/to/make_dagster_repo.py -n make_repo_from_dir``\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n repo_name (str): Name for generated RepositoryDefinition\n include_examples (bool): True to include Airflow's example DAGs. (default: False)\n safe_mode (bool): True to use Airflow's default heuristic to find files that contain DAGs\n (ie find files that contain both b'DAG' and b'airflow') (default: True)\n store_serialized_dags (bool): True to read Airflow DAGS from Airflow DB. False to read DAGS\n from Python files. (default: False)\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n\n Returns:\n RepositoryDefinition\n """\n check.str_param(dag_path, "dag_path")\n check.str_param(repo_name, "repo_name")\n check.bool_param(safe_mode, "safe_mode")\n check.bool_param(store_serialized_dags, "store_serialized_dags")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n\n try:\n dag_bag = DagBag(\n dag_folder=dag_path,\n include_examples=False, # Exclude Airflow example dags\n safe_mode=safe_mode,\n store_serialized_dags=store_serialized_dags,\n )\n except Exception:\n raise DagsterAirflowError("Error initializing airflow.models.dagbag object with arguments")\n\n return make_dagster_repo_from_airflow_dag_bag(dag_bag, repo_name, use_airflow_template_context)
\n\n\n
[docs]def make_dagster_pipeline_from_airflow_dag(\n dag, tags=None, use_airflow_template_context=False, unique_id=None\n):\n """Construct a Dagster pipeline corresponding to a given Airflow DAG.\n\n Tasks in the resulting pipeline will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Run Pipeline with 'default' preset, which sets execution_date to the\n time (in UTC) of pipeline invocation:\n\n .. code-block:: python\n\n execute_pipeline(\n pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),\n preset='default')\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineDefinition tags. This will\n override behavior from (1).\n\n .. code-block:: python\n\n execute_pipeline(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n )\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineRun tags,\n such as in the Dagit UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating pipeline name and solid\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster pipeline\n tags (Dict[str, Field]): Pipeline tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n unique_id (int): If not None, this id will be postpended to generated solid names. Used by\n framework authors to enforce unique solid names within a repo.\n\n Returns:\n pipeline_def (PipelineDefinition): The generated Dagster pipeline\n\n """\n check.inst_param(dag, "dag", DAG)\n tags = check.opt_dict_param(tags, "tags")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:\n tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true"\n\n tags = validate_tags(tags)\n\n pipeline_dependencies, solid_defs = _get_pipeline_definition_args(\n dag, use_airflow_template_context, unique_id\n )\n pipeline_def = PipelineDefinition(\n name=normalized_name(dag.dag_id, None),\n solid_defs=solid_defs,\n dependencies=pipeline_dependencies,\n tags=tags,\n )\n return pipeline_def
\n\n\n# Airflow DAG ids and Task ids allow a larger valid character set (alphanumeric characters,\n# dashes, dots and underscores) than Dagster's naming conventions (alphanumeric characters,\n# underscores), so Dagster will strip invalid characters and replace with '_'\ndef normalized_name(name, unique_id):\n base_name = "airflow_" + "".join(c if VALID_NAME_REGEX.match(c) else "_" for c in name)\n if not unique_id:\n return base_name\n else:\n return base_name + "_" + str(unique_id)\n\n\ndef _get_pipeline_definition_args(dag, use_airflow_template_context, unique_id=None):\n check.inst_param(dag, "dag", DAG)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n pipeline_dependencies = {}\n solid_defs = []\n seen_tasks = []\n\n # To enforce predictable iteration order\n dag_roots = sorted(dag.roots, key=lambda x: x.task_id)\n for task in dag_roots:\n _traverse_airflow_dag(\n task,\n seen_tasks,\n pipeline_dependencies,\n solid_defs,\n use_airflow_template_context,\n unique_id,\n )\n return (pipeline_dependencies, solid_defs)\n\n\ndef _traverse_airflow_dag(\n task, seen_tasks, pipeline_dependencies, solid_defs, use_airflow_template_context, unique_id\n):\n check.inst_param(task, "task", BaseOperator)\n check.list_param(seen_tasks, "seen_tasks", BaseOperator)\n check.list_param(solid_defs, "solid_defs", SolidDefinition)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n seen_tasks.append(task)\n current_solid = make_dagster_solid_from_airflow_task(\n task, use_airflow_template_context, unique_id\n )\n solid_defs.append(current_solid)\n\n if len(task.upstream_list) > 0:\n # To enforce predictable iteration order\n task_upstream_list = sorted(task.upstream_list, key=lambda x: x.task_id)\n\n pipeline_dependencies[current_solid.name] = {\n "airflow_task_ready": MultiDependencyDefinition(\n [\n DependencyDefinition(\n solid=normalized_name(task_upstream.task_id, unique_id),\n output="airflow_task_complete",\n )\n for task_upstream in task_upstream_list\n ]\n )\n }\n\n # To enforce predictable iteration order\n task_downstream_list = sorted(task.downstream_list, key=lambda x: x.task_id)\n for child_task in task_downstream_list:\n if child_task not in seen_tasks:\n _traverse_airflow_dag(\n child_task,\n seen_tasks,\n pipeline_dependencies,\n solid_defs,\n use_airflow_template_context,\n unique_id,\n )\n\n\n@contextmanager\ndef replace_airflow_logger_handlers():\n try:\n # Redirect airflow handlers to stdout / compute logs\n prev_airflow_handlers = logging.getLogger("airflow.task").handlers\n handler = logging.StreamHandler(sys.stdout)\n handler.setFormatter(logging.Formatter(LOG_FORMAT))\n root = logging.getLogger("airflow.task")\n root.handlers = [handler]\n yield\n finally:\n # Restore previous log handlers\n logging.getLogger("airflow.task").handlers = prev_airflow_handlers\n\n\n# If unique_id is not None, this id will be postpended to generated solid names, generally used\n# to enforce unique solid names within a repo.\ndef make_dagster_solid_from_airflow_task(task, use_airflow_template_context, unique_id=None):\n check.inst_param(task, "task", BaseOperator)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n @solid(\n name=normalized_name(task.task_id, unique_id),\n input_defs=[InputDefinition("airflow_task_ready", Nothing)],\n output_defs=[OutputDefinition(Nothing, "airflow_task_complete")],\n )\n def _solid(context): # pylint: disable=unused-argument\n if AIRFLOW_EXECUTION_DATE_STR not in context.pipeline_run.tags:\n raise DagsterInvariantViolationError(\n 'Could not find "{AIRFLOW_EXECUTION_DATE_STR}" in {target} tags "{tags}". Please '\n 'add "{AIRFLOW_EXECUTION_DATE_STR}" to {target} tags before executing'.format(\n target="job" if context.pipeline_def.is_graph_job_op_target else "pipeline",\n AIRFLOW_EXECUTION_DATE_STR=AIRFLOW_EXECUTION_DATE_STR,\n tags=context.pipeline_run.tags,\n )\n )\n execution_date_str = context.pipeline_run.tags.get(AIRFLOW_EXECUTION_DATE_STR)\n\n check.str_param(execution_date_str, "execution_date_str")\n try:\n execution_date = dateutil.parser.parse(execution_date_str)\n except ValueError:\n raise DagsterInvariantViolationError(\n 'Could not parse execution_date "{execution_date_str}". Please use datetime format '\n "compatible with dateutil.parser.parse.".format(\n execution_date_str=execution_date_str,\n )\n )\n except OverflowError:\n raise DagsterInvariantViolationError(\n 'Date "{execution_date_str}" exceeds the largest valid C integer on the system.'.format(\n execution_date_str=execution_date_str,\n )\n )\n\n check.inst_param(execution_date, "execution_date", datetime.datetime)\n\n with replace_airflow_logger_handlers():\n task_instance = TaskInstance(task=task, execution_date=execution_date)\n\n ti_context = (\n dagster_get_template_context(task_instance, task, execution_date)\n if not use_airflow_template_context\n else task_instance.get_template_context()\n )\n task.render_template_fields(ti_context)\n\n task.execute(ti_context)\n\n return None\n\n return _solid\n\n\ndef dagster_get_template_context(task_instance, task, execution_date):\n """\n Modified from /airflow/models/taskinstance.py to not reference Airflow DB\n (1) Removes the following block, which queries DB, removes dagrun instances, recycles run_id\n if hasattr(task, 'dag'):\n if task.dag.params:\n params.update(task.dag.params)\n from airflow.models.dagrun import DagRun # Avoid circular import\n\n dag_run = (\n session.query(DagRun)\n .filter_by(dag_id=task.dag.dag_id, execution_date=execution_date)\n .first()\n )\n run_id = dag_run.run_id if dag_run else None\n session.expunge_all()\n session.commit()\n (2) Removes returning 'conf': conf which passes along Airflow config\n (3) Removes 'var': {'value': VariableAccessor(), 'json': VariableJsonAccessor()}, which allows\n fetching Variable from Airflow DB\n """\n from airflow import macros\n\n tables = None\n if "tables" in task.params:\n tables = task.params["tables"]\n\n params = {}\n run_id = ""\n dag_run = None\n\n ds = execution_date.strftime("%Y-%m-%d")\n ts = execution_date.isoformat()\n yesterday_ds = (execution_date - datetime.timedelta(1)).strftime("%Y-%m-%d")\n tomorrow_ds = (execution_date + datetime.timedelta(1)).strftime("%Y-%m-%d")\n\n # For manually triggered dagruns that aren't run on a schedule, next/previous\n # schedule dates don't make sense, and should be set to execution date for\n # consistency with how execution_date is set for manually triggered tasks, i.e.\n # triggered_date == execution_date.\n if dag_run and dag_run.external_trigger:\n prev_execution_date = execution_date\n next_execution_date = execution_date\n else:\n prev_execution_date = task.dag.previous_schedule(execution_date)\n next_execution_date = task.dag.following_schedule(execution_date)\n\n next_ds = None\n next_ds_nodash = None\n if next_execution_date:\n next_ds = next_execution_date.strftime("%Y-%m-%d")\n next_ds_nodash = next_ds.replace("-", "")\n next_execution_date = pendulum.instance(next_execution_date)\n\n prev_ds = None\n prev_ds_nodash = None\n if prev_execution_date:\n prev_ds = prev_execution_date.strftime("%Y-%m-%d")\n prev_ds_nodash = prev_ds.replace("-", "")\n prev_execution_date = pendulum.instance(prev_execution_date)\n\n ds_nodash = ds.replace("-", "")\n ts_nodash = execution_date.strftime("%Y%m%dT%H%M%S")\n ts_nodash_with_tz = ts.replace("-", "").replace(":", "")\n yesterday_ds_nodash = yesterday_ds.replace("-", "")\n tomorrow_ds_nodash = tomorrow_ds.replace("-", "")\n\n ti_key_str = "{dag_id}__{task_id}__{ds_nodash}".format(\n dag_id=task.dag_id, task_id=task.task_id, ds_nodash=ds_nodash\n )\n\n if task.params:\n params.update(task.params)\n\n return {\n "dag": task.dag,\n "ds": ds,\n "next_ds": next_ds,\n "next_ds_nodash": next_ds_nodash,\n "prev_ds": prev_ds,\n "prev_ds_nodash": prev_ds_nodash,\n "ds_nodash": ds_nodash,\n "ts": ts,\n "ts_nodash": ts_nodash,\n "ts_nodash_with_tz": ts_nodash_with_tz,\n "yesterday_ds": yesterday_ds,\n "yesterday_ds_nodash": yesterday_ds_nodash,\n "tomorrow_ds": tomorrow_ds,\n "tomorrow_ds_nodash": tomorrow_ds_nodash,\n "END_DATE": ds,\n "end_date": ds,\n "dag_run": dag_run,\n "run_id": run_id,\n "execution_date": pendulum.instance(execution_date),\n "prev_execution_date": prev_execution_date,\n "prev_execution_date_success": lazy_object_proxy.Proxy(\n lambda: task_instance.previous_execution_date_success\n ),\n "prev_start_date_success": lazy_object_proxy.Proxy(\n lambda: task_instance.previous_start_date_success\n ),\n "next_execution_date": next_execution_date,\n "latest_date": ds,\n "macros": macros,\n "params": params,\n "tables": tables,\n "task": task,\n "task_instance": task_instance,\n "ti": task_instance,\n "task_instance_key_str": ti_key_str,\n "test_mode": task_instance.test_mode,\n "inlets": task.inlets,\n "outlets": task.outlets,\n }\n
", "current_page_name": "_modules/dagster_airflow/dagster_pipeline_factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airflow.dagster_pipeline_factory"}, "factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airflow.factory

\nimport datetime\nimport os\nimport re\nfrom collections import namedtuple\n\nfrom airflow import DAG\nfrom airflow.models.baseoperator import BaseOperator\nfrom dagster_airflow.operators.util import check_storage_specified\n\nfrom dagster import check, seven\nfrom dagster.core.definitions.reconstruct import ReconstructableRepository\nfrom dagster.core.execution.api import create_execution_plan\nfrom dagster.core.instance import DagsterInstance, is_dagster_home_set\nfrom dagster.core.instance.ref import InstanceRef\nfrom dagster.core.snap import ExecutionPlanSnapshot, PipelineSnapshot, snapshot_from_execution_plan\nfrom dagster.utils.backcompat import canonicalize_backcompat_args\n\nfrom .compile import coalesce_execution_steps\nfrom .operators.docker_operator import DagsterDockerOperator\nfrom .operators.python_operator import DagsterPythonOperator\n\nDEFAULT_ARGS = {\n    "depends_on_past": False,\n    "email": ["airflow@example.com"],\n    "email_on_failure": False,\n    "email_on_retry": False,\n    "owner": "airflow",\n    "retries": 1,\n    "retry_delay": datetime.timedelta(0, 300),\n    "start_date": datetime.datetime(1900, 1, 1, 0, 0),\n}\n\n# Airflow DAG names are not allowed to be longer than 250 chars\nAIRFLOW_MAX_DAG_NAME_LEN = 250\n\n\ndef _make_dag_description(pipeline_name):\n    return """Editable scaffolding autogenerated by dagster-airflow from pipeline {pipeline_name}\n    """.format(\n        pipeline_name=pipeline_name\n    )\n\n\ndef _rename_for_airflow(name):\n    """Modify pipeline name for Airflow to meet constraints on DAG names:\n    https://github.com/apache/airflow/blob/1.10.3/airflow/utils/helpers.py#L52-L63\n\n    Here, we just substitute underscores for illegal characters to avoid imposing Airflow's\n    constraints on our naming schemes.\n    """\n    return re.sub(r"[^\\w\\-\\.]", "_", name)[:AIRFLOW_MAX_DAG_NAME_LEN]\n\n\nclass DagsterOperatorInvocationArgs(\n    namedtuple(\n        "DagsterOperatorInvocationArgs",\n        "recon_repo pipeline_name run_config mode step_keys instance_ref pipeline_snapshot "\n        "execution_plan_snapshot parent_pipeline_snapshot",\n    )\n):\n    def __new__(\n        cls,\n        recon_repo,\n        pipeline_name,\n        run_config,\n        mode,\n        step_keys,\n        instance_ref,\n        pipeline_snapshot,\n        execution_plan_snapshot,\n        parent_pipeline_snapshot,\n    ):\n        return super(DagsterOperatorInvocationArgs, cls).__new__(\n            cls,\n            recon_repo=recon_repo,\n            pipeline_name=pipeline_name,\n            run_config=run_config,\n            mode=mode,\n            step_keys=step_keys,\n            instance_ref=instance_ref,\n            pipeline_snapshot=pipeline_snapshot,\n            execution_plan_snapshot=execution_plan_snapshot,\n            parent_pipeline_snapshot=parent_pipeline_snapshot,\n        )\n\n\nclass DagsterOperatorParameters(\n    namedtuple(\n        "_DagsterOperatorParameters",\n        (\n            "recon_repo pipeline_name run_config "\n            "mode task_id step_keys dag instance_ref op_kwargs pipeline_snapshot "\n            "execution_plan_snapshot parent_pipeline_snapshot"\n        ),\n    )\n):\n    def __new__(\n        cls,\n        pipeline_name,\n        task_id,\n        recon_repo=None,\n        run_config=None,\n        mode=None,\n        step_keys=None,\n        dag=None,\n        instance_ref=None,\n        op_kwargs=None,\n        pipeline_snapshot=None,\n        execution_plan_snapshot=None,\n        parent_pipeline_snapshot=None,\n    ):\n        pipeline_def = recon_repo.get_definition().get_pipeline(pipeline_name)\n\n        if mode is None:\n            mode = pipeline_def.get_default_mode_name()\n\n        mode_def = pipeline_def.get_mode_definition(mode)\n\n        check_storage_specified(pipeline_def, mode_def)\n\n        return super(DagsterOperatorParameters, cls).__new__(\n            cls,\n            recon_repo=check.opt_inst_param(recon_repo, "recon_repo", ReconstructableRepository),\n            pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n            run_config=check.opt_dict_param(run_config, "run_config", key_type=str),\n            mode=check.opt_str_param(mode, "mode"),\n            task_id=check.str_param(task_id, "task_id"),\n            step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n            dag=check.opt_inst_param(dag, "dag", DAG),\n            instance_ref=check.opt_inst_param(instance_ref, "instance_ref", InstanceRef),\n            op_kwargs=check.opt_dict_param(op_kwargs.copy(), "op_kwargs", key_type=str),\n            pipeline_snapshot=check.inst_param(\n                pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot\n            ),\n            execution_plan_snapshot=check.inst_param(\n                execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot\n            ),\n            parent_pipeline_snapshot=check.opt_inst_param(\n                parent_pipeline_snapshot, "parent_pipeline_snapshot", PipelineSnapshot\n            ),\n        )\n\n    @property\n    def invocation_args(self):\n        return DagsterOperatorInvocationArgs(\n            recon_repo=self.recon_repo,\n            pipeline_name=self.pipeline_name,\n            run_config=self.run_config,\n            mode=self.mode,\n            step_keys=self.step_keys,\n            instance_ref=self.instance_ref,\n            pipeline_snapshot=self.pipeline_snapshot,\n            execution_plan_snapshot=self.execution_plan_snapshot,\n            parent_pipeline_snapshot=self.parent_pipeline_snapshot,\n        )\n\n\ndef _make_airflow_dag(\n    recon_repo,\n    job_name,\n    run_config=None,\n    mode=None,\n    instance=None,\n    dag_id=None,\n    dag_description=None,\n    dag_kwargs=None,\n    op_kwargs=None,\n    operator=DagsterPythonOperator,\n):\n    check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)\n    check.str_param(job_name, "job_name")\n    run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n    mode = check.opt_str_param(mode, "mode")\n    # Default to use the (persistent) system temp directory rather than a TemporaryDirectory,\n    # which would not be consistent between Airflow task invocations.\n\n    if instance is None:\n        if is_dagster_home_set():\n            instance = DagsterInstance.get()\n        else:\n            instance = DagsterInstance.local_temp(tempdir=seven.get_system_temp_directory())\n\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    # Only used for Airflow; internally we continue to use pipeline.name\n    dag_id = check.opt_str_param(dag_id, "dag_id", _rename_for_airflow(job_name))\n\n    dag_description = check.opt_str_param(\n        dag_description, "dag_description", _make_dag_description(job_name)\n    )\n    check.class_param(operator, "operator", superclass=BaseOperator)\n\n    dag_kwargs = dict(\n        {"default_args": DEFAULT_ARGS},\n        **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str),\n    )\n\n    op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n\n    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)\n    pipeline = recon_repo.get_definition().get_pipeline(job_name)\n\n    if mode is None:\n        mode = pipeline.get_default_mode_name()\n\n    execution_plan = create_execution_plan(pipeline, run_config, mode=mode)\n\n    tasks = {}\n\n    coalesced_plan = coalesce_execution_steps(execution_plan)\n\n    for solid_handle, solid_steps in coalesced_plan.items():\n        step_keys = [step.key for step in solid_steps]\n\n        operator_parameters = DagsterOperatorParameters(\n            recon_repo=recon_repo,\n            pipeline_name=job_name,\n            run_config=run_config,\n            mode=mode,\n            task_id=solid_handle,\n            step_keys=step_keys,\n            dag=dag,\n            instance_ref=instance.get_ref(),\n            op_kwargs=op_kwargs,\n            pipeline_snapshot=pipeline.get_pipeline_snapshot(),\n            execution_plan_snapshot=snapshot_from_execution_plan(\n                execution_plan, pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()\n            ),\n        )\n        task = operator(operator_parameters)\n\n        tasks[solid_handle] = task\n\n        for solid_step in solid_steps:\n            for step_input in solid_step.step_inputs:\n                for key in step_input.dependency_keys:\n                    prev_solid_handle = execution_plan.get_step_by_key(key).solid_handle.to_string()\n                    if solid_handle != prev_solid_handle:\n                        tasks[prev_solid_handle].set_downstream(task)\n\n    return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])\n\n\n
[docs]def make_airflow_dag(\n module_name,\n job_name,\n run_config=None,\n mode=None,\n instance=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n pipeline_name=None,\n):\n """Construct an Airflow DAG corresponding to a given Dagster job/pipeline.\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate as a Python\n callable, run by an underlying :py:class:`PythonOperator <airflow:PythonOperator>`. As a\n consequence, both dagster, any Python dependencies required by your solid logic, and the module\n containing your pipeline definition must be available in the Python environment within which\n your Airflow tasks execute. If you cannot install requirements into this environment, or you\n are looking for a containerized solution to provide better isolation, see instead\n :py:func:`make_airflow_dag_containerized`.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n module_name (str): The name of the importable module in which the pipeline/job definition can be\n found.\n job_name (str): The name of the job definition.\n run_config (Optional[dict]): The config, if any, with which to compile\n the pipeline/job to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline/job.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator (a subclass of\n :py:class:`PythonOperator <airflow:airflow.operators.python_operator.PythonOperator>`).\n pipeline_name (str): (legacy) The name of the pipeline definition.\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n\n """\n check.str_param(module_name, "module_name")\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n\n recon_repo = ReconstructableRepository.for_module(module_name, job_name, os.getcwd())\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n instance=instance,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n )
\n\n\n
[docs]def make_airflow_dag_for_operator(\n recon_repo,\n job_name,\n operator,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n pipeline_name=None,\n):\n """Construct an Airflow DAG corresponding to a given Dagster job/pipeline and custom operator.\n\n `Custom operator template <https://github.com/dagster-io/dagster/blob/master/python_modules/dagster-test/dagster_test/dagster_airflow/custom_operator.py>`_\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate run by the given\n Operator :py:class:`BaseOperator <airflow.models.BaseOperator>`. If you\n are looking for a containerized solution to provide better isolation, see instead\n :py:func:`make_airflow_dag_containerized`.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n recon_repo (:class:`dagster.ReconstructableRepository`): reference to a Dagster RepositoryDefinition\n that can be reconstructed in another process\n job_name (str): The name of the job definition.\n operator (type): The operator to use. Must be a class that inherits from\n :py:class:`BaseOperator <airflow.models.BaseOperator>`\n run_config (Optional[dict]): The config, if any, with which to compile\n the pipeline to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator.\n pipeline_name (str): (legacy) The name of the pipeline definition.\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n """\n check.class_param(operator, "operator", superclass=BaseOperator)\n\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=operator,\n )
\n\n\ndef make_airflow_dag_for_recon_repo(\n recon_repo,\n job_name,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n pipeline_name=None,\n):\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n )\n\n\n
[docs]def make_airflow_dag_containerized(\n module_name,\n job_name,\n image,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n pipeline_name=None,\n):\n """Construct a containerized Airflow DAG corresponding to a given Dagster job/pipeline.\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate using a subclass of\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`. As a\n consequence, both dagster, any Python dependencies required by your solid logic, and the module\n containing your pipeline definition must be available in the container spun up by this operator.\n Typically you'll want to install these requirements onto the image you're using.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n module_name (str): The name of the importable module in which the pipeline/job definition can be\n found.\n job_name (str): The name of the job definition.\n image (str): The name of the Docker image to use for execution (passed through to\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`).\n run_config (Optional[dict]): The config, if any, with which to compile\n the pipeline/job to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator (a subclass of\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`).\n pipeline_name (str): (legacy) The name of the pipeline definition.\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n """\n check.str_param(module_name, "module_name")\n check.str_param(job_name, "job_name")\n check.str_param(image, "image")\n check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(dag_id, "dag_id")\n check.opt_str_param(dag_description, "dag_description")\n check.opt_dict_param(dag_kwargs, "dag_kwargs")\n check.opt_dict_param(op_kwargs, "op_kwargs")\n\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n recon_repo = ReconstructableRepository.for_module(module_name, job_name, os.getcwd())\n\n op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n op_kwargs["image"] = image\n\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=DagsterDockerOperator,\n )
\n\n\ndef make_airflow_dag_containerized_for_recon_repo(\n recon_repo,\n job_name,\n image,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n instance=None,\n pipeline_name=None,\n):\n check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)\n check.str_param(job_name, "job_name")\n check.str_param(image, "image")\n check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(dag_id, "dag_id")\n check.opt_str_param(dag_description, "dag_description")\n check.opt_dict_param(dag_kwargs, "dag_kwargs")\n op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n check.opt_str_param(pipeline_name, "pipeline_name")\n\n op_kwargs["image"] = image\n\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=DagsterDockerOperator,\n instance=instance,\n )\n
", "current_page_name": "_modules/dagster_airflow/factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airflow.factory"}}, "dagster_aws": {"ecs": {"launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.ecs.launcher

\nimport warnings\nfrom collections import namedtuple\nfrom contextlib import suppress\n\nimport boto3\nfrom botocore.exceptions import ClientError\n\nfrom dagster import Array, Field, Noneable, ScalarUnion, StringSource, check\nfrom dagster.core.events import EngineEventData, MetadataEntry\nfrom dagster.core.launcher.base import LaunchRunContext, RunLauncher\nfrom dagster.grpc.types import ExecuteRunArgs\nfrom dagster.serdes import ConfigurableClass\nfrom dagster.utils import merge_dicts\n\nfrom ..secretsmanager import get_secrets_from_arns, get_tagged_secrets\nfrom .tasks import default_ecs_task_definition, default_ecs_task_metadata\nfrom .utils import sanitize_family\n\nTags = namedtuple("Tags", ["arn", "cluster", "cpu", "memory"])\n\n\n
[docs]class EcsRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a task in ECS for each Dagster job run."""\n\n def __init__(\n self,\n inst_data=None,\n task_definition=None,\n container_name="run",\n secrets=None,\n secrets_tag="dagster",\n include_sidecars=False,\n ):\n self._inst_data = inst_data\n self.ecs = boto3.client("ecs")\n self.ec2 = boto3.resource("ec2")\n self.secrets_manager = boto3.client("secretsmanager")\n\n self.task_definition = task_definition\n self.container_name = container_name\n\n self.secrets = secrets or []\n if all(isinstance(secret, str) for secret in self.secrets):\n warnings.warn(\n "Setting secrets as a list of ARNs is deprecated. "\n "Secrets should instead follow the same structure as the ECS API: "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html",\n DeprecationWarning,\n )\n self.secrets = get_secrets_from_arns(self.secrets_manager, self.secrets)\n else:\n self.secrets = {secret["name"]: secret["valueFrom"] for secret in self.secrets}\n\n self.secrets_tag = secrets_tag\n self.include_sidecars = include_sidecars\n\n if self.task_definition:\n task_definition = self.ecs.describe_task_definition(taskDefinition=task_definition)\n container_names = [\n container.get("name")\n for container in task_definition["taskDefinition"]["containerDefinitions"]\n ]\n check.invariant(\n container_name in container_names,\n f"Cannot override container '{container_name}' in task definition "\n f"'{self.task_definition}' because the container is not defined.",\n )\n self.task_definition = task_definition["taskDefinition"]["taskDefinitionArn"]\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "task_definition": Field(\n StringSource,\n is_required=False,\n description=(\n "The task definition to use when launching new tasks. "\n "If none is provided, each run will create its own task "\n "definition."\n ),\n ),\n "container_name": Field(\n StringSource,\n is_required=False,\n default_value="run",\n description=(\n "The container name to use when launching new tasks. Defaults to 'run'."\n ),\n ),\n "secrets": Field(\n Array(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={"name": StringSource, "valueFrom": StringSource},\n )\n ),\n is_required=False,\n description=(\n "An array of AWS Secrets Manager secrets. These secrets will "\n "be mounted as environment variabls in the container. See "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html."\n ),\n ),\n "secrets_tag": Field(\n Noneable(StringSource),\n is_required=False,\n default_value="dagster",\n description=(\n "AWS Secrets Manager secrets with this tag will be mounted as "\n "environment variables in the container. Defaults to 'dagster'."\n ),\n ),\n "include_sidecars": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "Whether each run should use the same sidecars as the task that launches it. "\n "Defaults to False."\n ),\n ),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return EcsRunLauncher(inst_data=inst_data, **config_value)\n\n def _set_ecs_tags(self, run_id, task_arn):\n try:\n tags = [{"key": "dagster/run_id", "value": run_id}]\n self.ecs.tag_resource(resourceArn=task_arn, tags=tags)\n except ClientError:\n pass\n\n def _set_run_tags(self, run_id, task_arn):\n cluster = self._task_metadata().cluster\n tags = {"ecs/task_arn": task_arn, "ecs/cluster": cluster}\n self._instance.add_run_tags(run_id, tags)\n\n def _get_run_tags(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n tags = run.tags if run else {}\n arn = tags.get("ecs/task_arn")\n cluster = tags.get("ecs/cluster")\n cpu = tags.get("ecs/cpu")\n memory = tags.get("ecs/memory")\n\n return Tags(arn, cluster, cpu, memory)\n\n def launch_run(self, context: LaunchRunContext) -> None:\n\n """\n Launch a run in an ECS task.\n\n Currently, Fargate is the only supported launchType and awsvpc is the\n only supported networkMode. These are the defaults that are set up by\n docker-compose when you use the Dagster ECS reference deployment.\n """\n run = context.pipeline_run\n family = sanitize_family(\n run.external_pipeline_origin.external_repository_origin.repository_location_origin.location_name\n )\n metadata = self._task_metadata()\n pipeline_origin = context.pipeline_code_origin\n image = pipeline_origin.repository_origin.container_image\n task_definition = self._task_definition(family, metadata, image)["family"]\n\n args = ExecuteRunArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n )\n command = args.get_command_args()\n\n # Set cpu or memory overrides\n # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html\n cpu_and_memory_overrides = {}\n tags = self._get_run_tags(run.run_id)\n if tags.cpu:\n cpu_and_memory_overrides["cpu"] = tags.cpu\n if tags.memory:\n cpu_and_memory_overrides["memory"] = tags.memory\n\n # Run a task using the same network configuration as this processes's\n # task.\n response = self.ecs.run_task(\n taskDefinition=task_definition,\n cluster=metadata.cluster,\n overrides={\n "containerOverrides": [\n {\n "name": self.container_name,\n "command": command,\n # containerOverrides expects cpu/memory as integers\n **{k: int(v) for k, v in cpu_and_memory_overrides.items()},\n }\n ],\n # taskOverrides expects cpu/memory as strings\n **cpu_and_memory_overrides,\n },\n networkConfiguration={\n "awsvpcConfiguration": {\n "subnets": metadata.subnets,\n "assignPublicIp": metadata.assign_public_ip,\n "securityGroups": metadata.security_groups,\n }\n },\n launchType="FARGATE",\n )\n\n tasks = response["tasks"]\n\n if not tasks:\n failures = response["failures"]\n exceptions = []\n for failure in failures:\n arn = failure.get("arn")\n reason = failure.get("reason")\n detail = failure.get("detail")\n exceptions.append(Exception(f"Task {arn} failed because {reason}: {detail}"))\n raise Exception(exceptions)\n\n arn = tasks[0]["taskArn"]\n self._set_run_tags(run.run_id, task_arn=arn)\n self._set_ecs_tags(run.run_id, task_arn=arn)\n self._instance.report_engine_event(\n message="Launching run in ECS task",\n pipeline_run=run,\n engine_event_data=EngineEventData(\n [\n MetadataEntry("ECS Task ARN", value=arn),\n MetadataEntry("ECS Cluster", value=metadata.cluster),\n MetadataEntry("Run ID", value=run.run_id),\n ]\n ),\n cls=self.__class__,\n )\n\n def can_terminate(self, run_id):\n tags = self._get_run_tags(run_id)\n\n if not (tags.arn and tags.cluster):\n return False\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return False\n\n status = tasks[0].get("lastStatus")\n if status and status != "STOPPED":\n return True\n\n return False\n\n def terminate(self, run_id):\n tags = self._get_run_tags(run_id)\n\n if not (tags.arn and tags.cluster):\n return False\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return False\n\n status = tasks[0].get("lastStatus")\n if status == "STOPPED":\n return False\n\n self.ecs.stop_task(task=tags.arn, cluster=tags.cluster)\n return True\n\n def _task_definition(self, family, metadata, image):\n """\n Return the launcher's task definition if it's configured.\n\n Otherwise, a new task definition revision is registered for every run.\n First, the process that calls this method finds its own task\n definition. Next, it creates a new task definition based on its own\n but it overrides the image with the pipeline origin's image.\n """\n if self.task_definition:\n task_definition = self.ecs.describe_task_definition(taskDefinition=self.task_definition)\n return task_definition["taskDefinition"]\n\n secrets = merge_dicts(\n (\n get_tagged_secrets(self.secrets_manager, self.secrets_tag)\n if self.secrets_tag\n else {}\n ),\n self.secrets,\n )\n secrets_dict = (\n {"secrets": [{"name": key, "valueFrom": value} for key, value in secrets.items()]}\n if secrets\n else {}\n )\n\n task_definition = {}\n with suppress(ClientError):\n task_definition = self.ecs.describe_task_definition(taskDefinition=family)[\n "taskDefinition"\n ]\n\n container_definitions = task_definition.get("containerDefinitions", [{}])\n for container_definition in container_definitions:\n if (\n container_definition.get("image") == image\n and container_definition.get("name") == self.container_name\n and container_definition.get("secrets") == secrets_dict.get("secrets", [])\n ):\n return task_definition\n\n return default_ecs_task_definition(\n self.ecs,\n family,\n metadata,\n image,\n self.container_name,\n secrets=secrets_dict,\n include_sidecars=self.include_sidecars,\n )\n\n def _task_metadata(self):\n return default_ecs_task_metadata(self.ec2, self.ecs)
\n
", "current_page_name": "_modules/dagster_aws/ecs/launcher", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.ecs.launcher"}}, "emr": {"emr": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.emr.emr

\n# Portions of this file are copied from the Yelp MRJob project:\n#\n#   https://github.com/Yelp/mrjob\n#\n#\n# Copyright 2009-2013 Yelp, David Marin\n# Copyright 2015 Yelp\n# Copyright 2017 Yelp\n# Copyright 2018 Contributors\n# Copyright 2019 Yelp and Contributors\n#\n# Licensed under the Apache License, Version 2.0 (the "License");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an "AS IS" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport gzip\nimport re\nfrom io import BytesIO\nfrom urllib.parse import urlparse\n\nimport boto3\nfrom botocore.exceptions import WaiterError\nfrom dagster_aws.utils.mrjob.utils import _boto3_now, _wrap_aws_client, strip_microseconds\n\nimport dagster\nfrom dagster import check\n\nfrom .types import EMR_CLUSTER_TERMINATED_STATES, EmrClusterState, EmrStepState\n\n# if we can't create or find our own service role, use the one\n# created by the AWS console and CLI\n_FALLBACK_SERVICE_ROLE = "EMR_DefaultRole"\n\n# if we can't create or find our own instance profile, use the one\n# created by the AWS console and CLI\n_FALLBACK_INSTANCE_PROFILE = "EMR_EC2_DefaultRole"\n\n\n
[docs]class EmrError(Exception):\n pass
\n\n\n
[docs]class EmrJobRunner:\n def __init__(\n self,\n region,\n check_cluster_every=30,\n aws_access_key_id=None,\n aws_secret_access_key=None,\n ):\n """This object encapsulates various utilities for interacting with EMR clusters and invoking\n steps (jobs) on them.\n\n See also :py:class:`~dagster_aws.emr.EmrPySparkResource`, which wraps this job runner in a\n resource for pyspark workloads.\n\n Args:\n region (str): AWS region to use\n check_cluster_every (int, optional): How frequently to poll boto3 APIs for updates.\n Defaults to 30 seconds.\n aws_access_key_id ([type], optional): AWS access key ID. Defaults to None, which will\n use the default boto3 credentials chain.\n aws_secret_access_key ([type], optional): AWS secret access key. Defaults to None, which\n will use the default boto3 credentials chain.\n """\n self.region = check.str_param(region, "region")\n\n # This is in seconds\n self.check_cluster_every = check.int_param(check_cluster_every, "check_cluster_every")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n\n def make_emr_client(self):\n """Creates a boto3 EMR client. Construction is wrapped in retries in case client connection\n fails transiently.\n\n Returns:\n botocore.client.EMR: An EMR client\n """\n raw_emr_client = boto3.client(\n "emr",\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n region_name=self.region,\n )\n return _wrap_aws_client(raw_emr_client, min_backoff=self.check_cluster_every)\n\n def cluster_id_from_name(self, cluster_name):\n """Get a cluster ID in the format "j-123ABC123ABC1" given a cluster name "my cool cluster".\n\n Args:\n cluster_name (str): The name of the cluster for which to find an ID\n\n Returns:\n str: The ID of the cluster\n\n Raises:\n EmrError: No cluster with the specified name exists\n """\n check.str_param(cluster_name, "cluster_name")\n\n response = self.make_emr_client().list_clusters().get("Clusters", [])\n for cluster in response:\n if cluster["Name"] == cluster_name:\n return cluster["Id"]\n\n raise EmrError(\n "cluster {cluster_name} not found in region {region}".format(\n cluster_name=cluster_name, region=self.region\n )\n )\n\n @staticmethod\n def construct_step_dict_for_command(step_name, command, action_on_failure="CONTINUE"):\n """Construct an EMR step definition which uses command-runner.jar to execute a shell command\n on the EMR master.\n\n Args:\n step_name (str): The name of the EMR step (will show up in the EMR UI)\n command (str): The shell command to execute with command-runner.jar\n action_on_failure (str, optional): Configure action on failure (e.g., continue, or\n terminate the cluster). Defaults to 'CONTINUE'.\n\n Returns:\n dict: Step definition dict\n """\n check.str_param(step_name, "step_name")\n check.list_param(command, "command", of_type=str)\n check.str_param(action_on_failure, "action_on_failure")\n\n return {\n "Name": step_name,\n "ActionOnFailure": action_on_failure,\n "HadoopJarStep": {"Jar": "command-runner.jar", "Args": command},\n }\n\n def add_tags(self, log, tags, cluster_id):\n """Add tags in the dict tags to cluster cluster_id.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n tags (dict): Dictionary of {'key': 'value'} tags\n cluster_id (str): The ID of the cluster to tag\n """\n check.dict_param(tags, "tags")\n check.str_param(cluster_id, "cluster_id")\n\n tags_items = sorted(tags.items())\n\n self.make_emr_client().add_tags(\n ResourceId=cluster_id, Tags=[dict(Key=k, Value=v) for k, v in tags_items]\n )\n\n log.info(\n "Added EMR tags to cluster %s: %s"\n % (cluster_id, ", ".join("%s=%s" % (tag, value) for tag, value in tags_items))\n )\n\n def run_job_flow(self, log, cluster_config):\n """Create an empty cluster on EMR, and return the ID of that job flow.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_config (dict): Configuration for this EMR job flow. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html\n\n Returns:\n str: The cluster ID, e.g. "j-ZKIY4CKQRX72"\n """\n check.dict_param(cluster_config, "cluster_config")\n\n log.debug("Creating Elastic MapReduce cluster")\n emr_client = self.make_emr_client()\n\n log.debug(\n "Calling run_job_flow(%s)"\n % (", ".join("%s=%r" % (k, v) for k, v in sorted(cluster_config.items())))\n )\n cluster_id = emr_client.run_job_flow(**cluster_config)["JobFlowId"]\n\n log.info("Created new cluster %s" % cluster_id)\n\n # set EMR tags for the cluster\n tags_items = cluster_config.get("Tags", [])\n tags = {k: v for k, v in tags_items}\n tags["__dagster_version"] = dagster.__version__\n self.add_tags(log, tags, cluster_id)\n return cluster_id\n\n def describe_cluster(self, cluster_id):\n """Thin wrapper over boto3 describe_cluster.\n\n Args:\n cluster_id (str): Cluster to inspect\n\n Returns:\n dict: The cluster info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeCluster.html\n """\n check.str_param(cluster_id, "cluster_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_cluster(ClusterId=cluster_id)\n\n def describe_step(self, cluster_id, step_id):\n """Thin wrapper over boto3 describe_step.\n\n Args:\n cluster_id (str): Cluster to inspect\n step_id (str): Step ID to describe\n\n Returns:\n dict: The step info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeStep.html\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)\n\n def add_job_flow_steps(self, log, cluster_id, step_defs):\n """Submit the constructed job flow steps to EMR for execution.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): The ID of the cluster\n step_defs (List[dict]): List of steps; see also `construct_step_dict_for_command`\n\n Returns:\n List[str]: list of step IDs.\n """\n check.str_param(cluster_id, "cluster_id")\n check.list_param(step_defs, "step_defs", of_type=dict)\n\n emr_client = self.make_emr_client()\n\n steps_kwargs = dict(JobFlowId=cluster_id, Steps=step_defs)\n log.debug(\n "Calling add_job_flow_steps(%s)"\n % ",".join(("%s=%r" % (k, v)) for k, v in steps_kwargs.items())\n )\n return emr_client.add_job_flow_steps(**steps_kwargs)["StepIds"]\n\n def is_emr_step_complete(self, log, cluster_id, emr_step_id):\n step = self.describe_step(cluster_id, emr_step_id)["Step"]\n step_state = EmrStepState(step["Status"]["State"])\n\n if step_state == EmrStepState.Pending:\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n\n log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc))\n return False\n\n elif step_state == EmrStepState.Running:\n time_running_desc = ""\n\n start = step["Status"]["Timeline"].get("StartDateTime")\n if start:\n time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start)\n\n log.info("RUNNING%s" % time_running_desc)\n return False\n\n # we're done, will return at the end of this\n elif step_state == EmrStepState.Completed:\n log.info("COMPLETED")\n return True\n else:\n # step has failed somehow. *reason* seems to only be set\n # when job is cancelled (e.g. 'Job terminated')\n reason = _get_reason(step)\n reason_desc = (" (%s)" % reason) if reason else ""\n\n log.info("%s%s" % (step_state.value, reason_desc))\n\n # print cluster status; this might give more context\n # why step didn't succeed\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n log.info(\n "Cluster %s %s %s%s"\n % (\n cluster["Id"],\n "was" if "ED" in cluster["Status"]["State"] else "is",\n cluster["Status"]["State"],\n reason_desc,\n )\n )\n\n if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES:\n # was it caused by IAM roles?\n self._check_for_missing_default_iam_roles(log, cluster)\n\n # TODO: extract logs here to surface failure reason\n # See: https://github.com/dagster-io/dagster/issues/1954\n\n if step_state == EmrStepState.Failed:\n log.error("EMR step %s failed" % emr_step_id)\n\n raise EmrError("EMR step %s failed" % emr_step_id)\n\n def _check_for_missing_default_iam_roles(self, log, cluster):\n """If cluster couldn't start due to missing IAM roles, tell user what to do."""\n\n check.dict_param(cluster, "cluster")\n\n reason = _get_reason(cluster)\n if any(\n reason.endswith("/%s is invalid" % role)\n for role in (_FALLBACK_INSTANCE_PROFILE, _FALLBACK_SERVICE_ROLE)\n ):\n log.warning(\n "IAM roles are missing. See documentation for IAM roles on EMR here: "\n "https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html"\n )\n\n def log_location_for_cluster(self, cluster_id):\n """EMR clusters are typically launched with S3 logging configured. This method inspects a\n cluster using boto3 describe_cluster to retrieve the log URI.\n\n Args:\n cluster_id (str): The cluster to inspect.\n\n Raises:\n EmrError: the log URI was missing (S3 log mirroring not enabled for this cluster)\n\n Returns:\n (str, str): log bucket and key\n """\n check.str_param(cluster_id, "cluster_id")\n\n # The S3 log URI is specified per job flow (cluster)\n log_uri = self.describe_cluster(cluster_id)["Cluster"].get("LogUri", None)\n\n # ugh, seriously boto3?! This will come back as string "None"\n if log_uri == "None" or log_uri is None:\n raise EmrError("Log URI not specified, cannot retrieve step execution logs")\n\n # For some reason the API returns an s3n:// protocol log URI instead of s3://\n log_uri = re.sub("^s3n", "s3", log_uri)\n log_uri_parsed = urlparse(log_uri)\n log_bucket = log_uri_parsed.netloc\n log_key_prefix = log_uri_parsed.path.lstrip("/")\n return log_bucket, log_key_prefix\n\n def retrieve_logs_for_step_id(self, log, cluster_id, step_id):\n """Retrieves stdout and stderr logs for the given step ID.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): EMR cluster ID\n step_id (str): EMR step ID for the job that was submitted.\n\n Returns\n (str, str): Tuple of stdout log string contents, and stderr log string contents\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n log_bucket, log_key_prefix = self.log_location_for_cluster(cluster_id)\n\n prefix = "{log_key_prefix}{cluster_id}/steps/{step_id}".format(\n log_key_prefix=log_key_prefix, cluster_id=cluster_id, step_id=step_id\n )\n stdout_log = self.wait_for_log(log, log_bucket, "{prefix}/stdout.gz".format(prefix=prefix))\n stderr_log = self.wait_for_log(log, log_bucket, "{prefix}/stderr.gz".format(prefix=prefix))\n return stdout_log, stderr_log\n\n def wait_for_log(self, log, log_bucket, log_key, waiter_delay=30, waiter_max_attempts=20):\n """Wait for gzipped EMR logs to appear on S3. Note that EMR syncs logs to S3 every 5\n minutes, so this may take a long time.\n\n Args:\n log_bucket (str): S3 bucket where log is expected to appear\n log_key (str): S3 key for the log file\n waiter_delay (int): How long to wait between attempts to check S3 for the log file\n waiter_max_attempts (int): Number of attempts before giving up on waiting\n\n Raises:\n EmrError: Raised if we waited the full duration and the logs did not appear\n\n Returns:\n str: contents of the log file\n """\n check.str_param(log_bucket, "log_bucket")\n check.str_param(log_key, "log_key")\n check.int_param(waiter_delay, "waiter_delay")\n check.int_param(waiter_max_attempts, "waiter_max_attempts")\n\n log.info(\n "Attempting to get log: s3://{log_bucket}/{log_key}".format(\n log_bucket=log_bucket, log_key=log_key\n )\n )\n\n s3 = _wrap_aws_client(boto3.client("s3"), min_backoff=self.check_cluster_every)\n waiter = s3.get_waiter("object_exists")\n try:\n waiter.wait(\n Bucket=log_bucket,\n Key=log_key,\n WaiterConfig={"Delay": waiter_delay, "MaxAttempts": waiter_max_attempts},\n )\n except WaiterError as err:\n raise EmrError("EMR log file did not appear on S3 after waiting") from err\n\n obj = BytesIO(s3.get_object(Bucket=log_bucket, Key=log_key)["Body"].read())\n gzip_file = gzip.GzipFile(fileobj=obj)\n return gzip_file.read().decode("utf-8")
\n\n\ndef _get_reason(cluster_or_step):\n """Get state change reason message."""\n # StateChangeReason is {} before the first state change\n return cluster_or_step["Status"]["StateChangeReason"].get("Message", "")\n
", "current_page_name": "_modules/dagster_aws/emr/emr", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.emr.emr"}, "pyspark_step_launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.emr.pyspark_step_launcher

\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport time\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster_aws.emr import EmrError, EmrJobRunner, emr_step_main\nfrom dagster_aws.emr.configs_spark import spark_config as get_spark_config\nfrom dagster_aws.utils.mrjob.log4j import parse_hadoop_log4j_records\n\nfrom dagster import Field, StringSource, check, resource\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import DagsterInvariantViolationError, raise_execution_interrupts\nfrom dagster.core.events import log_step_event\nfrom dagster.core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\n\n# On EMR, Spark is installed here\nEMR_SPARK_HOME = "/usr/lib/spark/"\n\nCODE_ZIP_NAME = "code.zip"\n\n\n
[docs]@resource(\n {\n "spark_config": get_spark_config(),\n "cluster_id": Field(\n StringSource, description="Name of the job flow (cluster) on which to execute."\n ),\n "region_name": Field(StringSource, description="The AWS region that the cluster is in."),\n "action_on_failure": Field(\n str,\n is_required=False,\n default_value="CANCEL_AND_WAIT",\n description="The EMR action to take when the cluster step fails: "\n "https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html",\n ),\n "staging_bucket": Field(\n StringSource,\n is_required=True,\n description="S3 bucket to use for passing files between the plan process and EMR "\n "process.",\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="emr_staging",\n description="S3 key prefix inside the staging_bucket to use for files passed the plan "\n "process and EMR process",\n ),\n "wait_for_logs": Field(\n bool,\n is_required=False,\n default_value=False,\n description="If set, the system will wait for EMR logs to appear on S3. Note that logs "\n "are copied every 5 minutes, so enabling this will add several minutes to the job "\n "runtime.",\n ),\n "local_job_package_path": Field(\n StringSource,\n is_required=False,\n description="Absolute path to the package that contains the job definition(s) "\n "whose steps will execute remotely on EMR. This is a path on the local fileystem of "\n "the process executing the job. The expectation is that this package will "\n "also be available on the python path of the launched process running the Spark step "\n "on EMR, either deployed on step launch via the deploy_local_job_package option, "\n "referenced on s3 via the s3_job_package_path option, or installed on the cluster "\n "via bootstrap actions.",\n ),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description="(legacy) Absolute path to the package that contains the pipeline definition(s) "\n "whose steps will execute remotely on EMR. This is a path on the local fileystem of "\n "the process executing the pipeline. The expectation is that this package will "\n "also be available on the python path of the launched process running the Spark step "\n "on EMR, either deployed on step launch via the deploy_local_pipeline_package option, "\n "referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster "\n "via bootstrap actions.",\n ),\n "deploy_local_job_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description="If set, before every step run, the launcher will zip up all the code in "\n "local_job_package_path, upload it to s3, and pass it to spark-submit's "\n "--py-files option. This gives the remote process access to up-to-date user code. "\n "If not set, the assumption is that some other mechanism is used for distributing code "\n "to the EMR cluster. If this option is set to True, s3_job_package_path should "\n "not also be set.",\n ),\n "deploy_local_pipeline_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description="(legacy) If set, before every step run, the launcher will zip up all the code in "\n "local_job_package_path, upload it to s3, and pass it to spark-submit's "\n "--py-files option. This gives the remote process access to up-to-date user code. "\n "If not set, the assumption is that some other mechanism is used for distributing code "\n "to the EMR cluster. If this option is set to True, s3_job_package_path should "\n "not also be set.",\n ),\n "s3_job_package_path": Field(\n StringSource,\n is_required=False,\n description="If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_job_package should not be set to True.",\n ),\n "s3_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description="If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_pipeline_package should not be set to True.",\n ),\n }\n)\ndef emr_pyspark_step_launcher(context):\n\n # Resolve legacy arguments\n if context.resource_config.get("local_job_package_path") and context.resource_config.get(\n "local_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``local_job_package_path`` and legacy version "\n "``local_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n if not context.resource_config.get(\n "local_job_package_path"\n ) and not context.resource_config.get("local_pipeline_package_path"):\n raise DagsterInvariantViolationError(\n "For resource ``emr_pyspark_step_launcher``, no config value provided for required "\n "schema entry ``local_job_package_path``."\n )\n\n local_job_package_path = context.resource_config.get(\n "local_job_package_path"\n ) or context.resource_config.get("local_pipeline_package_path")\n\n if context.resource_config.get("deploy_local_job_package") and context.resource_config.get(\n "deploy_local_job_package"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``deploy_local_job_package`` and legacy version "\n "``deploy_local_pipeline_package`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n deploy_local_job_package = context.resource_config.get(\n "deploy_local_job_package"\n ) or context.resource_config.get("deploy_local_pipeline_package")\n\n if context.resource_config.get("s3_job_package_path") and context.resource_config.get(\n "s3_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``s3_job_package_path`` and legacy version "\n "``s3_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n s3_job_package_path = context.resource_config.get(\n "s3_job_package_path"\n ) or context.resource_config.get("s3_pipeline_package_path")\n\n return EmrPySparkStepLauncher(\n region_name=context.resource_config.get("region_name"),\n staging_bucket=context.resource_config.get("staging_bucket"),\n staging_prefix=context.resource_config.get("staging_prefix"),\n wait_for_logs=context.resource_config.get("wait_for_logs"),\n action_on_failure=context.resource_config.get("action_on_failure"),\n cluster_id=context.resource_config.get("cluster_id"),\n spark_config=context.resource_config.get("spark_config"),\n local_job_package_path=local_job_package_path,\n deploy_local_job_package=deploy_local_job_package,\n s3_job_package_path=s3_job_package_path,\n )
\n\n\nemr_pyspark_step_launcher.__doc__ = "\\n".join(\n "- **" + option + "**: " + (field.description or "")\n for option, field in emr_pyspark_step_launcher.config_schema.config_type.fields.items()\n)\n\n\nclass EmrPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n region_name,\n staging_bucket,\n staging_prefix,\n wait_for_logs,\n action_on_failure,\n cluster_id,\n spark_config,\n local_job_package_path,\n deploy_local_job_package,\n s3_job_package_path=None,\n ):\n self.region_name = check.str_param(region_name, "region_name")\n self.staging_bucket = check.str_param(staging_bucket, "staging_bucket")\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n self.action_on_failure = check.str_param(action_on_failure, "action_on_failure")\n self.cluster_id = check.str_param(cluster_id, "cluster_id")\n self.spark_config = spark_config\n\n check.invariant(\n not deploy_local_job_package or not s3_job_package_path,\n "If deploy_local_job_package is set to True, s3_job_package_path should not "\n "also be set.",\n )\n\n self.local_job_package_path = check.str_param(\n local_job_package_path, "local_job_package_path"\n )\n self.deploy_local_job_package = check.bool_param(\n deploy_local_job_package, "deploy_local_job_package"\n )\n self.s3_job_package_path = check.opt_str_param(s3_job_package_path, "s3_job_package_path")\n\n self.emr_job_runner = EmrJobRunner(region=self.region_name)\n\n def _post_artifacts(self, log, step_run_ref, run_id, step_key):\n """\n Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.\n\n For the zip file, consider the following toy example:\n\n # Folder: my_pyspark_project/\n # a.py\n def foo():\n print(1)\n\n # b.py\n def bar():\n print(2)\n\n # main.py\n from a import foo\n from b import bar\n\n foo()\n bar()\n\n This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running\n `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will\n print 1, 2.\n """\n from dagster_pyspark.utils import build_pyspark_zip\n\n with tempfile.TemporaryDirectory() as temp_dir:\n s3 = boto3.client("s3", region_name=self.region_name)\n\n # Upload step run ref\n def _upload_file_to_s3(local_path, s3_filename):\n key = self._artifact_s3_key(run_id, step_key, s3_filename)\n s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)\n log.debug(\n "Uploading file {local_path} to {s3_uri}".format(\n local_path=local_path, s3_uri=s3_uri\n )\n )\n s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key)\n\n # Upload main file.\n # The remote Dagster installation should also have the file, but locating it there\n # could be a pain.\n main_local_path = self._main_file_local_path()\n _upload_file_to_s3(main_local_path, self._main_file_name())\n\n if self.deploy_local_job_package:\n # Zip and upload package containing job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n\n build_pyspark_zip(zip_local_path, self.local_job_package_path)\n _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)\n\n # Create step run ref pickle file\n step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)\n with open(step_run_ref_local_path, "wb") as step_pickle_file:\n pickle.dump(step_run_ref, step_pickle_file)\n\n _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)\n\n def launch_step(self, step_context, prior_attempts_count):\n step_run_ref = step_context_to_step_run_ref(\n step_context, prior_attempts_count, self.local_job_package_path\n )\n\n run_id = step_context.pipeline_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._post_artifacts(log, step_run_ref, run_id, step_key)\n\n emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.solid.name)\n emr_step_id = self.emr_job_runner.add_job_flow_steps(log, self.cluster_id, [emr_step_def])[\n 0\n ]\n\n yield from self.wait_for_completion_and_log(\n log, run_id, step_key, emr_step_id, step_context\n )\n\n def wait_for_completion_and_log(self, log, run_id, step_key, emr_step_id, step_context):\n s3 = boto3.resource("s3", region_name=self.region_name)\n try:\n for event in self.wait_for_completion(log, s3, run_id, step_key, emr_step_id):\n log_step_event(step_context, event)\n yield event\n except EmrError as emr_error:\n if self.wait_for_logs:\n self._log_logs_from_s3(log, emr_step_id)\n raise emr_error\n\n if self.wait_for_logs:\n self._log_logs_from_s3(log, emr_step_id)\n\n def wait_for_completion(self, log, s3, run_id, step_key, emr_step_id, check_interval=15):\n """We want to wait for the EMR steps to complete, and while that's happening, we want to\n yield any events that have been written to S3 for us by the remote process.\n After the the EMR steps complete, we want a final chance to fetch events before finishing\n the step.\n """\n done = False\n all_events = []\n # If this is being called within a `capture_interrupts` context, allow interrupts\n # while waiting for the pyspark execution to complete, so that we can terminate slow or\n # hanging steps\n while not done:\n with raise_execution_interrupts():\n time.sleep(check_interval) # AWS rate-limits us if we poll it too often\n done = self.emr_job_runner.is_emr_step_complete(log, self.cluster_id, emr_step_id)\n\n all_events_new = self.read_events(s3, run_id, step_key)\n\n if len(all_events_new) > len(all_events):\n for i in range(len(all_events), len(all_events_new)):\n yield all_events_new[i]\n all_events = all_events_new\n\n def read_events(self, s3, run_id, step_key):\n events_s3_obj = s3.Object( # pylint: disable=no-member\n self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)\n )\n\n try:\n events_data = events_s3_obj.get()["Body"].read()\n return pickle.loads(events_data)\n except ClientError as ex:\n # The file might not be there yet, which is fine\n if ex.response["Error"]["Code"] == "NoSuchKey":\n return []\n else:\n raise ex\n\n def _log_logs_from_s3(self, log, emr_step_id):\n """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs\n them to the given log."""\n stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(\n log, self.cluster_id, emr_step_id\n )\n # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for\n # Dagster's logging system.\n records = parse_hadoop_log4j_records(stderr_log)\n for record in records:\n if record.level:\n log.log(\n level=record.level,\n msg="".join(["Spark Driver stderr: ", record.logger, ": ", record.message]),\n )\n else:\n log.debug(f"Spark Driver stderr: {record.message}")\n\n sys.stdout.write(\n "---------- Spark Driver stdout: ----------\\n"\n + stdout_log\n + "\\n"\n + "---------- End of Spark Driver stdout ----------\\n"\n )\n\n def _get_emr_step_def(self, run_id, step_key, solid_name):\n """From the local Dagster instance, construct EMR steps that will kick off execution on a\n remote EMR cluster.\n """\n from dagster_spark.utils import flatten_dict, format_for_cli\n\n action_on_failure = self.action_on_failure\n\n # Execute Solid via spark-submit\n conf = dict(flatten_dict(self.spark_config))\n conf["spark.app.name"] = conf.get("spark.app.name", solid_name)\n\n check.invariant(\n conf.get("spark.master", "yarn") == "yarn",\n desc="spark.master is configured as %s; cannot set Spark master on EMR to anything "\n 'other than "yarn"' % conf.get("spark.master"),\n )\n\n command = (\n [\n EMR_SPARK_HOME + "bin/spark-submit",\n "--master",\n "yarn",\n "--deploy-mode",\n conf.get("spark.submit.deployMode", "client"),\n ]\n + format_for_cli(list(flatten_dict(conf)))\n + [\n "--py-files",\n self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),\n self._artifact_s3_uri(run_id, step_key, self._main_file_name()),\n self.staging_bucket,\n self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n ]\n )\n\n return EmrJobRunner.construct_step_dict_for_command(\n "Execute Solid/Op %s" % solid_name, command, action_on_failure=action_on_failure\n )\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return emr_step_main.__file__\n\n def _artifact_s3_uri(self, run_id, step_key, filename):\n key = self._artifact_s3_key(run_id, step_key, filename)\n return "s3://{bucket}/{key}".format(bucket=self.staging_bucket, key=key)\n\n def _artifact_s3_key(self, run_id, step_key, filename):\n return "/".join([self.staging_prefix, run_id, step_key, os.path.basename(filename)])\n
", "current_page_name": "_modules/dagster_aws/emr/pyspark_step_launcher", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.emr.pyspark_step_launcher"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.emr.types

\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\n\nEbsVolumeType = Enum(\n    name="EbsVolumeType", enum_values=[EnumValue("gp2"), EnumValue("io1"), EnumValue("standard")]\n)\n\n\n
[docs]class EmrClusterState(PyEnum):\n Starting = "STARTING"\n Bootstrapping = "BOOTSTRAPPING"\n Running = "RUNNING"\n Waiting = "WAITING"\n Terminating = "TERMINATING"\n Terminated = "TERMINATED"\n TerminatedWithErrors = "TERMINATED_WITH_ERRORS"
\n\n\nEMR_CLUSTER_TERMINATED_STATES = [\n EmrClusterState.Terminating,\n EmrClusterState.Terminated,\n EmrClusterState.TerminatedWithErrors,\n]\n\nEMR_CLUSTER_DONE_STATES = EMR_CLUSTER_TERMINATED_STATES + [EmrClusterState.Waiting]\n\n\n
[docs]class EmrStepState(PyEnum):\n Pending = "PENDING"\n Running = "RUNNING"\n Continue = "CONTINUE"\n Completed = "COMPLETED"\n Cancelled = "CANCELLED"\n Failed = "FAILED"\n Interrupted = "INTERRUPTED"
\n\n\nEmrActionOnFailure = Enum(\n name="EmrActionOnFailure",\n enum_values=[\n EnumValue("TERMINATE_JOB_FLOW"),\n EnumValue("TERMINATE_CLUSTER"),\n EnumValue("CANCEL_AND_WAIT"),\n EnumValue("CONTINUE"),\n ],\n)\n\nEmrAdjustmentType = Enum(\n name="EmrAdjustmentType",\n enum_values=[\n EnumValue("CHANGE_IN_CAPACITY"),\n EnumValue("PERCENT_CHANGE_IN_CAPACITY"),\n EnumValue("EXACT_CAPACITY"),\n ],\n)\n\nEmrComparisonOperator = Enum(\n name="EmrComparisonOperator",\n enum_values=[\n EnumValue("GREATER_THAN_OR_EQUAL"),\n EnumValue("GREATER_THAN"),\n EnumValue("LESS_THAN"),\n EnumValue("LESS_THAN_OR_EQUAL"),\n ],\n)\n\nEmrInstanceRole = Enum(\n name="EmrInstanceRole", enum_values=[EnumValue("MASTER"), EnumValue("CORE"), EnumValue("TASK")]\n)\n\nEmrMarket = Enum(name="EmrMarket", enum_values=[EnumValue("ON_DEMAND"), EnumValue("SPOT")])\n\nEmrRepoUpgradeOnBoot = Enum(\n name="EmrRepoUpgradeOnBoot", enum_values=[EnumValue("SECURITY"), EnumValue("NONE")]\n)\n\nEmrScaleDownBehavior = Enum(\n name="EmrScaleDownBehavior",\n enum_values=[\n EnumValue("TERMINATE_AT_INSTANCE_HOUR"),\n EnumValue("TERMINATE_AT_TASK_COMPLETION"),\n ],\n)\n\nEmrStatistic = Enum(\n name="EmrStatistic",\n enum_values=[\n EnumValue("SAMPLE_COUNT"),\n EnumValue("AVERAGE"),\n EnumValue("SUM"),\n EnumValue("MINIMUM"),\n EnumValue("MAXIMUM"),\n ],\n)\n\nEmrSupportedProducts = Enum(\n name="EmrSupportedProducts", enum_values=[EnumValue("mapr-m3"), EnumValue("mapr-m5")]\n)\n\nEmrTimeoutAction = Enum(\n name="EmrTimeoutAction",\n enum_values=[EnumValue("SWITCH_TO_ON_DEMAND"), EnumValue("TERMINATE_CLUSTER")],\n)\n\nEmrUnit = Enum(\n name="EmrUnit",\n enum_values=[\n EnumValue("NONE"),\n EnumValue("SECONDS"),\n EnumValue("MICRO_SECONDS"),\n EnumValue("MILLI_SECONDS"),\n EnumValue("BYTES"),\n EnumValue("KILO_BYTES"),\n EnumValue("MEGA_BYTES"),\n EnumValue("GIGA_BYTES"),\n EnumValue("TERA_BYTES"),\n EnumValue("BITS"),\n EnumValue("KILO_BITS"),\n EnumValue("MEGA_BITS"),\n EnumValue("GIGA_BITS"),\n EnumValue("TERA_BITS"),\n EnumValue("PERCENT"),\n EnumValue("COUNT"),\n EnumValue("BYTES_PER_SECOND"),\n EnumValue("KILO_BYTES_PER_SECOND"),\n EnumValue("MEGA_BYTES_PER_SECOND"),\n EnumValue("GIGA_BYTES_PER_SECOND"),\n EnumValue("TERA_BYTES_PER_SECOND"),\n EnumValue("BITS_PER_SECOND"),\n EnumValue("KILO_BITS_PER_SECOND"),\n EnumValue("MEGA_BITS_PER_SECOND"),\n EnumValue("GIGA_BITS_PER_SECOND"),\n EnumValue("TERA_BITS_PER_SECOND"),\n EnumValue("COUNT_PER_SECOND"),\n ],\n)\n
", "current_page_name": "_modules/dagster_aws/emr/types", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.emr.types"}}, "redshift": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.redshift.resources

\nimport abc\nfrom contextlib import contextmanager\n\nimport psycopg2\nimport psycopg2.extensions\n\nfrom dagster import Field, IntSource, StringSource, check, resource\n\n\nclass RedshiftError(Exception):\n    pass\n\n\nclass _BaseRedshiftResource(abc.ABC):\n    def __init__(self, context):  # pylint: disable=too-many-locals\n        # Extract parameters from resource config\n        self.conn_args = {\n            k: context.resource_config.get(k)\n            for k in (\n                "host",\n                "port",\n                "user",\n                "password",\n                "database",\n                "schema",\n                "connect_timeout",\n                "sslmode",\n            )\n            if context.resource_config.get(k) is not None\n        }\n\n        self.autocommit = context.resource_config.get("autocommit")\n        self.log = context.log\n\n    @abc.abstractmethod\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        pass\n\n    @abc.abstractmethod\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        pass\n\n\nclass RedshiftResource(_BaseRedshiftResource):\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Synchronously execute a single query against Redshift. Will return a list of rows, where\n        each row is a tuple of values, e.g. SELECT 1 will return [(1,)].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                try:\n                    self.log.info("Executing query '{query}'".format(query=query))\n                    cursor.execute(query)\n\n                    if fetch_results and cursor.rowcount > 0:\n                        return cursor.fetchall()\n                    else:\n                        self.log.info("Empty result from query")\n\n                except Exception as e:\n                    # If autocommit is disabled or not set (it is disabled by default), Redshift\n                    # will be in the middle of a transaction at exception time, and because of\n                    # the failure the current transaction will not accept any further queries.\n                    #\n                    # This conn.commit() call closes the open transaction before handing off\n                    # control to the error callback, so that the user can issue additional\n                    # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                    # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                    # things are in a usable state in the error callback.\n                    if not self.autocommit:\n                        conn.commit()\n\n                    if error_callback is not None:\n                        error_callback(e, cursor, self.log)\n                    else:\n                        raise\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Synchronously execute a list of queries against Redshift. Will return a list of list of\n        rows, where each row is a tuple of values, e.g. ['SELECT 1', 'SELECT 1'] will return\n        [[(1,)], [(1,)]].\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n            cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        results = []\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                for query in queries:\n                    try:\n                        self.log.info("Executing query '{query}'".format(query=query))\n                        cursor.execute(query)\n\n                        if fetch_results and cursor.rowcount > 0:\n                            results.append(cursor.fetchall())\n                        else:\n                            results.append([])\n                            self.log.info("Empty result from query")\n\n                    except Exception as e:\n                        # If autocommit is disabled or not set (it is disabled by default), Redshift\n                        # will be in the middle of a transaction at exception time, and because of\n                        # the failure the current transaction will not accept any further queries.\n                        #\n                        # This conn.commit() call closes the open transaction before handing off\n                        # control to the error callback, so that the user can issue additional\n                        # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                        # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                        # things are in a usable state in the error callback.\n                        if not self.autocommit:\n                            conn.commit()\n\n                        if error_callback is not None:\n                            error_callback(e, cursor, self.log)\n                        else:\n                            raise\n\n        if fetch_results:\n            return results\n\n    @contextmanager\n    def _get_conn(self):\n        conn = None\n        try:\n            conn = psycopg2.connect(**self.conn_args)\n            yield conn\n        finally:\n            if conn:\n                conn.close()\n\n    @contextmanager\n    def _get_cursor(self, conn, cursor_factory=None):\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n\n        # Could be none, in which case we should respect the connection default. Otherwise\n        # explicitly set to true/false.\n        if self.autocommit is not None:\n            conn.autocommit = self.autocommit\n\n        with conn:\n            with conn.cursor(cursor_factory=cursor_factory) as cursor:\n                yield cursor\n\n            # If autocommit is set, we'll commit after each and every query execution. Otherwise, we\n            # want to do a final commit after we're wrapped up executing the full set of one or more\n            # queries.\n            if not self.autocommit:\n                conn.commit()\n\n\nclass FakeRedshiftResource(_BaseRedshiftResource):\n    QUERY_RESULT = [(1,)]\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Fake for execute_query; returns [self.QUERY_RESULT]\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        self.log.info("Executing query '{query}'".format(query=query))\n        if fetch_results:\n            return self.QUERY_RESULT\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Fake for execute_queries; returns [self.QUERY_RESULT] * 3\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        for query in queries:\n            self.log.info("Executing query '{query}'".format(query=query))\n        if fetch_results:\n            return [self.QUERY_RESULT] * 3\n\n\ndef define_redshift_config():\n    """Redshift configuration. See the Redshift documentation for reference:\n\n    https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-to-cluster.html\n    """\n\n    return {\n        "host": Field(StringSource, description="Redshift host", is_required=True),\n        "port": Field(\n            IntSource, description="Redshift port", is_required=False, default_value=5439\n        ),\n        "user": Field(\n            StringSource,\n            description="Username for Redshift connection",\n            is_required=False,\n        ),\n        "password": Field(\n            StringSource,\n            description="Password for Redshift connection",\n            is_required=False,\n        ),\n        "database": Field(\n            StringSource,\n            description="Name of the default database to use. After login, you can use USE DATABASE"\n            " to change the database.",\n            is_required=False,\n        ),\n        "schema": Field(\n            StringSource,\n            description="Name of the default schema to use. After login, you can use USE SCHEMA to "\n            "change the schema.",\n            is_required=False,\n        ),\n        "autocommit": Field(\n            bool,\n            description="None by default, which honors the Redshift parameter AUTOCOMMIT. Set to "\n            "True or False to enable or disable autocommit mode in the session, respectively.",\n            is_required=False,\n        ),\n        "connect_timeout": Field(\n            int,\n            description="Connection timeout in seconds. 5 seconds by default",\n            is_required=False,\n            default_value=5,\n        ),\n        "sslmode": Field(\n            str,\n            description="SSL mode to use. See the Redshift documentation for more information on "\n            "usage: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html",\n            is_required=False,\n            default_value="require",\n        ),\n    }\n\n\n
[docs]@resource(\n config_schema=define_redshift_config(),\n description="Resource for connecting to the Redshift data warehouse",\n)\ndef redshift_resource(context):\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n\n .. code-block:: python\n\n from dagster import build_op_context, op\n from dagster_aws.redshift import redshift_resource\n\n @op(required_resource_keys={'redshift'})\n def example_redshift_op(context):\n return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = redshift_resource.configured({\n 'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n 'port': 5439,\n 'user': 'dagster',\n 'password': 'dagster',\n 'database': 'dev',\n })\n context = build_op_context(resources={'redshift': redshift_configured})\n assert example_redshift_op(context) == [(1,)]\n\n """\n return RedshiftResource(context)
\n\n\n
[docs]@resource(\n config_schema=define_redshift_config(),\n description="Fake resource for connecting to the Redshift data warehouse. Usage is identical "\n "to the real redshift_resource. Will always return [(1,)] for the single query case and "\n "[[(1,)], [(1,)], [(1,)]] for the multi query case.",\n)\ndef fake_redshift_resource(context):\n return FakeRedshiftResource(context)
\n
", "current_page_name": "_modules/dagster_aws/redshift/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.redshift.resources"}}, "s3": {"compute_log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\n\nimport boto3\nfrom botocore.errorfactory import ClientError\n\nfrom dagster import Field, StringSource, check, seven\nfrom dagster.core.storage.compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n)\nfrom dagster.core.storage.local_compute_log_manager import IO_TYPE_EXTENSION, LocalComputeLogManager\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import ensure_dir, ensure_file\n\n\n
[docs]class S3ComputeLogManager(ComputeLogManager, ConfigurableClass):\n """Logs compute function stdout and stderr to S3.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_aws.s3.compute_log_manager\n class: S3ComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n use_ssl: true\n verify: true\n verify_cert_path: "/path/to/cert/bundle.pem"\n endpoint_url: "http://alternate-s3-host.io"\n skip_empty_files: true\n\n Args:\n bucket (str): The name of the s3 bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster.seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n use_ssl (Optional[bool]): Whether or not to use SSL. Default True.\n verify (Optional[bool]): Whether or not to verify SSL certificates. Default True.\n verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if\n `verify` set to False.\n endpoint_url (Optional[str]): Override for the S3 endpoint url.\n skip_empty_files: (Optional[bool]): Skip upload of empty log files.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data=None,\n prefix="dagster",\n use_ssl=True,\n verify=True,\n verify_cert_path=None,\n endpoint_url=None,\n skip_empty_files=False,\n ):\n _verify = False if not verify else verify_cert_path\n self._s3_session = boto3.resource(\n "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url\n ).meta.client\n self._s3_bucket = check.str_param(bucket, "bucket")\n self._s3_prefix = check.str_param(prefix, "prefix")\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self.local_manager = LocalComputeLogManager(local_dir)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files")\n\n @contextmanager\n def _watch_logs(self, pipeline_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs( # pylint: disable=protected-access\n pipeline_run, step_key\n ):\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "use_ssl": Field(bool, is_required=False, default_value=True),\n "verify": Field(bool, is_required=False, default_value=True),\n "verify_cert_path": Field(StringSource, is_required=False),\n "endpoint_url": Field(StringSource, is_required=False),\n "skip_empty_files": Field(bool, is_required=False, default_value=False),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return S3ComputeLogManager(inst_data=inst_data, **config_value)\n\n def get_local_path(self, run_id, key, io_type):\n return self.local_manager.get_local_path(run_id, key, io_type)\n\n def on_watch_start(self, pipeline_run, step_key):\n self.local_manager.on_watch_start(pipeline_run, step_key)\n\n def on_watch_finish(self, pipeline_run, step_key):\n self.local_manager.on_watch_finish(pipeline_run, step_key)\n key = self.local_manager.get_key(pipeline_run, step_key)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR)\n\n def is_watch_completed(self, run_id, key):\n return self.local_manager.is_watch_completed(run_id, key)\n\n def download_url(self, run_id, key, io_type):\n if not self.is_watch_completed(run_id, key):\n return self.local_manager.download_url(run_id, key, io_type)\n key = self._bucket_key(run_id, key, io_type)\n\n url = self._s3_session.generate_presigned_url(\n ClientMethod="get_object", Params={"Bucket": self._s3_bucket, "Key": key}\n )\n\n return url\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n if self._should_download(run_id, key, io_type):\n self._download_to_local(run_id, key, io_type)\n data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes)\n return self._from_local_file_data(run_id, key, io_type, data)\n\n def on_subscribe(self, subscription):\n self.local_manager.on_subscribe(subscription)\n\n def on_unsubscribe(self, subscription):\n self.local_manager.on_unsubscribe(subscription)\n\n def _should_download(self, run_id, key, io_type):\n local_path = self.get_local_path(run_id, key, io_type)\n if os.path.exists(local_path):\n return False\n\n try: # https://stackoverflow.com/a/38376288/14656695\n self._s3_session.head_object(\n Bucket=self._s3_bucket, Key=self._bucket_key(run_id, key, io_type)\n )\n except ClientError:\n return False\n\n return True\n\n def _from_local_file_data(self, run_id, key, io_type, local_file_data):\n is_complete = self.is_watch_completed(run_id, key)\n path = (\n "s3://{}/{}".format(self._s3_bucket, self._bucket_key(run_id, key, io_type))\n if is_complete\n else local_file_data.path\n )\n\n return ComputeLogFileData(\n path,\n local_file_data.data,\n local_file_data.cursor,\n local_file_data.size,\n self.download_url(run_id, key, io_type),\n )\n\n def _upload_from_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_file(path)\n if self._skip_empty_files and os.stat(path).st_size == 0:\n return\n\n key = self._bucket_key(run_id, key, io_type)\n with open(path, "rb") as data:\n self._s3_session.upload_fileobj(data, self._s3_bucket, key)\n\n def _download_to_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_dir(os.path.dirname(path))\n with open(path, "wb") as fileobj:\n self._s3_session.download_fileobj(\n self._s3_bucket, self._bucket_key(run_id, key, io_type), fileobj\n )\n\n def _bucket_key(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n paths = [\n self._s3_prefix,\n "storage",\n run_id,\n "compute_logs",\n "{}.{}".format(key, extension),\n ]\n return "/".join(paths) # s3 path delimiter\n\n def dispose(self):\n self.local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_aws/s3/compute_log_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.compute_log_manager"}, "file_cache": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.file_cache

\nimport boto3\nfrom botocore.exceptions import ClientError\n\nfrom dagster import Field, check, resource\nfrom dagster.core.storage.file_cache import FileCache\n\nfrom .file_manager import S3FileHandle\n\n\n
[docs]class S3FileCache(FileCache):\n def __init__(self, s3_bucket, s3_key, s3_session, overwrite=False):\n super(S3FileCache, self).__init__(overwrite=overwrite)\n\n self.s3_bucket = s3_bucket\n self.s3_key = s3_key\n self.s3 = s3_session\n\n def has_file_object(self, file_key):\n check.str_param(file_key, "file_key")\n try:\n self.s3.get_object(Bucket=self.s3_bucket, Key=self.get_full_key(file_key))\n except ClientError:\n return False\n return True\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self.s3_key, file_key=file_key)\n\n def write_file_object(self, file_key, source_file_object):\n check.str_param(file_key, "file_key")\n\n self.s3.put_object(\n Body=source_file_object, Bucket=self.s3_bucket, Key=self.get_full_key(file_key)\n )\n return self.get_file_handle(file_key)\n\n def get_file_handle(self, file_key):\n check.str_param(file_key, "file_key")\n return S3FileHandle(self.s3_bucket, self.get_full_key(file_key))
\n\n\n@resource(\n {\n "bucket": Field(str),\n "key": Field(str),\n "overwrite": Field(bool, is_required=False, default_value=False),\n }\n)\ndef s3_file_cache(init_context):\n return S3FileCache(\n s3_bucket=init_context.resource_config["bucket"],\n s3_key=init_context.resource_config["key"],\n overwrite=init_context.resource_config["overwrite"],\n # TODO: resource dependencies\n s3_session=boto3.resource("s3", use_ssl=True).meta.client,\n )\n
", "current_page_name": "_modules/dagster_aws/s3/file_cache", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.file_cache"}, "file_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nfrom dagster import check, usable_as_dagster_type\nfrom dagster.core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]@usable_as_dagster_type\nclass S3FileHandle(FileHandle):\n """A reference to a file on S3."""\n\n def __init__(self, s3_bucket: str, s3_key: str):\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_key = check.str_param(s3_key, "s3_key")\n\n @property\n def s3_bucket(self) -> str:\n """str: The name of the S3 bucket."""\n return self._s3_bucket\n\n @property\n def s3_key(self) -> str:\n """str: The S3 key."""\n return self._s3_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's S3 URL."""\n return self.s3_path\n\n @property\n def s3_path(self) -> str:\n """str: The file's S3 URL."""\n return "s3://{bucket}/{key}".format(bucket=self.s3_bucket, key=self.s3_key)
\n\n\nclass S3FileManager(FileManager):\n def __init__(self, s3_session, s3_bucket, s3_base_key):\n self._s3_session = s3_session\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_base_key = check.str_param(s3_base_key, "s3_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n self._s3_session.download_file(\n Bucket=file_handle.s3_bucket, Key=file_handle.s3_key, Filename=temp_name\n )\n self._local_handle_cache[file_handle.s3_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", S3FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n with open(self._get_local_path(file_handle), mode) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.s3_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.s3_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n s3_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n self._s3_session.put_object(Body=file_obj, Bucket=self._s3_bucket, Key=s3_key)\n return S3FileHandle(self._s3_bucket, s3_key)\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self._s3_base_key, file_key=file_key)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_aws/s3/file_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.file_manager"}, "io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.io_manager

\nimport io\nimport pickle\n\nfrom dagster import Field, MemoizableIOManager, StringSource, check, io_manager\nfrom dagster.utils import PICKLE_PROTOCOL\n\n\nclass PickledObjectS3IOManager(MemoizableIOManager):\n    def __init__(\n        self,\n        s3_bucket,\n        s3_session,\n        s3_prefix=None,\n    ):\n        self.bucket = check.str_param(s3_bucket, "s3_bucket")\n        self.s3_prefix = check.str_param(s3_prefix, "s3_prefix")\n        self.s3 = s3_session\n        self.s3.head_bucket(Bucket=self.bucket)\n\n    def _get_path(self, context):\n        return "/".join([self.s3_prefix, "storage", *context.get_output_identifier()])\n\n    def has_output(self, context):\n        key = self._get_path(context)\n        return self._has_object(key)\n\n    def _rm_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        # delete_object wont fail even if the item has been deleted.\n        self.s3.delete_object(Bucket=self.bucket, Key=key)\n\n    def _has_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        found_object = False\n\n        try:\n            self.s3.get_object(Bucket=self.bucket, Key=key)\n            found_object = True\n        except self.s3.exceptions.NoSuchKey:\n            found_object = False\n\n        return found_object\n\n    def _uri_for_key(self, key):\n        check.str_param(key, "key")\n        return "s3://" + self.bucket + "/" + "{key}".format(key=key)\n\n    def load_input(self, context):\n        key = self._get_path(context.upstream_output)\n        context.log.debug(f"Loading S3 object from: {self._uri_for_key(key)}")\n        obj = pickle.loads(self.s3.get_object(Bucket=self.bucket, Key=key)["Body"].read())\n\n        return obj\n\n    def handle_output(self, context, obj):\n        key = self._get_path(context)\n        context.log.debug(f"Writing S3 object at: {self._uri_for_key(key)}")\n\n        if self._has_object(key):\n            context.log.warning(f"Removing existing S3 key: {key}")\n            self._rm_object(key)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        pickled_obj_bytes = io.BytesIO(pickled_obj)\n        self.s3.upload_fileobj(pickled_obj_bytes, self.bucket, key)\n\n\n
[docs]@io_manager(\n config_schema={\n "s3_bucket": Field(StringSource),\n "s3_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"s3"},\n)\ndef s3_pickle_io_manager(init_context):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Attach this resource definition to your job to make it available to your ops.\n\n .. code-block:: python\n\n @job(resource_defs={'io_manager': s3_pickle_io_manager, "s3": s3_resource, ...})\n def my_job():\n ...\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n s3_bucket: my-cool-bucket\n s3_prefix: good/prefix-for-files-\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3IOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager
\n\n\nclass PickledObjectS3AssetIOManager(PickledObjectS3IOManager):\n def _get_path(self, context):\n return "/".join([self.s3_prefix, *context.asset_key.path])\n\n\n
[docs]@io_manager(\n config_schema={\n "s3_bucket": Field(StringSource),\n "s3_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"s3"},\n)\ndef s3_pickle_asset_io_manager(init_context):\n """Persistent IO manager using S3 for storage, meant for use with software-defined assets.\n\n Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\n will overwrite previous materializations of that asset.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Attach this resource definition to your job to make it available to your ops.\n\n .. code-block:: python\n\n asset_group = AssetGroup(\n assets...,\n resource_defs={'io_manager': s3_pickle_asset_io_manager, "s3": s3_resource, ...}),\n )\n\n You may configure this IO manager as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n s3_bucket: my-cool-bucket\n s3_prefix: good/prefix-for-files-\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3AssetIOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_aws/s3/io_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.io_manager"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.resources

\nfrom dagster import Field, StringSource, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom .file_manager import S3FileManager\nfrom .utils import construct_s3_client\n\nS3_SESSION_CONFIG = {\n    "use_unsigned_session": Field(\n        bool,\n        description="Specifies whether to use an unsigned S3 session",\n        is_required=False,\n        default_value=False,\n    ),\n    "region_name": Field(\n        str, description="Specifies a custom region for the S3 session", is_required=False\n    ),\n    "endpoint_url": Field(\n        StringSource,\n        description="Specifies a custom endpoint for the S3 session",\n        is_required=False,\n    ),\n    "max_attempts": Field(\n        int,\n        description="This provides Boto3's retry handler with a value of maximum retry attempts, "\n        "where the initial call counts toward the max_attempts value that you provide",\n        is_required=False,\n        default_value=5,\n    ),\n    "profile_name": Field(\n        str,\n        description="Specifies a profile to connect that session",\n        is_required=False,\n    ),\n}\n\n\n
[docs]@resource(S3_SESSION_CONFIG)\ndef s3_resource(context):\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.s3 import s3_resource\n\n @op(required_resource_keys={'s3'})\n def example_s3_op(context):\n return context.resources.s3.list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job(resource_defs={'s3': s3_resource})\n def example_job(context):\n example_s3_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 's3': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n s3:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n # through the ordinary boto credential chain.\n use_unsigned_session: false\n # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n endpoint_url: "http://localhost"\n # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for S3 session. Default is default\n # profile as specified in ~/.aws/credentials file\n\n """\n return construct_s3_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n endpoint_url=context.resource_config.get("endpoint_url"),\n use_unsigned_session=context.resource_config["use_unsigned_session"],\n profile_name=context.resource_config.get("profile_name"),\n )
\n\n\n
[docs]@resource(\n merge_dicts(\n S3_SESSION_CONFIG,\n {\n "s3_bucket": Field(StringSource),\n "s3_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef s3_file_manager(context):\n """FileManager that provides abstract access to S3.\n\n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n """\n return S3FileManager(\n s3_session=construct_s3_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n endpoint_url=context.resource_config.get("endpoint_url"),\n use_unsigned_session=context.resource_config["use_unsigned_session"],\n profile_name=context.resource_config.get("profile_name"),\n ),\n s3_bucket=context.resource_config["s3_bucket"],\n s3_base_key=context.resource_config["s3_prefix"],\n )
\n
", "current_page_name": "_modules/dagster_aws/s3/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.resources"}}, "secretsmanager": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.secretsmanager.resources

\nfrom contextlib import contextmanager\n\nfrom dagster import Array, Field, Noneable, check, resource\nfrom dagster.core.test_utils import environ\nfrom dagster.utils.merger import merge_dicts\n\nfrom .secrets import construct_secretsmanager_client, get_secrets_from_arns, get_tagged_secrets\n\nSECRETSMANAGER_SESSION_CONFIG = {\n    "region_name": Field(\n        str,\n        description="Specifies a custom region for the SecretsManager session",\n        is_required=False,\n    ),\n    "max_attempts": Field(\n        int,\n        description="This provides Boto3's retry handler with a value of maximum retry attempts, "\n        "where the initial call counts toward the max_attempts value that you provide",\n        is_required=False,\n        default_value=5,\n    ),\n    "profile_name": Field(\n        str,\n        description="Specifies a profile to connect that session",\n        is_required=False,\n    ),\n}\n\n\n
[docs]@resource(SECRETSMANAGER_SESSION_CONFIG)\ndef secretsmanager_resource(context):\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_resource\n\n @op(required_resource_keys={'secretsmanager'})\n def example_secretsmanager_op(context):\n return context.resources.secretsmanager.get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job(resource_defs={'secretsmanager': secretsmanager_resource})\n def example_job(context):\n example_secretsmanager_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secretsmanager': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n\n """\n return construct_secretsmanager_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n profile_name=context.resource_config.get("profile_name"),\n )
\n\n\n
[docs]@resource(\n merge_dicts(\n SECRETSMANAGER_SESSION_CONFIG,\n {\n "secrets": Field(\n Array(str),\n is_required=False,\n default_value=[],\n description=("An array of AWS Secrets Manager secrets arns to fetch."),\n ),\n "secrets_tag": Field(\n Noneable(str),\n is_required=False,\n default_value=None,\n description=(\n "AWS Secrets Manager secrets with this tag will be fetched and made available."\n ),\n ),\n "add_to_environment": Field(\n bool,\n is_required=False,\n default_value=False,\n description=("Whether to mount the secrets as environment variables."),\n ),\n },\n )\n)\n@contextmanager\ndef secretsmanager_secrets_resource(context):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op(context):\n return context.resources.secrets.get("my-secret-name")\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op_2(context):\n return os.getenv("my-other-secret-name")\n\n @job(resource_defs={'secrets': secretsmanager_secrets_resource})\n def example_job(context):\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secrets': {\n 'config': {\n 'region_name': 'us-west-1',\n 'secrets_tag': 'dagster',\n 'add_to_environment': True,\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n secrets_tag: "dagster"\n # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n # from SecretsManager.\n add_to_environment: true\n # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n # to false.\n\n """\n add_to_environment = check.bool_param(\n context.resource_config["add_to_environment"], "add_to_environment"\n )\n secrets_tag = check.opt_str_param(context.resource_config["secrets_tag"], "secrets_tag")\n secrets = check.list_param(context.resource_config["secrets"], "secrets", of_type=str)\n\n secrets_manager = construct_secretsmanager_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n profile_name=context.resource_config.get("profile_name"),\n )\n\n secret_arns = merge_dicts(\n (get_tagged_secrets(secrets_manager, secrets_tag) if secrets_tag else {}),\n get_secrets_from_arns(secrets_manager, secrets),\n )\n\n secrets_map = {\n name: secrets_manager.get_secret_value(SecretId=arn).get("SecretString")\n for name, arn in secret_arns.items()\n }\n with environ(secrets_map if add_to_environment else {}):\n yield secrets_map
\n
", "current_page_name": "_modules/dagster_aws/secretsmanager/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.secretsmanager.resources"}}}, "dagster_azure": {"adls2": {"fake_adls2_resource": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.fake_adls2_resource

\nimport io\nimport random\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom unittest import mock\n\nfrom dagster_azure.blob import FakeBlobServiceClient\n\nfrom .resources import ADLS2Resource\nfrom .utils import ResourceNotFoundError\n\n\n
[docs]class FakeADLS2Resource(ADLS2Resource):\n """Stateful mock of an ADLS2Resource for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n def __init__(\n self, account_name, credential="fake-creds"\n ): # pylint: disable=unused-argument,super-init-not-called\n self._adls2_client = FakeADLS2ServiceClient(account_name)\n self._blob_client = FakeBlobServiceClient(account_name)
\n\n\nclass FakeADLS2ServiceClient:\n """Stateful mock of an ADLS2 service client for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n def __init__(self, account_name, credential="fake-creds"):\n\n self._account_name = account_name\n self._credential = mock.MagicMock()\n self._credential.account_key = credential\n self._file_systems = {}\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def credential(self):\n return self._credential\n\n @property\n def file_systems(self):\n return self._file_systems\n\n def get_file_system_client(self, file_system):\n return self._file_systems.setdefault(\n file_system, FakeADLS2FilesystemClient(self.account_name, file_system)\n )\n\n def get_file_client(self, file_system, file_path):\n return self.get_file_system_client(file_system).get_file_client(file_path)\n\n\nclass FakeADLS2FilesystemClient:\n """Stateful mock of an ADLS2 filesystem client for testing."""\n\n def __init__(self, account_name, file_system_name):\n self._file_system = defaultdict(FakeADLS2FileClient)\n self._account_name = account_name\n self._file_system_name = file_system_name\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def file_system_name(self):\n return self._file_system_name\n\n def keys(self):\n return self._file_system.keys()\n\n def get_file_system_properties(self):\n return {"account_name": self.account_name, "file_system_name": self.file_system_name}\n\n def has_file(self, path):\n return bool(self._file_system.get(path))\n\n def get_file_client(self, file_path):\n return self._file_system[file_path]\n\n def create_file(self, file):\n return self._file_system[file]\n\n def delete_file(self, file):\n for k in list(self._file_system.keys()):\n if k.startswith(file):\n del self._file_system[k]\n\n\nclass FakeADLS2FileClient:\n """Stateful mock of an ADLS2 file client for testing."""\n\n def __init__(self):\n self.contents = None\n self.lease = None\n\n def get_file_properties(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n return {"lease": self.lease}\n\n def upload_data(self, contents, overwrite=False, lease=None):\n if self.lease is not None:\n if lease != self.lease:\n raise Exception("Invalid lease!")\n if self.contents is not None or overwrite is True:\n if isinstance(contents, str):\n self.contents = contents.encode("utf8")\n elif isinstance(contents, io.BytesIO):\n self.contents = contents.read()\n elif isinstance(contents, io.StringIO):\n self.contents = contents.read().encode("utf8")\n elif isinstance(contents, bytes):\n self.contents = contents\n else:\n self.contents = contents\n\n @contextmanager\n def acquire_lease(self, lease_duration=-1): # pylint: disable=unused-argument\n if self.lease is None:\n self.lease = random.randint(0, 2**9)\n try:\n yield self.lease\n finally:\n self.lease = None\n else:\n raise Exception("Lease already held")\n\n def download_file(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n return FakeADLS2FileDownloader(contents=self.contents)\n\n\nclass FakeADLS2FileDownloader:\n """Mock of an ADLS2 file downloader for testing."""\n\n def __init__(self, contents):\n self.contents = contents\n\n def readall(self):\n return self.contents\n\n def readinto(self, fileobj):\n fileobj.write(self.contents)\n
", "current_page_name": "_modules/dagster_azure/adls2/fake_adls2_resource", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.fake_adls2_resource"}, "file_cache": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.file_cache

\nfrom dagster import Field, Selector, StringSource, check, resource\nfrom dagster.core.storage.file_cache import FileCache\n\nfrom .file_manager import ADLS2FileHandle\nfrom .utils import ResourceNotFoundError, create_adls2_client\n\n\nclass ADLS2FileCache(FileCache):\n    def __init__(\n        self, storage_account, file_system, prefix, credential=None, overwrite=False, client=None\n    ):\n        super(ADLS2FileCache, self).__init__(overwrite=overwrite)\n\n        self.storage_account = storage_account\n        self.file_system = file_system\n        self.prefix = prefix\n\n        self.client = client or create_adls2_client(storage_account, credential)\n\n    def has_file_object(self, file_key):\n        check.str_param(file_key, "file_key")\n        try:\n            file = self.client.get_file_client(self.file_system, self.get_full_key(file_key))\n            file.get_file_properties()\n        except ResourceNotFoundError:\n            return False\n        return True\n\n    def get_full_key(self, file_key):\n        return "{base_key}/{file_key}".format(base_key=self.prefix, file_key=file_key)\n\n    def write_file_object(self, file_key, source_file_object):\n        check.str_param(file_key, "file_key")\n\n        adls2_key = self.get_full_key(file_key)\n        adls2_file = self.client.get_file_client(file_system=self.file_system, file_path=adls2_key)\n        adls2_file.upload_data(source_file_object, overwrite=True)\n        return self.get_file_handle(file_key)\n\n    def get_file_handle(self, file_key):\n        check.str_param(file_key, "file_key")\n        return ADLS2FileHandle(\n            self.client.account_name, self.file_system, self.get_full_key(file_key)\n        )\n\n\n
[docs]@resource(\n {\n "storage_account": Field(StringSource, description="The storage account name."),\n "credential": Field(\n Selector(\n {\n "sas": Field(StringSource, description="SAS token for the account."),\n "key": Field(StringSource, description="Shared Access Key for the account"),\n }\n ),\n description="The credentials with which to authenticate.",\n ),\n "prefix": Field(StringSource, description="The base path prefix to use in ADLS2"),\n "file_system": Field(\n StringSource, description="The storage account filesystem (aka container)"\n ),\n "overwrite": Field(bool, is_required=False, default_value=False),\n }\n)\ndef adls2_file_cache(init_context):\n return ADLS2FileCache(\n storage_account=init_context.resource_config["storage_account"],\n file_system=init_context.resource_config["file_system"],\n prefix=init_context.resource_config["prefix"],\n credential=init_context.resource_config["credential"],\n overwrite=init_context.resource_config["overwrite"],\n # TODO: resource dependencies\n )
\n
", "current_page_name": "_modules/dagster_azure/adls2/file_cache", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.file_cache"}, "file_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nfrom dagster import check, usable_as_dagster_type\nfrom dagster.core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]@usable_as_dagster_type\nclass ADLS2FileHandle(FileHandle):\n """A reference to a file on ADLS2."""\n\n def __init__(self, account: str, file_system: str, key: str):\n self._account = check.str_param(account, "account")\n self._file_system = check.str_param(file_system, "file_system")\n self._key = check.str_param(key, "key")\n\n @property\n def account(self):\n """str: The name of the ADLS2 account."""\n return self._account\n\n @property\n def file_system(self):\n """str: The name of the ADLS2 file system."""\n return self._file_system\n\n @property\n def key(self):\n """str: The ADLS2 key."""\n return self._key\n\n @property\n def path_desc(self):\n """str: The file's ADLS2 URL."""\n return self.adls2_path\n\n @property\n def adls2_path(self):\n """str: The file's ADLS2 URL."""\n return "adfss://{file_system}@{account}.dfs.core.windows.net/{key}".format(\n file_system=self.file_system,\n account=self.account,\n key=self.key,\n )
\n\n\nclass ADLS2FileManager(FileManager):\n def __init__(self, adls2_client, file_system, prefix):\n self._client = adls2_client\n self._file_system = check.str_param(file_system, "file_system")\n self._prefix = check.str_param(prefix, "prefix")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n file = self._client.get_file_client(\n file_system=file_handle.file_system,\n file_path=file_handle.key,\n )\n download = file.download_file()\n with open(temp_name, "wb") as file_obj:\n download.readinto(file_obj)\n self._local_handle_cache[file_handle.adls2_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", ADLS2FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n with open(self._get_local_path(file_handle), mode) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.adls2_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.adls2_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None): # pylint: disable=unused-argument\n check_file_like_obj(file_obj)\n adls2_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n adls2_file = self._client.get_file_client(\n file_system=self._file_system, file_path=adls2_key\n )\n adls2_file.upload_data(file_obj, overwrite=True)\n return ADLS2FileHandle(self._client.account_name, self._file_system, adls2_key)\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self._prefix, file_key=file_key)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_azure/adls2/file_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.file_manager"}, "io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.io_manager

\nimport pickle\n\nfrom dagster_azure.adls2.utils import ResourceNotFoundError\n\nfrom dagster import Field, IOManager, StringSource, check, io_manager\nfrom dagster.utils import PICKLE_PROTOCOL\n\n_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectADLS2IOManager(IOManager):\n    def __init__(self, file_system, adls2_client, blob_client, prefix="dagster"):\n        self.adls2_client = adls2_client\n        self.file_system_client = self.adls2_client.get_file_system_client(file_system)\n        # We also need a blob client to handle copying as ADLS doesn't have a copy API yet\n        self.blob_client = blob_client\n        self.blob_container_client = self.blob_client.get_container_client(file_system)\n        self.prefix = check.str_param(prefix, "prefix")\n\n        self.lease_duration = _LEASE_DURATION\n        self.file_system_client.get_file_system_properties()\n\n    def _get_path(self, context):\n        keys = context.get_output_identifier()\n        run_id = keys[0]\n        output_identifiers = keys[1:]  # variable length because of mapping key\n        return "/".join(\n            [\n                self.prefix,\n                "storage",\n                run_id,\n                "files",\n                *output_identifiers,\n            ]\n        )\n\n    def _rm_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        # This operates recursively already so is nice and simple.\n        self.file_system_client.delete_file(key)\n\n    def _has_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        try:\n            file = self.file_system_client.get_file_client(key)\n            file.get_file_properties()\n            return True\n        except ResourceNotFoundError:\n            return False\n\n    def _uri_for_key(self, key, protocol=None):\n        check.str_param(key, "key")\n        protocol = check.opt_str_param(protocol, "protocol", default="abfss://")\n        return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format(\n            protocol=protocol,\n            filesystem=self.file_system_client.file_system_name,\n            account=self.file_system_client.account_name,\n            key=key,\n        )\n\n    def load_input(self, context):\n        key = self._get_path(context.upstream_output)\n        context.log.debug(f"Loading ADLS2 object from: {self._uri_for_key(key)}")\n        file = self.file_system_client.get_file_client(key)\n        stream = file.download_file()\n        obj = pickle.loads(stream.readall())\n\n        return obj\n\n    def handle_output(self, context, obj):\n        key = self._get_path(context)\n        context.log.debug(f"Writing ADLS2 object at: {self._uri_for_key(key)}")\n\n        if self._has_object(key):\n            context.log.warning(f"Removing existing ADLS2 key: {key}")\n            self._rm_object(key)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n        file = self.file_system_client.create_file(key)\n        with file.acquire_lease(self.lease_duration) as lease:\n            file.upload_data(pickled_obj, lease=lease, overwrite=True)\n\n\n
[docs]@io_manager(\n config_schema={\n "adls2_file_system": Field(StringSource, description="ADLS Gen2 file system name"),\n "adls2_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"adls2"},\n)\ndef adls2_pickle_io_manager(init_context):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Attach this resource definition to your job in order to make it available all your ops:\n\n .. code-block:: python\n\n @job(resource_defs={\n 'io_manager': adls2_pickle_io_manager,\n 'adls2': adls2_resource,\n ...,\n })\n def my_job():\n ...\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n adls2_file_system: my-cool-file-system\n adls2_prefix: good/prefix-for-files-\n """\n adls_resource = init_context.resources.adls2\n adls2_client = adls_resource.adls2_client\n blob_client = adls_resource.blob_client\n pickled_io_manager = PickledObjectADLS2IOManager(\n init_context.resource_config["adls2_file_system"],\n adls2_client,\n blob_client,\n init_context.resource_config.get("adls2_prefix"),\n )\n return pickled_io_manager
\n\n\nclass PickledObjectADLS2AssetIOManager(PickledObjectADLS2IOManager):\n def _get_path(self, context):\n return "/".join([self.prefix, *context.asset_key.path])\n\n\n
[docs]@io_manager(\n config_schema={\n "adls2_file_system": Field(StringSource, description="ADLS Gen2 file system name"),\n "adls2_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"adls2"},\n)\ndef adls2_pickle_asset_io_manager(init_context):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage, meant for use with\n software-defined assets.\n\n Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\n will overwrite previous materializations of that asset.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Attach this resource definition to your job in order to make it available all your ops:\n\n .. code-block:: python\n\n asset_group = AssetGroup(\n assets...,\n resource_defs={'io_manager': adls2_pickle_io_manager, "adls2": adls2_resource, ...}),\n )\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n adls2_file_system: my-cool-file-system\n adls2_prefix: good/prefix-for-files\n """\n adls_resource = init_context.resources.adls2\n adls2_client = adls_resource.adls2_client\n blob_client = adls_resource.blob_client\n pickled_io_manager = PickledObjectADLS2AssetIOManager(\n init_context.resource_config["adls2_file_system"],\n adls2_client,\n blob_client,\n init_context.resource_config.get("adls2_prefix"),\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_azure/adls2/io_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.io_manager"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.resources

\nfrom dagster_azure.blob.utils import create_blob_client\n\nfrom dagster import Field, Selector, StringSource, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom .file_manager import ADLS2FileManager\nfrom .utils import create_adls2_client\n\nADLS2_CLIENT_CONFIG = {\n    "storage_account": Field(StringSource, description="The storage account name."),\n    "credential": Field(\n        Selector(\n            {\n                "sas": Field(StringSource, description="SAS token for the account."),\n                "key": Field(StringSource, description="Shared Access Key for the account"),\n            }\n        ),\n        description="The credentials with which to authenticate.",\n    ),\n}\n\n\n
[docs]@resource(ADLS2_CLIENT_CONFIG)\ndef adls2_resource(context):\n """Resource that gives ops access to Azure Data Lake Storage Gen2.\n\n The underlying client is a :py:class:`~azure.storage.filedatalake.DataLakeServiceClient`.\n\n Attach this resource definition to a :py:class:`~dagster.JobDefinition` in order to make it\n available to your ops.\n\n Example:\n\n .. code-block:: python\n\n from dagster import job, op\n from dagster_azure.adls2 import adls2_resource\n\n @op(required_resource_keys={'adls2'})\n def example_adls2_op(context):\n return list(context.resources.adls2.adls2_client.list_file_systems())\n\n @job(resource_defs={"adls2": adls2_resource})\n def my_job():\n example_adls2_op()\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may pass credentials to this resource using either a SAS token or a key, using\n environment variables if desired:\n\n .. code-block:: YAML\n\n resources:\n adls2:\n config:\n storage_account: my_storage_account\n # str: The storage account name.\n credential:\n sas: my_sas_token\n # str: the SAS token for the account.\n key:\n env: AZURE_DATA_LAKE_STORAGE_KEY\n # str: The shared access key for the account.\n """\n return _adls2_resource_from_config(context.resource_config)
\n\n\n
[docs]@resource(\n merge_dicts(\n ADLS2_CLIENT_CONFIG,\n {\n "adls2_file_system": Field(StringSource, description="ADLS Gen2 file system name"),\n "adls2_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef adls2_file_manager(context):\n """FileManager that provides abstract access to ADLS2.\n\n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n """\n adls2_client = _adls2_resource_from_config(context.resource_config).adls2_client\n\n return ADLS2FileManager(\n adls2_client=adls2_client,\n file_system=context.resource_config["adls2_file_system"],\n prefix=context.resource_config["adls2_prefix"],\n )
\n\n\nclass ADLS2Resource:\n """Resource containing clients to access Azure Data Lake Storage Gen2.\n\n Contains a client for both the Data Lake and Blob APIs, to work around the limitations\n of each.\n """\n\n def __init__(self, storage_account, credential):\n self._adls2_client = create_adls2_client(storage_account, credential)\n self._blob_client = create_blob_client(storage_account, credential)\n\n @property\n def adls2_client(self):\n return self._adls2_client\n\n @property\n def blob_client(self):\n return self._blob_client\n\n\ndef _adls2_resource_from_config(config):\n """\n Args:\n config: A configuration containing the fields in ADLS2_CLIENT_CONFIG.\n\n Returns: An adls2 client.\n """\n storage_account = config["storage_account"]\n credential = config["credential"].copy().popitem()[1]\n return ADLS2Resource(storage_account, credential)\n
", "current_page_name": "_modules/dagster_azure/adls2/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.resources"}}, "blob": {"compute_log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.blob.compute_log_manager

\nimport itertools\nimport os\nfrom contextlib import contextmanager\n\nfrom dagster import Field, StringSource, check, seven\nfrom dagster.core.storage.compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n)\nfrom dagster.core.storage.local_compute_log_manager import IO_TYPE_EXTENSION, LocalComputeLogManager\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import ensure_dir, ensure_file\n\nfrom .utils import create_blob_client, generate_blob_sas\n\n\n
[docs]class AzureBlobComputeLogManager(ComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to Azure Blob Storage.\n\n This is also compatible with Azure Data Lake Storage.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_azure.blob.compute_log_manager\n class: AzureBlobComputeLogManager\n config:\n storage_account: my-storage-account\n container: my-container\n credential: sas-token-or-secret-key\n prefix: "dagster-test-"\n local_dir: "/tmp/cool"\n\n Args:\n storage_account (str): The storage account name to which to log.\n container (str): The container (or ADLS2 filesystem) to which to log.\n secret_key (str): Secret key for the storage account. SAS tokens are not\n supported because we need a secret key to generate a SAS token for a download URL.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster.seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n storage_account,\n container,\n secret_key,\n local_dir=None,\n inst_data=None,\n prefix="dagster",\n ):\n self._storage_account = check.str_param(storage_account, "storage_account")\n self._container = check.str_param(container, "container")\n self._blob_prefix = check.str_param(prefix, "prefix")\n check.str_param(secret_key, "secret_key")\n\n self._blob_client = create_blob_client(storage_account, secret_key)\n self._container_client = self._blob_client.get_container_client(container)\n self._download_urls = {}\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self.local_manager = LocalComputeLogManager(local_dir)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, pipeline_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs( # pylint: disable=protected-access\n pipeline_run, step_key\n ):\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "storage_account": StringSource,\n "container": StringSource,\n "secret_key": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return AzureBlobComputeLogManager(inst_data=inst_data, **config_value)\n\n def get_local_path(self, run_id, key, io_type):\n return self.local_manager.get_local_path(run_id, key, io_type)\n\n def on_watch_start(self, pipeline_run, step_key):\n self.local_manager.on_watch_start(pipeline_run, step_key)\n\n def on_watch_finish(self, pipeline_run, step_key):\n self.local_manager.on_watch_finish(pipeline_run, step_key)\n key = self.local_manager.get_key(pipeline_run, step_key)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR)\n\n def is_watch_completed(self, run_id, key):\n return self.local_manager.is_watch_completed(run_id, key)\n\n def download_url(self, run_id, key, io_type):\n if not self.is_watch_completed(run_id, key):\n return self.local_manager.download_url(run_id, key, io_type)\n key = self._blob_key(run_id, key, io_type)\n if key in self._download_urls:\n return self._download_urls[key]\n blob = self._container_client.get_blob_client(key)\n sas = generate_blob_sas(\n self._storage_account,\n self._container,\n key,\n account_key=self._blob_client.credential.account_key,\n )\n url = blob.url + sas\n self._download_urls[key] = url\n return url\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n if self._should_download(run_id, key, io_type):\n self._download_to_local(run_id, key, io_type)\n data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes)\n return self._from_local_file_data(run_id, key, io_type, data)\n\n def on_subscribe(self, subscription):\n self.local_manager.on_subscribe(subscription)\n\n def on_unsubscribe(self, subscription):\n self.local_manager.on_unsubscribe(subscription)\n\n def _should_download(self, run_id, key, io_type):\n local_path = self.get_local_path(run_id, key, io_type)\n if os.path.exists(local_path):\n return False\n blob_objects = self._container_client.list_blobs(self._blob_key(run_id, key, io_type))\n # Limit the generator to avoid paging since we only need one element\n # to return True\n limited_blob_objects = itertools.islice(blob_objects, 1)\n return len(list(limited_blob_objects)) > 0\n\n def _from_local_file_data(self, run_id, key, io_type, local_file_data):\n is_complete = self.is_watch_completed(run_id, key)\n path = (\n "https://{account}.blob.core.windows.net/{container}/{key}".format(\n account=self._storage_account,\n container=self._container,\n key=self._blob_key(run_id, key, io_type),\n )\n if is_complete\n else local_file_data.path\n )\n\n return ComputeLogFileData(\n path,\n local_file_data.data,\n local_file_data.cursor,\n local_file_data.size,\n self.download_url(run_id, key, io_type),\n )\n\n def _upload_from_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_file(path)\n key = self._blob_key(run_id, key, io_type)\n with open(path, "rb") as data:\n blob = self._container_client.get_blob_client(key)\n blob.upload_blob(data)\n\n def _download_to_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_dir(os.path.dirname(path))\n key = self._blob_key(run_id, key, io_type)\n with open(path, "wb") as fileobj:\n blob = self._container_client.get_blob_client(key)\n blob.download_blob().readinto(fileobj)\n\n def _blob_key(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n paths = [\n self._blob_prefix,\n "storage",\n run_id,\n "compute_logs",\n "{}.{}".format(key, extension),\n ]\n return "/".join(paths) # blob path delimiter\n\n def dispose(self):\n self.local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_azure/blob/compute_log_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.blob.compute_log_manager"}}}, "dagster_celery": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_celery.executor

\nfrom dagster import (\n    Executor,\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster.core.execution.retries import RetryMode, get_retries_config\nfrom dagster.grpc.types import ExecuteStepArgs\nfrom dagster.serdes import pack_value\n\nfrom .config import DEFAULT_CONFIG, dict_wrapper\nfrom .defaults import broker_url, result_backend\n\nCELERY_CONFIG = {\n    "broker": Field(\n        Noneable(StringSource),\n        is_required=False,\n        description=(\n            "The URL of the Celery broker. Default: "\n            "'pyamqp://guest@{os.getenv('DAGSTER_CELERY_BROKER_HOST',"\n            "'localhost')}//'."\n        ),\n    ),\n    "backend": Field(\n        Noneable(StringSource),\n        is_required=False,\n        default_value="rpc://",\n        description="The URL of the Celery results backend. Default: 'rpc://'.",\n    ),\n    "include": Field(\n        [str], is_required=False, description="List of modules every worker should import"\n    ),\n    "config_source": Field(\n        Noneable(Permissive()),\n        is_required=False,\n        description="Additional settings for the Celery app.",\n    ),\n    "retries": get_retries_config(),\n}\n\n\n
[docs]@executor(\n name="celery",\n config_schema=CELERY_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_executor(init_context):\n """Celery-based executor.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery import celery_executor\n\n @job(executor_def=celery_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n """\n\n return CeleryExecutor(\n broker=init_context.executor_config.get("broker"),\n backend=init_context.executor_config.get("backend"),\n config_source=init_context.executor_config.get("config_source"),\n include=init_context.executor_config.get("include"),\n retries=RetryMode.from_config(init_context.executor_config["retries"]),\n )
\n\n\ndef _submit_task(app, plan_context, step, queue, priority, known_state):\n from .tasks import create_task\n\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=plan_context.reconstructable_pipeline.get_python_origin(),\n pipeline_run_id=plan_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n )\n\n task = create_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n executable_dict=plan_context.reconstructable_pipeline.to_dict(),\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key="{queue}.execute_plan".format(queue=queue),\n )\n\n\nclass CeleryExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self._retries = check.inst_param(retries, "retries", RetryMode)\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from .core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task\n )\n\n @staticmethod\n def for_cli(broker=None, backend=None, include=None, config_source=None):\n return CeleryExecutor(\n retries=RetryMode(RetryMode.DISABLED),\n broker=broker,\n backend=backend,\n include=include,\n config_source=config_source,\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n
", "current_page_name": "_modules/dagster_celery/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_celery.executor"}}, "dagster_celery_docker": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_celery_docker.executor

\nimport json\nimport os\n\nimport docker.client\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER, core_celery_execution_loop\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_celery.executor import CELERY_CONFIG\n\nfrom dagster import (\n    DagsterInstance,\n    Executor,\n    Field,\n    MetadataEntry,\n    StringSource,\n    check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster.cli.api import ExecuteStepArgs\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.serdes import pack_value, serialize_dagster_namedtuple, unpack_value\nfrom dagster.utils import merge_dicts\n\nCELERY_DOCKER_CONFIG_KEY = "celery-docker"\n\n\ndef celery_docker_config():\n    additional_config = {\n        "docker": Field(\n            {\n                "image": Field(\n                    StringSource,\n                    is_required=False,\n                    description="The docker image to be used for step execution.",\n                ),\n                "registry": Field(\n                    {\n                        "url": Field(StringSource),\n                        "username": Field(StringSource),\n                        "password": Field(StringSource),\n                    },\n                    is_required=False,\n                    description="Information for using a non local/public docker registry",\n                ),\n                "env_vars": Field(\n                    [str],\n                    is_required=False,\n                    description="The list of environment variables names to forward from the celery worker in to the docker container",\n                ),\n                "network": Field(\n                    str,\n                    is_required=False,\n                    description="Name of the network this container will be connected to at creation time",\n                ),\n            },\n            is_required=True,\n            description="The configuration for interacting with docker in the celery worker.",\n        ),\n    }\n\n    cfg = merge_dicts(CELERY_CONFIG, additional_config)\n    return cfg\n\n\n
[docs]@executor(\n name=CELERY_DOCKER_CONFIG_KEY,\n config_schema=celery_docker_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_docker_executor(init_context):\n """Celery-based executor which launches tasks in docker containers.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_docker_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery_docker.executor import celery_executor\n\n @job(executor_def=celery_docker_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n docker:\n image: 'my_repo.com/image_name:latest'\n registry:\n url: 'my_repo.com'\n username: 'my_user'\n password: {env: 'DOCKER_PASSWORD'}\n env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_docker.app` argument.\n """\n\n exc_cfg = init_context.executor_config\n\n return CeleryDockerExecutor(\n broker=exc_cfg.get("broker"),\n backend=exc_cfg.get("backend"),\n config_source=exc_cfg.get("config_source"),\n include=exc_cfg.get("include"),\n retries=RetryMode.from_config(exc_cfg.get("retries")),\n docker_config=exc_cfg.get("docker"),\n )
\n\n\nclass CeleryDockerExecutor(Executor):\n def __init__(\n self,\n retries,\n docker_config,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.docker_config = check.dict_param(docker_config, "docker_config")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_docker\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_docker(app, plan_context, step, queue, priority, known_state):\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=plan_context.reconstructable_pipeline.get_python_origin(),\n pipeline_run_id=plan_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n )\n\n task = create_docker_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n docker_config=plan_context.executor.docker_config,\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key="{queue}.execute_step_docker".format(queue=queue),\n )\n\n\ndef create_docker_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_docker", **task_kwargs)\n def _execute_step_docker(\n self,\n execute_step_args_packed,\n docker_config,\n ):\n """Run step execution in a Docker container."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n )\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n\n check.dict_param(docker_config, "docker_config")\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n pipeline_run = instance.get_run_by_id(execute_step_args.pipeline_run_id)\n check.inst(\n pipeline_run,\n PipelineRun,\n "Could not load run {}".format(execute_step_args.pipeline_run_id),\n )\n step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)\n\n input_json = serialize_dagster_namedtuple(execute_step_args)\n\n command = "dagster api execute_step {}".format(json.dumps(input_json))\n\n docker_image = (\n docker_config["image"]\n if docker_config.get("image")\n else execute_step_args.pipeline_origin.repository_origin.container_image\n )\n\n if not docker_image:\n raise Exception("No docker image specified by either the job or the repository")\n\n client = docker.client.from_env()\n\n if docker_config.get("registry"):\n client.login(\n registry=docker_config["registry"]["url"],\n username=docker_config["registry"]["username"],\n password=docker_config["registry"]["password"],\n )\n\n # Post event for starting execution\n engine_event = instance.report_engine_event(\n "Executing steps {} in Docker container {}".format(step_keys_str, docker_image),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step keys", value=step_keys_str),\n MetadataEntry("Image", value=docker_image),\n MetadataEntry("Celery worker", value=self.request.hostname),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n\n serialized_events = [serialize_dagster_namedtuple(engine_event)]\n\n docker_env = {}\n if docker_config.get("env_vars"):\n docker_env = {env_name: os.getenv(env_name) for env_name in docker_config["env_vars"]}\n\n try:\n docker_response = client.containers.run(\n docker_image,\n command=command,\n detach=False,\n auto_remove=True,\n # pass through this worker's environment for things like AWS creds etc.\n environment=docker_env,\n network=docker_config.get("network", None),\n )\n\n res = docker_response.decode("utf-8")\n except docker.errors.ContainerError as err:\n entries = [MetadataEntry("Job image", value=docker_image)]\n if err.stderr is not None:\n entries.append(MetadataEntry("Docker stderr", value=err.stderr))\n\n instance.report_engine_event(\n "Failed to run steps {} in Docker container {}".format(step_keys_str, docker_image),\n pipeline_run,\n EngineEventData(entries),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n raise\n else:\n if res is None:\n raise Exception("No response from execute_step in CeleryDockerExecutor")\n\n serialized_events += [event for event in res.split("\\n") if event]\n\n return serialized_events\n\n return _execute_step_docker\n
", "current_page_name": "_modules/dagster_celery_docker/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_celery_docker.executor"}}, "dagster_celery_k8s": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_celery_k8s.executor

\nimport logging\nimport os\nimport sys\nimport time\n\nimport kubernetes\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_k8s import DagsterK8sJobConfig, construct_dagster_k8s_job\nfrom dagster_k8s.client import (\n    DagsterK8sAPIRetryLimitExceeded,\n    DagsterK8sError,\n    DagsterK8sPipelineStatusException,\n    DagsterK8sTimeoutError,\n    DagsterK8sUnrecoverableAPIError,\n)\nfrom dagster_k8s.job import (\n    UserDefinedDagsterK8sConfig,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\nfrom dagster_k8s.utils import (\n    delete_job,\n    filter_dagster_events_from_pod_logs,\n    get_pod_names_in_job,\n    retrieve_pod_logs,\n    wait_for_job_success,\n)\n\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    DagsterInstance,\n    Executor,\n    MetadataEntry,\n    check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster.cli.api import ExecuteStepArgs\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.execution.plan.objects import StepFailureData, UserFailureData\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import pack_value, serialize_dagster_namedtuple, unpack_value\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\nfrom .launcher import CeleryK8sRunLauncher\n\n\n
[docs]@executor(\n name=CELERY_K8S_CONFIG_KEY,\n config_schema=celery_k8s_executor_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_k8s_job_executor(init_context):\n """Celery-based executor which launches tasks as Kubernetes Jobs.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute dagster jobs\n with variations on these settings.\n\n To use the `celery_k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-celery-k8s/dagster_celery_k8s_tests/example_celery_mode_def.py\n :language: python\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_image: 'my_repo.com/image_name:latest'\n job_namespace: 'some-namespace'\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_k8s.app` argument.\n """\n\n run_launcher = init_context.instance.run_launcher\n exc_cfg = init_context.executor_config\n\n if not isinstance(run_launcher, CeleryK8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a CeleryK8sRunLauncher; configure the "\n "CeleryK8sRunLauncher on your instance to use it.",\n )\n\n job_config = run_launcher.get_k8s_job_config(\n job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"), exc_config=exc_cfg\n )\n\n # Set on the instance but overrideable here\n broker = run_launcher.broker or exc_cfg.get("broker")\n backend = run_launcher.backend or exc_cfg.get("backend")\n config_source = run_launcher.config_source or exc_cfg.get("config_source")\n include = run_launcher.include or exc_cfg.get("include")\n retries = run_launcher.retries or RetryMode.from_config(exc_cfg.get("retries"))\n\n return CeleryK8sJobExecutor(\n broker=broker,\n backend=backend,\n config_source=config_source,\n include=include,\n retries=retries,\n job_config=job_config,\n job_namespace=exc_cfg.get("job_namespace"),\n load_incluster_config=exc_cfg.get("load_incluster_config"),\n kubeconfig_file=exc_cfg.get("kubeconfig_file"),\n repo_location_name=exc_cfg.get("repo_location_name"),\n job_wait_timeout=exc_cfg.get("job_wait_timeout"),\n )
\n\n\nclass CeleryK8sJobExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n job_config=None,\n job_namespace=None,\n load_incluster_config=False,\n kubeconfig_file=None,\n repo_location_name=None,\n job_wait_timeout=None,\n ):\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.job_config = check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace", default="default")\n\n self.load_incluster_config = check.bool_param(\n load_incluster_config, "load_incluster_config"\n )\n\n self.kubeconfig_file = check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n self.repo_location_name = check.opt_str_param(repo_location_name, "repo_location_name")\n self.job_wait_timeout = check.float_param(job_wait_timeout, "job_wait_timeout")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from dagster_celery.core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_k8s_job\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_k8s_job(app, plan_context, step, queue, priority, known_state):\n user_defined_k8s_config = get_user_defined_k8s_config(step.tags)\n\n pipeline_origin = plan_context.reconstructable_pipeline.get_python_origin()\n\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=plan_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n should_verify_step=True,\n )\n\n job_config = plan_context.executor.job_config\n if not job_config.job_image:\n job_config = job_config.with_image(pipeline_origin.repository_origin.container_image)\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the dagster job")\n\n task = create_k8s_job_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n job_config_dict=job_config.to_dict(),\n job_namespace=plan_context.executor.job_namespace,\n user_defined_k8s_config_dict=user_defined_k8s_config.to_dict(),\n load_incluster_config=plan_context.executor.load_incluster_config,\n job_wait_timeout=plan_context.executor.job_wait_timeout,\n kubeconfig_file=plan_context.executor.kubeconfig_file,\n )\n\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key="{queue}.execute_step_k8s_job".format(queue=queue),\n )\n\n\ndef construct_step_failure_event_and_handle(pipeline_run, step_key, err, instance):\n step_failure_event = DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=pipeline_run.pipeline_name,\n step_key=step_key,\n event_specific_data=StepFailureData(\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n user_failure_data=UserFailureData(label="K8sError"),\n ),\n )\n event_record = EventLogEntry(\n user_message=str(err),\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n step_key=step_key,\n timestamp=time.time(),\n dagster_event=step_failure_event,\n )\n instance.handle_new_event(event_record)\n return step_failure_event\n\n\ndef create_k8s_job_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_k8s_job", **task_kwargs)\n def _execute_step_k8s_job(\n self,\n execute_step_args_packed,\n job_config_dict,\n job_namespace,\n load_incluster_config,\n job_wait_timeout,\n user_defined_k8s_config_dict=None,\n kubeconfig_file=None,\n ):\n """Run step execution in a K8s job pod."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n )\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n check.invariant(\n len(execute_step_args.step_keys_to_execute) == 1,\n "Celery K8s task executor can only execute 1 step at a time",\n )\n\n # Celery will serialize this as a list\n job_config = DagsterK8sJobConfig.from_dict(job_config_dict)\n check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n check.str_param(job_namespace, "job_namespace")\n\n check.bool_param(load_incluster_config, "load_incluster_config")\n\n user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(\n user_defined_k8s_config_dict\n )\n check.opt_inst_param(\n user_defined_k8s_config,\n "user_defined_k8s_config",\n UserDefinedDagsterK8sConfig,\n )\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n # For when launched via DinD or running the cluster\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n pipeline_run = instance.get_run_by_id(execute_step_args.pipeline_run_id)\n\n check.inst(\n pipeline_run,\n PipelineRun,\n "Could not load run {}".format(execute_step_args.pipeline_run_id),\n )\n step_key = execute_step_args.step_keys_to_execute[0]\n\n celery_worker_name = self.request.hostname\n celery_pod_name = os.environ.get("HOSTNAME")\n instance.report_engine_event(\n "Task for step {step_key} picked up by Celery".format(step_key=step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Celery worker name", value=celery_worker_name),\n MetadataEntry("Celery worker Kubernetes Pod name", value=celery_pod_name),\n ]\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n if pipeline_run.status != PipelineRunStatus.STARTED:\n instance.report_engine_event(\n "Not scheduling step because dagster run status is not STARTED",\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ]\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Ensure we stay below k8s name length limits\n k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key)\n\n retry_state = execute_step_args.known_state.get_retry_state()\n\n if retry_state.get_attempt_count(step_key):\n attempt_number = retry_state.get_attempt_count(step_key)\n job_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n pod_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n else:\n job_name = "dagster-step-%s" % (k8s_name_key)\n pod_name = "dagster-step-%s" % (k8s_name_key)\n\n args = execute_step_args.get_command_args()\n\n job = construct_dagster_k8s_job(\n job_config,\n args,\n job_name,\n user_defined_k8s_config,\n pod_name,\n component="step_worker",\n labels={\n "dagster/job": execute_step_args.pipeline_origin.pipeline_name,\n "dagster/op": step_key,\n "dagster/run-id": execute_step_args.pipeline_run_id,\n },\n )\n\n # Running list of events generated from this task execution\n events = []\n\n # Post event for starting execution\n job_name = job.metadata.name\n engine_event = instance.report_engine_event(\n "Executing step {} in Kubernetes job {}".format(step_key, job_name),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Job image", value=job_config.job_image),\n MetadataEntry("Image pull policy", value=job_config.image_pull_policy),\n MetadataEntry("Image pull secrets", value=str(job_config.image_pull_secrets)),\n MetadataEntry(\n "Service account name", value=str(job_config.service_account_name)\n ),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n # validated above that step_keys is length 1, and it is not possible to use ETH or\n # execution plan in this function (Celery K8s workers should not access to user code)\n step_key=step_key,\n )\n events.append(engine_event)\n try:\n kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n if e.reason == "Conflict":\n # There is an existing job with the same name so proceed and see if the existing job succeeded\n instance.report_engine_event(\n "Did not create Kubernetes job {} for step {} since job name already "\n "exists, proceeding with existing job.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Kubernetes Job name", value=job_name),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n else:\n instance.report_engine_event(\n "Encountered unexpected error while creating Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n wait_for_job_success(\n job_name=job_name,\n namespace=job_namespace,\n instance=instance,\n run_id=execute_step_args.pipeline_run_id,\n wait_timeout=job_wait_timeout,\n )\n except (DagsterK8sError, DagsterK8sTimeoutError) as err:\n step_failure_event = construct_step_failure_event_and_handle(\n pipeline_run, step_key, err, instance=instance\n )\n events.append(step_failure_event)\n except DagsterK8sPipelineStatusException:\n instance.report_engine_event(\n "Terminating Kubernetes Job because dagster run status is not STARTED",\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Kubernetes Job namespace", value=job_namespace),\n ]\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n delete_job(job_name=job_name, namespace=job_namespace)\n return []\n except (\n DagsterK8sUnrecoverableAPIError,\n DagsterK8sAPIRetryLimitExceeded,\n # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in\n # a retry boundary. We still catch it here just in case we missed one so that we can\n # report it to the event log\n kubernetes.client.rest.ApiException,\n ) as err:\n instance.report_engine_event(\n "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n instance.report_engine_event(\n "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Post engine event for log retrieval\n engine_event = instance.report_engine_event(\n "Retrieving logs from Kubernetes Job pods",\n pipeline_run,\n EngineEventData([MetadataEntry("Pod names", value="\\n".join(pod_names))]),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n events.append(engine_event)\n\n logs = []\n for pod_name in pod_names:\n try:\n raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)\n logs += raw_logs.split("\\n")\n except kubernetes.client.rest.ApiException as e:\n instance.report_engine_event(\n "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "\n "Pod name {} for step {}. Will attempt to continue with other pods.".format(\n job_name, pod_name, step_key\n ),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n events += filter_dagster_events_from_pod_logs(logs)\n serialized_events = [serialize_dagster_namedtuple(event) for event in events]\n return serialized_events\n\n return _execute_step_k8s_job\n
", "current_page_name": "_modules/dagster_celery_k8s/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_celery_k8s.executor"}, "launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_celery_k8s.launcher

\nimport sys\n\nimport kubernetes\nfrom dagster_k8s.job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_job_name_from_run_id,\n    get_user_defined_k8s_config,\n)\nfrom dagster_k8s.utils import delete_job\n\nfrom dagster import DagsterInvariantViolationError, MetadataEntry, check\nfrom dagster.config.field import resolve_to_config_type\nfrom dagster.config.validate import process_config\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.launcher import LaunchRunContext, RunLauncher\nfrom dagster.core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import frozentags, merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\n\n\n
[docs]class CeleryK8sRunLauncher(RunLauncher, ConfigurableClass):\n """In contrast to the :py:class:`K8sRunLauncher`, which launches dagster runs as single K8s\n Jobs, this run launcher is intended for use in concert with\n :py:func:`dagster_celery_k8s.celery_k8s_job_executor`.\n\n With this run launcher, execution is delegated to:\n\n 1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\n submits steps to Celery queues for execution;\n 2. The step executions which are submitted to Celery queues are picked up by Celery workers,\n and each step execution spawns a step execution Kubernetes Job. See the implementation\n defined in :py:func:`dagster_celery_k8.executor.create_k8s_job_task`.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: CeleryK8sRunLauncher\n config:\n instance_config_map: "dagster-k8s-instance-config-map"\n dagster_home: "/some/path"\n postgres_password_secret: "dagster-k8s-pg-password"\n broker: "some_celery_broker_url"\n backend: "some_celery_backend_url"\n\n """\n\n def __init__(\n self,\n instance_config_map,\n dagster_home,\n postgres_password_secret,\n load_incluster_config=True,\n kubeconfig_file=None,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n retries=None,\n inst_data=None,\n k8s_client_batch_api=None,\n env_config_maps=None,\n env_secrets=None,\n volume_mounts=None,\n volumes=None,\n service_account_name=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n labels=None,\n fail_pod_on_run_failure=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._fixed_batch_api = k8s_client_batch_api\n\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self.postgres_password_secret = check.str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self.broker = check.opt_str_param(broker, "broker")\n self.backend = check.opt_str_param(backend, "backend")\n self.include = check.opt_list_param(include, "include")\n self.config_source = check.opt_dict_param(config_source, "config_source")\n\n retries = check.opt_dict_param(retries, "retries") or {"enabled": {}}\n self.retries = RetryMode.from_config(retries)\n\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n\n self._service_account_name = check.opt_str_param(\n service_account_name, "service_account_name"\n )\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n\n super().__init__()\n\n @property\n def _batch_api(self):\n return self._fixed_batch_api if self._fixed_batch_api else kubernetes.client.BatchV1Api()\n\n @classmethod\n def config_type(cls):\n from dagster_celery.executor import CELERY_CONFIG\n\n return merge_dicts(DagsterK8sJobConfig.config_type_run_launcher(), CELERY_CONFIG)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.pipeline_run\n\n job_name = get_job_name_from_run_id(run.run_id)\n pod_name = job_name\n exc_config = _get_validated_celery_k8s_executor_config(run.run_config)\n env_vars = None\n\n job_image_from_executor_config = exc_config.get("job_image")\n\n pipeline_origin = context.pipeline_code_origin\n repository_origin = pipeline_origin.repository_origin\n\n job_image = repository_origin.container_image\n\n if job_image:\n if job_image_from_executor_config:\n job_image = job_image_from_executor_config\n self._instance.report_engine_event(\n f"You have specified a job_image {job_image_from_executor_config} in your executor configuration, "\n f"but also {job_image} in your user-code deployment. Using the job image {job_image_from_executor_config} "\n f"from executor configuration as it takes precedence.",\n run,\n cls=self.__class__,\n )\n else:\n if not job_image_from_executor_config:\n raise DagsterInvariantViolationError(\n "You have not specified a job_image in your executor configuration. "\n "To resolve this error, specify the job_image configuration in the executor "\n "config section in your run config. \\n"\n "Note: You may also be seeing this error because you are using the configured API. "\n "Using configured with the celery-k8s executor is not supported at this time, "\n "and the job_image must be configured at the top-level executor config without "\n "using configured."\n )\n\n job_image = job_image_from_executor_config\n\n job_config = self.get_k8s_job_config(job_image, exc_config)\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_config.job_image},\n )\n\n user_defined_k8s_config = get_user_defined_k8s_config(frozentags(run.tags))\n\n from dagster.cli.api import ExecuteRunArgs\n\n run_args = ExecuteRunArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n job = construct_dagster_k8s_job(\n job_config,\n args=run_args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n env_vars=env_vars,\n labels={\n "dagster/job": pipeline_origin.pipeline_name,\n "dagster/run-id": run.run_id,\n },\n )\n\n job_namespace = exc_config.get("job_namespace")\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n [\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Kubernetes Namespace", value=job_namespace),\n MetadataEntry("Run ID", value=run.run_id),\n ]\n ),\n cls=self.__class__,\n )\n\n self._batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n EngineEventData(\n [\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Kubernetes Namespace", value=job_namespace),\n MetadataEntry("Run ID", value=run.run_id),\n ]\n ),\n cls=self.__class__,\n )\n\n def get_k8s_job_config(self, job_image, exc_config):\n return DagsterK8sJobConfig(\n dagster_home=self.dagster_home,\n instance_config_map=self.instance_config_map,\n postgres_password_secret=self.postgres_password_secret,\n job_image=check.opt_str_param(job_image, "job_image"),\n image_pull_policy=exc_config.get("image_pull_policy", self._image_pull_policy),\n image_pull_secrets=exc_config.get("image_pull_secrets", []) + self._image_pull_secrets,\n service_account_name=exc_config.get("service_account_name", self._service_account_name),\n env_config_maps=exc_config.get("env_config_maps", []) + self._env_config_maps,\n env_secrets=exc_config.get("env_secrets", []) + self._env_secrets,\n volume_mounts=exc_config.get("volume_mounts", []) + self._volume_mounts,\n volumes=exc_config.get("volumes", []) + self._volumes,\n labels=merge_dicts(self._labels, exc_config.get("labels", {})),\n )\n\n # https://github.com/dagster-io/dagster/issues/2741\n def can_terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n if not pipeline_run:\n return False\n\n if pipeline_run.status != PipelineRunStatus.STARTED:\n return False\n\n return True\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n can_terminate = self.can_terminate(run_id)\n if not can_terminate:\n self._instance.report_engine_event(\n message="Unable to terminate dagster job: can_terminate returned {}.".format(\n can_terminate\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n job_name = get_job_name_from_run_id(run_id)\n\n job_namespace = self.get_namespace_from_run_config(run_id)\n\n self._instance.report_run_canceling(run)\n\n try:\n termination_result = delete_job(job_name=job_name, namespace=job_namespace)\n if termination_result:\n self._instance.report_engine_event(\n message="Dagster Job was terminated successfully.",\n pipeline_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Dagster Job was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message="Dagster Job was not terminated successfully; encountered error in delete_job",\n pipeline_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n def get_namespace_from_run_config(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n run_config = pipeline_run.run_config\n executor_config = _get_validated_celery_k8s_executor_config(run_config)\n return executor_config.get("job_namespace")\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: PipelineRun):\n job_namespace = _get_validated_celery_k8s_executor_config(run.run_config).get(\n "job_namespace"\n )\n job_name = get_job_name_from_run_id(run.run_id)\n try:\n job = self._batch_api.read_namespaced_job(namespace=job_namespace, name=job_name)\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n if job.status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n\n\ndef _get_validated_celery_k8s_executor_config(run_config):\n check.dict_param(run_config, "run_config")\n\n executor_config = run_config.get("execution", {})\n execution_config_schema = resolve_to_config_type(celery_k8s_executor_config())\n\n # In run config on jobs, we don't have an executor key\n if not CELERY_K8S_CONFIG_KEY in executor_config:\n\n execution_run_config = executor_config.get("config", {})\n else:\n execution_run_config = (run_config["execution"][CELERY_K8S_CONFIG_KEY] or {}).get(\n "config", {}\n )\n\n res = process_config(execution_config_schema, execution_run_config)\n\n check.invariant(\n res.success,\n "Incorrect execution schema provided. Note: You may also be seeing this error "\n "because you are using the configured API. "\n "Using configured with the {config_key} executor is not supported at this time, "\n "and all executor config must be directly in the run config without using configured.".format(\n config_key=CELERY_K8S_CONFIG_KEY,\n ),\n )\n\n return res.value\n
", "current_page_name": "_modules/dagster_celery_k8s/launcher", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_celery_k8s.launcher"}}, "dagster_dask": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dask.executor

\nimport dask\nimport dask.distributed\n\nfrom dagster import (\n    Executor,\n    Field,\n    Permissive,\n    Selector,\n    StringSource,\n    check,\n    multiple_process_executor_requirements,\n    seven,\n)\nfrom dagster.core.definitions.executor_definition import executor\nfrom dagster.core.errors import raise_execution_interrupts\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.execution.api import create_execution_plan, execute_plan\nfrom dagster.core.execution.context.system import PlanOrchestrationContext\nfrom dagster.core.execution.plan.plan import ExecutionPlan\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.utils import frozentags, iterate_with_context\n\n# Dask resource requirements are specified under this key\nDASK_RESOURCE_REQUIREMENTS_KEY = "dagster-dask/resource_requirements"\n\n\n
[docs]@executor(\n name="dask",\n requirements=multiple_process_executor_requirements(),\n config_schema={\n "cluster": Field(\n Selector(\n {\n "existing": Field(\n {"address": StringSource},\n description="Connect to an existing scheduler.",\n ),\n "local": Field(\n Permissive(), is_required=False, description="Local cluster configuration."\n ),\n "yarn": Field(\n Permissive(), is_required=False, description="YARN cluster configuration."\n ),\n "ssh": Field(\n Permissive(), is_required=False, description="SSH cluster configuration."\n ),\n "pbs": Field(\n Permissive(), is_required=False, description="PBS cluster configuration."\n ),\n "moab": Field(\n Permissive(), is_required=False, description="Moab cluster configuration."\n ),\n "sge": Field(\n Permissive(), is_required=False, description="SGE cluster configuration."\n ),\n "lsf": Field(\n Permissive(), is_required=False, description="LSF cluster configuration."\n ),\n "slurm": Field(\n Permissive(), is_required=False, description="SLURM cluster configuration."\n ),\n "oar": Field(\n Permissive(), is_required=False, description="OAR cluster configuration."\n ),\n "kube": Field(\n Permissive(),\n is_required=False,\n description="Kubernetes cluster configuration.",\n ),\n }\n )\n )\n },\n)\ndef dask_executor(init_context):\n """Dask-based executor.\n\n The 'cluster' can be one of the following:\n ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').\n\n If the Dask executor is used without providing executor-specific config, a local Dask cluster\n will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`\n with :py:class:`dask.distributed.LocalCluster() <dask:distributed.LocalCluster>`).\n\n The Dask executor optionally takes the following config:\n\n .. code-block:: none\n\n cluster:\n {\n local?: # takes distributed.LocalCluster parameters\n {\n timeout?: 5, # Timeout duration for initial connection to the scheduler\n n_workers?: 4 # Number of workers to start\n threads_per_worker?: 1 # Number of threads per each worker\n }\n }\n\n To use the `dask_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dask import dask_executor\n\n @job(executor_def=dask_executor)\n def dask_enabled_job():\n pass\n\n """\n ((cluster_type, cluster_configuration),) = init_context.executor_config["cluster"].items()\n return DaskExecutor(cluster_type, cluster_configuration)
\n\n\ndef query_on_dask_worker(\n dependencies,\n recon_pipeline,\n pipeline_run,\n run_config,\n step_keys,\n mode,\n instance_ref,\n known_state,\n): # pylint: disable=unused-argument\n """Note that we need to pass "dependencies" to ensure Dask sequences futures during task\n scheduling, even though we do not use this argument within the function.\n """\n\n with DagsterInstance.from_ref(instance_ref) as instance:\n subset_pipeline = recon_pipeline.subset_for_execution_from_existing_pipeline(\n pipeline_run.solids_to_execute\n )\n\n execution_plan = create_execution_plan(\n subset_pipeline,\n run_config=run_config,\n step_keys_to_execute=step_keys,\n mode=mode,\n known_state=known_state,\n )\n\n return execute_plan(\n execution_plan, subset_pipeline, instance, pipeline_run, run_config=run_config\n )\n\n\ndef get_dask_resource_requirements(tags):\n check.inst_param(tags, "tags", frozentags)\n req_str = tags.get(DASK_RESOURCE_REQUIREMENTS_KEY)\n if req_str is not None:\n return seven.json.loads(req_str)\n\n return {}\n\n\nclass DaskExecutor(Executor):\n def __init__(self, cluster_type, cluster_configuration):\n self.cluster_type = check.opt_str_param(cluster_type, "cluster_type", default="local")\n self.cluster_configuration = check.opt_dict_param(\n cluster_configuration, "cluster_configuration"\n )\n\n @property\n def retries(self):\n return RetryMode.DISABLED\n\n def execute(self, plan_context, execution_plan):\n check.inst_param(plan_context, "plan_context", PlanOrchestrationContext)\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.param_invariant(\n isinstance(plan_context.executor, DaskExecutor),\n "plan_context",\n "Expected executor to be DaskExecutor got {}".format(plan_context.executor),\n )\n\n check.invariant(\n plan_context.instance.is_persistent,\n "Dask execution requires a persistent DagsterInstance",\n )\n\n step_levels = execution_plan.get_steps_to_execute_by_level()\n\n pipeline_name = plan_context.pipeline_name\n\n instance = plan_context.instance\n\n cluster_type = self.cluster_type\n if cluster_type == "existing":\n # address passed directly to Client() below to connect to existing Scheduler\n cluster = self.cluster_configuration["address"]\n elif cluster_type == "local":\n from dask.distributed import LocalCluster\n\n cluster = LocalCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "yarn":\n from dask_yarn import YarnCluster\n\n cluster = YarnCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "ssh":\n from dask.distributed import SSHCluster\n\n cluster = SSHCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "pbs":\n from dask_jobqueue import PBSCluster\n\n cluster = PBSCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "moab":\n from dask_jobqueue import MoabCluster\n\n cluster = MoabCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "sge":\n from dask_jobqueue import SGECluster\n\n cluster = SGECluster(**self.build_dict(pipeline_name))\n elif cluster_type == "lsf":\n from dask_jobqueue import LSFCluster\n\n cluster = LSFCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "slurm":\n from dask_jobqueue import SLURMCluster\n\n cluster = SLURMCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "oar":\n from dask_jobqueue import OARCluster\n\n cluster = OARCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "kube":\n from dask_kubernetes import KubeCluster\n\n cluster = KubeCluster(**self.build_dict(pipeline_name))\n else:\n raise ValueError(\n f"Must be providing one of the following ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"\n )\n\n with dask.distributed.Client(cluster) as client:\n execution_futures = []\n execution_futures_dict = {}\n\n for step_level in step_levels:\n for step in step_level:\n # We ensure correctness in sequencing by letting Dask schedule futures and\n # awaiting dependencies within each step.\n dependencies = []\n for step_input in step.step_inputs:\n for key in step_input.dependency_keys:\n dependencies.append(execution_futures_dict[key])\n\n if plan_context.pipeline.get_definition().is_job:\n run_config = plan_context.run_config\n else:\n run_config = dict(plan_context.run_config, execution={"in_process": {}})\n\n dask_task_name = "%s.%s" % (pipeline_name, step.key)\n\n recon_pipeline = plan_context.reconstructable_pipeline\n\n future = client.submit(\n query_on_dask_worker,\n dependencies,\n recon_pipeline,\n plan_context.pipeline_run,\n run_config,\n [step.key],\n plan_context.pipeline_run.mode,\n instance.get_ref(),\n execution_plan.known_state,\n key=dask_task_name,\n resources=get_dask_resource_requirements(step.tags),\n )\n\n execution_futures.append(future)\n execution_futures_dict[step.key] = future\n\n # This tells Dask to awaits the step executions and retrieve their results to the\n # master\n futures = dask.distributed.as_completed(execution_futures, with_results=True)\n\n # Allow interrupts while waiting for the results from Dask\n for future, result in iterate_with_context(raise_execution_interrupts, futures):\n for step_event in result:\n check.inst(step_event, DagsterEvent)\n yield step_event\n\n def build_dict(self, pipeline_name):\n """Returns a dict we can use for kwargs passed to dask client instantiation.\n\n Intended to be used like:\n\n with dask.distributed.Client(**cfg.build_dict()) as client:\n << use client here >>\n\n """\n if self.cluster_type in ["yarn", "pbs", "moab", "sge", "lsf", "slurm", "oar", "kube"]:\n dask_cfg = {"name": pipeline_name}\n else:\n dask_cfg = {}\n\n if self.cluster_configuration:\n for k, v in self.cluster_configuration.items():\n dask_cfg[k] = v\n\n # if address is set, don't add LocalCluster args\n # context: https://github.com/dask/distributed/issues/3313\n if (self.cluster_type == "local") and ("address" not in dask_cfg):\n # We set threads_per_worker because Dagster is not thread-safe. Even though\n # environments=True by default, there is a clever piece of machinery\n # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution\n # multithreaded by default when the number of available cores is greater than 4.\n # See: https://github.com/dagster-io/dagster/issues/2181\n # We may want to try to figure out a way to enforce this on remote Dask clusters against\n # which users run Dagster workloads.\n dask_cfg["threads_per_worker"] = 1\n\n return dask_cfg\n
", "current_page_name": "_modules/dagster_dask/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dask.executor"}}, "dagster_databricks": {"databricks": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_databricks.databricks

\nimport base64\nimport time\n\nimport requests.exceptions\nfrom databricks_api import DatabricksAPI\n\nimport dagster\nfrom dagster import check\n\nfrom .types import (\n    DATABRICKS_RUN_TERMINATED_STATES,\n    DatabricksRunLifeCycleState,\n    DatabricksRunResultState,\n)\n\n# wait at most 24 hours by default for run execution\nDEFAULT_RUN_MAX_WAIT_TIME_SEC = 24 * 60 * 60\n\n\n
[docs]class DatabricksError(Exception):\n pass
\n\n\nclass DatabricksClient:\n """A thin wrapper over the Databricks REST API."""\n\n def __init__(self, host, token, workspace_id=None):\n self.host = host\n self.workspace_id = workspace_id\n self.client = DatabricksAPI(host=host, token=token)\n\n def submit_run(self, *args, **kwargs):\n """Submit a run directly to the 'Runs Submit' API."""\n return self.client.jobs.submit_run(*args, **kwargs)["run_id"] # pylint: disable=no-member\n\n def read_file(self, dbfs_path, block_size=1024**2):\n """Read a file from DBFS to a **byte string**."""\n\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n data = b""\n bytes_read = 0\n jdoc = self.client.dbfs.read(path=dbfs_path, length=block_size) # pylint: disable=no-member\n data += base64.b64decode(jdoc["data"])\n while jdoc["bytes_read"] == block_size:\n bytes_read += jdoc["bytes_read"]\n jdoc = self.client.dbfs.read( # pylint: disable=no-member\n path=dbfs_path, offset=bytes_read, length=block_size\n )\n data += base64.b64decode(jdoc["data"])\n return data\n\n def put_file(self, file_obj, dbfs_path, overwrite=False, block_size=1024**2):\n """Upload an arbitrary large file to DBFS.\n\n This doesn't use the DBFS `Put` API because that endpoint is limited to 1MB.\n """\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n create_response = self.client.dbfs.create( # pylint: disable=no-member\n path=dbfs_path, overwrite=overwrite\n )\n handle = create_response["handle"]\n\n block = file_obj.read(block_size)\n while block:\n data = base64.b64encode(block).decode("utf-8")\n self.client.dbfs.add_block(data=data, handle=handle) # pylint: disable=no-member\n block = file_obj.read(block_size)\n\n self.client.dbfs.close(handle=handle) # pylint: disable=no-member\n\n def get_run_state(self, databricks_run_id):\n """Get the state of a run by Databricks run ID (_not_ dagster run ID).\n\n Return a `DatabricksRunState` object. Note that the `result_state`\n attribute may be `None` if the run hasn't yet terminated.\n """\n run = self.client.jobs.get_run(databricks_run_id) # pylint: disable=no-member\n state = run["state"]\n result_state = state.get("result_state")\n if result_state:\n result_state = DatabricksRunResultState(result_state)\n return DatabricksRunState(\n life_cycle_state=DatabricksRunLifeCycleState(state["life_cycle_state"]),\n result_state=result_state,\n state_message=state["state_message"],\n )\n\n\nclass DatabricksRunState:\n """Represents the state of a Databricks job run."""\n\n def __init__(self, life_cycle_state, result_state, state_message):\n self.life_cycle_state = life_cycle_state\n self.result_state = result_state\n self.state_message = state_message\n\n def has_terminated(self):\n """Has the job terminated?"""\n return self.life_cycle_state in DATABRICKS_RUN_TERMINATED_STATES\n\n def is_successful(self):\n """Was the job successful?"""\n return self.result_state == DatabricksRunResultState.Success\n\n def __repr__(self):\n return str(self.__dict__)\n\n\nclass DatabricksJobRunner:\n """Submits jobs created using Dagster config to Databricks, and monitors their progress."""\n\n def __init__(\n self, host, token, poll_interval_sec=5, max_wait_time_sec=DEFAULT_RUN_MAX_WAIT_TIME_SEC\n ):\n """Args:\n host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net\n token (str): Databricks token\n """\n self.host = check.str_param(host, "host")\n self.token = check.str_param(token, "token")\n self.poll_interval_sec = check.numeric_param(poll_interval_sec, "poll_interval_sec")\n self.max_wait_time_sec = check.int_param(max_wait_time_sec, "max_wait_time_sec")\n\n self._client = DatabricksClient(host=self.host, token=self.token)\n\n @property\n def client(self):\n """Return the underlying `DatabricksClient` object."""\n return self._client\n\n def submit_run(self, run_config, task):\n """Submit a new run using the 'Runs submit' API."""\n existing_cluster_id = run_config["cluster"].get("existing")\n\n new_cluster = run_config["cluster"].get("new")\n\n # The Databricks API needs different keys to be present in API calls depending\n # on new/existing cluster, so we need to process the new_cluster\n # config first.\n if new_cluster:\n new_cluster = new_cluster.copy()\n\n nodes = new_cluster.pop("nodes")\n if "instance_pool_id" in nodes:\n new_cluster["instance_pool_id"] = nodes["instance_pool_id"]\n else:\n node_types = nodes["node_types"]\n new_cluster["node_type_id"] = node_types["node_type_id"]\n if "driver_node_type_id" in node_types:\n new_cluster["driver_node_type_id"] = node_types["driver_node_type_id"]\n\n cluster_size = new_cluster.pop("size")\n if "num_workers" in cluster_size:\n new_cluster["num_workers"] = cluster_size["num_workers"]\n else:\n new_cluster["autoscale"] = cluster_size["autoscale"]\n\n tags = new_cluster.get("custom_tags", [])\n tags.append({"key": "__dagster_version", "value": dagster.__version__})\n new_cluster["custom_tags"] = tags\n\n check.invariant(\n existing_cluster_id is not None or new_cluster is not None,\n "Invalid value for run_config.cluster",\n )\n\n # We'll always need some libraries, namely dagster/dagster_databricks/dagster_pyspark,\n # since they're imported by our scripts.\n # Add them if they're not already added by users in config.\n libraries = list(run_config.get("libraries", []))\n python_libraries = {\n x["pypi"]["package"].split("==")[0].replace("_", "-") for x in libraries if "pypi" in x\n }\n for library in ["dagster", "dagster-databricks", "dagster-pyspark"]:\n if library not in python_libraries:\n libraries.append(\n {"pypi": {"package": "{}=={}".format(library, dagster.__version__)}}\n )\n\n # Only one task should be able to be chosen really; make sure of that here.\n check.invariant(\n sum(\n task.get(key) is not None\n for key in [\n "notebook_task",\n "spark_python_task",\n "spark_jar_task",\n "spark_submit_task",\n ]\n )\n == 1,\n "Multiple tasks specified in Databricks run",\n )\n\n config = dict(\n run_name=run_config.get("run_name"),\n new_cluster=new_cluster,\n existing_cluster_id=existing_cluster_id,\n libraries=libraries,\n **task,\n )\n return self.client.submit_run(**config)\n\n def retrieve_logs_for_run_id(self, log, databricks_run_id):\n """Retrieve the stdout and stderr logs for a run."""\n api_client = self.client.client\n run = api_client.jobs.get_run(databricks_run_id) # pylint: disable=no-member\n cluster = api_client.cluster.get_cluster( # pylint: disable=no-member\n run["cluster_instance"]["cluster_id"]\n )\n log_config = cluster.get("cluster_log_conf")\n if log_config is None:\n log.warn(\n "Logs not configured for cluster {cluster} used for run {run}".format(\n cluster=cluster["cluster_id"], run=databricks_run_id\n )\n )\n return None\n if "s3" in log_config:\n logs_prefix = log_config["s3"]["destination"]\n log.warn("Retrieving S3 logs not yet implemented")\n return None\n elif "dbfs" in log_config:\n logs_prefix = log_config["dbfs"]["destination"]\n stdout = self.wait_for_dbfs_logs(log, logs_prefix, cluster["cluster_id"], "stdout")\n stderr = self.wait_for_dbfs_logs(log, logs_prefix, cluster["cluster_id"], "stderr")\n return stdout, stderr\n\n def wait_for_dbfs_logs(\n self, log, prefix, cluster_id, filename, waiter_delay=10, waiter_max_attempts=10\n ):\n """Attempt up to `waiter_max_attempts` attempts to get logs from DBFS."""\n path = "/".join([prefix, cluster_id, "driver", filename])\n log.info("Retrieving logs from {}".format(path))\n num_attempts = 0\n while num_attempts <= waiter_max_attempts:\n try:\n logs = self.client.read_file(path)\n return logs.decode("utf-8")\n except requests.exceptions.HTTPError:\n num_attempts += 1\n time.sleep(waiter_delay)\n log.warn("Could not retrieve cluster logs!")\n\n def wait_for_run_to_complete(self, log, databricks_run_id):\n return wait_for_run_to_complete(\n self.client, log, databricks_run_id, self.poll_interval_sec, self.max_wait_time_sec\n )\n\n\ndef poll_run_state(\n client,\n log,\n start_poll_time: float,\n databricks_run_id: int,\n max_wait_time_sec: float,\n):\n run_state = client.get_run_state(databricks_run_id)\n if run_state.has_terminated():\n if run_state.is_successful():\n log.info("Run %s completed successfully" % databricks_run_id)\n return True\n else:\n error_message = "Run %s failed with result state: %s. Message: %s" % (\n databricks_run_id,\n run_state.result_state,\n run_state.state_message,\n )\n log.error(error_message)\n raise DatabricksError(error_message)\n else:\n log.info("Run %s in state %s" % (databricks_run_id, run_state))\n if time.time() - start_poll_time > max_wait_time_sec:\n raise DatabricksError(\n "Job run {} took more than {}s to complete; failing".format(\n databricks_run_id, max_wait_time_sec\n )\n )\n return False\n\n\ndef wait_for_run_to_complete(client, log, databricks_run_id, poll_interval_sec, max_wait_time_sec):\n """Wait for a Databricks run to complete."""\n check.int_param(databricks_run_id, "databricks_run_id")\n log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n start = time.time()\n while True:\n if poll_run_state(client, log, start, databricks_run_id, max_wait_time_sec):\n return\n time.sleep(poll_interval_sec)\n
", "current_page_name": "_modules/dagster_databricks/databricks", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_databricks.databricks"}, "databricks_pyspark_step_launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_databricks.databricks_pyspark_step_launcher

\nimport io\nimport os.path\nimport pickle\nimport tempfile\nimport time\n\nfrom dagster_databricks import databricks_step_main\nfrom dagster_databricks.databricks import (\n    DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n    DatabricksJobRunner,\n    poll_run_state,\n)\nfrom dagster_pyspark.utils import build_pyspark_zip\nfrom requests import HTTPError\n\nfrom dagster import Bool, Field, IntSource, StringSource, check, resource\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import raise_execution_interrupts\nfrom dagster.core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster.serdes import deserialize_value\nfrom dagster.utils.backoff import backoff\n\nfrom .configs import (\n    define_databricks_secrets_config,\n    define_databricks_storage_config,\n    define_databricks_submit_run_config,\n)\n\nCODE_ZIP_NAME = "code.zip"\nPICKLED_CONFIG_FILE_NAME = "config.pkl"\n\n\n
[docs]@resource(\n {\n "run_config": define_databricks_submit_run_config(),\n "databricks_host": Field(\n StringSource,\n is_required=True,\n description="Databricks host, e.g. uksouth.azuredatabricks.com",\n ),\n "databricks_token": Field(\n StringSource,\n is_required=True,\n description="Databricks access token",\n ),\n "secrets_to_env_variables": define_databricks_secrets_config(),\n "storage": define_databricks_storage_config(),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description="Absolute path to the package that contains the pipeline definition(s) "\n "whose steps will execute remotely on Databricks. This is a path on the local "\n "fileystem of the process executing the pipeline. Before every step run, the "\n "launcher will zip up the code in this path, upload it to DBFS, and unzip it "\n "into the Python path of the remote Spark process. This gives the remote process "\n "access to up-to-date user code.",\n ),\n "local_dagster_job_package_path": Field(\n StringSource,\n is_required=False,\n description="Absolute path to the package that contains the dagster job definition(s) "\n "whose steps will execute remotely on Databricks. This is a path on the local "\n "fileystem of the process executing the dagster job. Before every step run, the "\n "launcher will zip up the code in this path, upload it to DBFS, and unzip it "\n "into the Python path of the remote Spark process. This gives the remote process "\n "access to up-to-date user code.",\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="/dagster_staging",\n description="Directory in DBFS to use for uploaded job code. Must be absolute.",\n ),\n "wait_for_logs": Field(\n Bool,\n is_required=False,\n default_value=False,\n description="If set, and if the specified cluster is configured to export logs, "\n "the system will wait after job completion for the logs to appear in the configured "\n "location. Note that logs are copied every 5 minutes, so enabling this will add "\n "several minutes to the job runtime. NOTE: this integration will export stdout/stderr"\n "from the remote Databricks process automatically, so this option is not generally "\n "necessary.",\n ),\n "max_completion_wait_time_seconds": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n description="If the Databricks job run takes more than this many seconds, then "\n "consider it failed and terminate the step.",\n ),\n "poll_interval_sec": Field(\n float,\n is_required=False,\n default_value=5.0,\n description="How frequently Dagster will poll Databricks to determine the state of the job.",\n ),\n }\n)\ndef databricks_pyspark_step_launcher(context):\n """Resource for running ops as a Databricks Job.\n\n When this resource is used, the op will be executed in Databricks using the 'Run Submit'\n API. Pipeline code will be zipped up and copied to a directory in DBFS along with the op's\n execution context.\n\n Use the 'run_config' configuration to specify the details of the Databricks cluster used, and\n the 'storage' key to configure persistent storage on that cluster. Storage is accessed by\n setting the credentials in the Spark context, as documented `here for S3`_ and `here for ADLS`_.\n\n .. _`here for S3`: https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context\n .. _`here for ADLS`: https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n """\n return DatabricksPySparkStepLauncher(**context.resource_config)
\n\n\nclass DatabricksPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n run_config,\n databricks_host,\n databricks_token,\n secrets_to_env_variables,\n storage,\n staging_prefix,\n wait_for_logs,\n max_completion_wait_time_seconds,\n poll_interval_sec=5,\n local_pipeline_package_path=None,\n local_dagster_job_package_path=None,\n ):\n self.run_config = check.dict_param(run_config, "run_config")\n self.databricks_host = check.str_param(databricks_host, "databricks_host")\n self.databricks_token = check.str_param(databricks_token, "databricks_token")\n self.secrets = check.list_param(secrets_to_env_variables, "secrets_to_env_variables", dict)\n self.storage = check.dict_param(storage, "storage")\n check.invariant(\n local_dagster_job_package_path is not None or local_pipeline_package_path is not None,\n "Missing config: need to provide either 'local_dagster_job_package_path' or 'local_pipeline_package_path' config entry",\n )\n check.invariant(\n local_dagster_job_package_path is None or local_pipeline_package_path is None,\n "Error in config: Provided both 'local_dagster_job_package_path' and 'local_pipeline_package_path' entries. Need to specify one or the other.",\n )\n self.local_dagster_job_package_path = check.str_param(\n local_pipeline_package_path or local_dagster_job_package_path,\n "local_dagster_job_package_path",\n )\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n check.invariant(staging_prefix.startswith("/"), "staging_prefix must be an absolute path")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n\n self.databricks_runner = DatabricksJobRunner(\n host=databricks_host,\n token=databricks_token,\n poll_interval_sec=poll_interval_sec,\n max_wait_time_sec=max_completion_wait_time_seconds,\n )\n\n def launch_step(self, step_context, prior_attempts_count):\n step_run_ref = step_context_to_step_run_ref(\n step_context, prior_attempts_count, self.local_dagster_job_package_path\n )\n run_id = step_context.pipeline_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._upload_artifacts(log, step_run_ref, run_id, step_key)\n\n task = self._get_databricks_task(run_id, step_key)\n databricks_run_id = self.databricks_runner.submit_run(self.run_config, task)\n\n try:\n # If this is being called within a `capture_interrupts` context, allow interrupts while\n # waiting for the execution to complete, so that we can terminate slow or hanging steps\n with raise_execution_interrupts():\n yield from self.step_events_iterator(step_context, step_key, databricks_run_id)\n finally:\n self.log_compute_logs(log, run_id, step_key)\n # this is somewhat obsolete\n if self.wait_for_logs:\n self._log_logs_from_cluster(log, databricks_run_id)\n\n def log_compute_logs(self, log, run_id, step_key):\n stdout = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stdout")\n ).decode()\n stderr = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stderr")\n ).decode()\n log.info(f"Captured stdout for step {step_key}:")\n log.info(stdout)\n log.info(f"Captured stderr for step {step_key}:")\n log.info(stderr)\n\n def step_events_iterator(self, step_context, step_key: str, databricks_run_id: int):\n """The launched Databricks job writes all event records to a specific dbfs file. This iterator\n regularly reads the contents of the file, adds any events that have not yet been seen to\n the instance, and yields any DagsterEvents.\n\n By doing this, we simulate having the remote Databricks process able to directly write to\n the local DagsterInstance. Importantly, this means that timestamps (and all other record\n properties) will be sourced from the Databricks process, rather than recording when this\n process happens to log them.\n """\n\n check.int_param(databricks_run_id, "databricks_run_id")\n processed_events = 0\n start = time.time()\n done = False\n step_context.log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n while not done:\n with raise_execution_interrupts():\n step_context.log.debug(\n "Waiting %.1f seconds...", self.databricks_runner.poll_interval_sec\n )\n time.sleep(self.databricks_runner.poll_interval_sec)\n try:\n done = poll_run_state(\n self.databricks_runner.client,\n step_context.log,\n start,\n databricks_run_id,\n self.databricks_runner.max_wait_time_sec,\n )\n finally:\n all_events = self.get_step_events(\n step_context.run_id, step_key, step_context.previous_attempt_count\n )\n # we get all available records on each poll, but we only want to process the\n # ones we haven't seen before\n for event in all_events[processed_events:]:\n # write each event from the DataBricks instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.dagster_event\n processed_events = len(all_events)\n\n step_context.log.info(f"Databricks run {databricks_run_id} completed.")\n\n def get_step_events(self, run_id: str, step_key: str, retry_number: int):\n path = self._dbfs_path(run_id, step_key, f"{retry_number}_{PICKLED_EVENTS_FILE_NAME}")\n\n def _get_step_records():\n serialized_records = self.databricks_runner.client.read_file(path)\n if not serialized_records:\n return []\n return deserialize_value(pickle.loads(serialized_records))\n\n try:\n # reading from dbfs while it writes can be flaky\n # allow for retry if we get malformed data\n return backoff(\n fn=_get_step_records,\n retry_on=(pickle.UnpicklingError,),\n max_retries=2,\n )\n # if you poll before the Databricks process has had a chance to create the file,\n # we expect to get this error\n except HTTPError as e:\n if e.response.json().get("error_code") == "RESOURCE_DOES_NOT_EXIST":\n return []\n\n return []\n\n def _get_databricks_task(self, run_id, step_key):\n """Construct the 'task' parameter to be submitted to the Databricks API.\n\n This will create a 'spark_python_task' dict where `python_file` is a path on DBFS\n pointing to the 'databricks_step_main.py' file, and `parameters` is an array with a single\n element, a path on DBFS pointing to the picked `step_run_ref` data.\n\n See https://docs.databricks.com/dev-tools/api/latest/jobs.html#jobssparkpythontask.\n """\n python_file = self._dbfs_path(run_id, step_key, self._main_file_name())\n parameters = [\n self._internal_dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, CODE_ZIP_NAME),\n ]\n return {"spark_python_task": {"python_file": python_file, "parameters": parameters}}\n\n def _upload_artifacts(self, log, step_run_ref, run_id, step_key):\n """Upload the step run ref and pyspark code to DBFS to run as a job."""\n\n log.info("Uploading main file to DBFS")\n main_local_path = self._main_file_local_path()\n with open(main_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, self._main_file_name()), overwrite=True\n )\n\n log.info("Uploading dagster job to DBFS")\n with tempfile.TemporaryDirectory() as temp_dir:\n # Zip and upload package containing dagster job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n build_pyspark_zip(zip_local_path, self.local_dagster_job_package_path)\n with open(zip_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME), overwrite=True\n )\n\n log.info("Uploading step run ref file to DBFS")\n step_pickle_file = io.BytesIO()\n\n pickle.dump(step_run_ref, step_pickle_file)\n step_pickle_file.seek(0)\n self.databricks_runner.client.put_file(\n step_pickle_file,\n self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n overwrite=True,\n )\n\n databricks_config = DatabricksConfig(\n storage=self.storage,\n secrets=self.secrets,\n )\n log.info("Uploading Databricks configuration to DBFS")\n databricks_config_file = io.BytesIO()\n\n pickle.dump(databricks_config, databricks_config_file)\n databricks_config_file.seek(0)\n self.databricks_runner.client.put_file(\n databricks_config_file,\n self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n overwrite=True,\n )\n\n def _log_logs_from_cluster(self, log, run_id):\n logs = self.databricks_runner.retrieve_logs_for_run_id(log, run_id)\n if logs is None:\n return\n stdout, stderr = logs\n if stderr:\n log.info(stderr)\n if stdout:\n log.info(stdout)\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return databricks_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _dbfs_path(self, run_id, step_key, filename):\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return "dbfs://{}".format(path)\n\n def _internal_dbfs_path(self, run_id, step_key, filename):\n """Scripts running on Databricks should access DBFS at /dbfs/."""\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return "/dbfs/{}".format(path)\n\n\nclass DatabricksConfig:\n """Represents configuration required by Databricks to run jobs.\n\n Instances of this class will be created when a Databricks step is launched and will contain\n all configuration and secrets required to set up storage and environment variables within\n the Databricks environment. The instance will be serialized and uploaded to Databricks\n by the step launcher, then deserialized as part of the 'main' script when the job is running\n in Databricks.\n\n The `setup` method handles the actual setup prior to op execution on the Databricks side.\n\n This config is separated out from the regular Dagster run config system because the setup\n is done by the 'main' script before entering a Dagster context (i.e. using `run_step_from_ref`).\n We use a separate class to avoid coupling the setup to the format of the `step_run_ref` object.\n """\n\n def __init__(self, storage, secrets):\n """Create a new DatabricksConfig object.\n\n `storage` and `secrets` should be of the same shape as the `storage` and\n `secrets_to_env_variables` config passed to `databricks_pyspark_step_launcher`.\n """\n self.storage = storage\n self.secrets = secrets\n\n def setup(self, dbutils, sc):\n """Set up storage and environment variables on Databricks.\n\n The `dbutils` and `sc` arguments must be passed in by the 'main' script, as they\n aren't accessible by any other modules.\n """\n self.setup_storage(dbutils, sc)\n self.setup_environment(dbutils)\n\n def setup_storage(self, dbutils, sc):\n """Set up storage using either S3 or ADLS2."""\n if "s3" in self.storage:\n self.setup_s3_storage(self.storage["s3"], dbutils, sc)\n elif "adls2" in self.storage:\n self.setup_adls2_storage(self.storage["adls2"], dbutils, sc)\n\n def setup_s3_storage(self, s3_storage, dbutils, sc):\n """Obtain AWS credentials from Databricks secrets and export so both Spark and boto can use them."""\n\n scope = s3_storage["secret_scope"]\n\n access_key = dbutils.secrets.get(scope=scope, key=s3_storage["access_key_key"])\n secret_key = dbutils.secrets.get(scope=scope, key=s3_storage["secret_key_key"])\n\n # Spark APIs will use this.\n # See https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context.\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.s3n.awsAccessKeyId", access_key\n )\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.s3n.awsSecretAccessKey", secret_key\n )\n\n # Boto will use these.\n os.environ["AWS_ACCESS_KEY_ID"] = access_key\n os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key\n\n def setup_adls2_storage(self, adls2_storage, dbutils, sc):\n """Obtain an Azure Storage Account key from Databricks secrets and export so Spark can use it."""\n storage_account_key = dbutils.secrets.get(\n scope=adls2_storage["secret_scope"], key=adls2_storage["storage_account_key_key"]\n )\n # Spark APIs will use this.\n # See https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n # sc is globally defined in the Databricks runtime and points to the Spark context\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.azure.account.key.{}.dfs.core.windows.net".format(\n adls2_storage["storage_account_name"]\n ),\n storage_account_key,\n )\n\n def setup_environment(self, dbutils):\n """Setup any environment variables required by the run.\n\n Extract any secrets in the run config and export them as environment variables.\n\n This is important for any `StringSource` config since the environment variables\n won't ordinarily be available in the Databricks execution environment.\n """\n for secret in self.secrets:\n name = secret["name"]\n key = secret["key"]\n scope = secret["scope"]\n print( # pylint: disable=print-call\n "Exporting {} from Databricks secret {}, scope {}".format(name, key, scope)\n )\n val = dbutils.secrets.get(scope=scope, key=key)\n os.environ[name] = val\n
", "current_page_name": "_modules/dagster_databricks/databricks_pyspark_step_launcher", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_databricks.databricks_pyspark_step_launcher"}, "solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_databricks.solids

\nfrom dagster import Field, InputDefinition, Nothing, OutputDefinition, Permissive, check, op, solid\n\nfrom .databricks import wait_for_run_to_complete\n\n_START = "start"\n\n_DEFAULT_POLL_INTERVAL = 10\n# wait at most 24 hours by default for run execution\n_DEFAULT_RUN_MAX_WAIT_TIME_SEC = 24 * 60 * 60\n\n\n
[docs]def create_databricks_job_op(\n name="databricks_job",\n num_inputs=1,\n description=None,\n required_resource_keys=frozenset(["databricks_client"]),\n):\n """\n Creates an op that launches a databricks job (not to be confused with Dagster's job API).\n\n As config, the op accepts a blob of the form described in Databricks' job API:\n https://docs.databricks.com/dev-tools/api/latest/jobs.html.\n\n Returns:\n OpDefinition: An op definition.\n\n Example:\n\n .. code-block:: python\n\n from dagster import graph\n from dagster_databricks import create_databricks_job_op, databricks_client\n\n sparkpi = create_databricks_job_op().configured(\n {\n "job": {\n "name": "SparkPi Python job",\n "new_cluster": {\n "spark_version": "7.3.x-scala2.12",\n "node_type_id": "i3.xlarge",\n "num_workers": 2,\n },\n "spark_python_task": {"python_file": "dbfs:/docs/pi.py", "parameters": ["10"]},\n }\n },\n name="sparkpi",\n )\n\n @graph\n def my_spark():\n sparkpi()\n\n my_spark.to_job(\n resource_defs={\n "databricks_client": databricks_client.configured(\n {"host": "my.workspace.url", "token": "my.access.token"}\n )\n }\n )\n """\n return core_create_databricks_job(\n dagster_decorator=op,\n name=name,\n num_inputs=num_inputs,\n description=description,\n required_resource_keys=required_resource_keys,\n )
\n\n\n
[docs]def create_databricks_job_solid(\n name="databricks_job",\n num_inputs=1,\n description=None,\n required_resource_keys=frozenset(["databricks_client"]),\n):\n """\n Creates a solid that launches a databricks job.\n\n As config, the solid accepts a blob of the form described in Databricks' job API:\n https://docs.databricks.com/dev-tools/api/latest/jobs.html.\n\n Returns:\n SolidDefinition: A solid definition.\n\n Example:\n\n .. code-block:: python\n\n from dagster import ModeDefinition, pipeline\n from dagster_databricks import create_databricks_job_solid, databricks_client\n\n sparkpi = create_databricks_job_solid().configured(\n {\n "job": {\n "name": "SparkPi Python job",\n "new_cluster": {\n "spark_version": "7.3.x-scala2.12",\n "node_type_id": "i3.xlarge",\n "num_workers": 2,\n },\n "spark_python_task": {"python_file": "dbfs:/docs/pi.py", "parameters": ["10"]},\n }\n },\n name="sparkspi",\n )\n\n\n @pipeline(\n mode_defs=[\n ModeDefinition(\n resource_defs={\n "databricks_client": databricks_client.configured(\n {"host": "my.workspace.url", "token": "my.access.token"}\n )\n }\n )\n ]\n )\n def my_pipeline():\n sparkpi()\n """\n return core_create_databricks_job(\n dagster_decorator=solid,\n name=name,\n num_inputs=num_inputs,\n description=description,\n required_resource_keys=required_resource_keys,\n )
\n\n\ndef core_create_databricks_job(\n dagster_decorator,\n name="databricks_job",\n num_inputs=1,\n description=None,\n required_resource_keys=frozenset(["databricks_client"]),\n):\n check.str_param(name, "name")\n check.opt_str_param(description, "description")\n check.int_param(num_inputs, "num_inputs")\n check.set_param(required_resource_keys, "required_resource_keys", of_type=str)\n\n input_defs = [InputDefinition("input_" + str(i), Nothing) for i in range(num_inputs)]\n\n @dagster_decorator(\n name=name,\n description=description,\n config_schema={\n "job": Field(\n Permissive(),\n description="Databricks job run configuration, in the form described in "\n "Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html",\n ),\n "poll_interval_sec": Field(\n float,\n description="Check whether the job is done at this interval.",\n default_value=_DEFAULT_POLL_INTERVAL,\n ),\n "max_wait_time_sec": Field(\n float,\n description="If the job is not complete after this length of time, raise an error.",\n default_value=_DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n ),\n },\n input_defs=input_defs,\n output_defs=[OutputDefinition(Nothing)],\n required_resource_keys=required_resource_keys,\n tags={"kind": "databricks"},\n )\n def databricks_fn(context):\n job_config = context.op_config["job"]\n databricks_client = context.resources.databricks_client\n run_id = databricks_client.submit_run(**job_config)\n\n context.log.info(\n "Launched databricks job with run id {run_id}. UI: {url}. Waiting to run to completion...".format(\n run_id=run_id, url=create_ui_url(databricks_client, context.op_config)\n )\n )\n wait_for_run_to_complete(\n databricks_client,\n context.log,\n run_id,\n context.op_config["poll_interval_sec"],\n context.op_config["max_wait_time_sec"],\n )\n\n return databricks_fn\n\n\ndef create_ui_url(databricks_client, op_config):\n host = databricks_client.host\n workspace_id = databricks_client.workspace_id or "<workspace_id>"\n if "existing_cluster_id" in op_config["job"]:\n return "https://{host}/?o={workspace_id}#/setting/clusters/{cluster_id}/sparkUi".format(\n host=host,\n workspace_id=workspace_id,\n cluster_id=op_config["job"]["existing_cluster_id"],\n )\n else:\n return "https://{host}/?o={workspace_id}#joblist".format(\n host=host, workspace_id=workspace_id\n )\n
", "current_page_name": "_modules/dagster_databricks/solids", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_databricks.solids"}}, "dagster_datadog": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_datadog.resources

\nfrom datadog import DogStatsd, initialize, statsd\n\nfrom dagster import Field, StringSource, resource\n\n\nclass DataDogResource:\n    # Mirroring levels from the dogstatsd library\n    OK, WARNING, CRITICAL, UNKNOWN = (\n        DogStatsd.OK,\n        DogStatsd.WARNING,\n        DogStatsd.CRITICAL,\n        DogStatsd.UNKNOWN,\n    )\n\n    def __init__(self, api_key, app_key):\n        initialize(api_key=api_key, app_key=app_key)\n\n        # Pull in methods from the dogstatsd library\n        for method in [\n            "event",\n            "gauge",\n            "increment",\n            "decrement",\n            "histogram",\n            "distribution",\n            "set",\n            "service_check",\n            "timed",\n            "timing",\n        ]:\n            setattr(self, method, getattr(statsd, method))\n\n\n
[docs]@resource(\n {\n "api_key": Field(StringSource, description="Datadog API key"),\n "app_key": Field(StringSource, description="Datadog application key"),\n },\n description="This resource is for publishing to DataDog",\n)\ndef datadog_resource(context):\n """This resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n\n .. code-block:: python\n\n @op(required_resource_keys={'datadog'})\n def datadog_op(context):\n dd = context.resources.datadog\n\n dd.event('Man down!', 'This server needs assistance.')\n dd.gauge('users.online', 1001, tags=["protocol:http"])\n dd.increment('page.views')\n dd.decrement('page.views')\n dd.histogram('album.photo.count', 26, tags=["gender:female"])\n dd.distribution('album.photo.count', 26, tags=["color:blue"])\n dd.set('visitors.uniques', 999, tags=["browser:ie"])\n dd.service_check('svc.check_name', dd.WARNING)\n dd.timing("query.response.time", 1234)\n\n # Use timed decorator\n @dd.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job(resource_defs={'datadog': datadog_resource})\n def dd_job():\n datadog_op()\n\n result = dd_job.execute_in_process(\n run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n )\n\n """\n return DataDogResource(\n context.resource_config.get("api_key"), context.resource_config.get("app_key")\n )
\n
", "current_page_name": "_modules/dagster_datadog/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_datadog.resources"}}, "dagster_dbt": {"asset_defs": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.asset_defs

\nimport json\nimport os\nimport textwrap\nfrom typing import AbstractSet, Any, Callable, Dict, Mapping, Optional, Sequence, Set, Tuple\n\nfrom dagster_dbt.cli.types import DbtCliOutput\nfrom dagster_dbt.cli.utils import execute_cli\nfrom dagster_dbt.utils import generate_materializations\n\nfrom dagster import (\n    AssetKey,\n    MetadataValue,\n    Out,\n    Output,\n    SolidExecutionContext,\n    TableColumn,\n    TableSchema,\n    check,\n    get_dagster_logger,\n)\nfrom dagster.core.asset_defs import AssetsDefinition, multi_asset\n\n\ndef _load_manifest_for_project(\n    project_dir: str, profiles_dir: str, target_dir: str, select: str\n) -> Tuple[Mapping[str, Any], DbtCliOutput]:\n    # running "dbt ls" regenerates the manifest.json, which includes a superset of the actual\n    # "dbt ls" output\n    cli_output = execute_cli(\n        executable="dbt",\n        command="ls",\n        log=get_dagster_logger(),\n        flags_dict={\n            "project-dir": project_dir,\n            "profiles-dir": profiles_dir,\n            "select": select,\n            "resource-type": "model",\n            "output": "json",\n        },\n        warn_error=False,\n        ignore_handled_error=False,\n        target_path=target_dir,\n    )\n    manifest_path = os.path.join(target_dir, "manifest.json")\n    with open(manifest_path, "r") as f:\n        return json.load(f), cli_output\n\n\ndef _get_node_name(node_info: Mapping[str, Any]):\n    return "__".join([node_info["resource_type"], node_info["package_name"], node_info["name"]])\n\n\ndef _get_node_asset_key(node_info):\n    return AssetKey(node_info["name"])\n\n\ndef _dbt_nodes_to_assets(\n    dbt_nodes: Mapping[str, Any],\n    select: str,\n    selected_unique_ids: AbstractSet[str],\n    runtime_metadata_fn: Optional[\n        Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n    ] = None,\n    io_manager_key: Optional[str] = None,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = _get_node_asset_key,\n) -> AssetsDefinition:\n    outs: Dict[str, Out] = {}\n    sources: Set[AssetKey] = set()\n    out_name_to_node_info: Dict[str, Mapping[str, Any]] = {}\n    internal_asset_deps: Dict[str, Set[AssetKey]] = {}\n    for unique_id in selected_unique_ids:\n        asset_deps = set()\n        node_info = dbt_nodes[unique_id]\n        for dep_name in node_info["depends_on"]["nodes"]:\n            dep_type = dbt_nodes[dep_name]["resource_type"]\n            # ignore seeds/snapshots\n            if dep_type not in ["source", "model"]:\n                continue\n            dep_asset_key = node_info_to_asset_key(dbt_nodes[dep_name])\n\n            # if it's a source, it will be used as an input to this multi-asset\n            if dep_type == "source":\n                sources.add(dep_asset_key)\n            # regardless of type, list this as a dependency for the current asset\n            asset_deps.add(dep_asset_key)\n        code_block = textwrap.indent(node_info["raw_sql"], "    ")\n        description_sections = [\n            node_info["description"],\n            f"#### Raw SQL:\\n```\\n{code_block}\\n```",\n        ]\n        description = "\\n\\n".join(filter(None, description_sections))\n\n        node_name = node_info["name"]\n        outs[node_name] = Out(\n            dagster_type=None,\n            asset_key=node_info_to_asset_key(node_info),\n            description=description,\n            io_manager_key=io_manager_key,\n            metadata=_columns_to_metadata(node_info["columns"]),\n        )\n        out_name_to_node_info[node_name] = node_info\n        internal_asset_deps[node_name] = asset_deps\n\n    @multi_asset(\n        name="dbt_project",\n        non_argument_deps=sources,\n        outs=outs,\n        required_resource_keys={"dbt"},\n        compute_kind="dbt",\n        internal_asset_deps=internal_asset_deps,\n    )\n    def _dbt_project_multi_assset(context):\n        dbt_output = context.resources.dbt.run(select=select)\n        # yield an Output for each materialization generated in the run\n        for materialization in generate_materializations(dbt_output):\n            output_name = materialization.asset_key.path[-1]\n            if runtime_metadata_fn:\n                yield Output(\n                    value=None,\n                    output_name=output_name,\n                    metadata=runtime_metadata_fn(context, out_name_to_node_info[output_name]),\n                )\n            else:\n                yield Output(\n                    value=None,\n                    output_name=output_name,\n                    metadata_entries=materialization.metadata_entries,\n                )\n\n    return _dbt_project_multi_assset\n\n\ndef _columns_to_metadata(columns: Mapping[str, Any]) -> Optional[Mapping[str, Any]]:\n    return (\n        {\n            "schema": MetadataValue.table_schema(\n                TableSchema(\n                    columns=[\n                        TableColumn(\n                            name=name,\n                            type=metadata.get("data_type") or "?",\n                            description=metadata.get("description"),\n                        )\n                        for name, metadata in columns.items()\n                    ]\n                )\n            )\n        }\n        if len(columns) > 0\n        else None\n    )\n\n\n
[docs]def load_assets_from_dbt_project(\n project_dir: str,\n profiles_dir: Optional[str] = None,\n target_dir: Optional[str] = None,\n select: Optional[str] = None,\n runtime_metadata_fn: Optional[\n Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n io_manager_key: Optional[str] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = _get_node_asset_key,\n) -> Sequence[AssetsDefinition]:\n """\n Loads a set of DBT models from a DBT project into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` command.\n\n Args:\n project_dir (Optional[str]): The directory containing the DBT project to load.\n profiles_dir (Optional[str]): The profiles directory to use for loading the DBT project.\n Defaults to a directory called "config" inside the project_dir.\n target_dir (Optional[str]): The target directory where DBT will place compiled artifacts.\n Defaults to "target" underneath the project_dir.\n select (str): A DBT selection string for the models in a project that you want to include.\n Defaults to "*".\n runtime_metadata_fn: (Optional[Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]):\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n node_info_to_asset_key: (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model.\n """\n check.str_param(project_dir, "project_dir")\n profiles_dir = check.opt_str_param(\n profiles_dir, "profiles_dir", os.path.join(project_dir, "config")\n )\n target_dir = check.opt_str_param(target_dir, "target_dir", os.path.join(project_dir, "target"))\n\n manifest_json, cli_output = _load_manifest_for_project(\n project_dir, profiles_dir, target_dir, select or "*"\n )\n selected_unique_ids: Set[str] = set(\n filter(None, (line.get("unique_id") for line in cli_output.logs))\n )\n\n dbt_nodes = {**manifest_json["nodes"], **manifest_json["sources"]}\n return [\n _dbt_nodes_to_assets(\n dbt_nodes,\n select=select or "*",\n selected_unique_ids=selected_unique_ids,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n node_info_to_asset_key=node_info_to_asset_key,\n ),\n ]
\n\n\n
[docs]def load_assets_from_dbt_manifest(\n manifest_json: Mapping[str, Any],\n runtime_metadata_fn: Optional[\n Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n io_manager_key: Optional[str] = None,\n selected_unique_ids: Optional[AbstractSet[str]] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = _get_node_asset_key,\n) -> Sequence[AssetsDefinition]:\n """\n Loads a set of dbt models, described in a manifest.json, into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` command.\n\n Args:\n manifest_json (Optional[Mapping[str, Any]]): The contents of a DBT manifest.json, which contains\n a set of models to load into assets.\n runtime_metadata_fn: (Optional[Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]):\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n selected_unique_ids (Optional[Set[str]]): The set of dbt unique_ids that you want to load\n as assets.\n node_info_to_asset_key: (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model.\n """\n check.dict_param(manifest_json, "manifest_json", key_type=str)\n dbt_nodes = {**manifest_json["nodes"], **manifest_json["sources"]}\n\n def _unique_id_to_selector(uid):\n # take the fully-qualified node name and use it to select the model\n return ".".join(dbt_nodes[uid]["fqn"])\n\n select = (\n "*"\n if selected_unique_ids is None\n else " ".join(_unique_id_to_selector(uid) for uid in selected_unique_ids)\n )\n selected_unique_ids = selected_unique_ids or set(\n unique_id\n for unique_id, node_info in dbt_nodes.items()\n if node_info["resource_type"] == "model"\n )\n return [\n _dbt_nodes_to_assets(\n dbt_nodes,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n select=select,\n selected_unique_ids=selected_unique_ids,\n node_info_to_asset_key=node_info_to_asset_key,\n )\n ]
\n
", "current_page_name": "_modules/dagster_dbt/asset_defs", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.asset_defs"}, "cli": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cli.resources

\nfrom typing import Any, Dict, List, Optional, Set\n\nfrom dagster import Permissive, check, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom ..dbt_resource import DbtResource\nfrom .constants import CLI_COMMON_FLAGS_CONFIG_SCHEMA, CLI_COMMON_OPTIONS_CONFIG_SCHEMA\nfrom .types import DbtCliOutput\nfrom .utils import execute_cli\n\n\n
[docs]class DbtCliResource(DbtResource):\n """\n A resource that allows you to execute dbt cli commands. For the most up-to-date documentation on\n the specific parameters available to you for each command, check out the dbt docs:\n\n https://docs.getdbt.com/reference/commands/run\n\n To use this as a dagster resource, we recommend using\n :func:`dbt_cli_resource <dagster_dbt.dbt_cli_resource>`.\n """\n\n def __init__(\n self,\n executable: str,\n default_flags: Dict[str, Any],\n warn_error: bool,\n ignore_handled_error: bool,\n target_path: str,\n logger: Optional[Any] = None,\n ):\n self._default_flags = default_flags\n self._executable = executable\n self._warn_error = warn_error\n self._ignore_handled_error = ignore_handled_error\n self._target_path = target_path\n super().__init__(logger)\n\n @property\n def default_flags(self) -> Dict[str, Any]:\n """\n A set of params populated from resource config that are passed as flags to each dbt CLI command.\n """\n return self._format_params(self._default_flags, replace_underscores=True)\n\n @property\n def strict_flags(self) -> Set[str]:\n """\n A set of flags that should not be auto-populated from the default flags unless they are\n arguments to the associated function.\n """\n return {"models", "exclude", "select"}\n\n
[docs] def cli(self, command: str, **kwargs) -> DbtCliOutput:\n """\n Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n default flags that were configured on resource initialization (if any) overriding the\n default values if necessary.\n\n Args:\n command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n command = check.str_param(command, "command")\n extra_flags = {} if kwargs is None else kwargs\n\n # remove default flags that are declared as "strict" and not explicitly passed in\n default_flags = {\n k: v\n for k, v in self.default_flags.items()\n if not (k in self.strict_flags and k not in extra_flags)\n }\n\n flags = merge_dicts(\n default_flags, self._format_params(extra_flags, replace_underscores=True)\n )\n\n return execute_cli(\n executable=self._executable,\n command=command,\n flags_dict=flags,\n log=self.logger,\n warn_error=self._warn_error,\n ignore_handled_error=self._ignore_handled_error,\n target_path=self._target_path,\n )
\n\n
[docs] def compile(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtCliOutput:\n """\n Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("compile", models=models, exclude=exclude, **kwargs)
\n\n
[docs] def run(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtCliOutput:\n """\n Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("run", models=models, exclude=exclude, **kwargs)
\n\n
[docs] def snapshot(\n self, select: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtCliOutput:\n """\n Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("snapshot", select=select, exclude=exclude, **kwargs)
\n\n
[docs] def test(\n self,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n data: bool = True,\n schema: bool = True,\n **kwargs,\n ) -> DbtCliOutput:\n """\n Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in testing.\n exclude (List[str], optional): the models to exclude from testing.\n data (bool, optional): If ``True`` (default), then run data tests.\n schema (bool, optional): If ``True`` (default), then run schema tests.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n if data and schema:\n # do not include these arguments if both are True, as these are deprecated in later\n # versions of dbt, and for older versions the functionality is the same regardless of\n # if both are set or neither are set.\n return self.cli("test", models=models, exclude=exclude, **kwargs)\n return self.cli("test", models=models, exclude=exclude, data=data, schema=schema, **kwargs)
\n\n
[docs] def seed(\n self,\n show: bool = False,\n select: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtCliOutput:\n """\n Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n show (bool, optional): If ``True``, then show a sample of the seeded data in the\n response. Defaults to ``False``.\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("seed", show=show, select=select, exclude=exclude, **kwargs)
\n\n
[docs] def ls(\n self,\n select: Optional[List[str]] = None,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtCliOutput:\n """\n Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the resources to include in the output.\n models (List[str], optional): the models to include in the output.\n exclude (List[str], optional): the resources to exclude from the output.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("ls", select=select, models=models, exclude=exclude, **kwargs)
\n\n
[docs] def freshness(self, select: Optional[List[str]] = None, **kwargs) -> DbtCliOutput:\n """\n Run the ``source snapshot-freshness`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the sources to include in the run.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("source snapshot-freshness", select=select, **kwargs)
\n\n
[docs] def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtCliOutput:\n """\n Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("docs generate", compile=compile_project, **kwargs)
\n\n
[docs] def run_operation(\n self, macro: str, args: Optional[Dict[str, Any]] = None, **kwargs\n ) -> DbtCliOutput:\n """\n Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n macro (str): the dbt macro to invoke.\n args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n\n return self.cli(f"run-operation {macro}", args=args, **kwargs)
\n\n\n
[docs]@resource(\n config_schema=Permissive(\n {\n k.replace("-", "_"): v\n for k, v in dict(\n **CLI_COMMON_FLAGS_CONFIG_SCHEMA, **CLI_COMMON_OPTIONS_CONFIG_SCHEMA\n ).items()\n }\n ),\n description="A resource that can run dbt CLI commands.",\n)\ndef dbt_cli_resource(context) -> DbtCliResource:\n """This resource defines a dbt CLI interface.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/overview/configuration#configured>`_ method.\n\n Examples:\n\n .. code-block:: python\n\n custom_dbt_cli_resource = dbt_cli_resource.configured({"project-dir": "path/to/my/dbt_project"})\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"dbt": custom_dbt_cli_resource})])\n def dbt_cli_pipeline():\n # Run solids with `required_resource_keys={"dbt", ...}`.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n dbt_cli_resource:\n config:\n project_dir: "."\n # Optional[str]: Which directory to look in for the dbt_project.yml file. Default is\n # the current working directory and its parents.\n profiles_dir: $DBT_PROFILES_DIR or $HOME/.dbt\n # Optional[str]: Which directory to look in for the profiles.yml file.\n profile: ""\n # Optional[str]: Which profile to load. Overrides setting in dbt_project.yml.\n target: ""\n # Optional[str]: Which target to load for the given profile.\n vars: {}\n # Optional[Permissive]: Supply variables to the project. This argument overrides\n # variables defined in your dbt_project.yml file. This argument should be a\n # dictionary, eg. "{'my_variable': 'my_value'}"\n bypass_cache: False\n # Optional[bool]: If set, bypass the adapter-level cache of database state.\n\n\n """\n # set of options in the config schema that are not flags\n non_flag_options = {k.replace("-", "_") for k in CLI_COMMON_OPTIONS_CONFIG_SCHEMA}\n # all config options that are intended to be used as flags for dbt commands\n default_flags = {k: v for k, v in context.resource_config.items() if k not in non_flag_options}\n return DbtCliResource(\n executable=context.resource_config["dbt_executable"],\n default_flags=default_flags,\n warn_error=context.resource_config["warn_error"],\n ignore_handled_error=context.resource_config["ignore_handled_error"],\n target_path=context.resource_config["target_path"],\n logger=context.log,\n )
\n
", "current_page_name": "_modules/dagster_dbt/cli/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cli.resources"}, "solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cli.solids

\nfrom dagster import (\n    Array,\n    Bool,\n    InputDefinition,\n    Noneable,\n    Nothing,\n    Output,\n    OutputDefinition,\n    Permissive,\n    StringSource,\n    solid,\n)\nfrom dagster.config.field import Field\nfrom dagster.utils.backcompat import experimental\n\nfrom ..utils import generate_materializations\nfrom .constants import (\n    CLI_COMMON_FLAGS_CONFIG_SCHEMA,\n    CLI_COMMON_OPTIONS_CONFIG_SCHEMA,\n    DEFAULT_DBT_TARGET_PATH,\n)\nfrom .types import DbtCliOutput\nfrom .utils import execute_cli\n\nCLI_CONFIG_SCHEMA = {**CLI_COMMON_FLAGS_CONFIG_SCHEMA, **CLI_COMMON_OPTIONS_CONFIG_SCHEMA}\nCLI_COMMON_FLAGS = set(CLI_COMMON_FLAGS_CONFIG_SCHEMA.keys())\n\n\ndef passthrough_flags_only(solid_config, additional_flags):\n    return {\n        flag: solid_config[flag]\n        for flag in (CLI_COMMON_FLAGS | set(additional_flags))\n        if solid_config.get(flag) is not None\n    }\n\n\n
[docs]@solid(\n description="A solid to invoke dbt run via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "full-refresh": Field(\n config=bool,\n description=(\n "If specified, DBT will drop incremental models and fully-recalculate the "\n "incremental table from the model definition. (--full-refresh)"\n ),\n is_required=False,\n default_value=False,\n ),\n "fail-fast": Field(\n config=bool,\n description="Stop execution upon a first failure. (--fail-fast)",\n is_required=False,\n default_value=False,\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n is_required=False,\n default_value=[],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_run(context):\n """This solid executes ``dbt run`` via the dbt CLI. See the solid definition for available\n parameters.\n """\n\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="run",\n flags_dict=passthrough_flags_only(\n context.solid_config, ("threads", "models", "exclude", "full-refresh", "fail-fast")\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n if context.solid_config["yield_materializations"]:\n for materialization in generate_materializations(\n cli_output,\n asset_key_prefix=context.solid_config["asset_key_prefix"],\n ):\n yield materialization\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt test via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "data": Field(\n config=bool,\n description='Run data tests defined in "tests" directory.',\n is_required=False,\n default_value=False,\n ),\n "schema": Field(\n config=bool,\n description="Run constraint validations from schema.yml files.",\n is_required=False,\n default_value=False,\n ),\n "fail-fast": Field(\n config=bool,\n description="Stop execution upon a first test failure.",\n is_required=False,\n default_value=False,\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "target-path": Field(\n config=StringSource,\n is_required=False,\n default_value=DEFAULT_DBT_TARGET_PATH,\n description=(\n "The directory path for target if different from the default `target-path` in "\n "your dbt project configuration file."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_test(context):\n """This solid executes ``dbt test`` via the dbt CLI. See the solid definition for available\n parameters.\n """\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="test",\n flags_dict=passthrough_flags_only(\n context.solid_config, ("data", "schema", "fail-fast", "threads", "models", "exclude")\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt snapshot via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings in "\n "profiles.yml."\n ),\n ),\n "select": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to include.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_snapshot(context):\n """This solid executes ``dbt snapshot`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="snapshot",\n flags_dict=passthrough_flags_only(context.solid_config, ("threads", "select", "exclude")),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt run-operation via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "macro": Field(\n config=StringSource,\n description=(\n "Specify the macro to invoke. dbt will call this macro with the supplied "\n "arguments and then exit."\n ),\n ),\n "args": Field(\n config=Permissive({}),\n is_required=False,\n description=(\n "Supply arguments to the macro. This dictionary will be mapped to the keyword "\n "arguments defined in the selected macro. This argument should be a dictionary, "\n "eg. {'my_variable': 'my_value'}"\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_run_operation(context):\n """This solid executes ``dbt run-operation`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command=f"run-operation {context.solid_config['macro']}",\n flags_dict=passthrough_flags_only(context.solid_config, ("args",)),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt source snapshot-freshness via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "select": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="Specify the sources to snapshot freshness.",\n ),\n "output": Field(\n config=StringSource,\n is_required=False,\n description=(\n "Specify the output path for the json report. By default, outputs to "\n "target/sources.json"\n ),\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides "\n "settings in profiles.yml."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_snapshot_freshness(context):\n """This solid executes ``dbt source snapshot-freshness`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="source snapshot-freshness",\n flags_dict=passthrough_flags_only(context.solid_config, ("select", "output", "threads")),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt compile via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "parse-only": Field(\n config=bool,\n is_required=False,\n default_value=False,\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "no-version-check": Field(\n config=bool,\n description=(\n "Skip the check that dbt's version matches the one specified in the "\n "dbt_project.yml file ('require-dbt-version')"\n ),\n is_required=False,\n default_value=False,\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "selector": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The selector name to use, as defined in your selectors.yml",\n ),\n "state": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description=(\n "If set, use the given directory as the source for json files to compare with "\n "this project."\n ),\n ),\n "full-refresh": Field(\n config=bool,\n description=(\n "If specified, DBT will drop incremental models and fully-recalculate "\n "the incremental table from the model definition. (--full-refresh)"\n ),\n is_required=False,\n default_value=False,\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_compile(context):\n """This solid executes ``dbt compile`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="compile",\n flags_dict=passthrough_flags_only(\n context.solid_config,\n (\n "parse-only",\n "threads",\n "no-version-check",\n "models",\n "exclude",\n "selector",\n "state",\n "full-refresh",\n ),\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n@solid(\n description="A solid to invoke dbt docs generate via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "no-version-check": Field(\n config=bool,\n description=(\n "Skip the check that dbt's version matches the one specified in the "\n "dbt_project.yml file ('require-dbt-version')"\n ),\n is_required=False,\n default_value=False,\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "selector": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The selector name to use, as defined in your selectors.yml",\n ),\n "state": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description=(\n "If set, use the given directory as the source for json files to compare with "\n "this project."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_docs_generate(context):\n """This solid executes ``dbt docs generate`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="docs generate",\n flags_dict=passthrough_flags_only(\n context.solid_config,\n (\n "threads",\n "no-version-check",\n "models",\n "exclude",\n "selector",\n "state",\n ),\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")\n\n\n@solid(\n description="A solid to invoke dbt seed via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "full-refresh": Field(\n config=bool,\n default_value=False,\n is_required=False,\n description=("Drop existing seed tables and recreate them."),\n ),\n "show": Field(\n config=bool,\n default_value=False,\n is_required=False,\n description=("Show a sample of the loaded data in the terminal."),\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "no-version-check": Field(\n config=bool,\n description=(\n "Skip the check that dbt's version matches the one specified in the "\n "dbt_project.yml file ('require-dbt-version')"\n ),\n is_required=False,\n default_value=False,\n ),\n "select": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="Specify the nodes to include.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "selector": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The selector name to use, as defined in your selectors.yml",\n ),\n "state": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description=(\n "If set, use the given directory as the source for json files to compare with "\n "this project."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_seed(context):\n """This solid executes ``dbt seed`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="seed",\n flags_dict=passthrough_flags_only(\n context.solid_config,\n (\n "full-refresh",\n "show",\n "threads",\n "no-version-check",\n "select",\n "exclude",\n "selector",\n "state",\n ),\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")\n
", "current_page_name": "_modules/dagster_dbt/cli/solids", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cli.solids"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cli.types

\nfrom typing import Any, Dict, List\n\nfrom dagster import check, usable_as_dagster_type\n\nfrom ..types import DbtOutput\n\n\n
[docs]@usable_as_dagster_type\nclass DbtCliOutput(DbtOutput):\n """The results of executing a dbt command, along with additional metadata about the dbt CLI\n process that was run.\n\n Note that users should not construct instances of this class directly. This class is intended\n to be constructed from the JSON output of dbt commands.\n\n Attributes:\n command (str): The full shell command that was executed.\n return_code (int): The return code of the dbt CLI process.\n raw_output (str): The raw output (``stdout``) of the dbt CLI process.\n logs (List[Dict[str, Any]]): List of parsed JSON logs produced by the dbt command.\n result (Optional[Dict[str, Any]]): Dictionary containing dbt-reported result information\n contained in run_results.json. Some dbt commands do not produce results, and will\n therefore have result = None.\n """\n\n def __init__(\n self,\n command: str,\n return_code: int,\n raw_output: str,\n logs: List[Dict[str, Any]],\n result: Dict[str, Any],\n ):\n self._command = check.str_param(command, "command")\n self._return_code = check.int_param(return_code, "return_code")\n self._raw_output = check.str_param(raw_output, "raw_output")\n self._logs = check.list_param(logs, "logs", of_type=dict)\n super().__init__(result)\n\n @property\n def command(self) -> str:\n return self._command\n\n @property\n def return_code(self) -> int:\n return self._return_code\n\n @property\n def raw_output(self) -> str:\n return self._raw_output\n\n @property\n def logs(self) -> List[Dict[str, Any]]:\n return self._logs
\n
", "current_page_name": "_modules/dagster_dbt/cli/types", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cli.types"}}, "cloud": {"ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cloud.ops

\nfrom dagster import Array, Bool, Field, In, Noneable, Nothing, Out, Output, op\n\nfrom ..utils import generate_materializations\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import DbtCloudOutput\n\n\n
[docs]@op(\n required_resource_keys={"dbt_cloud"},\n ins={"start_after": In(Nothing)},\n out=Out(DbtCloudOutput, description="Parsed output from running the dbt Cloud job."),\n config_schema={\n "job_id": Field(\n config=int,\n is_required=True,\n description=(\n "The integer ID of the relevant dbt Cloud job. You can find this value by going to "\n "the details page of your job in the dbt Cloud UI. It will be the final number in the "\n "url, e.g.: "\n " https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/"\n ),\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) that will be waited between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description="The maximum time that will waited before this operation is timed out. By "\n "default, this will never time out.",\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["dbt"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "dbt_cloud"},\n)\ndef dbt_cloud_run_op(context):\n """\n Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\n fails or is otherwised stopped before succeeding, a `dagster.Failure` exception will be raised,\n and this op will fail.\n\n It requires the use of a 'dbt_cloud' resource, which is used to connect to the dbt Cloud API.\n\n **Config Options:**\n\n job_id (int)\n The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\n page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\n ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n poll_interval (float)\n The time (in seconds) that will be waited between successive polls. Defaults to ``10``.\n poll_timeout (float)\n The maximum time (in seconds) that will waited before this operation is timed out. By\n default, this will never time out.\n yield_materializations (bool)\n If True, materializations corresponding to the results of the dbt operation will be\n yielded when the solid executes. Defaults to ``True``.\n rasset_key_prefix (float)\n If provided and yield_materializations is True, these components will be used to "\n prefix the generated asset keys. Defaults to ["dbt"].\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource, run_dbt_cloud_op\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n )\n run_dbt_nightly_sync = run_dbt_cloud_op.configured(\n {"job_id": 54321}, name="run_dbt_nightly_sync"\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def dbt_cloud():\n run_dbt_nightly_sync()\n\n\n """\n dbt_output = context.resources.dbt_cloud.run_job_and_poll(\n context.op_config["job_id"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"] and "results" in dbt_output.result:\n yield from generate_materializations(\n dbt_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(\n dbt_output,\n metadata={\n "created_at": dbt_output.run_details["created_at"],\n "started_at": dbt_output.run_details["started_at"],\n "finished_at": dbt_output.run_details["finished_at"],\n "total_duration": dbt_output.run_details["duration"],\n "run_duration": dbt_output.run_details["run_duration"],\n },\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/ops", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cloud.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cloud.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Dict, List, Optional, cast\nfrom urllib.parse import urljoin\n\nimport requests\nfrom requests.exceptions import RequestException\n\nfrom dagster import (\n    Failure,\n    Field,\n    MetadataValue,\n    StringSource,\n    __version__,\n    check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster.utils.merger import deep_merge_dicts\n\nfrom .types import DbtCloudOutput\n\nDBT_DEFAULT_HOST = "https://cloud.getdbt.com/"\nDBT_ACCOUNTS_PATH = "api/v2/accounts/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\n
[docs]class DbtCloudResourceV2:\n """This class exposes methods on top of the dbt Cloud REST API v2.\n\n For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n """\n\n def __init__(\n self,\n auth_token: str,\n account_id: int,\n disable_schedule_on_trigger: bool = True,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n dbt_cloud_host: str = DBT_DEFAULT_HOST,\n log: logging.Logger = get_dagster_logger(),\n log_requests: bool = False,\n ):\n self._auth_token = auth_token\n self._account_id = account_id\n self._disable_schedule_on_trigger = disable_schedule_on_trigger\n\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._dbt_cloud_host = dbt_cloud_host\n self._log = log\n self._log_requests = log_requests\n\n @property\n def api_base_url(self) -> str:\n return urljoin(self._dbt_cloud_host, DBT_ACCOUNTS_PATH)\n\n
[docs] def make_request(\n self,\n method: str,\n endpoint: str,\n data: Optional[Dict[str, Any]] = None,\n return_text: bool = False,\n ) -> Dict[str, Any]:\n """\n Creates and sends a request to the desired dbt Cloud API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The dbt Cloud API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n return_text (bool): Override default behavior and return unparsed {"text": response.text}\n blob instead of json.\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n\n headers = {\n "User-Agent": f"dagster-dbt/{__version__}",\n "Content-Type": "application/json",\n "Authorization": f"Bearer {self._auth_token}",\n }\n url = urljoin(self.api_base_url, endpoint)\n\n if self._log_requests:\n self._log.debug(f"Making Request: method={method} url={url} data={data}")\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n data=json.dumps(data),\n allow_redirects=False,\n )\n response.raise_for_status()\n return {"text": response.text} if return_text else response.json()["data"]\n except RequestException as e:\n self._log.error("Request to dbt Cloud API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure("Exceeded max number of retries.")
\n\n
[docs] def get_job(self, job_id: int) -> Dict[str, Any]:\n """\n Gets details about a given dbt job from the dbt Cloud API.\n\n Args:\n job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n the details page of your job in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n return self.make_request("GET", f"{self._account_id}/jobs/{job_id}/")
\n\n
[docs] def update_job(self, job_id: int, **kwargs) -> Dict[str, Any]:\n """\n Updates specific properties of a dbt job. Documentation on the full set of potential\n parameters can be found here: https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById\n\n Args:\n job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n the details page of your job in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n kwargs: Passed in as the properties to be changed.\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n\n Examples:\n\n .. code-block:: python\n\n # disable schedule for job with id=12345\n my_dbt_cloud_resource.update_job(12345, triggers={"schedule": False})\n """\n # API requires you to supply a bunch of values, so we can just use the current state\n # as the defaults\n job_data = self.get_job(job_id)\n return self.make_request(\n "POST", f"{self._account_id}/jobs/{job_id}/", data=deep_merge_dicts(job_data, kwargs)\n )
\n\n
[docs] def run_job(self, job_id: int, **kwargs) -> Dict[str, Any]:\n """\n Initializes a run for a job. Overrides for specific properties can be set by passing in\n values to the kwargs. A full list of overridable properties can be found here:\n https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun\n\n Args:\n job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n the details page of your job in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n kwargs: Passed in as the properties to be overridden.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n if self._disable_schedule_on_trigger:\n self._log.info("Disabling dbt Cloud job schedule.")\n self.update_job(job_id, triggers={"schedule": False})\n self._log.info(f"Initializing run for job with job_id={job_id}")\n if "cause" not in kwargs:\n kwargs["cause"] = "Triggered via Dagster"\n resp = self.make_request("POST", f"{self._account_id}/jobs/{job_id}/run/", data=kwargs)\n self._log.info(\n f"Run initialized with run_id={resp['id']}. View this run in "\n f"the dbt Cloud UI: {resp['href']}"\n )\n return resp
\n\n
[docs] def get_run(self, run_id: int, include_related: Optional[List[str]] = None) -> Dict[str, Any]:\n """\n Gets details about a specific job run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n include_related (List[str]): List of related fields to pull with the run. Valid values\n are "trigger", "job", and "debug_logs".\n\n Returns:\n Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n """\n query_params = f"?include_related={','.join(include_related)}" if include_related else ""\n return self.make_request(\n "GET",\n f"{self._account_id}/runs/{run_id}/{query_params}",\n )
\n\n
[docs] def get_run_steps(self, run_id: int) -> List[str]:\n """\n Gets the steps of an initialized dbt Cloud run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n Returns:\n List[str, Any]: List of commands for each step of the run.\n """\n run_details = self.get_run(run_id, include_related=["trigger", "job"])\n steps = run_details["job"]["execute_steps"]\n steps_override = run_details["trigger"]["steps_override"]\n return steps_override or steps
\n\n
[docs] def cancel_run(self, run_id: int) -> Dict[str, Any]:\n """\n Cancels a dbt Cloud run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n Returns:\n Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n """\n self._log.info(f"Cancelling run with id '{run_id}'")\n return self.make_request("POST", f"{self._account_id}/runs/{run_id}/cancel/")
\n\n
[docs] def list_run_artifacts(self, run_id: int, step: Optional[int] = None) -> List[str]:\n """\n Lists the paths of the available run artifacts from a completed dbt Cloud run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n step (int): The index of the step in the run to query for artifacts. The first step in\n the run has the index 1. If the step parameter is omitted, then this endpoint will\n return the artifacts compiled for the last step in the run\n\n Returns:\n List[str]: List of the paths of the available run artifacts\n """\n query_params = f"?step={step}" if step else ""\n return cast(\n list,\n self.make_request(\n "GET",\n f"{self._account_id}/runs/{run_id}/artifacts/{query_params}",\n data={"step": step} if step else None,\n ),\n )
\n\n
[docs] def get_run_artifact(self, run_id: int, path: str, step: Optional[int] = None) -> str:\n """\n The string contents of a run artifact from a dbt Cloud run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n path (str): The path to this run artifact (e.g. 'run/my_new_project/models/example/my_first_dbt_model.sql')\n step (int): The index of the step in the run to query for artifacts. The first step in\n the run has the index 1. If the step parameter is omitted, then this endpoint will\n return the artifacts compiled for the last step in the run.\n\n Returns:\n List[str]: List of the names of the available run artifacts\n """\n query_params = f"?step={step}" if step else ""\n return self.make_request(\n "GET",\n f"{self._account_id}/runs/{run_id}/artifacts/{path}{query_params}",\n data={"step": step} if step else None,\n return_text=True,\n )["text"]
\n\n
[docs] def get_manifest(self, run_id: int, step: Optional[int] = None) -> Dict[str, Any]:\n """\n The parsed contents of a manifest.json file created by a completed run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n step (int): The index of the step in the run to query for artifacts. The first step in\n the run has the index 1. If the step parameter is omitted, then this endpoint will\n return the artifacts compiled for the last step in the run.\n\n Returns:\n Dict[str, Any]: Parsed contents of the manifest.json file\n """\n return json.loads(self.get_run_artifact(run_id, "manifest.json", step=step))
\n\n
[docs] def get_run_results(self, run_id: int, step: Optional[int] = None) -> Dict[str, Any]:\n """\n The parsed contents of a run_results.json file created by a completed run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n step (int): The index of the step in the run to query for artifacts. The first step in\n the run has the index 1. If the step parameter is omitted, then this endpoint will\n return the artifacts compiled for the last step in the run.\n\n Returns:\n Dict[str, Any]: Parsed contents of the run_results.json file\n """\n return json.loads(self.get_run_artifact(run_id, "run_results.json", step=step))
\n\n
[docs] def poll_run(\n self,\n run_id: int,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n href: Optional[str] = None,\n ) -> Dict[str, Any]:\n """\n Polls a dbt Cloud job run until it completes. Will raise a `dagster.Failure` exception if the\n run does not complete successfully.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n poll_interval (float): The time (in seconds) that should be waited between successive\n polls of the dbt Cloud API.\n poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n to complete. If this threshold is exceeded, the run will be cancelled and an\n exception will be thrown. By default, this will poll forver.\n href (str): For internal use, generally should not be set manually.\n\n Returns:\n Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n """\n\n if not href:\n href = self.get_run(run_id).get("href")\n\n poll_start = datetime.datetime.now()\n while True:\n run_details = self.get_run(run_id)\n status = run_details["status_humanized"]\n self._log.info(f"Polled run {run_id}. Status: [{status}]")\n\n # completed successfully\n if status == "Success":\n return self.get_run(run_id, include_related=["job", "trigger"])\n elif status in ["Error", "Cancelled"]:\n break\n elif status not in ["Queued", "Starting", "Running"]:\n check.failed(f"Received unexpected status '{status}'. This should never happen")\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n self.cancel_run(run_id)\n raise Failure(\n f"Run {run_id} timed out after "\n f"{datetime.datetime.now() - poll_start}. Attempted to cancel.",\n metadata={"run_page_url": MetadataValue.url(href)},\n )\n\n # Sleep for the configured time interval before polling again.\n time.sleep(poll_interval)\n\n run_details = self.get_run(run_id, include_related=["trigger"])\n raise Failure(\n f"Run {run_id} failed. Status Message: {run_details['status_message']}",\n metadata={\n "run_details": MetadataValue.json(run_details),\n "run_page_url": MetadataValue.url(href),\n },\n )
\n\n
[docs] def run_job_and_poll(\n self,\n job_id: int,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> DbtCloudOutput:\n """\n Runs a dbt Cloud job and polls until it completes. Will raise a `dagster.Failure` exception\n if the run does not complete successfully.\n\n Args:\n job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n the details page of your job in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n poll_interval (float): The time (in seconds) that should be waited between successive\n polls of the dbt Cloud API.\n poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n to complete. If this threshold is exceeded, the run will be cancelled and an\n exception will be thrown. By default, this will poll forver.\n\n Returns:\n :py:class:`~DbtCloudOutput`: Class containing details about the specific job run and the\n parsed run results.\n """\n run_details = self.run_job(job_id)\n run_id = run_details["id"]\n href = run_details["href"]\n final_run_details = self.poll_run(\n run_id, poll_interval=poll_interval, poll_timeout=poll_timeout, href=href\n )\n output = DbtCloudOutput(run_details=final_run_details, result=self.get_run_results(run_id))\n if output.docs_url:\n self._log.info(f"Docs for this run can be viewed here: {output.docs_url}")\n return output
\n\n\n
[docs]@resource(\n config_schema={\n "auth_token": Field(\n StringSource,\n is_required=True,\n description="dbt Cloud API Token. User tokens can be found in the "\n "[dbt Cloud UI](https://cloud.getdbt.com/#/profile/api/), or see the "\n "[dbt Cloud Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) "\n "for instructions on creating a Service Account token.",\n ),\n "account_id": Field(\n int,\n is_required=True,\n description="dbt Cloud Account ID. This value can be found in the url of a variety of "\n "views in the dbt Cloud UI, e.g. https://cloud.getdbt.com/#/accounts/{account_id}/settings/.",\n ),\n "disable_schedule_on_trigger": Field(\n bool,\n default_value=True,\n description="Specifies if you would like any job that is triggered using this "\n "resource to automatically disable its schedule.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description="The maximum number of times requests to the dbt Cloud API should be retried "\n "before failing.",\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps interact with dbt Cloud connectors",\n)\ndef dbt_cloud_resource(context) -> DbtCloudResourceV2:\n """\n This resource allows users to programatically interface with the dbt Cloud Administrative REST\n API (v2) to launch jobs and monitor their progress. This currently implements only a subset of\n the functionality exposed by the API.\n\n For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/overview/configuration#configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n "account_id": 30000,\n }\n )\n\n @job(resource_defs={"dbt_cloud":my_dbt_cloud_resource})\n def my_dbt_cloud_job():\n ...\n """\n return DbtCloudResourceV2(\n auth_token=context.resource_config["auth_token"],\n account_id=context.resource_config["account_id"],\n disable_schedule_on_trigger=context.resource_config["disable_schedule_on_trigger"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cloud.resources"}}, "dbt_resource": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.dbt_resource

\nimport logging\nfrom abc import abstractmethod\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import get_dagster_logger\n\nfrom .types import DbtOutput\n\n\n
[docs]class DbtResource:\n """Base class for a resource allowing users to interface with dbt"""\n\n def __init__(\n self,\n logger: Optional[logging.Logger] = None,\n ):\n """Constructor\n\n Args:\n logger (Optional[Any]): A property for injecting a logger dependency.\n Default is ``None``.\n """\n self._logger = logger or get_dagster_logger()\n\n def _format_params(\n self, flags: Dict[str, Any], replace_underscores: bool = False\n ) -> Dict[str, Any]:\n """\n Reformats arguments that are easier to express as a list into the format that dbt expects,\n and deletes and keys with no value.\n """\n\n # remove any keys with a value of None\n if replace_underscores:\n flags = {k.replace("_", "-"): v for k, v in flags.items() if v is not None}\n else:\n flags = {k: v for k, v in flags.items() if v is not None}\n\n for param in ["select", "exclude", "models"]:\n if param in flags:\n if isinstance(flags[param], list):\n # if it's a list, format as space-separated\n flags[param] = " ".join(set(flags[param]))\n\n return flags\n\n @property\n def logger(self) -> logging.Logger:\n """logging.Logger: A property for injecting a logger dependency."""\n return self._logger\n\n
[docs] @abstractmethod\n def compile(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtOutput:\n """\n Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def run(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtOutput:\n """\n Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in the run.\n exclude (List[str]), optional): the models to exclude from the run.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def snapshot(\n self, select: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtOutput:\n """\n Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def test(\n self,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n data: bool = True,\n schema: bool = True,\n **kwargs,\n ) -> DbtOutput:\n """\n Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in testing.\n exclude (List[str], optional): the models to exclude from testing.\n data (bool, optional): If ``True`` (default), then run data tests.\n schema (bool, optional): If ``True`` (default), then run schema tests.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def seed(\n self,\n show: bool = False,\n select: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtOutput:\n """\n Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n show (bool, optional): If ``True``, then show a sample of the seeded data in the\n response. Defaults to ``False``.\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def ls(\n self,\n select: Optional[List[str]] = None,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtOutput:\n """\n Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the resources to include in the output.\n models (List[str], optional): the models to include in the output.\n exclude (List[str], optional): the resources to exclude from the output.\n\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtOutput:\n """\n Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def run_operation(\n self, macro: str, args: Optional[Dict[str, Any]] = None, **kwargs\n ) -> DbtOutput:\n """\n Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n macro (str): the dbt macro to invoke.\n args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n
", "current_page_name": "_modules/dagster_dbt/dbt_resource", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.dbt_resource"}, "errors": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.errors

\nfrom abc import ABC\nfrom typing import Any, Dict, List\n\nfrom dagster import Failure, MetadataEntry, check\n\n\n
[docs]class DagsterDbtError(Failure, ABC):\n """The base exception of the ``dagster-dbt`` library."""
\n\n\n
[docs]class DagsterDbtCliUnexpectedOutputError(DagsterDbtError):\n """Represents an error when parsing the output of a dbt CLI command."""\n\n invalid_line_nos: List[int]\n\n def __init__(self, invalid_line_nos: List[int]):\n check.list_param(invalid_line_nos, "invalid_line_nos", int)\n line_nos_str = ", ".join(map(str, invalid_line_nos))\n description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"\n metadata_entries = [\n MetadataEntry("Invalid CLI Output Line Numbers", value={"line_nos": invalid_line_nos})\n ]\n super().__init__(description, metadata_entries)\n self.invalid_line_nos = invalid_line_nos
\n\n\n
[docs]class DagsterDbtCliRuntimeError(DagsterDbtError, ABC):\n """Represents an error while executing a dbt CLI command."""\n\n def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str):\n metadata_entries = [\n MetadataEntry(\n "Parsed CLI Output (JSON)",\n value={"logs": logs},\n ),\n MetadataEntry(\n "Parsed CLI Output (JSON) Message Attributes",\n value=DagsterDbtCliRuntimeError.stitch_messages(logs),\n ),\n MetadataEntry(\n "Raw CLI Output",\n value=raw_output,\n ),\n ]\n super().__init__(description, metadata_entries)\n\n @staticmethod\n def stitch_messages(logs: List[dict]) -> str:\n return "\\n".join(\n log["message"].strip("\\n")\n for log in logs\n if isinstance(log.get("message"), str) # defensive\n )
\n\n\n
[docs]class DagsterDbtCliHandledRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a model error reported by the dbt CLI at runtime (return code 1)."""\n\n def __init__(self, logs: List[Dict[str, Any]], raw_output: str):\n super().__init__("Handled error in the dbt CLI (return code 1)", logs, raw_output)
\n\n\n
[docs]class DagsterDbtCliFatalRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a fatal error in the dbt CLI (return code 2)."""\n\n def __init__(self, logs: List[Dict[str, Any]], raw_output: str):\n super().__init__("Fatal error in the dbt CLI (return code 2)", logs, raw_output)
\n\n\n
[docs]class DagsterDbtRpcUnexpectedPollOutputError(DagsterDbtError):\n """Represents an unexpected response when polling the dbt RPC server."""
\n\n\n
[docs]class DagsterDbtCliOutputsNotFoundError(DagsterDbtError):\n """Represents a problem in finding the ``target/run_results.json`` artifact when executing a dbt\n CLI command.\n\n For more details on ``target/run_results.json``, see\n https://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.\n """\n\n def __init__(self, path: str):\n super().__init__("Expected to find file at path {}".format(path))
\n
", "current_page_name": "_modules/dagster_dbt/errors", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.errors"}, "ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.ops

\nfrom dagster import Array, Bool, Field, In, Nothing, Out, Output, op\n\nfrom .types import DbtOutput\nfrom .utils import generate_materializations\n\n_DEFAULT_OP_PROPS = dict(\n    required_resource_keys={"dbt"},\n    ins={"start_after": In(Nothing)},\n    out=Out(DbtOutput, description="Parsed output from running the dbt command."),\n    tags={"kind": "dbt"},\n)\n\n\ndef _get_doc(op_name: str, dbt_command: str) -> str:\n    return f"""\nThis op executes a ``dbt {dbt_command}`` command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the :py:class:`~dagster_dbt.dbt_cli_resource`) or\nover RPC (using the :py:class:`~dbt_rpc_sync_resource`).\n\nExamples:\n\n.. code-block:: python\n\n    from dagster import job\n    from dagster_dbt import {op_name}, dbt_cli_resource, dbt_rpc_sync_resource\n\n    @job(resource_defs={{"dbt":dbt_cli_resource}})\n    def my_dbt_cli_job():\n        {op_name}()\n\n    @job(resource_defs={{"dbt":dbt_rpc_sync_resource}})\n    def my_dbt_rpc_job():\n        {op_name}()\n    """\n\n\n
[docs]@op(\n **_DEFAULT_OP_PROPS,\n config_schema={\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the op executes. Default: True"\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["dbt"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n)\ndef dbt_run_op(context):\n dbt_output = context.resources.dbt.run()\n if context.op_config["yield_materializations"] and "results" in dbt_output.result:\n yield from generate_materializations(\n dbt_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(dbt_output)
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_compile_op(context):\n return context.resources.dbt.compile()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_ls_op(context):\n return context.resources.dbt.ls()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_test_op(context):\n return context.resources.dbt.test()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_snapshot_op(context):\n return context.resources.dbt.snapshot()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_seed_op(context):\n return context.resources.dbt.seed()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_docs_generate_op(context):\n return context.resources.dbt.generate_docs()
\n\n\nfor op, cmd in [\n (dbt_run_op, "run"),\n (dbt_compile_op, "compile"),\n (dbt_ls_op, "ls"),\n (dbt_test_op, "test"),\n (dbt_snapshot_op, "snapshot"),\n (dbt_seed_op, "seed"),\n (dbt_docs_generate_op, "docs generate"),\n]:\n op.__doc__ = _get_doc(op.name, cmd)\n
", "current_page_name": "_modules/dagster_dbt/ops", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.ops"}, "rpc": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.rpc.resources

\nimport json\nimport platform\nimport sys\nimport time\nimport uuid\nfrom base64 import standard_b64encode as b64\nfrom typing import Any, Dict, List, Optional\n\nimport requests\n\nfrom dagster import Failure, Field, IntSource, RetryRequested, StringSource, check, resource\nfrom dagster.core.utils import coerce_valid_log_level\n\nfrom ..dbt_resource import DbtResource\nfrom .types import DbtRpcOutput\nfrom .utils import is_fatal_code\n\n\n
[docs]class DbtRpcResource(DbtResource):\n """A client for a dbt RPC server.\n\n To use this as a dagster resource, we recommend using\n :func:`dbt_rpc_resource <dagster_dbt.dbt_rpc_resource>`.\n """\n\n def __init__(\n self,\n host: str = "0.0.0.0",\n port: int = 8580,\n jsonrpc_version: str = "2.0",\n logger: Optional[Any] = None,\n **_,\n ):\n """Constructor\n\n Args:\n host (str): The IP address of the host of the dbt RPC server. Default is ``"0.0.0.0"``.\n port (int): The port of the dbt RPC server. Default is ``8580``.\n jsonrpc_version (str): The JSON-RPC version to send in RPC requests.\n Default is ``"2.0"``.\n logger (Optional[Any]): A property for injecting a logger dependency.\n Default is ``None``.\n """\n check.str_param(host, "host")\n check.int_param(port, "port")\n check.str_param(jsonrpc_version, "jsonrpc_version")\n\n self._host = host\n self._port = port\n self._jsonrpc_version = jsonrpc_version\n super().__init__(logger)\n\n @staticmethod\n def _construct_user_agent() -> str:\n """A helper method to construct a standard User-Agent string to be used in HTTP request\n headers.\n\n Returns:\n str: The constructed User-Agent value.\n """\n client = "dagster/dbt-rpc-client"\n python_version = (\n f"Python/{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"\n )\n system_info = f"{platform.system()}/{platform.release()}"\n user_agent = " ".join([python_version, client, system_info])\n return user_agent\n\n def _construct_headers(self) -> Dict[str, str]:\n """Constructs a standard set of headers for HTTP requests.\n\n Returns:\n Dict[str, str]: The HTTP request headers.\n """\n headers = requests.utils.default_headers()\n headers["User-Agent"] = self._construct_user_agent()\n headers["Content-Type"] = "application/json"\n headers["Accept"] = "application/json"\n return headers\n\n def _post(self, data: Optional[str] = None) -> DbtRpcOutput:\n """Constructs and sends a POST request to the dbt RPC server.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n headers = self._construct_headers()\n try:\n response = requests.post(self.url, headers=headers, data=data)\n response.raise_for_status()\n except requests.exceptions.HTTPError as e:\n if is_fatal_code(e):\n raise e\n else:\n raise RetryRequested(max_retries=5, seconds_to_wait=30)\n return DbtRpcOutput(response)\n\n def _get_result(self, data: Optional[str] = None) -> DbtRpcOutput:\n """Constructs and sends a POST request to the dbt RPC server.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n return self._post(data)\n\n def _default_request(\n self, method: str, params: Optional[Dict[str, Any]] = None\n ) -> Dict[str, Any]:\n """Constructs a standard HTTP request body, to be sent to a dbt RPC server.\n\n Args:\n method (str): a dbt RPC method.\n\n Returns:\n Dict: the constructed HTTP request body.\n """\n data = {\n "jsonrpc": self.jsonrpc_version,\n "method": method,\n "id": str(uuid.uuid1()),\n "params": params or {},\n }\n return data\n\n @property\n def host(self) -> str:\n """str: The IP address of the host of the dbt RPC server."""\n return self._host\n\n @property\n def port(self) -> int:\n """int: The port of the dbt RPC server."""\n return self._port\n\n @property\n def jsonrpc_version(self) -> str:\n """str: The JSON-RPC version to send in RPC requests."""\n return self._jsonrpc_version\n\n @property\n def logger(self) -> Optional[Any]:\n """Optional[Any]: A property for injecting a logger dependency."""\n return self._logger\n\n @property\n def url(self) -> str:\n """str: The URL for sending dbt RPC requests."""\n return f"http://{self.host}:{self.port}/jsonrpc"\n\n
[docs] def status(self):\n """Sends a request with the method ``status`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the RPC method `status\n <https://docs.getdbt.com/reference/commands/rpc/#status>`_.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="status")\n return self._post(data=json.dumps(data))
\n\n
[docs] def ls(\n self,\n select: Optional[List[str]] = None,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtRpcOutput:\n """Sends a request with the method ``list`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `list\n <https://docs.getdbt.com/reference/commands/rpc/#list>`_.\n\n Args:\n select (List[str], optional): the resources to include in the output.\n models (List[str], optional): the models to include in the output.\n exclude (List[str]), optional): the resources to exclude from compilation.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n\n explicit_params = dict(models=models, exclude=exclude)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="list", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def poll(self, request_token: str, logs: bool = False, logs_start: int = 0) -> DbtRpcOutput:\n """Sends a request with the method ``poll`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `poll\n <https://docs.getdbt.com/reference/commands/rpc/#poll>`_.\n\n Args:\n request_token (str): the token to poll responses for.\n logs (bool): Whether logs should be returned in the response. Defaults to ``False``.\n logs_start (int): The zero-indexed log line to fetch logs from. Defaults to ``0``.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="poll")\n data["params"] = {"request_token": request_token, "logs": logs, "logs_start": logs_start}\n return self._post(data=json.dumps(data))
\n\n
[docs] def ps(self, completed: bool = False) -> DbtRpcOutput:\n """Sends a request with the method ``ps`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `ps\n <https://docs.getdbt.com/reference/commands/rpc/#ps>`_.\n\n Args:\n compelted (bool): If ``True``, then also return completed tasks. Defaults to ``False``.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="ps")\n data["params"] = {"completed": completed}\n return self._post(data=json.dumps(data))
\n\n
[docs] def kill(self, task_id: str) -> DbtRpcOutput:\n """Sends a request with the method ``kill`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `kill\n <https://docs.getdbt.com/reference/commands/rpc/#kill>`_.\n\n Args:\n task_id (str): the ID of the task to terminate.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="kill")\n data["params"] = {"task_id": task_id}\n return self._post(data=json.dumps(data))
\n\n
[docs] def cli(self, command: str, **kwargs) -> DbtRpcOutput:\n """Sends a request with CLI syntax to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for `running CLI commands via RPC\n <https://docs.getdbt.com/reference/commands/rpc/#running-a-task-with-cli-syntax>`_.\n\n Args:\n cli (str): a dbt command in CLI syntax.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n params = self._format_params({"cli": command, **kwargs})\n data = self._default_request(method="cli_args", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def compile(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtRpcOutput:\n """Sends a request with the method ``compile`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `compiling projects via RPC\n <https://docs.getdbt.com/reference/commands/rpc/#compile-a-project>`_.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n\n explicit_params = dict(models=models, exclude=exclude)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="compile", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def run(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtRpcOutput:\n """Sends a request with the method ``run`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `run\n <https://docs.getdbt.com/reference/commands/rpc/#run-models>`_.\n\n Args:\n models (List[str], optional): the models to include in the run.\n exclude (List[str]), optional): the models to exclude from the run.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(models=models, exclude=exclude)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="run", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def snapshot(\n self, select: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtRpcOutput:\n """Sends a request with the method ``snapshot`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the command `snapshot\n <https://docs.getdbt.com/reference/commands/snapshot>`_.\n\n Args:\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(select=select, exclude=exclude)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="snapshot", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def test(\n self,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n data: bool = True,\n schema: bool = True,\n **kwargs,\n ) -> DbtRpcOutput:\n """Sends a request with the method ``test`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `test\n <https://docs.getdbt.com/reference/commands/rpc/#run-test>`_.\n\n Args:\n models (List[str], optional): the models to include in testing.\n exclude (List[str], optional): the models to exclude from testing.\n data (bool, optional): If ``True`` (default), then run data tests.\n schema (bool, optional): If ``True`` (default), then run schema tests.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(models=models, exclude=exclude, data=data, schema=schema)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="test", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def seed(\n self,\n show: bool = False,\n select: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtRpcOutput:\n """Sends a request with the method ``seed`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `seed\n <https://docs.getdbt.com/reference/commands/rpc/#run-seed>`_.\n\n Args:\n show (bool, optional): If ``True``, then show a sample of the seeded data in the\n response. Defaults to ``False``.\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="seed")\n data["params"] = {"show": show}\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def generate_docs(\n self,\n compile_project: bool = False,\n **kwargs,\n ) -> DbtRpcOutput:\n """Sends a request with the method ``docs.generate`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the RPC method `docs.generate\n <https://docs.getdbt.com/reference/commands/rpc/#generate-docs>`_.\n\n Args:\n compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n """\n explicit_params = dict(compile=compile_project)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="docs.generate", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def run_operation(\n self, macro: str, args: Optional[Dict[str, Any]] = None, **kwargs\n ) -> DbtRpcOutput:\n """Sends a request with the method ``run-operation`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the command `run-operation\n <https://docs.getdbt.com/reference/commands/run-operation>`_.\n\n Args:\n macro (str): the dbt macro to invoke.\n args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(macro=macro, args=args)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="run-operation", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def snapshot_freshness(self, select: Optional[List[str]] = None, **kwargs) -> DbtRpcOutput:\n """Sends a request with the method ``snapshot-freshness`` to the dbt RPC server, and returns\n the response. For more details, see the dbt docs for the command `source snapshot-freshness\n <https://docs.getdbt.com/reference/commands/source#dbt-source-snapshot-freshness>`_.\n\n Args:\n select (List[str], optional): the models to include in calculating snapshot freshness.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(select=select)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="snapshot-freshness", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def compile_sql(self, sql: str, name: str) -> DbtRpcOutput:\n """Sends a request with the method ``compile_sql`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `compiling SQL via RPC\n <https://docs.getdbt.com/reference/commands/rpc#compiling-a-query>`_.\n\n Args:\n sql (str): the SQL to compile in base-64 encoding.\n name (str): a name for the compiled SQL.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(sql=b64(sql.encode("utf-8")).decode("utf-8"), name=name)\n params = self._format_params(explicit_params)\n data = self._default_request(method="compile_sql", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def run_sql(self, sql: str, name: str) -> DbtRpcOutput:\n """Sends a request with the method ``run_sql`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `running SQL via RPC\n <https://docs.getdbt.com/reference/commands/rpc#executing-a-query>`_.\n\n Args:\n sql (str): the SQL to run in base-64 encoding.\n name (str): a name for the compiled SQL.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(sql=b64(sql.encode("utf-8")).decode("utf-8"), name=name)\n params = self._format_params(explicit_params)\n data = self._default_request(method="run_sql", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n\n
[docs]class DbtRpcSyncResource(DbtRpcResource):\n def __init__(\n self,\n host: str = "0.0.0.0",\n port: int = 8580,\n jsonrpc_version: str = "2.0",\n logger: Optional[Any] = None,\n poll_interval: int = 1,\n **_,\n ):\n """Constructor\n\n Args:\n host (str): The IP address of the host of the dbt RPC server. Default is ``"0.0.0.0"``.\n port (int): The port of the dbt RPC server. Default is ``8580``.\n jsonrpc_version (str): The JSON-RPC version to send in RPC requests.\n Default is ``"2.0"``.\n logger (Optional[Any]): A property for injecting a logger dependency.\n Default is ``None``.\n poll_interval (int): The polling interval in seconds.\n """\n super().__init__(host, port, jsonrpc_version, logger)\n self.poll_interval = poll_interval\n\n def _get_result(self, data: Optional[str] = None) -> DbtRpcOutput:\n """Sends a request to the dbt RPC server and continuously polls for the status of a request until the state is ``success``."""\n\n out = super()._get_result(data)\n request_token = out.result.get("request_token")\n\n logs_start = 0\n\n elapsed_time = -1\n current_state = None\n\n while True:\n out = self.poll(\n request_token=request_token,\n logs=True,\n logs_start=logs_start,\n )\n logs = out.result.get("logs", [])\n for log in logs:\n self.logger.log(\n msg=log["message"],\n level=coerce_valid_log_level(log.get("levelname", "INFO")),\n extra=log.get("extra"),\n )\n logs_start += len(logs)\n\n current_state = out.result.get("state")\n # Stop polling if request's state is no longer "running".\n if current_state != "running":\n break\n\n elapsed_time = out.result.get("elapsed", 0)\n # Sleep for the configured time interval before polling again.\n time.sleep(self.poll_interval)\n\n if current_state != "success":\n raise Failure(\n description=(\n f"Request {request_token} finished with state '{current_state}' in "\n f"{elapsed_time} seconds"\n ),\n )\n\n return out
\n\n\n
[docs]@resource(\n description="A resource representing a dbt RPC client.",\n config_schema={\n "host": Field(StringSource),\n "port": Field(IntSource, is_required=False, default_value=8580),\n },\n)\ndef dbt_rpc_resource(context) -> DbtRpcResource:\n """This resource defines a dbt RPC client.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/overview/configuration#configured>`_ method.\n\n Examples:\n\n Examples:\n\n .. code-block:: python\n\n from dagster_dbt import dbt_rpc_resource\n\n custom_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n @job(resource_defs={"dbt_rpc": custom_dbt_rpc_sync_resource})\n def dbt_rpc_job():\n # Run ops with `required_resource_keys={"dbt_rpc", ...}`.\n\n """\n return DbtRpcResource(\n host=context.resource_config["host"], port=context.resource_config["port"]\n )
\n\n\n
[docs]@resource(\n description="A resource representing a synchronous dbt RPC client.",\n config_schema={\n "host": Field(StringSource),\n "port": Field(IntSource, is_required=False, default_value=8580),\n "poll_interval": Field(IntSource, is_required=False, default_value=1),\n },\n)\ndef dbt_rpc_sync_resource(\n context,\n) -> DbtRpcSyncResource:\n """This resource defines a synchronous dbt RPC client, which sends requests to a dbt RPC server,\n and waits for the request to complete before returning.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/overview/configuration#configured>`_ method.\n\n Examples:\n\n .. code-block:: python\n\n from dagster_dbt import dbt_rpc_sync_resource\n\n custom_sync_dbt_rpc_resource = dbt_rpc_sync_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n @job(resource_defs={"dbt_rpc": custom_dbt_rpc_sync_resource})\n def dbt_rpc_sync_job():\n # Run ops with `required_resource_keys={"dbt_rpc", ...}`.\n\n """\n return DbtRpcSyncResource(\n host=context.resource_config["host"],\n port=context.resource_config["port"],\n poll_interval=context.resource_config["poll_interval"],\n )
\n\n\nlocal_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "0.0.0.0", "port": 8580})\nlocal_dbt_rpc_resource.__doc__ = """This resource defines a dbt RPC client for an RPC server running\non 0.0.0.0:8580."""\n
", "current_page_name": "_modules/dagster_dbt/rpc/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.rpc.resources"}, "solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.rpc.solids

\nimport json\nimport time\nfrom typing import Callable, Optional\n\nimport pandas as pd\nfrom dagster_pandas import DataFrame\n\nfrom dagster import (\n    Array,\n    Bool,\n    DagsterInvalidDefinitionError,\n    Failure,\n    Field,\n    InputDefinition,\n    Int,\n    Noneable,\n    Nothing,\n    Output,\n    OutputDefinition,\n    Permissive,\n    RetryRequested,\n    String,\n    check,\n    solid,\n)\nfrom dagster.core.execution.context.compute import SolidExecutionContext\n\nfrom ..errors import DagsterDbtRpcUnexpectedPollOutputError\nfrom .types import DbtRpcOutput\nfrom .utils import log_rpc, raise_for_rpc_error\n\n\ndef _poll_rpc(\n    context: SolidExecutionContext, request_token: str, should_yield_materializations: bool = True\n):\n    """Polls the dbt RPC server for the status of a request until the state is ``success``."""\n    from ..utils import generate_materializations\n\n    logs_start = 0\n    interval = context.solid_config.get("interval")\n\n    elapsed_time = -1\n    current_state = None\n\n    while True:\n        # Poll for the dbt RPC request.\n        context.log.debug(f"RequestToken: {request_token}")\n        out = context.resources.dbt_rpc.poll(\n            request_token=request_token, logs=context.solid_config["logs"], logs_start=logs_start\n        )\n        raise_for_rpc_error(context, out.response)\n\n        # Pass dbt RPC logs into the Dagster/Dagit logger.\n        if context.solid_config["logs"]:\n            logs = out.result.get("logs", [])\n            if len(logs) > 0:\n                log_rpc(context, logs)\n            logs_start += len(logs)\n\n        current_state = out.result.get("state")\n        # Stop polling if request's state is no longer "running".\n        if current_state != "running":\n            break\n\n        elapsed_time = out.result.get("elapsed", 0)\n        # Sleep for the configured time interval before polling again.\n        context.log.debug(\n            f"Request {request_token} currently in state '{current_state}' (elapsed time "\n            f"{elapsed_time} seconds). Sleeping for {interval}s..."\n        )\n        time.sleep(interval)\n\n    if current_state != "success":\n        raise Failure(\n            description=(\n                f"Request {request_token} finished with state '{current_state}' in "\n                f"{elapsed_time} seconds"\n            ),\n        )\n\n    context.log.info(\n        f"Request {request_token} finished with state '{current_state}' in {elapsed_time} seconds"\n    )\n    context.log.debug(json.dumps(out.result, indent=2))\n\n    if should_yield_materializations:\n        for materialization in generate_materializations(out):\n            yield materialization\n\n    yield Output(out)\n\n\ndef unwrap_result(poll_rpc_generator) -> DbtRpcOutput:\n    """A helper function that extracts the `DbtRpcOutput` value from a generator.\n\n    The parameter `poll_rpc_generator` is expected to be an invocation of `_poll_rpc`.\n    """\n    output = None\n    for x in poll_rpc_generator:\n        output = x\n\n    if output is None:\n        raise DagsterDbtRpcUnexpectedPollOutputError(\n            description="poll_rpc yielded None as its last value. Expected value of type Output containing DbtRpcOutput.",\n        )\n\n    if not isinstance(output, Output):\n        raise DagsterDbtRpcUnexpectedPollOutputError(\n            description=f"poll_rpc yielded value of type {type(output)} as its last value. Expected value of type Output containing DbtRpcOutput.",\n        )\n\n    if not isinstance(output.value, DbtRpcOutput):\n        raise DagsterDbtRpcUnexpectedPollOutputError(\n            description=f"poll_rpc yielded Output containing {type(output.value)}. Expected DbtRpcOutput.",\n        )\n\n    return output.value\n\n\n
[docs]@solid(\n description="A solid to invoke dbt run over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt run.",\n )\n ],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt run`` command to a dbt RPC server and returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.run(\n models=context.solid_config["models"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt run over RPC and poll the resulting RPC process until it's complete.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "full_refresh": Field(\n config=Bool,\n description="Whether or not to perform a --full-refresh.",\n is_required=False,\n default_value=False,\n ),\n "fail_fast": Field(\n config=Bool,\n description="Whether or not to --fail-fast.",\n is_required=False,\n default_value=False,\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "task_tags": Permissive(),\n "max_retries": Field(config=Int, is_required=False, default_value=5),\n "retry_interval": Field(config=Int, is_required=False, default_value=120),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt run`` command to a dbt RPC server and returns the result of the\n executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n if context.solid_config["task_tags"]:\n results = context.resources.dbt_rpc.ps().json()\n for task in results["result"]["rows"]:\n if task["tags"] == context.solid_config["task_tags"]:\n context.log.warning(\n f"RPC task with tags {json.dumps(task['tags'])} currently running."\n )\n raise RetryRequested(\n max_retries=context.solid_config["max_retries"],\n seconds_to_wait=context.solid_config["retry_interval"],\n )\n\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " run"\n\n if context.solid_config["models"]:\n models = " ".join(set(context.solid_config["models"]))\n command += f" --models {models}"\n\n if context.solid_config["exclude"]:\n exclude = " ".join(set(context.solid_config["exclude"]))\n command += f" --exclude {exclude}"\n\n if context.solid_config["full_refresh"]:\n command += " --full-refresh"\n\n if context.solid_config["fail_fast"]:\n command += " --fail-fast"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n out = context.resources.dbt_rpc.cli(\n command=command, task_tags=context.solid_config["task_tags"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt test over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt test.",\n )\n ],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to test.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "data": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run custom data tests.",\n ),\n "schema": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run schema tests.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_test(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt test`` command to a dbt RPC server and returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.test(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n data=context.solid_config["data"],\n schema=context.solid_config["schema"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description=(\n "A solid to invoke dbt test over RPC and poll the resulting RPC process until it's "\n "complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to test.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "data": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run custom data tests.",\n ),\n "schema": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run schema tests.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_test_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt test`` command to a dbt RPC server and returns the result of the\n executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n out = context.resources.dbt_rpc.test(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n data=context.solid_config["data"],\n schema=context.solid_config["schema"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to invoke a dbt run operation over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt run operation.",\n )\n ],\n config_schema={\n "macro": Field(\n config=String,\n is_required=True,\n description="The dbt macro to invoke as a run operation",\n ),\n "args": Field(\n config=Noneable(Permissive()),\n is_required=False,\n default_value=None,\n description="Arguments to supply to the invoked macro.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_operation(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt run-operation`` command to a dbt RPC server and returns the\n request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.run_operation(\n macro=context.solid_config["macro"], args=context.solid_config["args"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description=(\n "A solid to invoke a dbt run operation over RPC and poll the resulting RPC process until "\n "it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "macro": Field(\n config=String,\n is_required=True,\n description="The dbt macro to invoke as a run operation",\n ),\n "args": Field(\n config=Noneable(Permissive()),\n is_required=False,\n default_value=None,\n description="Arguments to supply to the invoked macro.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_operation_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt run-operation`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n out = context.resources.dbt_rpc.run_operation(\n macro=context.solid_config["macro"], args=context.solid_config["args"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to invoke a dbt snapshot over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt snapshot.",\n )\n ],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to snapshot.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to exclude from the snapshot.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt snapshot`` command to a dbt RPC server and returns the\n request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.snapshot(\n select=context.solid_config["select"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description=(\n "A solid to invoke a dbt snapshot over RPC and poll the resulting RPC process until "\n "it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to snapshot.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to exclude from the snapshot.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "task_tags": Permissive(),\n "max_retries": Field(config=Int, is_required=False, default_value=5),\n "retry_interval": Field(config=Int, is_required=False, default_value=120),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt snapshot`` command to a dbt RPC server and returns the result of\n the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n if context.solid_config["task_tags"]:\n results = context.resources.dbt_rpc.ps().json()\n for task in results["result"]["rows"]:\n if task["tags"] == context.solid_config["task_tags"]:\n context.log.warning(\n f"RPC task with tags {json.dumps(task['tags'])} currently running."\n )\n raise RetryRequested(\n max_retries=context.solid_config["max_retries"],\n seconds_to_wait=context.solid_config["retry_interval"],\n )\n\n out = context.resources.dbt_rpc.snapshot(\n select=context.solid_config["select"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt source snapshot-freshness over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt snapshot.",\n )\n ],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt sources to snapshot-freshness for.",\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_freshness(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt source snapshot-freshness`` command to a dbt RPC server and\n returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " source snapshot-freshness"\n\n if context.solid_config["select"]:\n select = " ".join(set(context.solid_config["select"]))\n command += f" --select {select}"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n out = context.resources.dbt_rpc.cli(command=command)\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description=(\n "A solid to invoke dbt source snapshot-freshness over RPC and poll the resulting "\n "RPC process until it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt sources to snapshot-freshness for.",\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_freshness_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt source snapshot`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " source snapshot-freshness"\n\n if context.solid_config["select"]:\n select = " ".join(set(context.solid_config["select"]))\n command += f" --select {select}"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n out = context.resources.dbt_rpc.cli(command=command)\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to compile a SQL query in context of a dbt project over RPC.",\n input_defs=[\n InputDefinition(name="start_after", dagster_type=Nothing),\n InputDefinition(\n name="sql", description="The SQL query to be compiled.", dagster_type=String\n ),\n ],\n output_defs=[\n OutputDefinition(name="sql", description="The compiled SQL query.", dagster_type=String)\n ],\n config_schema={\n "name": Field(config=String),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_compile_sql(context: SolidExecutionContext, sql: String) -> DbtRpcOutput:\n """This solid sends the ``dbt compile`` command to a dbt RPC server and returns the request\n token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.compile_sql(sql=sql, name=context.solid_config["name"])\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return unwrap_result(\n _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n )
\n\n\n
[docs]def create_dbt_rpc_run_sql_solid(\n name: str, output_def: Optional[OutputDefinition] = None, **kwargs\n) -> Callable:\n """This function is a factory which constructs a solid that will copy the results of a SQL query\n run within the context of a dbt project to a pandas ``DataFrame``.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@solid\n <dagster.solid>` decorator. However, note that overriding ``config_schema``, ``input_defs``, and\n ``required_resource_keys`` is not allowed and will throw a :class:`DagsterInvalidDefinitionError\n <dagster.DagsterInvalidDefinitionError>`.\n\n If you would like to configure this solid with different config fields, you could consider using\n :func:`@composite_solid <dagster.composite_solid>` to wrap this solid.\n\n Args:\n name (str): The name of this solid.\n output_def (OutputDefinition, optional): The :class:`OutputDefinition\n <dagster.OutputDefinition>` for the solid. This value should always be a representation\n of a pandas ``DataFrame``. If not specified, the solid will default to an\n :class:`OutputDefinition <dagster.OutputDefinition>` named "df" with a ``DataFrame``\n dagster type.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n check.str_param(obj=name, param_name="name")\n check.opt_inst_param(obj=output_def, param_name="output_def", ttype=OutputDefinition)\n\n if "config_schema" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding config_schema is not supported.")\n\n if "input_defs" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding input_defs is not supported.")\n\n if "required_resource_keys" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding required_resource_keys is not supported.")\n\n @solid(\n name=name,\n description=kwargs.pop(\n "description",\n "A solid to run a SQL query in context of a dbt project over RPC and return the "\n "results in a pandas DataFrame.",\n ),\n input_defs=[\n InputDefinition(name="start_after", dagster_type=Nothing),\n InputDefinition(\n name="sql", description="The SQL query to be run.", dagster_type=String\n ),\n ],\n output_defs=[\n output_def\n or OutputDefinition(\n name="df", description="The results of the SQL query.", dagster_type=DataFrame\n )\n ],\n config_schema={\n "name": Field(config=String),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation "\n "will be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n **kwargs,\n )\n def _dbt_rpc_run_sql(context: SolidExecutionContext, sql: String) -> DataFrame:\n out = context.resources.dbt_rpc.run_sql(sql=sql, name=context.solid_config["name"])\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n out = unwrap_result(\n _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n )\n table = out.result["results"][0]["table"]\n return pd.DataFrame.from_records(data=table["rows"], columns=table["column_names"])\n\n return _dbt_rpc_run_sql
\n\n\n@solid(\n description="A solid to invoke dbt seed over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n description="The request token of the invoked dbt seed.",\n dagster_type=String,\n ),\n ],\n config_schema={\n "show": Field(\n config=Bool,\n is_required=False,\n default_value=False,\n description="If True, show a sample of the seeded data in the response.",\n ),\n "task_tags": Permissive(),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_seed(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt seed`` command to a dbt RPC server and returns the request\n token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.seed(\n show=context.solid_config["show"],\n **context.solid_config["task_tags"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")\n\n\n@solid(\n description=(\n "A solid to invoke dbt seed over RPC and poll the resulting RPC process until it's "\n "complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "show": Field(\n config=Bool,\n is_required=False,\n default_value=False,\n description="If True, show a sample of the seeded data in the response.",\n ),\n "task_tags": Permissive(),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_seed_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt seed`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n out = context.resources.dbt_rpc.seed(\n show=context.solid_config["show"],\n task_tags=context.solid_config["task_tags"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n\n\n@solid(\n description="A solid to invoke dbt docs generate over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt run.",\n )\n ],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to compile and generate docs for.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "compile": Field(\n config=Bool,\n is_required=False,\n default_value=False,\n description="If True, compile the project before generating a catalog.",\n ),\n "task_tags": Permissive(),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_docs_generate(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt docs generate`` command to a dbt RPC server and returns the\n request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.run(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n compile=context.solid_config["compile"],\n **context.solid_config["task_tags"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")\n\n\n@solid(\n description="A solid to invoke dbt docs generate over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to compile and generate docs for.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "compile": Field(\n config=Bool,\n is_required=False,\n default_value=False,\n description="If True, compile the project before generating a catalog.",\n ),\n "task_tags": Permissive(),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_docs_generate_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt docs generate`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n out = context.resources.dbt_rpc.run(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n compile=context.solid_config["compile"],\n task_tags=context.solid_config["task_tags"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n
", "current_page_name": "_modules/dagster_dbt/rpc/solids", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.rpc.solids"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.rpc.types

\nfrom typing import Any, Dict\n\nimport requests\n\nfrom dagster import usable_as_dagster_type\n\nfrom ..types import DbtOutput\n\n\n
[docs]@usable_as_dagster_type\nclass DbtRpcOutput(DbtOutput):\n """The output from executing a dbt command via the dbt RPC server.\n\n Attributes:\n result (Dict[str, Any]): The parsed contents of the "result" field of the JSON response from\n the rpc server (if any).\n response_dict (Dict[str, Any]): The entire contents of the JSON response from the rpc server.\n response (requests.Response): The original Response from which this output was generated.\n """\n\n def __init__(self, response: requests.Response):\n\n self._response = response\n self._response_dict = response.json()\n\n super().__init__(result=self._response_dict.get("result", {}))\n\n @property\n def response(self) -> requests.Response:\n return self._response\n\n @property\n def response_dict(self) -> Dict[str, Any]:\n return self._response_dict
\n
", "current_page_name": "_modules/dagster_dbt/rpc/types", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.rpc.types"}}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.types

\nfrom typing import Any, Dict, Optional\n\nfrom dagster import check\n\n\n
[docs]class DbtOutput:\n """\n Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, `result`, which\n represents the dbt-formatted result of the command that was run (if any).\n\n Used internally, should not be instantiated directly by the user.\n """\n\n def __init__(self, result: Dict[str, Any]):\n self._result = check.dict_param(result, "result", key_type=str)\n\n @property\n def result(self) -> Dict[str, Any]:\n return self._result\n\n @property\n def docs_url(self) -> Optional[str]:\n return None
\n
", "current_page_name": "_modules/dagster_dbt/types", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.types"}, "utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.utils

\nfrom typing import Any, Dict, Iterator, List, Optional\n\nimport dateutil\n\nfrom dagster import AssetMaterialization, MetadataEntry, MetadataValue, check\n\nfrom .types import DbtOutput\n\n\ndef _get_asset_materialization(\n    unique_id: str, asset_key_prefix: List[str], metadata: List[MetadataEntry]\n) -> AssetMaterialization:\n    return AssetMaterialization(\n        description=f"dbt node: {unique_id}",\n        metadata_entries=metadata,\n        asset_key=asset_key_prefix + unique_id.split("."),\n    )\n\n\ndef _node_result_to_metadata(node_result: Dict[str, Any]) -> List[MetadataEntry]:\n    return [\n        MetadataEntry(\n            "Materialization Strategy",\n            value=node_result["config"]["materialized"],\n        ),\n        MetadataEntry("Database", value=node_result["database"]),\n        MetadataEntry("Schema", value=node_result["schema"]),\n        MetadataEntry("Alias", value=node_result["alias"]),\n        MetadataEntry("Description", value=node_result["description"]),\n    ]\n\n\ndef _timing_to_metadata(timings: List[Dict[str, Any]]) -> List[MetadataEntry]:\n    metadata = []\n    for timing in timings:\n        if timing["name"] == "execute":\n            desc = "Execution"\n        elif timing["name"] == "compile":\n            desc = "Compilation"\n        else:\n            continue\n\n        started_at = dateutil.parser.isoparse(timing["started_at"])\n        completed_at = dateutil.parser.isoparse(timing["completed_at"])\n        duration = completed_at - started_at\n        metadata.extend(\n            [\n                MetadataEntry(f"{desc} Started At", value=started_at.isoformat(timespec="seconds")),\n                MetadataEntry(\n                    f"{desc} Completed At", value=started_at.isoformat(timespec="seconds")\n                ),\n                MetadataEntry(f"{desc} Duration", value=duration.total_seconds()),\n            ]\n        )\n    return metadata\n\n\ndef result_to_materialization(\n    result: Dict[str, Any],\n    asset_key_prefix: Optional[List[str]] = None,\n    docs_url: Optional[str] = None,\n) -> Optional[AssetMaterialization]:\n    """\n    This is a hacky solution that attempts to consolidate parsing many of the potential formats\n    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,\n    as well as RPC responses for a similar time period, but as the RPC response schema is not documented\n    nor enforced, this can become out of date easily.\n    """\n\n    asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n    # status comes from set of fields rather than "status"\n    if "fail" in result:\n        success = not result.get("fail") and not result.get("skip") and not result.get("error")\n    else:\n        success = result["status"] == "success"\n\n    if not success:\n        return None\n\n    # all versions represent timing the same way\n    metadata = [\n        MetadataEntry("Execution Time (seconds)", value=result["execution_time"])\n    ] + _timing_to_metadata(result["timing"])\n\n    # working with a response that contains the node block (RPC and CLI 0.18.x)\n    if "node" in result:\n\n        unique_id = result["node"]["unique_id"]\n        metadata += _node_result_to_metadata(result["node"])\n    else:\n        unique_id = result["unique_id"]\n\n    id_prefix = unique_id.split(".")\n\n    # only generate materializations for models\n    if id_prefix[0] != "model":\n        return None\n\n    if docs_url:\n        metadata = [\n            MetadataEntry("docs_url", value=MetadataValue.url(f"{docs_url}#!/model/{unique_id}"))\n        ] + metadata\n\n    return AssetMaterialization(\n        description=f"dbt node: {unique_id}",\n        metadata_entries=metadata,\n        asset_key=asset_key_prefix + id_prefix,\n    )\n\n\n
[docs]def generate_materializations(\n dbt_output: DbtOutput, asset_key_prefix: Optional[List[str]] = None\n) -> Iterator[AssetMaterialization]:\n\n """\n This function yields :py:class:`dagster.AssetMaterialization` events for each model created by\n a dbt run command (with information parsed from a :py:class:`~DbtOutput` object).\n\n Note that this will not work with output from the `dbt_rpc_resource`, because this resource does\n not wait for a response from the RPC server before returning. Instead, use the\n `dbt_rpc_sync_resource`, which will wait for execution to complete.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import op, Output\n from dagster_dbt.utils import generate_materializations\n from dagster_dbt import dbt_cli_resource, dbt_rpc_sync_resource\n\n @op(required_resource_keys={"dbt"})\n def my_custom_dbt_run(context):\n dbt_output = context.resources.dbt.run()\n for materialization in generate_materializations(dbt_output):\n # you can modify the materialization object to add extra metadata, if desired\n yield materialization\n yield Output(my_dbt_output)\n\n @job(resource_defs={{"dbt":dbt_cli_resource}})\n def my_dbt_cli_job():\n my_custom_dbt_run()\n\n @job(resource_defs={{"dbt":dbt_rpc_sync_resource}})\n def my_dbt_rpc_job():\n my_custom_dbt_run()\n """\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n for result in dbt_output.result["results"]:\n materialization = result_to_materialization(\n result, asset_key_prefix, docs_url=dbt_output.docs_url\n )\n if materialization is not None:\n yield materialization
\n
", "current_page_name": "_modules/dagster_dbt/utils", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.utils"}}, "dagster_docker": {"docker_executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_docker.docker_executor

\nimport os\nfrom typing import List\n\nimport docker\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom dagster import check, executor\nfrom dagster.core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster.core.events import DagsterEvent, DagsterEventType, EngineEventData, MetadataEntry\nfrom dagster.core.execution.plan.objects import StepFailureData\nfrom dagster.core.execution.retries import RetryMode, get_retries_config\nfrom dagster.core.executor.base import Executor\nfrom dagster.core.executor.init import InitExecutorContext\nfrom dagster.core.executor.step_delegating import StepDelegatingExecutor\nfrom dagster.core.executor.step_delegating.step_handler.base import StepHandler, StepHandlerContext\nfrom dagster.serdes.utils import hash_str\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]@executor(\n name="docker",\n config_schema=merge_dicts(\n DOCKER_CONFIG_SCHEMA,\n {\n "retries": get_retries_config(),\n },\n ),\n requirements=multiple_process_executor_requirements(),\n)\n@experimental\ndef docker_executor(init_context: InitExecutorContext) -> Executor:\n """\n Executor which launches steps as Docker containers.\n\n To use the `docker_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_executor.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n registry: ...\n network: ...\n networks: ...\n container_kwargs: ...\n\n If you're using the DockerRunLauncher, configuration set on the containers created by the run\n launcher will also be set on the containers that are created for each step.\n """\n from . import DockerRunLauncher\n\n image = init_context.executor_config.get("image")\n registry = init_context.executor_config.get("registry")\n env_vars = init_context.executor_config.get("env_vars")\n network = init_context.executor_config.get("network")\n networks = init_context.executor_config.get("networks")\n container_kwargs = init_context.executor_config.get("container_kwargs")\n\n run_launcher = init_context.instance.run_launcher\n if isinstance(run_launcher, DockerRunLauncher):\n image = image or run_launcher.image\n registry = registry or run_launcher.registry\n env_vars = run_launcher.env_vars + (env_vars or [])\n networks = run_launcher.networks + (networks or [])\n container_kwargs = merge_dicts(run_launcher.container_kwargs, container_kwargs or {})\n\n validate_docker_config(network, networks, container_kwargs)\n\n return StepDelegatingExecutor(\n DockerStepHandler(\n image,\n registry,\n env_vars,\n network,\n networks,\n container_kwargs,\n ),\n retries=RetryMode.from_config(init_context.executor_config["retries"]),\n )
\n\n\nclass DockerStepHandler(StepHandler):\n def __init__(\n self,\n image=None,\n registry=None,\n env_vars=None,\n network=None,\n networks=None,\n container_kwargs=None,\n ):\n super().__init__()\n\n self._image = image\n self._registry = registry\n self._env_vars = env_vars\n\n if network:\n self._networks = [network]\n elif networks:\n self._networks = networks\n else:\n self._networks = []\n\n self._container_kwargs = check.opt_dict_param(\n container_kwargs, "container_kwargs", key_type=str\n )\n\n @property\n def name(self) -> str:\n return "DockerStepHandler"\n\n def _get_client(self):\n client = docker.client.from_env()\n if self._registry:\n client.login(\n registry=self._registry["url"],\n username=self._registry["username"],\n password=self._registry["password"],\n )\n return client\n\n def _get_container_name(self, run_id, step_key):\n return f"dagster-step-{hash_str(run_id + step_key)}"\n\n def _create_step_container(self, client, step_image, execute_step_args):\n return client.containers.create(\n step_image,\n name=self._get_container_name(\n execute_step_args.pipeline_run_id, execute_step_args.step_keys_to_execute[0]\n ),\n detach=True,\n network=self._networks[0] if len(self._networks) else None,\n command=execute_step_args.get_command_args(),\n environment=(\n {env_name: os.getenv(env_name) for env_name in self._env_vars}\n if self._env_vars\n else {}\n ),\n **self._container_kwargs,\n )\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> List[DagsterEvent]:\n client = self._get_client()\n\n step_image = (\n step_handler_context.execute_step_args.pipeline_origin.repository_origin.container_image\n )\n\n if not step_image:\n step_image = self._image\n\n if not step_image:\n raise Exception("No docker image specified by the executor config or repository")\n\n validate_docker_image(step_image)\n\n try:\n step_container = self._create_step_container(\n client, step_image, step_handler_context.execute_step_args\n )\n except docker.errors.ImageNotFound:\n client.images.pull(step_image)\n step_container = self._create_step_container(\n client, step_image, step_handler_context.execute_step_args\n )\n\n if len(self._networks) > 1:\n for network_name in self._networks[1:]:\n network = client.networks.get(network_name)\n network.connect(step_container)\n\n assert (\n len(step_handler_context.execute_step_args.step_keys_to_execute) == 1\n ), "Launching multiple steps is not currently supported"\n step_key = step_handler_context.execute_step_args.step_keys_to_execute[0]\n\n events = [\n DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message="Launching step in Docker container",\n event_specific_data=EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Docker container id", value=step_container.id),\n ],\n ),\n )\n ]\n\n step_container.start()\n\n return events\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> List[DagsterEvent]:\n step_key = step_handler_context.execute_step_args.step_keys_to_execute[0]\n\n client = self._get_client()\n\n container_name = self._get_container_name(\n step_handler_context.execute_step_args.pipeline_run_id,\n step_key,\n )\n\n try:\n container = client.containers.get(container_name)\n\n except Exception as e:\n return [\n DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Error when checking on step container health: {e}",\n event_specific_data=StepFailureData(\n error=None,\n user_failure_data=None,\n ),\n )\n ]\n\n if container.status == "running":\n return []\n\n try:\n container_info = container.wait(timeout=0.1)\n except Exception as e:\n return [\n DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Container status is {container.status}. Hit exception attempting to get its return code: {e}",\n event_specific_data=StepFailureData(\n error=None,\n user_failure_data=None,\n ),\n )\n ]\n\n ret_code = container_info.get("StatusCode")\n if ret_code == 0:\n return []\n\n return [\n DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Container status is {container.status}. Return code is {str(ret_code)}.",\n event_specific_data=StepFailureData(\n error=None,\n user_failure_data=None,\n ),\n )\n ]\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> List[DagsterEvent]:\n\n assert (\n len(step_handler_context.execute_step_args.step_keys_to_execute) == 1\n ), "Launching multiple steps is not currently supported"\n step_key = step_handler_context.execute_step_args.step_keys_to_execute[0]\n\n events = [\n DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message="Stopping Docker container for step",\n event_specific_data=EngineEventData(),\n )\n ]\n\n client = self._get_client()\n\n try:\n container = client.containers.get(\n self._get_container_name(\n step_handler_context.execute_step_args.pipeline_run_id,\n step_handler_context.execute_step_args.step_keys_to_execute[0],\n )\n )\n container.stop()\n except Exception as e:\n events.append(\n DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Hit error while terminating Docker container:\\n{e}",\n event_specific_data=EngineEventData(),\n )\n )\n\n return events\n
", "current_page_name": "_modules/dagster_docker/docker_executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_docker.docker_executor"}, "docker_run_launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_docker.docker_run_launcher

\nimport os\n\nimport docker\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom dagster import check\nfrom dagster.core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    ResumeRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster.grpc.types import ExecuteRunArgs, ResumeRunArgs\nfrom dagster.serdes import ConfigurableClass\n\nDOCKER_CONTAINER_ID_TAG = "docker/container_id"\n\n\n
[docs]class DockerRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs in a Docker container."""\n\n def __init__(\n self,\n inst_data=None,\n image=None,\n registry=None,\n env_vars=None,\n network=None,\n networks=None,\n container_kwargs=None,\n ):\n self._inst_data = inst_data\n self.image = image\n self.registry = registry\n self.env_vars = env_vars\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network:\n self.networks = [network]\n elif networks:\n self.networks = networks\n else:\n self.networks = []\n\n self.container_kwargs = check.opt_dict_param(\n container_kwargs, "container_kwargs", key_type=str\n )\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return DOCKER_CONFIG_SCHEMA\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return DockerRunLauncher(inst_data=inst_data, **config_value)\n\n def _get_client(self):\n client = docker.client.from_env()\n if self.registry:\n client.login(\n registry=self.registry["url"],\n username=self.registry["username"],\n password=self.registry["password"],\n )\n return client\n\n def _get_docker_image(self, pipeline_code_origin):\n docker_image = pipeline_code_origin.repository_origin.container_image\n\n if not docker_image:\n docker_image = self.image\n\n if not docker_image:\n raise Exception("No docker image specified by the instance config or repository")\n\n validate_docker_image(docker_image)\n return docker_image\n\n def _launch_container_with_command(self, run, docker_image, command):\n docker_env = (\n {env_name: os.getenv(env_name) for env_name in self.env_vars} if self.env_vars else {}\n )\n\n client = self._get_client()\n\n try:\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=self.networks[0] if len(self.networks) else None,\n **self.container_kwargs,\n )\n\n except docker.errors.ImageNotFound:\n client.images.pull(docker_image)\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=self.networks[0] if len(self.networks) else None,\n **self.container_kwargs,\n )\n\n if len(self.networks) > 1:\n for network_name in self.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n self._instance.report_engine_event(\n message="Launching run in a new container {container_id} with image {docker_image}".format(\n container_id=container.id,\n docker_image=docker_image,\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_CONTAINER_ID_TAG: container.id, DOCKER_IMAGE_TAG: docker_image},\n )\n\n container.start()\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.pipeline_run\n pipeline_code_origin = context.pipeline_code_origin\n docker_image = self._get_docker_image(pipeline_code_origin)\n\n command = ExecuteRunArgs(\n pipeline_origin=pipeline_code_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.pipeline_run\n pipeline_code_origin = context.pipeline_code_origin\n docker_image = self._get_docker_image(pipeline_code_origin)\n\n command = ResumeRunArgs(\n pipeline_origin=pipeline_code_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n def _get_container(self, run):\n if not run or run.is_finished:\n return None\n\n container_id = run.tags.get(DOCKER_CONTAINER_ID_TAG)\n\n if not container_id:\n return None\n\n try:\n return self._get_client().containers.get(container_id)\n except Exception:\n return None\n\n def can_terminate(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n return self._get_container(run) != None\n\n def terminate(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n container = self._get_container(run)\n\n if not container:\n self._instance.report_engine_event(\n message="Unable to get docker container to send termination request to.",\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n self._instance.report_run_canceling(run)\n\n container.stop()\n\n return True\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: PipelineRun):\n container = self._get_container(run)\n if container == None:\n return CheckRunHealthResult(WorkerStatus.NOT_FOUND)\n if container.status == "running":\n return CheckRunHealthResult(WorkerStatus.RUNNING)\n return CheckRunHealthResult(\n WorkerStatus.FAILED, msg=f"Container status is {container.status}"\n )
\n
", "current_page_name": "_modules/dagster_docker/docker_run_launcher", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_docker.docker_run_launcher"}}, "dagster_fivetran": {"asset_defs": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_fivetran.asset_defs

\nfrom typing import List, Optional\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL\nfrom dagster_fivetran.utils import generate_materializations\n\nfrom dagster import AssetKey, AssetsDefinition, Out, Output, check, multi_asset\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]@experimental\ndef build_fivetran_assets(\n connector_id: str,\n destination_tables: List[str],\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n io_manager_key: Optional[str] = None,\n asset_key_prefix: Optional[List[str]] = None,\n) -> List[AssetsDefinition]:\n\n """\n Build a set of assets for a given Fivetran connector.\n\n Returns an AssetsDefintion which connects the specified ``asset_keys`` to the computation that\n will update them. Internally, executes a Fivetran sync for a given ``connector_id``, and\n polls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\n :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to communicate with the\n Fivetran API.\n\n Args:\n connector_id (str): The Fivetran Connector ID that this op will sync. You can retrieve this\n value from the "Setup" tab of a given connector in the Fivetran UI.\n destination_tables (List[str]): `schema_name.table_name` for each table that you want to be\n represented in the Dagster asset graph for this connection.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n io_manager_key (Optional[str]): The io_manager to be used to handle each of these assets.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([schema_name, table_name])`.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import AssetKey, build_assets_job\n\n from dagster_fivetran import fivetran_resource\n from dagster_fivetran.assets import build_fivetran_assets\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n fivetran_assets = build_fivetran_assets(\n connector_id="foobar",\n table_names=["schema1.table1", "schema2.table2"],\n ])\n\n my_fivetran_job = build_assets_job(\n "my_fivetran_job",\n assets=[fivetran_assets],\n resource_defs={"fivetran": my_fivetran_resource}\n )\n\n """\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n tracked_asset_keys = {\n AssetKey(asset_key_prefix + table.split(".")) for table in destination_tables\n }\n\n @multi_asset(\n name=f"fivetran_sync_{connector_id}",\n outs={\n "_".join(key.path): Out(io_manager_key=io_manager_key, asset_key=key)\n for key in tracked_asset_keys\n },\n required_resource_keys={"fivetran"},\n compute_kind="fivetran",\n )\n def _assets(context):\n fivetran_output = context.resources.fivetran.sync_and_poll(\n connector_id=connector_id,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n for materialization in generate_materializations(\n fivetran_output, asset_key_prefix=asset_key_prefix\n ):\n # scan through all tables actually created, if it was expected then emit an Output.\n # otherwise, emit a runtime AssetMaterialization\n if materialization.asset_key in tracked_asset_keys:\n yield Output(\n value=None,\n output_name="_".join(materialization.asset_key.path),\n metadata={\n entry.label: entry.entry_data for entry in materialization.metadata_entries\n },\n )\n else:\n yield materialization\n\n return [_assets]
\n
", "current_page_name": "_modules/dagster_fivetran/asset_defs", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_fivetran.asset_defs"}, "ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_fivetran.ops

\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import generate_materializations\n\nfrom dagster import Array, AssetKey, Bool, Field, In, Noneable, Nothing, Out, Output, Permissive, op\n\n\n
[docs]@op(\n required_resource_keys={"fivetran"},\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description="Parsed json dictionary representing the details of the Fivetran connector after "\n "the sync successfully completes. "\n "See the [Fivetran API Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) "\n "to see detailed information on this response.",\n ),\n config_schema={\n "connector_id": Field(\n str,\n is_required=True,\n description="The Fivetran Connector ID that this op will sync. You can retrieve this "\n 'value from the "Setup" tab of a given connector in the Fivetran UI.',\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) that will be waited between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description="The maximum time that will waited before this operation is timed out. By "\n "default, this will never time out.",\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Fivetran sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["fivetran"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "fivetran"},\n)\ndef fivetran_sync_op(context):\n """\n Executes a Fivetran sync for a given ``connector_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the sync successfully completes, as well as details\n about which tables the sync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_sync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n\n fivetran_output = context.resources.fivetran.sync_and_poll(\n connector_id=context.op_config["connector_id"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n yield from generate_materializations(\n fivetran_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(fivetran_output)
\n\n\n@op(\n required_resource_keys={"fivetran"},\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description="Parsed json dictionary representing the details of the Fivetran connector after "\n "the resync successfully completes. "\n "See the [Fivetran API Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) "\n "to see detailed information on this response.",\n ),\n config_schema={\n "connector_id": Field(\n str,\n is_required=True,\n description="The Fivetran Connector ID that this op will sync. You can retrieve this "\n 'value from the "Setup" tab of a given connector in the Fivetran UI.',\n ),\n "resync_parameters": Field(\n Permissive(),\n is_required=True,\n description="The resync parameters to send in the payload to the Fivetran API. You "\n "can find an example resync payload here: https://fivetran.com/docs/rest-api/connectors#request_6",\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) that will be waited between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description="The maximum time that will waited before this operation is timed out. By "\n "default, this will never time out.",\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Fivetran sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["fivetran"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "fivetran"},\n)\ndef fivetran_resync_op(context):\n """\n Executes a Fivetran historical resync for a given ``connector_id``, and polls until that resync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the resync successfully completes, as well as details\n about which tables the resync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_resync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_resync_op.configured(\n {\n "connector_id": "foobar",\n "resync_parameters": {\n "schema_a": ["table_a", "table_b"],\n "schema_b": ["table_c"]\n }\n },\n name="sync_foobar"\n )\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n\n fivetran_output = context.resources.fivetran.resync_and_poll(\n connector_id=context.op_config["connector_id"],\n resync_parameters=context.op_config["resync_parameters"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n asset_key_filter = [\n AssetKey(context.op_config["asset_key_prefix"] + [schema, table])\n for schema, tables in context.op_config["resync_parameters"].items()\n for table in tables\n ]\n for mat in generate_materializations(\n fivetran_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n ):\n if mat.asset_key in asset_key_filter:\n yield mat\n\n yield Output(fivetran_output)\n
", "current_page_name": "_modules/dagster_fivetran/ops", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_fivetran.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_fivetran.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Dict, List, Optional, Tuple\nfrom urllib.parse import urljoin\n\nimport requests\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import get_fivetran_connector_url, get_fivetran_logs_url\nfrom dateutil import parser\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom dagster import (\n    Failure,\n    Field,\n    MetadataValue,\n    StringSource,\n    __version__,\n    check,\n    get_dagster_logger,\n    resource,\n)\n\nFIVETRAN_API_BASE = "https://api.fivetran.com"\nFIVETRAN_CONNECTOR_PATH = "v1/connectors/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\n
[docs]class FivetranResource:\n """\n This class exposes methods on top of the Fivetran REST API.\n """\n\n def __init__(\n self,\n api_key: str,\n api_secret: str,\n disable_schedule_on_trigger: bool = True,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n log: logging.Logger = get_dagster_logger(),\n ):\n self._auth = HTTPBasicAuth(api_key, api_secret)\n self._disable_schedule_on_trigger = disable_schedule_on_trigger\n\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._log = log\n\n @property\n def api_base_url(self) -> str:\n return urljoin(FIVETRAN_API_BASE, FIVETRAN_CONNECTOR_PATH)\n\n
[docs] def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Dict[str, Any]:\n """\n Creates and sends a request to the desired Fivetran Connector API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Fivetran API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n\n headers = {\n "User-Agent": f"dagster-fivetran/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=urljoin(self.api_base_url, endpoint),\n headers=headers,\n auth=self._auth,\n data=data,\n )\n response.raise_for_status()\n resp_dict = response.json()\n return resp_dict["data"] if "data" in resp_dict else resp_dict\n except RequestException as e:\n self._log.error("Request to Fivetran API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure("Exceeded max number of retries.")
\n\n
[docs] def get_connector_details(self, connector_id: str) -> Dict[str, Any]:\n """\n Gets details about a given connector from the Fivetran Connector API.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n return self.make_request(method="GET", endpoint=connector_id)
\n\n def _assert_syncable_connector(self, connector_id: str):\n """\n Confirms that a given connector is eligible to sync. Will raise a Failure in the event that\n the connector is either paused or not fully setup.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n """\n connector_details = self.get_connector_details(connector_id)\n if connector_details["paused"]:\n raise Failure("Connector '{connector_id}' cannot be synced as it is currently paused.")\n if connector_details["status"]["setup_state"] != "connected":\n raise Failure("Connector '{connector_id}' cannot be synced as it has not been setup")\n\n
[docs] def get_connector_sync_status(self, connector_id: str) -> Tuple[datetime.datetime, bool, str]:\n """\n Gets details about the status of the most recent Fivetran sync operation for a given\n connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Tuple[datetime.datetime, bool, str]:\n Tuple representing the timestamp of the last completeded sync, if it succeeded, and\n the currently reported sync status.\n """\n connector_details = self.get_connector_details(connector_id)\n\n min_time_str = "0001-01-01 00:00:00+00"\n succeeded_at = parser.parse(connector_details["succeeded_at"] or min_time_str)\n failed_at = parser.parse(connector_details["failed_at"] or min_time_str)\n\n return (\n max(succeeded_at, failed_at),\n succeeded_at > failed_at,\n connector_details["status"]["sync_state"],\n )
\n\n
[docs] def update_connector(\n self, connector_id: str, properties: Optional[Dict[str, Any]] = None\n ) -> Dict[str, Any]:\n """\n Updates properties of a Fivetran Connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n properties (Dict[str, Any]): The properties to be updated. For a comprehensive list of\n properties, see the [Fivetran docs](https://fivetran.com/docs/rest-api/connectors#modifyaconnector).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n return self.make_request(method="PATCH", endpoint=connector_id, data=json.dumps(properties))
\n\n
[docs] def update_schedule_type(\n self, connector_id: str, schedule_type: Optional[str] = None\n ) -> Dict[str, Any]:\n """\n Updates the schedule type property of the connector to either "auto" or "manual".\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n schedule_type (Optional[str]): Either "auto" (to turn the schedule on) or "manual" (to\n turn it off).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n if schedule_type not in ["auto", "manual"]:\n check.failed("schedule_type must be either 'auto' or 'manual'.")\n return self.update_connector(connector_id, properties={"schedule_type": schedule_type})
\n\n def get_connector_schema_config(self, connector_id: str) -> Dict[str, Any]:\n return self.make_request("GET", endpoint=f"{connector_id}/schemas")\n\n
[docs] def start_sync(self, connector_id: str) -> Dict[str, Any]:\n """\n Initiates a sync of a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the sync is started.\n """\n if self._disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_request(method="POST", endpoint=f"{connector_id}/force")\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this sync in the Fivetran UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details
\n\n
[docs] def start_resync(\n self, connector_id: str, resync_parameters: Dict[str, List[str]]\n ) -> Dict[str, Any]:\n """\n Initiates a historical sync of all data for multiple schema tables within a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Dict[str, List[str]]): The resync parameters to send to the Fivetran API.\n An example payload can be found here: https://fivetran.com/docs/rest-api/connectors#request_6\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the resync is started.\n """\n if self._disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_request(\n method="POST",\n endpoint=f"{connector_id}/schemas/tables/resync",\n data=json.dumps(resync_parameters),\n )\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this resync in the Fivetran UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details
\n\n
[docs] def poll_sync(\n self,\n connector_id: str,\n initial_last_sync_completion: datetime.datetime,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Dict[str, Any]:\n """\n Given a Fivetran connector and the timestamp at which the previous sync completed, poll\n until the next sync completes.\n\n The previous sync completion time is necessary because the only way to tell when a sync\n completes is when this value changes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync\n (successful or otherwise) for this connector, prior to running this method.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n poll_start = datetime.datetime.now()\n while True:\n (\n curr_last_sync_completion,\n curr_last_sync_succeeded,\n curr_sync_state,\n ) = self.get_connector_sync_status(connector_id)\n self._log.info(f"Polled '{connector_id}'. Status: [{curr_sync_state}]")\n\n if curr_last_sync_completion > initial_last_sync_completion:\n break\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for connector '{connector_id}' timed out after {datetime.datetime.now() - poll_start}."\n )\n\n # Sleep for the configured time interval before polling again.\n time.sleep(poll_interval)\n\n connector_details = self.get_connector_details(connector_id)\n if not curr_last_sync_succeeded:\n raise Failure(\n f"Sync for connector '{connector_id}' failed!",\n metadata={\n "connector_details": MetadataValue.json(connector_details),\n "log_url": MetadataValue.url(get_fivetran_logs_url(connector_details)),\n },\n )\n return connector_details
\n\n
[docs] def sync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> FivetranOutput:\n """\n Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_sync(connector_id)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)
\n\n
[docs] def resync_and_poll(\n self,\n connector_id: str,\n resync_parameters: Dict[str, List[str]],\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> FivetranOutput:\n """\n Initializes a historical resync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Dict[str, List[str]]): The payload to send to the Fivetran API.\n This should be a dictionary with schema names as the keys and a list of tables\n to resync as the values.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_resync(connector_id, resync_parameters)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)
\n\n\n
[docs]@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n is_required=True,\n description="Fivetran API Key. You can find this value on the Fivetran settings page: "\n "https://fivetran.com/account/settings",\n ),\n "api_secret": Field(\n StringSource,\n is_required=True,\n description="Fivetran API Secret. You can find this value on the Fivetran settings page: "\n "https://fivetran.com/account/settings",\n ),\n "disable_schedule_on_trigger": Field(\n bool,\n default_value=True,\n description="Specifies if you would like any connector that is sync'd using this "\n "resource to be automatically taken off its Fivetran schedule.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description="The maximum number of times requests to the Fivetran API should be retried "\n "before failing.",\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps manage Fivetran connectors",\n)\ndef fivetran_resource(context) -> FivetranResource:\n """\n This resource allows users to programatically interface with the Fivetran REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Fivetran REST API, including expected response JSON\n schemae, see the `Fivetran API Docs <https://fivetran.com/docs/rest-api/connectors>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/overview/configuration#configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n @job(resource_defs={"fivetran":my_fivetran_resource})\n def my_fivetran_job():\n ...\n\n """\n return FivetranResource(\n api_key=context.resource_config["api_key"],\n api_secret=context.resource_config["api_secret"],\n disable_schedule_on_trigger=context.resource_config["disable_schedule_on_trigger"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_fivetran/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_fivetran.resources"}}, "dagster_gcp": {"bigquery": {"ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.bigquery.ops

\nimport hashlib\n\nfrom dagster_pandas import DataFrame\nfrom google.cloud.bigquery.job import LoadJobConfig, QueryJobConfig\nfrom google.cloud.bigquery.table import EncryptionConfiguration, TimePartitioning\n\nfrom dagster import InputDefinition, List, Nothing, OutputDefinition, check, op, solid\n\nfrom .configs import (\n    define_bigquery_create_dataset_config,\n    define_bigquery_delete_dataset_config,\n    define_bigquery_load_config,\n    define_bigquery_query_config,\n)\nfrom .types import BigQueryLoadSource\n\n_START = "start"\n\n\ndef _preprocess_config(cfg):\n    destination_encryption_configuration = cfg.get("destination_encryption_configuration")\n    time_partitioning = cfg.get("time_partitioning")\n\n    if destination_encryption_configuration is not None:\n        cfg["destination_encryption_configuration"] = EncryptionConfiguration(\n            kms_key_name=destination_encryption_configuration\n        )\n\n    if time_partitioning is not None:\n        cfg["time_partitioning"] = TimePartitioning(**time_partitioning)\n\n    return cfg\n\n\ndef _bq_core_command(dagster_decorator, decorator_name, sql_queries):\n    sql_queries = check.list_param(sql_queries, "sql queries", of_type=str)\n    m = hashlib.sha1()\n    for query in sql_queries:\n        m.update(query.encode("utf-8"))\n    hash_str = m.hexdigest()[:10]\n    name = f"bq_{decorator_name}_{hash_str}"\n\n    @dagster_decorator(\n        name=name,\n        input_defs=[InputDefinition(_START, Nothing)],\n        output_defs=[OutputDefinition(List[DataFrame])],\n        config_schema=define_bigquery_query_config(),\n        required_resource_keys={"bigquery"},\n        tags={"kind": "sql", "sql": "\\n".join(sql_queries)},\n    )\n    def _bq_fn(context):  # pylint: disable=unused-argument\n        query_job_config = _preprocess_config(context.op_config.get("query_job_config", {}))\n\n        # Retrieve results as pandas DataFrames\n        results = []\n        for sql_query in sql_queries:\n            # We need to construct a new QueryJobConfig for each query.\n            # See: https://bit.ly/2VjD6sl\n            cfg = QueryJobConfig(**query_job_config) if query_job_config else None\n            context.log.info(\n                "executing query %s with config: %s"\n                % (sql_query, cfg.to_api_repr() if cfg else "(no config provided)")\n            )\n            results.append(\n                context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()\n            )\n\n        return results\n\n    return _bq_fn\n\n\n
[docs]def bq_solid_for_queries(sql_queries):\n """\n Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n\n return _bq_core_command(solid, "solid", sql_queries)
\n\n\n
[docs]def bq_op_for_queries(sql_queries):\n """\n Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n\n return _bq_core_command(op, "op", sql_queries)
\n\n\nBIGQUERY_LOAD_CONFIG = define_bigquery_load_config()\n\n\n
[docs]@op(\n input_defs=[InputDefinition("paths", List[str])],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_gcs_paths_to_bq(context, paths):\n return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)
\n\n\n
[docs]@op(\n input_defs=[InputDefinition("df", DataFrame)],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_df_to_bq(context, df):\n return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)
\n\n\n
[docs]@op(\n input_defs=[InputDefinition("path", str)],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_file_to_bq(context, path):\n return _execute_load_in_source(context, path, BigQueryLoadSource.File)
\n\n\ndef _execute_load_in_source(context, source, source_name):\n destination = context.op_config.get("destination")\n load_job_config = _preprocess_config(context.op_config.get("load_job_config", {}))\n cfg = LoadJobConfig(**load_job_config) if load_job_config else None\n\n context.log.info(\n "executing BQ load with config: %s for source %s"\n % (cfg.to_api_repr() if cfg else "(no config provided)", source)\n )\n\n if source_name == BigQueryLoadSource.DataFrame:\n context.resources.bigquery.load_table_from_dataframe(\n source, destination, job_config=cfg\n ).result()\n\n # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local\n elif source_name == BigQueryLoadSource.File:\n with open(source, "rb") as file_obj:\n context.resources.bigquery.load_table_from_file(\n file_obj, destination, job_config=cfg\n ).result()\n\n # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage\n elif source_name == BigQueryLoadSource.GCS:\n context.resources.bigquery.load_table_from_uri(source, destination, job_config=cfg).result()\n\n\n
[docs]@op(\n input_defs=[InputDefinition(_START, Nothing)],\n config_schema=define_bigquery_create_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_create_dataset(context):\n """BigQuery Create Dataset.\n\n This op encapsulates creating a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, exists_ok) = [context.op_config.get(k) for k in ("dataset", "exists_ok")]\n context.log.info("executing BQ create_dataset for dataset %s" % (dataset))\n context.resources.bigquery.create_dataset(dataset, exists_ok)
\n\n\n
[docs]@op(\n input_defs=[InputDefinition(_START, Nothing)],\n config_schema=define_bigquery_delete_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_delete_dataset(context):\n """BigQuery Delete Dataset.\n\n This op encapsulates deleting a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n\n (dataset, delete_contents, not_found_ok) = [\n context.op_config.get(k) for k in ("dataset", "delete_contents", "not_found_ok")\n ]\n\n context.log.info("executing BQ delete_dataset for dataset %s" % dataset)\n\n context.resources.bigquery.delete_dataset(\n dataset, delete_contents=delete_contents, not_found_ok=not_found_ok\n )
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/ops", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.bigquery.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.bigquery.resources

\nfrom google.cloud import bigquery  # type: ignore\n\nfrom dagster import resource\n\nfrom .configs import bq_resource_config\n\n\n
[docs]@resource(\n config_schema=bq_resource_config(), description="Dagster resource for connecting to BigQuery"\n)\ndef bigquery_resource(context):\n return bigquery.Client(**context.resource_config)
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.bigquery.resources"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.bigquery.types

\nimport re\nfrom enum import Enum as PyEnum\n\nfrom google.cloud.bigquery.job import (\n    CreateDisposition,\n    Encoding,\n    QueryPriority,\n    SchemaUpdateOption,\n    SourceFormat,\n    WriteDisposition,\n)\n\nfrom dagster import Enum, EnumValue\nfrom dagster.config import ConfigScalar, ConfigScalarKind, PostProcessingError\n\n\nclass BigQueryLoadSource(PyEnum):\n    DataFrame = "DATA_FRAME"\n    GCS = "GCS"\n    File = "FILE"\n\n\nBQCreateDisposition = Enum(\n    name="BQCreateDisposition",\n    enum_values=[\n        EnumValue(CreateDisposition.CREATE_IF_NEEDED),\n        EnumValue(CreateDisposition.CREATE_NEVER),\n    ],\n)\n\nBQPriority = Enum(\n    name="BQPriority",\n    enum_values=[EnumValue(QueryPriority.BATCH), EnumValue(QueryPriority.INTERACTIVE)],\n)\n\nBQSchemaUpdateOption = Enum(\n    name="BQSchemaUpdateOption",\n    enum_values=[\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_ADDITION,\n            description="Allow adding a nullable field to the schema.",\n        ),\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_RELAXATION,\n            description="Allow relaxing a required field in the original schema to nullable.",\n        ),\n    ],\n)\n\nBQWriteDisposition = Enum(\n    name="BQWriteDisposition",\n    enum_values=[\n        EnumValue(WriteDisposition.WRITE_APPEND),\n        EnumValue(WriteDisposition.WRITE_EMPTY),\n        EnumValue(WriteDisposition.WRITE_TRUNCATE),\n    ],\n)\n\nBQEncoding = Enum(\n    name="BQEncoding", enum_values=[EnumValue(Encoding.ISO_8859_1), EnumValue(Encoding.UTF_8)]\n)\n\nBQSourceFormat = Enum(\n    name="BQSourceFormat",\n    enum_values=[\n        EnumValue(SourceFormat.AVRO),\n        EnumValue(SourceFormat.CSV),\n        EnumValue(SourceFormat.DATASTORE_BACKUP),\n        EnumValue(SourceFormat.NEWLINE_DELIMITED_JSON),\n        EnumValue(SourceFormat.ORC),\n        EnumValue(SourceFormat.PARQUET),\n    ],\n)\n\n\n# Project names are permitted to have alphanumeric, dashes and underscores, up to 1024 characters.\nRE_PROJECT = r"[\\w\\d\\-\\_]{1,1024}"\n\n# Datasets and tables are permitted to have alphanumeric or underscores, no dashes allowed, up to\n# 1024 characters\nRE_DS_TABLE = r"[\\w\\d\\_]{1,1024}"\n\n# BigQuery supports writes directly to date partitions with the syntax foo.bar$20190101\nRE_PARTITION_SUFFIX = r"(\\$\\d{8})?"\n\n\ndef _is_valid_dataset(config_value):\n    """Datasets must be of form "project.dataset" or "dataset" """\n    return re.match(\n        # regex matches: project.dataset -- OR -- dataset\n        r"^" + RE_PROJECT + r"\\." + RE_DS_TABLE + r"$|^" + RE_DS_TABLE + r"$",\n        config_value,\n    )\n\n\ndef _is_valid_table(config_value):\n    """Tables must be of form "project.dataset.table" or "dataset.table" with optional\n    date-partition suffix\n    """\n    return re.match(\n        r"^"\n        + RE_PROJECT  #          project\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$|^"  #              -- OR --\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$",\n        config_value,\n    )\n\n\nclass _Dataset(ConfigScalar):\n    def __init__(self):\n        super(_Dataset, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_dataset(value):\n            raise PostProcessingError('Datasets must be of the form "project.dataset" or "dataset"')\n        return value\n\n\nclass _Table(ConfigScalar):\n    def __init__(self):\n        super(_Table, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_table(value):\n            raise PostProcessingError(\n                (\n                    'Tables must be of the form "project.dataset.table" or "dataset.table" '\n                    "with optional date-partition suffix"\n                )\n            )\n\n        return value\n\n\n# https://github.com/dagster-io/dagster/issues/1971\nTable = _Table()\nDataset = _Dataset()\n\n\n
[docs]class BigQueryError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/types", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.bigquery.types"}}, "dataproc": {"ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.dataproc.ops

\nfrom dagster import Bool, Field, op, solid\nfrom dagster.seven import json\n\nfrom .configs import define_dataproc_submit_job_config\n\nDATAPROC_CONFIG_SCHEMA = {\n    "job_config": define_dataproc_submit_job_config(),\n    "job_scoped_cluster": Field(\n        Bool,\n        description="whether to create a cluster or use an existing cluster",\n        is_required=False,\n        default_value=True,\n    ),\n}\n\n\ndef _dataproc_compute(context):\n    job_config = context.solid_config["job_config"]\n\n    context.log.info("submitting job with config: %s" % str(json.dumps(job_config)))\n\n    if context.solid_config["job_scoped_cluster"]:\n        # Cluster context manager, creates and then deletes cluster\n        with context.resources.dataproc.cluster_context_manager() as cluster:\n            # Submit the job specified by this solid to the cluster defined by the associated resource\n            result = cluster.submit_job(job_config)\n\n            job_id = result["reference"]["jobId"]\n            context.log.info("Submitted job ID {}".format(job_id))\n            cluster.wait_for_job(job_id)\n    else:\n        # Submit to an existing cluster\n        # Submit the job specified by this solid to the cluster defined by the associated resource\n        result = context.resources.dataproc.submit_job(job_config)\n\n        job_id = result["reference"]["jobId"]\n        context.log.info("Submitted job ID {}".format(job_id))\n        context.resources.dataproc.wait_for_job(job_id)\n\n\n
[docs]@solid(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_solid(context):\n return _dataproc_compute(context)
\n\n\n
[docs]@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_op(context):\n return _dataproc_compute(context)
\n
", "current_page_name": "_modules/dagster_gcp/dataproc/ops", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.dataproc.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.dataproc.resources

\nimport time\nfrom contextlib import contextmanager\n\nfrom googleapiclient.discovery import build\nfrom oauth2client.client import GoogleCredentials\n\nfrom dagster import resource\n\nfrom .configs import define_dataproc_create_cluster_config\nfrom .types import DataprocError\n\nTWENTY_MINUTES = 20 * 60\nDEFAULT_ITER_TIME_SEC = 5\n\n\nclass DataprocResource:\n    """Builds a client to the dataproc API."""\n\n    def __init__(self, config):\n        # Use Application Default Credentials to check the\n        # GOOGLE_APPLICATION_CREDENTIALS environment variable\n        # for the location of the service account key file.\n        credentials = GoogleCredentials.get_application_default()\n\n        # See https://github.com/googleapis/google-api-python-client/issues/299 for the\n        # cache_discovery=False configuration below\n        self.dataproc = build("dataproc", "v1", credentials=credentials, cache_discovery=False)\n\n        self.config = config\n\n        (self.project_id, self.region, self.cluster_name, self.cluster_config) = (\n            self.config.get(k) for k in ("projectId", "region", "clusterName", "cluster_config")\n        )\n\n    @property\n    def dataproc_clusters(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            # pylint: disable=no-member\n            self.dataproc.projects()\n            .regions()\n            .clusters()\n        )\n\n    @property\n    def dataproc_jobs(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            # pylint: disable=no-member\n            self.dataproc.projects()\n            .regions()\n            .jobs()\n        )\n\n    def create_cluster(self):\n        (\n            self.dataproc_clusters.create(\n                projectId=self.project_id,\n                region=self.region,\n                body={\n                    "projectId": self.project_id,\n                    "clusterName": self.cluster_name,\n                    "config": self.cluster_config,\n                },\n            ).execute()\n        )\n\n        def iter_fn():\n            # TODO: Add logging\n            # See: https://bit.ly/2UW5JaN\n            cluster = self.get_cluster()\n            return cluster["status"]["state"] in {"RUNNING", "UPDATING"}\n\n        done = DataprocResource._iter_and_sleep_until_ready(iter_fn)\n        if not done:\n            cluster = self.get_cluster()\n            raise DataprocError(\n                "Could not provision cluster -- status: %s" % str(cluster["status"])\n            )\n\n    def get_cluster(self):\n        return self.dataproc_clusters.get(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def delete_cluster(self):\n        return self.dataproc_clusters.delete(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def submit_job(self, job_details):\n        return self.dataproc_jobs.submit(\n            projectId=self.project_id, region=self.region, body=job_details\n        ).execute()\n\n    def get_job(self, job_id):\n        return self.dataproc_jobs.get(\n            projectId=self.project_id, region=self.region, jobId=job_id\n        ).execute()\n\n    def wait_for_job(self, job_id):\n        """This method polls job status every 5 seconds"""\n        # TODO: Add logging here print('Waiting for job ID {} to finish...'.format(job_id))\n        def iter_fn():\n            # See: https://bit.ly/2Lg2tHr\n            result = self.get_job(job_id)\n\n            # Handle exceptions\n            if result["status"]["state"] in {"CANCELLED", "ERROR"}:\n                raise DataprocError("Job error: %s" % str(result["status"]))\n\n            if result["status"]["state"] == "DONE":\n                return True\n\n            return False\n\n        done = DataprocResource._iter_and_sleep_until_ready(iter_fn)\n        if not done:\n            job = self.get_job(job_id)\n            raise DataprocError("Job run timed out: %s" % str(job["status"]))\n\n    @staticmethod\n    def _iter_and_sleep_until_ready(\n        callable_fn, max_wait_time_sec=TWENTY_MINUTES, iter_time=DEFAULT_ITER_TIME_SEC\n    ):\n        """Iterates and sleeps until callable_fn returns true"""\n        # Wait for cluster ready state\n        ready, curr_iter = False, 0\n        max_iter = max_wait_time_sec / iter_time\n        while not ready and curr_iter < max_iter:\n            ready = callable_fn()\n            time.sleep(iter_time)\n            curr_iter += 1\n\n        # Will return false if ran up to max_iter without success\n        return ready\n\n    @contextmanager\n    def cluster_context_manager(self):\n        """This context manager gives syntactic sugar so you can run:\n\n        with context.resources.dataproc.cluster as cluster:\n            # do stuff...\n        """\n        self.create_cluster()\n        try:\n            yield self\n        finally:\n            self.delete_cluster()\n\n\n
[docs]@resource(\n config_schema=define_dataproc_create_cluster_config(),\n description="Manage a Dataproc cluster resource",\n)\ndef dataproc_resource(context):\n return DataprocResource(context.resource_config)
\n
", "current_page_name": "_modules/dagster_gcp/dataproc/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.dataproc.resources"}}, "gcs": {"file_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.gcs.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nfrom google.cloud import storage  # type: ignore\n\nfrom dagster import check, usable_as_dagster_type\nfrom dagster.core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]@usable_as_dagster_type\nclass GCSFileHandle(FileHandle):\n """A reference to a file on GCS."""\n\n def __init__(self, gcs_bucket: str, gcs_key: str):\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_key = check.str_param(gcs_key, "gcs_key")\n\n @property\n def gcs_bucket(self) -> str:\n """str: The name of the GCS bucket."""\n return self._gcs_bucket\n\n @property\n def gcs_key(self) -> str:\n """str: The GCS key."""\n return self._gcs_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's GCS URL."""\n return self.gcs_path\n\n @property\n def gcs_path(self) -> str:\n """str: The file's GCS URL."""\n return "gs://{bucket}/{key}".format(bucket=self.gcs_bucket, key=self.gcs_key)
\n\n\nclass GCSFileManager(FileManager):\n def __init__(self, client, gcs_bucket, gcs_base_key):\n self._client = check.inst_param(client, "client", storage.client.Client)\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_base_key = check.str_param(gcs_base_key, "gcs_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n bucket_obj = self._client.bucket(file_handle.gcs_bucket)\n bucket_obj.blob(file_handle.gcs_key).download_to_file(temp_file_obj)\n self._local_handle_cache[file_handle.gcs_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", GCSFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n with open(self._get_local_path(file_handle), mode) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.gcs_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.gcs_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n gcs_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n bucket_obj = self._client.bucket(self._gcs_bucket)\n bucket_obj.blob(gcs_key).upload_from_file(file_obj)\n return GCSFileHandle(self._gcs_bucket, gcs_key)\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self._gcs_base_key, file_key=file_key)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_gcp/gcs/file_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.gcs.file_manager"}, "io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.gcs.io_manager

\nimport pickle\n\nfrom google.api_core.exceptions import Forbidden, TooManyRequests\nfrom google.cloud import storage  # type: ignore\n\nfrom dagster import Field, IOManager, StringSource, check, io_manager\nfrom dagster.utils import PICKLE_PROTOCOL\nfrom dagster.utils.backoff import backoff\n\nDEFAULT_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectGCSIOManager(IOManager):\n    def __init__(self, bucket, client=None, prefix="dagster"):\n        self.bucket = check.str_param(bucket, "bucket")\n        self.client = client or storage.Client()\n        self.bucket_obj = self.client.bucket(bucket)\n        check.invariant(self.bucket_obj.exists())\n        self.prefix = check.str_param(prefix, "prefix")\n\n    def _get_path(self, context):\n        parts = context.get_output_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return "/".join([self.prefix, "storage", run_id, "files", *output_parts])\n\n    def _rm_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        if self.bucket_obj.blob(key).exists():\n            self.bucket_obj.blob(key).delete()\n\n    def _has_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n        blobs = self.client.list_blobs(self.bucket, prefix=key)\n        return len(list(blobs)) > 0\n\n    def _uri_for_key(self, key):\n        check.str_param(key, "key")\n        return "gs://" + self.bucket + "/" + "{key}".format(key=key)\n\n    def load_input(self, context):\n        key = self._get_path(context.upstream_output)\n        context.log.debug(f"Loading GCS object from: {self._uri_for_key(key)}")\n\n        bytes_obj = self.bucket_obj.blob(key).download_as_bytes()\n        obj = pickle.loads(bytes_obj)\n\n        return obj\n\n    def handle_output(self, context, obj):\n        key = self._get_path(context)\n        context.log.debug(f"Writing GCS object at: {self._uri_for_key(key)}")\n\n        if self._has_object(key):\n            context.log.warning(f"Removing existing GCS key: {key}")\n            self._rm_object(key)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n        backoff(\n            self.bucket_obj.blob(key).upload_from_string,\n            args=[pickled_obj],\n            retry_on=(TooManyRequests, Forbidden),\n        )\n\n\n
[docs]@io_manager(\n config_schema={\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_io_manager(init_context):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Attach this resource definition to your job to make it available to your ops.\n\n .. code-block:: python\n\n @job(resource_defs={'io_manager': gcs_pickle_io_manager, 'gcs': gcs_resource, ...})\n def my_job():\n my_op()\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n gcs_bucket: my-cool-bucket\n gcs_prefix: good/prefix-for-files-\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSIOManager(\n init_context.resource_config["gcs_bucket"],\n client,\n init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager
\n\n\nclass PickledObjectGCSAssetIOManager(PickledObjectGCSIOManager):\n def _get_path(self, context):\n return "/".join([self.prefix, *context.asset_key.path])\n\n\n
[docs]@io_manager(\n config_schema={\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_asset_io_manager(init_context):\n """Persistent IO manager using GCS for storage, meant for use with software-defined assets.\n\n Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\n will overwrite previous materializations of that asset.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Attach this resource definition to your job to make it available to your ops.\n\n .. code-block:: python\n\n asset_group = AssetGroup(\n assets...,\n resource_defs={'io_manager': gcs_pickle_asset_io_manager, "gcs": gcs_resource, ...}),\n )\n\n You may configure this IO manager as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n gcs_bucket: my-cool-bucket\n gcs_prefix: good/prefix-for-files-\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSAssetIOManager(\n init_context.resource_config["gcs_bucket"],\n client,\n init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_gcp/gcs/io_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.gcs.io_manager"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.gcs.resources

\nfrom google.cloud import storage  # type: ignore\n\nfrom dagster import Field, Noneable, StringSource, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom .file_manager import GCSFileManager\n\nGCS_CLIENT_CONFIG = {\n    "project": Field(Noneable(StringSource), is_required=False, description="Project name")\n}\n\n\n
[docs]@resource(\n GCS_CLIENT_CONFIG,\n description="This resource provides a GCS client",\n)\ndef gcs_resource(init_context):\n return _gcs_client_from_config(init_context.resource_config)
\n\n\n
[docs]@resource(\n merge_dicts(\n GCS_CLIENT_CONFIG,\n {\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef gcs_file_manager(context):\n """FileManager that provides abstract access to GCS.\n\n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n """\n gcs_client = _gcs_client_from_config(context.resource_config)\n return GCSFileManager(\n client=gcs_client,\n gcs_bucket=context.resource_config["gcs_bucket"],\n gcs_base_key=context.resource_config["gcs_prefix"],\n )
\n\n\ndef _gcs_client_from_config(config):\n """\n Args:\n config: A configuration containing the fields in GCS_CLIENT_CONFIG.\n\n Returns: A GCS client.\n """\n project = config.get("project", None)\n return storage.client.Client(project=project)\n
", "current_page_name": "_modules/dagster_gcp/gcs/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.gcs.resources"}}}, "dagster_ge": {"factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_ge.factory

\nimport datetime\n\nimport great_expectations as ge\nfrom dagster_pandas import DataFrame\nfrom great_expectations.render.renderer import ValidationResultsPageRenderer\nfrom great_expectations.render.view import DefaultMarkdownPageView\n\nfrom dagster import (\n    ExpectationResult,\n    InputDefinition,\n    MetadataEntry,\n    MetadataValue,\n    Noneable,\n    Output,\n    OutputDefinition,\n    StringSource,\n    check,\n    op,\n    resource,\n    solid,\n)\n\ntry:\n    # ge < v0.13.0\n    from great_expectations.core import convert_to_json_serializable\nexcept ImportError:\n    # ge >= v0.13.0\n    from great_expectations.core.util import convert_to_json_serializable\n\n\n@resource(config_schema={"ge_root_dir": Noneable(StringSource)})\ndef ge_data_context(context):\n    if context.resource_config["ge_root_dir"] is None:\n        yield ge.data_context.DataContext()\n    else:\n        yield ge.data_context.DataContext(context_root_dir=context.resource_config["ge_root_dir"])\n\n\ndef core_ge_validation_factory(\n    dagster_decorator,\n    decorator_name,\n    name,\n    datasource_name,\n    suite_name,\n    validation_operator_name=None,\n    input_dagster_type=DataFrame,\n    batch_kwargs=None,\n):\n    check.str_param(datasource_name, "datasource_name")\n    check.str_param(suite_name, "suite_name")\n    check.opt_str_param(validation_operator_name, "validation_operator_name")\n    batch_kwargs = check.opt_dict_param(batch_kwargs, "batch_kwargs")\n\n    @dagster_decorator(\n        name=name,\n        input_defs=[InputDefinition("dataset", input_dagster_type)],\n        output_defs=[\n            OutputDefinition(\n                dagster_type=dict,\n                description=f"""\n        This {decorator_name} yields an expectationResult with a structured dict of metadata from\n        the GE suite, as well as the full result in case a user wants to process it differently.\n        The structured dict contains both summary stats from the suite as well as expectation by\n        expectation results/details.\n        """,\n            )\n        ],\n        required_resource_keys={"ge_data_context"},\n        tags={"kind": "ge"},\n    )\n    def _ge_validation_fn(context, dataset):\n        data_context = context.resources.ge_data_context\n        if validation_operator_name is not None:\n            validation_operator = validation_operator_name\n        else:\n            data_context.add_validation_operator(\n                "ephemeral_validation",\n                {"class_name": "ActionListValidationOperator", "action_list": []},\n            )\n            validation_operator = "ephemeral_validation"\n        suite = data_context.get_expectation_suite(suite_name)\n        final_batch_kwargs = batch_kwargs or {"dataset": dataset}\n        if "datasource" in batch_kwargs:\n            context.log.warning(\n                "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "\n                f"parameter of the {decorator_name} factory instead."\n            )\n        final_batch_kwargs["datasource"] = datasource_name\n        batch = data_context.get_batch(final_batch_kwargs, suite)\n        run_id = {\n            "run_name": datasource_name + " run",\n            "run_time": datetime.datetime.utcnow(),\n        }\n        results = data_context.run_validation_operator(\n            validation_operator, assets_to_validate=[batch], run_id=run_id\n        )\n        res = convert_to_json_serializable(results.list_validation_results())[0]\n        validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n        rendered_document_content_list = (\n            validation_results_page_renderer.render_validation_operator_result(results)\n        )\n        md_str = " ".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n        meta_stats = MetadataEntry("Expectation Results", value=MetadataValue.md(md_str))\n        yield ExpectationResult(\n            success=res["success"],\n            metadata_entries=[\n                meta_stats,\n            ],\n        )\n        yield Output(res)\n\n    return _ge_validation_fn\n\n\n
[docs]def ge_validation_solid_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """Generates solids for interacting with GE.\n\n Args:\n name (str): the name of the solid\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to None,\n which generates an ephemeral validator.\n If you want to save data docs, use 'action_list_operator'.\n See https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the\n solid. Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`,\n where `dataset` is the input to the generated solid.\n Returns:\n A solid that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n """\n\n return core_ge_validation_factory(\n solid,\n "solid",\n name,\n datasource_name,\n suite_name,\n validation_operator_name,\n input_dagster_type,\n batch_kwargs,\n )
\n\n\n
[docs]def ge_validation_op_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """Generates ops for interacting with GE.\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to\n None, which generates an ephemeral validator. If you want to save data docs, use\n 'action_list_operator'.\n See https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`, where\n `dataset` is the input to the generated op.\n Returns:\n A solid that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n """\n\n return core_ge_validation_factory(\n solid,\n "solid",\n name,\n datasource_name,\n suite_name,\n validation_operator_name,\n input_dagster_type,\n batch_kwargs,\n )
\n\n\ndef core_ge_validation_factory_v3(\n dagster_decorator,\n decorator_name,\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n check.str_param(datasource_name, "datasource_name")\n check.str_param(data_connector_name, "data_connector_name")\n check.str_param(suite_name, "suite_name")\n\n extra_kwargs = check.opt_dict_param(extra_kwargs, "extra_kwargs")\n\n @dagster_decorator(\n name=name,\n input_defs=[InputDefinition("dataset", input_dagster_type)],\n output_defs=[\n OutputDefinition(\n dagster_type=dict,\n description=f"""\n This {decorator_name} yields an ExpectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n )\n ],\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context, dataset):\n data_context = context.resources.ge_data_context\n validator_kwargs = {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": datasource_name or data_asset_name,\n "runtime_parameters": {runtime_method_type: dataset},\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n **extra_kwargs,\n }\n validator = data_context.get_validator(**validator_kwargs)\n\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = validator.validate(run_id=run_id)\n\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = validation_results_page_renderer.render(\n validation_results=results\n )\n md_str = "".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n meta_stats = MetadataEntry("Expectation Results", value=MetadataValue.md(md_str))\n yield ExpectationResult(\n success=bool(results["success"]),\n metadata_entries=[meta_stats],\n )\n yield Output(results.to_json_dict())\n\n return _ge_validation_fn\n\n\ndef ge_validation_solid_factory_v3(\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n """Generates solids for interacting with GE (v3 API)\n\n Args:\n name (str): the name of the solid\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n data_connector_name (str): the name of the data connector for this datasource. This should\n point to a RuntimeDataConnector. For information on how to set this up, see:\n https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/how_to_create_a_batch_of_data_from_an_in_memory_spark_or_pandas_dataframe\n data_asset_name (str): the name of the data asset that this solid will be validating.\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n batch_identifier_fn (dict): A dicitonary of batch identifiers to uniquely identify this\n batch of data. To learn more about batch identifiers, see:\n https://docs.greatexpectations.io/docs/reference/datasources#batches.\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the\n solid. Defaults to `dagster_pandas.DataFrame`.\n runtime_method_type (str): how GE should interperet the solid input. One of ("batch_data",\n "path", "query"). Defaults to "batch_data", which will interperet the input as an in-memory\n object.\n extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s\n `get_validator` method. If not set, input will be:\n {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": data_asset_name,\n "runtime_parameters": {\n "<runtime_method_type>": <solid input>\n },\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n }\n\n Returns:\n A solid that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n\n """\n return core_ge_validation_factory_v3(\n solid,\n "solid",\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers,\n input_dagster_type,\n runtime_method_type,\n extra_kwargs,\n )\n\n\ndef ge_validation_op_factory_v3(\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n """Generates ops for interacting with GE (v3 API)\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n data_connector_name (str): the name of the data connector for this datasource. This should\n point to a RuntimeDataConnector. For information on how to set this up, see:\n https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/how_to_create_a_batch_of_data_from_an_in_memory_spark_or_pandas_dataframe\n data_asset_name (str): the name of the data asset that this op will be validating.\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n batch_identifier_fn (dict): A dicitonary of batch identifiers to uniquely identify this\n batch of data. To learn more about batch identifiers, see:\n https://docs.greatexpectations.io/docs/reference/datasources#batches.\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n runtime_method_type (str): how GE should interperet the solid input. One of ("batch_data",\n "path", "query"). Defaults to "batch_data", which will interperet the input as an\n in-memory object.\n extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s\n `get_validator` method. If not set, input will be:\n {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": data_asset_name,\n "runtime_parameters": {\n "<runtime_method_type>": <op input>\n },\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n }\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata and\n an output with all the metadata (for user processing)\n\n """\n return core_ge_validation_factory_v3(\n op,\n "op",\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers,\n input_dagster_type,\n runtime_method_type,\n extra_kwargs,\n )\n
", "current_page_name": "_modules/dagster_ge/factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_ge.factory"}}, "dagster_github": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_github.resources

\nimport time\nfrom datetime import datetime\n\nimport jwt\nimport requests\n\nfrom dagster import Field, IntSource, StringSource, resource\n\n\ndef to_seconds(dt):\n    return (dt - datetime(1970, 1, 1)).total_seconds()\n\n\nclass GithubResource:\n    def __init__(self, client, app_id, app_private_rsa_key, default_installation_id, hostname=None):\n        self.client = client\n        self.app_private_rsa_key = app_private_rsa_key\n        self.app_id = app_id\n        self.default_installation_id = default_installation_id\n        self.installation_tokens = {}\n        self.app_token = {}\n        self.hostname = hostname\n\n    def __set_app_token(self):\n        # from https://developer.github.com/apps/building-github-apps/authenticating-with-github-apps/\n        # needing to self-sign a JWT\n        now = int(time.time())\n        # JWT expiration time (10 minute maximum)\n        expires = now + (10 * 60)\n        encoded_token = jwt.encode(\n            {\n                # issued at time\n                "iat": now,\n                # JWT expiration time\n                "exp": expires,\n                # GitHub App's identifier\n                "iss": self.app_id,\n            },\n            self.app_private_rsa_key,\n            algorithm="RS256",\n        )\n        self.app_token = {\n            "value": encoded_token,\n            "expires": expires,\n        }\n\n    def __check_app_token(self):\n        if ("expires" not in self.app_token) or (\n            self.app_token["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_app_token()\n\n    def get_installations(self, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = self.client.get(\n            "https://api.github.com/app/installations"\n            if self.hostname is None\n            else "https://{}/api/v3/app/installations".format(self.hostname),\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def __set_installation_token(self, installation_id, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = requests.post(\n            "https://api.github.com/app/installations/{}/access_tokens".format(installation_id)\n            if self.hostname is None\n            else "https://{}/api/v3/app/installations/{}/access_tokens".format(\n                self.hostname, installation_id\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        auth = request.json()\n        self.installation_tokens[installation_id] = {\n            "value": auth["token"],\n            "expires": to_seconds(datetime.strptime(auth["expires_at"], "%Y-%m-%dT%H:%M:%SZ")),\n        }\n\n    def __check_installation_tokens(self, installation_id):\n        if (installation_id not in self.installation_tokens) or (\n            self.installation_tokens[installation_id]["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_installation_token(installation_id)\n\n    def execute(self, query, variables, headers=None, installation_id=None):\n        if headers is None:\n            headers = {}\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        self.__check_installation_tokens(installation_id)\n        headers["Authorization"] = "token {}".format(\n            self.installation_tokens[installation_id]["value"]\n        )\n        request = requests.post(\n            "https://api.github.com/graphql"\n            if self.hostname is None\n            else "https://{}/api/graphql".format(self.hostname),\n            json={"query": query, "variables": variables},\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def create_issue(self, repo_name, repo_owner, title, body, installation_id=None):\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        res = self.execute(\n            query="""\n            query get_repo_id($repo_name: String!, $repo_owner: String!) {\n                repository(name: $repo_name, owner: $repo_owner) {\n                    id\n                }\n            }\n            """,\n            variables={"repo_name": repo_name, "repo_owner": repo_owner},\n            installation_id=installation_id,\n        )\n\n        return self.execute(\n            query="""\n                mutation CreateIssue($id: ID!, $title: String!, $body: String!) {\n                createIssue(input: {\n                    repositoryId: $id,\n                    title: $title,\n                    body: $body\n                }) {\n                    clientMutationId,\n                    issue {\n                        body\n                        title\n                        url\n                    }\n                }\n                }\n            """,\n            variables={\n                "id": res["data"]["repository"]["id"],\n                "title": title,\n                "body": body,\n            },\n            installation_id=installation_id,\n        )\n\n\n
[docs]@resource(\n config_schema={\n "github_app_id": Field(\n IntSource,\n description="Github Application ID, for more info see https://developer.github.com/apps/",\n ),\n "github_app_private_rsa_key": Field(\n StringSource,\n description="Github Application Private RSA key text, for more info see https://developer.github.com/apps/",\n ),\n "github_installation_id": Field(\n IntSource,\n is_required=False,\n description="Github Application Installation ID, for more info see https://developer.github.com/apps/",\n ),\n "github_hostname": Field(\n StringSource,\n is_required=False,\n description="Github hostname. Defaults to `api.github.com`, for more info see https://developer.github.com/apps/",\n ),\n },\n description="This resource is for connecting to Github",\n)\ndef github_resource(context):\n return GithubResource(\n client=requests.Session(),\n app_id=context.resource_config["github_app_id"],\n app_private_rsa_key=context.resource_config["github_app_private_rsa_key"],\n default_installation_id=context.resource_config["github_installation_id"],\n hostname=context.resource_config.get("github_hostname", None),\n )
\n
", "current_page_name": "_modules/dagster_github/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_github.resources"}}, "dagster_graphql": {"client": {"client": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_graphql.client.client

\nfrom itertools import chain\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport requests.exceptions\nfrom gql import Client, gql\nfrom gql.transport import Transport\nfrom gql.transport.requests import RequestsHTTPTransport\n\nfrom dagster import check\nfrom dagster.core.definitions.utils import validate_tags\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus\nfrom dagster.utils.backcompat import experimental_class_warning\n\nfrom .client_queries import (\n    CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY,\n    CLIENT_SUBMIT_PIPELINE_RUN_MUTATION,\n    GET_PIPELINE_RUN_STATUS_QUERY,\n    RELOAD_REPOSITORY_LOCATION_MUTATION,\n    SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n)\nfrom .utils import (\n    DagsterGraphQLClientError,\n    InvalidOutputErrorInfo,\n    PipelineInfo,\n    ReloadRepositoryLocationInfo,\n    ReloadRepositoryLocationStatus,\n    ShutdownRepositoryLocationInfo,\n    ShutdownRepositoryLocationStatus,\n)\n\n\n
[docs]class DagsterGraphQLClient:\n """Official Dagster Python Client for GraphQL\n\n Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server\n\n As of now, all operations on this client are synchronous.\n\n Intended usage:\n\n .. code-block:: python\n\n client = DagsterGraphQLClient("localhost", port_number=3000)\n status = client.get_run_status(**SOME_RUN_ID**)\n\n Args:\n hostname (str): Hostname for the Dagster GraphQL API, like `localhost` or\n `dagit.dagster.YOUR_ORG_HERE`.\n port_number (Optional[int], optional): Optional port number to connect to on the host.\n Defaults to None.\n transport (Optional[Transport], optional): A custom transport to use to connect to the\n GraphQL API with (e.g. for custom auth). Defaults to None.\n use_https (bool, optional): Whether to use https in the URL connection string for the\n GraphQL API. Defaults to False.\n\n Raises:\n :py:class:`~requests.exceptions.ConnectionError`: if the client cannot connect to the host.\n """\n\n def __init__(\n self,\n hostname: str,\n port_number: Optional[int] = None,\n transport: Optional[Transport] = None,\n use_https: bool = False,\n ):\n experimental_class_warning(self.__class__.__name__)\n\n self._hostname = check.str_param(hostname, "hostname")\n self._port_number = check.opt_int_param(port_number, "port_number")\n self._use_https = check.bool_param(use_https, "use_https")\n\n self._url = (\n ("https://" if self._use_https else "http://")\n + (f"{self._hostname}:{self._port_number}" if self._port_number else self._hostname)\n + "/graphql"\n )\n\n self._transport = check.opt_inst_param(\n transport,\n "transport",\n Transport,\n default=RequestsHTTPTransport(url=self._url, use_json=True),\n )\n try:\n self._client = Client(transport=self._transport, fetch_schema_from_transport=True)\n except requests.exceptions.ConnectionError as exc:\n raise DagsterGraphQLClientError(\n f"Error when connecting to url {self._url}. "\n + f"Did you specify hostname: {self._hostname} "\n + (f"and port_number: {self._port_number} " if self._port_number else "")\n + "correctly?"\n ) from exc\n\n def _execute(self, query: str, variables: Optional[Dict[str, Any]] = None):\n try:\n return self._client.execute(gql(query), variable_values=variables)\n except Exception as exc: # catch generic Exception from the gql client\n raise DagsterGraphQLClientError(\n f"Exception occured during execution of query \\n{query}\\n with variables \\n{variables}\\n"\n ) from exc\n\n def _get_repo_locations_and_names_with_pipeline(self, pipeline_name: str) -> List[PipelineInfo]:\n res_data = self._execute(CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY)\n query_res = res_data["repositoriesOrError"]\n repo_connection_status = query_res["__typename"]\n if repo_connection_status == "RepositoryConnection":\n valid_nodes: Iterable[PipelineInfo] = chain(\n *map(PipelineInfo.from_node, query_res["nodes"])\n )\n return [info for info in valid_nodes if info.pipeline_name == pipeline_name]\n else:\n raise DagsterGraphQLClientError(repo_connection_status, query_res["message"])\n\n def _core_submit_execution(\n self,\n pipeline_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n is_using_job_op_graph_apis: Optional[bool] = False,\n ):\n check.opt_str_param(repository_location_name, "repository_location_name")\n check.opt_str_param(repository_name, "repository_name")\n check.str_param(pipeline_name, "pipeline_name")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n run_config = check.opt_dict_param(run_config, "run_config")\n\n # The following invariant will never fail when a job is executed\n check.invariant(\n (mode is not None and run_config is not None) or preset is not None,\n "Either a mode and run_config or a preset must be specified in order to "\n f"submit the pipeline {pipeline_name} for execution",\n )\n tags = validate_tags(tags)\n\n pipeline_or_job = "Job" if is_using_job_op_graph_apis else "Pipeline"\n\n if not repository_location_name or not repository_name:\n pipeline_info_lst = self._get_repo_locations_and_names_with_pipeline(pipeline_name)\n if len(pipeline_info_lst) == 0:\n raise DagsterGraphQLClientError(\n f"{pipeline_or_job}NotFoundError",\n f"No {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name `{pipeline_name}` exist",\n )\n elif len(pipeline_info_lst) == 1:\n pipeline_info = pipeline_info_lst[0]\n repository_location_name = pipeline_info.repository_location_name\n repository_name = pipeline_info.repository_name\n else:\n raise DagsterGraphQLClientError(\n "Must specify repository_location_name and repository_name"\n f" since there are multiple {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name {pipeline_name}."\n f"\\n\\tchoose one of: {pipeline_info_lst}"\n )\n\n variables: Dict[str, Any] = {\n "executionParams": {\n "selector": {\n "repositoryLocationName": repository_location_name,\n "repositoryName": repository_name,\n "pipelineName": pipeline_name,\n "solidSelection": solid_selection,\n }\n }\n }\n if preset is not None:\n variables["executionParams"]["preset"] = preset\n if mode is not None and run_config is not None:\n variables["executionParams"] = {\n **variables["executionParams"],\n "runConfigData": run_config,\n "mode": mode,\n "executionMetadata": {"tags": [{"key": k, "value": v} for k, v in tags.items()]}\n if tags\n else {},\n }\n\n res_data: Dict[str, Any] = self._execute(CLIENT_SUBMIT_PIPELINE_RUN_MUTATION, variables)\n query_result = res_data["launchPipelineExecution"]\n query_result_type = query_result["__typename"]\n if (\n query_result_type == "LaunchRunSuccess"\n or query_result_type == "LaunchPipelineRunSuccess"\n ):\n return query_result["run"]["runId"]\n elif query_result_type == "InvalidStepError":\n raise DagsterGraphQLClientError(query_result_type, query_result["invalidStepKey"])\n elif query_result_type == "InvalidOutputError":\n error_info = InvalidOutputErrorInfo(\n step_key=query_result["stepKey"],\n invalid_output_name=query_result["invalidOutputName"],\n )\n raise DagsterGraphQLClientError(query_result_type, body=error_info)\n elif (\n query_result_type == "RunConfigValidationInvalid"\n or query_result_type == "PipelineConfigValidationInvalid"\n ):\n raise DagsterGraphQLClientError(query_result_type, query_result["errors"])\n else:\n # query_result_type is a ConflictingExecutionParamsError, a PresetNotFoundError\n # a PipelineNotFoundError, a RunConflict, or a PythonError\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])\n\n
[docs] def submit_pipeline_execution(\n self,\n pipeline_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n ) -> str:\n """Submits a Pipeline with attached configuration for execution.\n\n Args:\n pipeline_name (str): The pipeline's name\n repository_location_name (Optional[str], optional): The name of the repository location where\n the pipeline is located. If omitted, the client will try to infer the repository location\n from the available options on the Dagster deployment. Defaults to None.\n repository_name (Optional[str], optional): The name of the repository where the pipeline is located.\n If omitted, the client will try to infer the repository from the available options\n on the Dagster deployment. Defaults to None.\n run_config (Optional[Any], optional): This is the run config to execute the pipeline with.\n Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\n an arbitrary object for run config. However, it must conform to the constraints of the config\n schema for this pipeline. If it does not, the client will throw a DagsterGraphQLClientError with a message of\n RunConfigValidationInvalid. Defaults to None.\n mode (Optional[str], optional): The mode to run the pipeline with. If you have not\n defined any custom modes for your pipeline, the default mode is "default". Defaults to None.\n preset (Optional[str], optional): The name of a pre-defined preset to use instead of a\n run config. Defaults to None.\n tags (Optional[Dict[str, Any]], optional): A set of tags to add to the pipeline execution.\n\n Raises:\n DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the pipeline has an invalid step\n DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the pipeline.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.\n DagsterGraphQLClientError("ConflictingExecutionParamsError", invalid_step_key): a preset and a run_config & mode are present\n that conflict with one another\n DagsterGraphQLClientError("PresetNotFoundError", message): if the provided preset name is not found\n DagsterGraphQLClientError("RunConflict", message): a `DagsterRunConflict` occured during execution.\n This indicates that a conflicting pipeline run already exists in run storage.\n DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format\n for the pipeline\n DagsterGraphQLClientError("PipelineNotFoundError", message): the requested pipeline does not exist\n DagsterGraphQLClientError("PythonError", message): an internal framework error occurred\n\n Returns:\n str: run id of the submitted pipeline run\n """\n return self._core_submit_execution(\n pipeline_name,\n repository_location_name,\n repository_name,\n run_config,\n mode,\n preset,\n tags,\n solid_selection,\n is_using_job_op_graph_apis=False,\n )
\n\n
[docs] def submit_job_execution(\n self,\n job_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Dict[str, Any]] = None,\n tags: Optional[Dict[str, Any]] = None,\n op_selection: Optional[List[str]] = None,\n ) -> str:\n """Submits a job with attached configuration for execution.\n\n Args:\n job_name (str): The job's name\n repository_location_name (Optional[str]): The name of the repository location where\n the job is located. If omitted, the client will try to infer the repository location\n from the available options on the Dagster deployment. Defaults to None.\n repository_name (Optional[str]): The name of the repository where the job is located.\n If omitted, the client will try to infer the repository from the available options\n on the Dagster deployment. Defaults to None.\n run_config (Optional[Dict[str, Any]]): This is the run config to execute the job with.\n Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\n an arbitrary object for run config. However, it must conform to the constraints of the config\n schema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\n JobConfigValidationInvalid. Defaults to None.\n tags (Optional[Dict[str, Any]]): A set of tags to add to the job execution.\n\n Raises:\n DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the job has an invalid step\n DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.\n DagsterGraphQLClientError("RunConflict", message): a `DagsterRunConflict` occured during execution.\n This indicates that a conflicting job run already exists in run storage.\n DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format\n for the job\n DagsterGraphQLClientError("JobNotFoundError", message): the requested job does not exist\n DagsterGraphQLClientError("PythonError", message): an internal framework error occurred\n\n Returns:\n str: run id of the submitted pipeline run\n """\n return self._core_submit_execution(\n pipeline_name=job_name,\n repository_location_name=repository_location_name,\n repository_name=repository_name,\n run_config=run_config,\n mode="default",\n preset=None,\n tags=tags,\n solid_selection=op_selection,\n is_using_job_op_graph_apis=True,\n )
\n\n
[docs] def get_run_status(self, run_id: str) -> PipelineRunStatus:\n """Get the status of a given Pipeline Run\n\n Args:\n run_id (str): run id of the requested pipeline run.\n\n Raises:\n DagsterGraphQLClientError("PipelineNotFoundError", message): if the requested run id is not found\n DagsterGraphQLClientError("PythonError", message): on internal framework errors\n\n Returns:\n PipelineRunStatus: returns a status Enum describing the state of the requested pipeline run\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n GET_PIPELINE_RUN_STATUS_QUERY, {"runId": run_id}\n )\n query_result: Dict[str, Any] = res_data["pipelineRunOrError"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "PipelineRun" or query_result_type == "Run":\n return PipelineRunStatus(query_result["status"])\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n\n
[docs] def reload_repository_location(\n self, repository_location_name: str\n ) -> ReloadRepositoryLocationInfo:\n """Reloads a Dagster Repository Location, which reloads all repositories in that repository location.\n\n This is useful in a variety of contexts, including refreshing Dagit without restarting\n the server.\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ReloadRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n RELOAD_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["reloadRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "WorkspaceLocationEntry":\n location_or_error_type = query_result["locationOrLoadError"]["__typename"]\n if location_or_error_type == "RepositoryLocation":\n return ReloadRepositoryLocationInfo(status=ReloadRepositoryLocationStatus.SUCCESS)\n else:\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type="PythonError",\n message=query_result["locationOrLoadError"]["message"],\n )\n else:\n # query_result_type is either ReloadNotSupported or RepositoryLocationNotFound\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type=query_result_type,\n message=query_result["message"],\n )
\n\n
[docs] def shutdown_repository_location(\n self, repository_location_name: str\n ) -> ShutdownRepositoryLocationInfo:\n """Shuts down the server that is serving metadata for the provided repository location.\n\n This is primarily useful when you want the server to be restarted by the compute environment\n in which it is running (for example, in Kubernetes, the pod in which the server is running\n will automatically restart when the server is shut down, and the repository metadata will\n be reloaded)\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ShutdownRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["shutdownRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "ShutdownRepositoryLocationSuccess":\n return ShutdownRepositoryLocationInfo(status=ShutdownRepositoryLocationStatus.SUCCESS)\n elif (\n query_result_type == "RepositoryLocationNotFound" or query_result_type == "PythonError"\n ):\n return ShutdownRepositoryLocationInfo(\n status=ShutdownRepositoryLocationStatus.FAILURE,\n message=query_result["message"],\n )\n else:\n raise Exception(f"Unexpected query result type {query_result_type}")
\n
", "current_page_name": "_modules/dagster_graphql/client/client", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_graphql.client.client"}, "utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_graphql.client.utils

\nfrom enum import Enum\nfrom typing import Any, Dict, List, NamedTuple, Optional\n\n\n
[docs]class DagsterGraphQLClientError(Exception):\n def __init__(self, *args, body=None):\n super().__init__(*args)\n self.body = body
\n\n\n
[docs]class ReloadRepositoryLocationStatus(Enum):\n """This enum describes the status of a GraphQL mutation to reload a Dagster repository location\n\n Args:\n Enum (str): can be either `ReloadRepositoryLocationStatus.SUCCESS`\n or `ReloadRepositoryLocationStatus.FAILURE`.\n """\n\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"
\n\n\nclass ShutdownRepositoryLocationStatus(Enum):\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n\n\n
[docs]class ReloadRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of reloading\n a Dagster repository location with a GraphQL mutation.\n\n Args:\n status (ReloadRepositoryLocationStatus): The status of the reload repository location mutation\n failure_type: (Optional[str], optional): the failure type if `status == ReloadRepositoryLocationStatus.FAILURE`.\n Can be one of `ReloadNotSupported`, `RepositoryLocationNotFound`, or `RepositoryLocationLoadFailure`. Defaults to None.\n message (Optional[str], optional): the failure message/reason if\n `status == ReloadRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ReloadRepositoryLocationStatus\n failure_type: Optional[str] = None\n message: Optional[str] = None
\n\n\nclass ShutdownRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of shutting down the server for\n a Dagster repository location using a GraphQL mutation.\n\n Args:\n status (ShutdownRepositoryLocationStatus) Whether the shutdown succeeded or failed.\n message (Optional[str], optional): the failure message/reason if\n `status == ShutdownRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ShutdownRepositoryLocationStatus\n message: Optional[str] = None\n\n\nclass PipelineInfo(NamedTuple):\n repository_location_name: str\n repository_name: str\n pipeline_name: str\n\n @staticmethod\n def from_node(node: Dict[str, Any]) -> List["PipelineInfo"]:\n repo_name = node["name"]\n repo_location_name = node["location"]["name"]\n return [\n PipelineInfo(\n repository_location_name=repo_location_name,\n repository_name=repo_name,\n pipeline_name=pipeline["name"],\n )\n for pipeline in node["pipelines"]\n ]\n\n\n
[docs]class InvalidOutputErrorInfo(NamedTuple):\n """This class gives information about an InvalidOutputError from submitting a pipeline for execution\n from GraphQL.\n\n Args:\n step_key (str): key of the step that failed\n invalid_output_name (str): the name of the invalid output from the given step\n """\n\n step_key: str\n invalid_output_name: str
\n
", "current_page_name": "_modules/dagster_graphql/client/utils", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_graphql.client.utils"}}}, "dagster_k8s": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_k8s.executor

\nimport kubernetes\nfrom dagster_k8s.launcher import K8sRunLauncher\n\nfrom dagster import Field, StringSource, check, executor\nfrom dagster.core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.events import DagsterEvent, DagsterEventType, EngineEventData, MetadataEntry\nfrom dagster.core.execution.plan.objects import StepFailureData\nfrom dagster.core.execution.retries import RetryMode, get_retries_config\nfrom dagster.core.executor.base import Executor\nfrom dagster.core.executor.init import InitExecutorContext\nfrom dagster.core.executor.step_delegating import StepDelegatingExecutor\nfrom dagster.core.executor.step_delegating.step_handler import StepHandler\nfrom dagster.core.executor.step_delegating.step_handler.base import StepHandlerContext\nfrom dagster.core.types.dagster_type import Optional\nfrom dagster.utils import frozentags, merge_dicts\n\nfrom .job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\nfrom .utils import delete_job\n\n\n
[docs]@executor(\n name="k8s",\n config_schema=merge_dicts(\n DagsterK8sJobConfig.config_type_job(),\n {"job_namespace": Field(StringSource, is_required=False)},\n {"retries": get_retries_config()},\n ),\n requirements=multiple_process_executor_requirements(),\n)\ndef k8s_job_executor(init_context: InitExecutorContext) -> Executor:\n """\n Executor which launches steps as Kubernetes Jobs.\n\n To use the `k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_namespace: 'some-namespace'\n image_pull_policy: ...\n image_pull_secrets: ...\n service_account_name: ...\n env_config_maps: ...\n env_secrets: ...\n env_vars: ...\n job_image: ... # leave out if using userDeployments\n\n Configuration set on the Kubernetes Jobs and Pods created by the `K8sRunLauncher` will also be\n set on Kubernetes Jobs and Pods created by the `k8s_job_executor`.\n """\n\n run_launcher = init_context.instance.run_launcher\n if not isinstance(run_launcher, K8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a K8sRunLauncher; configure the "\n "K8sRunLauncher on your instance to use it.",\n )\n\n exc_cfg = init_context.executor_config\n job_config = DagsterK8sJobConfig(\n dagster_home=run_launcher.dagster_home,\n instance_config_map=run_launcher.instance_config_map,\n postgres_password_secret=run_launcher.postgres_password_secret,\n job_image=exc_cfg.get("job_image"),\n image_pull_policy=(\n exc_cfg.get("image_pull_policy")\n if exc_cfg.get("image_pull_policy") != None\n else run_launcher.image_pull_policy\n ),\n image_pull_secrets=run_launcher.image_pull_secrets\n + (exc_cfg.get("image_pull_secrets") or []),\n service_account_name=(\n exc_cfg.get("service_account_name")\n if exc_cfg.get("service_account_name") != None\n else run_launcher.service_account_name\n ),\n env_config_maps=run_launcher.env_config_maps + (exc_cfg.get("env_config_maps") or []),\n env_secrets=run_launcher.env_secrets + (exc_cfg.get("env_secrets") or []),\n env_vars=run_launcher.env_vars + (exc_cfg.get("env_vars") or []),\n volume_mounts=run_launcher.volume_mounts + (exc_cfg.get("volume_mounts") or []),\n volumes=run_launcher.volumes + (exc_cfg.get("volumes") or []),\n labels=merge_dicts(run_launcher.labels, exc_cfg.get("labels", {})),\n )\n\n return StepDelegatingExecutor(\n K8sStepHandler(\n job_config=job_config,\n job_namespace=(\n exc_cfg.get("job_namespace")\n if exc_cfg.get("job_namespace") != None\n else run_launcher.job_namespace\n ),\n load_incluster_config=run_launcher.load_incluster_config,\n kubeconfig_file=run_launcher.kubeconfig_file,\n ),\n retries=RetryMode.from_config(init_context.executor_config["retries"]),\n should_verify_step=True,\n )
\n\n\nclass K8sStepHandler(StepHandler):\n @property\n def name(self):\n return "K8sStepHandler"\n\n def __init__(\n self,\n job_config: DagsterK8sJobConfig,\n job_namespace: str,\n load_incluster_config: bool,\n kubeconfig_file: Optional[str],\n k8s_client_batch_api=None,\n ):\n super().__init__()\n\n self._job_config = job_config\n self._job_namespace = job_namespace\n self._fixed_k8s_client_batch_api = k8s_client_batch_api\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n @property\n def _batch_api(self):\n return self._fixed_k8s_client_batch_api or kubernetes.client.BatchV1Api()\n\n def _get_k8s_step_job_name(self, step_handler_context):\n step_key = step_handler_context.execute_step_args.step_keys_to_execute[0]\n\n name_key = get_k8s_job_name(\n step_handler_context.execute_step_args.pipeline_run_id,\n step_key,\n )\n\n if step_handler_context.execute_step_args.known_state:\n retry_state = step_handler_context.execute_step_args.known_state.get_retry_state()\n if retry_state.get_attempt_count(step_key):\n return "dagster-step-%s-%d" % (name_key, retry_state.get_attempt_count(step_key))\n\n return "dagster-step-%s" % (name_key)\n\n def launch_step(self, step_handler_context: StepHandlerContext):\n events = []\n\n assert (\n len(step_handler_context.execute_step_args.step_keys_to_execute) == 1\n ), "Launching multiple steps is not currently supported"\n step_key = step_handler_context.execute_step_args.step_keys_to_execute[0]\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n pod_name = job_name\n\n args = step_handler_context.execute_step_args.get_command_args()\n\n job_config = self._job_config\n if not job_config.job_image:\n job_config = job_config.with_image(\n step_handler_context.execute_step_args.pipeline_origin.repository_origin.container_image\n )\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the job")\n\n user_defined_k8s_config = get_user_defined_k8s_config(\n frozentags(step_handler_context.step_tags[step_key])\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="step_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels={\n "dagster/job": step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n "dagster/op": step_key,\n "dagster/run-id": step_handler_context.execute_step_args.pipeline_run_id,\n },\n )\n\n events.append(\n DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Executing step {step_key} in Kubernetes job {job_name}",\n event_specific_data=EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Kubernetes Job name", value=job_name),\n ],\n ),\n )\n )\n\n self._batch_api.create_namespaced_job(body=job, namespace=self._job_namespace)\n\n return events\n\n def check_step_health(self, step_handler_context: StepHandlerContext):\n assert (\n len(step_handler_context.execute_step_args.step_keys_to_execute) == 1\n ), "Launching multiple steps is not currently supported"\n step_key = step_handler_context.execute_step_args.step_keys_to_execute[0]\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n\n job = self._batch_api.read_namespaced_job(namespace=self._job_namespace, name=job_name)\n if job.status.failed:\n return [\n DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Discovered failed Kubernetes job {job_name} for step {step_key}",\n event_specific_data=StepFailureData(\n error=None,\n user_failure_data=None,\n ),\n )\n ]\n return []\n\n def terminate_step(self, step_handler_context: StepHandlerContext):\n assert (\n len(step_handler_context.execute_step_args.step_keys_to_execute) == 1\n ), "Launching multiple steps is not currently supported"\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n\n delete_job(job_name=job_name, namespace=self._job_namespace)\n return []\n
", "current_page_name": "_modules/dagster_k8s/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_k8s.executor"}, "launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_k8s.launcher

\nimport sys\n\nimport kubernetes\n\nfrom dagster import Field, MetadataEntry, StringSource, check\nfrom dagster.cli.api import ExecuteRunArgs\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.launcher import LaunchRunContext, ResumeRunContext, RunLauncher\nfrom dagster.core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster.grpc.types import ResumeRunArgs\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import frozentags, merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_job_name_from_run_id,\n    get_user_defined_k8s_config,\n)\nfrom .utils import delete_job\n\n\n
[docs]class K8sRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a Kubernetes Job for each Dagster job run.\n\n Encapsulates each run in a separate, isolated invocation of ``dagster-graphql``.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n service_account_name: your_service_account\n job_image: my_project/dagster_image:latest\n instance_config_map: dagster-instance\n postgres_password_secret: dagster-postgresql-secret\n\n """\n\n def __init__(\n self,\n service_account_name,\n instance_config_map,\n postgres_password_secret=None,\n dagster_home=None,\n job_image=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n load_incluster_config=True,\n kubeconfig_file=None,\n inst_data=None,\n job_namespace="default",\n env_config_maps=None,\n env_secrets=None,\n env_vars=None,\n k8s_client_batch_api=None,\n volume_mounts=None,\n volumes=None,\n labels=None,\n fail_pod_on_run_failure=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.job_namespace = check.str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = load_incluster_config\n self.kubeconfig_file = kubeconfig_file\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._fixed_batch_api = k8s_client_batch_api\n\n self._job_config = None\n self._job_image = check.opt_str_param(job_image, "job_image")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._service_account_name = check.str_param(service_account_name, "service_account_name")\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.postgres_password_secret = check.opt_str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n self._env_vars = check.opt_list_param(env_vars, "env_vars", of_type=str)\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n\n super().__init__()\n\n @property\n def image_pull_policy(self):\n return self._image_pull_policy\n\n @property\n def image_pull_secrets(self):\n return self._image_pull_secrets\n\n @property\n def service_account_name(self):\n return self._service_account_name\n\n @property\n def env_config_maps(self):\n return self._env_config_maps\n\n @property\n def env_secrets(self):\n return self._env_secrets\n\n @property\n def volume_mounts(self):\n return self._volume_mounts\n\n @property\n def volumes(self):\n return self._volumes\n\n @property\n def env_vars(self):\n return self._env_vars\n\n @property\n def labels(self):\n return self._labels\n\n @property\n def fail_pod_on_run_failure(self):\n return self._fail_pod_on_run_failure\n\n @property\n def _batch_api(self):\n return self._fixed_batch_api if self._fixed_batch_api else kubernetes.client.BatchV1Api()\n\n @classmethod\n def config_type(cls):\n """Include all arguments required for DagsterK8sJobConfig along with additional arguments\n needed for the RunLauncher itself.\n """\n job_cfg = DagsterK8sJobConfig.config_type_run_launcher()\n\n run_launcher_extra_cfg = {\n "job_namespace": Field(StringSource, is_required=False, default_value="default"),\n }\n return merge_dicts(job_cfg, run_launcher_extra_cfg)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def get_static_job_config(self):\n if self._job_config:\n return self._job_config\n else:\n self._job_config = DagsterK8sJobConfig(\n job_image=check.str_param(self._job_image, "job_image"),\n dagster_home=check.str_param(self.dagster_home, "dagster_home"),\n image_pull_policy=check.str_param(self._image_pull_policy, "image_pull_policy"),\n image_pull_secrets=check.opt_list_param(\n self._image_pull_secrets, "image_pull_secrets", of_type=dict\n ),\n service_account_name=check.str_param(\n self._service_account_name, "service_account_name"\n ),\n instance_config_map=check.str_param(\n self.instance_config_map, "instance_config_map"\n ),\n postgres_password_secret=check.opt_str_param(\n self.postgres_password_secret, "postgres_password_secret"\n ),\n env_config_maps=check.opt_list_param(\n self._env_config_maps, "env_config_maps", of_type=str\n ),\n env_secrets=check.opt_list_param(self._env_secrets, "env_secrets", of_type=str),\n env_vars=check.opt_list_param(self._env_vars, "env_vars", of_type=str),\n volume_mounts=self._volume_mounts,\n volumes=self._volumes,\n labels=self._labels,\n )\n return self._job_config\n\n def _get_grpc_job_config(self, job_image):\n return DagsterK8sJobConfig(\n job_image=check.str_param(job_image, "job_image"),\n dagster_home=check.str_param(self.dagster_home, "dagster_home"),\n image_pull_policy=check.str_param(self._image_pull_policy, "image_pull_policy"),\n image_pull_secrets=check.opt_list_param(\n self._image_pull_secrets, "image_pull_secrets", of_type=dict\n ),\n service_account_name=check.str_param(\n self._service_account_name, "service_account_name"\n ),\n instance_config_map=check.str_param(self.instance_config_map, "instance_config_map"),\n postgres_password_secret=check.opt_str_param(\n self.postgres_password_secret, "postgres_password_secret"\n ),\n env_config_maps=check.opt_list_param(\n self._env_config_maps, "env_config_maps", of_type=str\n ),\n env_secrets=check.opt_list_param(self._env_secrets, "env_secrets", of_type=str),\n env_vars=check.opt_list_param(self._env_vars, "env_vars", of_type=str),\n volume_mounts=self._volume_mounts,\n volumes=self._volumes,\n labels=self._labels,\n )\n\n def _launch_k8s_job_with_args(self, job_name, args, run, pipeline_origin):\n pod_name = job_name\n\n user_defined_k8s_config = get_user_defined_k8s_config(frozentags(run.tags))\n repository_origin = pipeline_origin.repository_origin\n\n job_config = (\n self._get_grpc_job_config(repository_origin.container_image)\n if repository_origin.container_image\n else self.get_static_job_config()\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_config.job_image},\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels={\n "dagster/job": pipeline_origin.pipeline_name,\n "dagster/run-id": run.run_id,\n },\n )\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n [\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Kubernetes Namespace", value=self.job_namespace),\n MetadataEntry("Run ID", value=run.run_id),\n ]\n ),\n cls=self.__class__,\n )\n\n self._batch_api.create_namespaced_job(body=job, namespace=self.job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n cls=self.__class__,\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.pipeline_run\n job_name = get_job_name_from_run_id(run.run_id)\n pipeline_origin = context.pipeline_code_origin\n\n args = ExecuteRunArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run, pipeline_origin)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.pipeline_run\n job_name = get_job_name_from_run_id(\n run.run_id, resume_attempt_number=context.resume_attempt_number\n )\n pipeline_origin = context.pipeline_code_origin\n\n args = ResumeRunArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run, pipeline_origin)\n\n # https://github.com/dagster-io/dagster/issues/2741\n def can_terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n if not pipeline_run:\n return False\n if pipeline_run.status != PipelineRunStatus.STARTED:\n return False\n return True\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n can_terminate = self.can_terminate(run_id)\n if not can_terminate:\n self._instance.report_engine_event(\n message="Unable to terminate run; can_terminate returned {}".format(can_terminate),\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n self._instance.report_run_canceling(run)\n\n job_name = get_job_name_from_run_id(\n run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)\n )\n\n try:\n termination_result = delete_job(job_name=job_name, namespace=self.job_namespace)\n if termination_result:\n self._instance.report_engine_event(\n message="Run was terminated successfully.",\n pipeline_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; encountered error in delete_job",\n pipeline_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: PipelineRun):\n job_name = get_job_name_from_run_id(\n run.run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)\n )\n try:\n job = self._batch_api.read_namespaced_job(namespace=self.job_namespace, name=job_name)\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n if job.status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n if job.status.succeeded:\n return CheckRunHealthResult(WorkerStatus.SUCCESS)\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n
", "current_page_name": "_modules/dagster_k8s/launcher", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_k8s.launcher"}}, "dagster_mlflow": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_mlflow.resources

\n"""\nThis module contains the mlflow resource provided by the MlFlow\nclass. This resource provides an easy way to configure mlflow for logging various\nthings from dagster runs.\n"""\nimport atexit\nimport sys\nfrom itertools import islice\nfrom os import environ\nfrom typing import Any, Optional\n\nimport mlflow\nfrom mlflow.entities.run_status import RunStatus\n\nfrom dagster import Field, Noneable, Permissive, resource\n\nCONFIG_SCHEMA = {\n    "experiment_name": Field(str, is_required=True, description="MlFlow experiment name."),\n    "mlflow_tracking_uri": Field(\n        Noneable(str),\n        default_value=None,\n        is_required=False,\n        description="MlFlow tracking server uri.",\n    ),\n    "parent_run_id": Field(\n        Noneable(str),\n        default_value=None,\n        is_required=False,\n        description="Mlflow run ID of parent run if this is a nested run.",\n    ),\n    "env": Field(Permissive(), description="Environment variables for mlflow setup."),\n    "env_to_tag": Field(\n        Noneable(list),\n        default_value=None,\n        is_required=False,\n        description="List of environment variables to log as tags in mlflow.",\n    ),\n    "extra_tags": Field(Permissive(), description="Any extra key-value tags to log to mlflow."),\n}\n\n\nclass MlflowMeta(type):\n    """Mlflow Metaclass to create methods that "inherit" all of Mlflow's\n    methods. If the class has a method defined it is excluded from the\n    attribute setting from mlflow.\n    """\n\n    def __new__(cls, name, bases, attrs):\n        class_cls = super(MlflowMeta, cls).__new__(cls, name, bases, attrs)\n        for attr in (attr for attr in dir(mlflow) if attr not in dir(class_cls)):\n            mlflow_attribute = getattr(mlflow, attr)\n            if callable(mlflow_attribute):\n                setattr(class_cls, attr, staticmethod(mlflow_attribute))\n            else:\n                setattr(class_cls, attr, mlflow_attribute)\n        return class_cls\n\n\nclass MlFlow(metaclass=MlflowMeta):\n    """Class for setting up an mlflow resource for dagster runs.\n    This takes care of all the configuration required to use mlflow tracking and the complexities of\n    mlflow tracking dagster parallel runs.\n    """\n\n    def __init__(self, context):\n\n        # Context associated attributes\n        self.log = context.log\n        self.run_name = context.pipeline_run.pipeline_name\n        self.dagster_run_id = context.run_id\n\n        # resource config attributes\n        resource_config = context.resource_config\n        self.tracking_uri = resource_config.get("mlflow_tracking_uri")\n        if self.tracking_uri:\n            mlflow.set_tracking_uri(self.tracking_uri)\n        self.parent_run_id = resource_config.get("parent_run_id")\n        self.experiment_name = resource_config["experiment_name"]\n        self.env_tags_to_log = resource_config.get("env_to_tag") or []\n        self.extra_tags = resource_config.get("extra_tags")\n\n        # Update env variables if any are given\n        self.env_vars = resource_config.get("env", {})\n        if self.env_vars:\n            environ.update(self.env_vars)\n\n        # If the experiment exists then the set won't do anything\n        mlflow.set_experiment(self.experiment_name)\n        self.experiment = mlflow.get_experiment_by_name(self.experiment_name)\n\n        # Get the client object\n        self.tracking_client = mlflow.tracking.MlflowClient()\n\n        # Set up the active run and tags\n        self._setup()\n\n    def _setup(self):\n        """\n        Sets the active run and tags. If an Mlflow run_id exists then the\n        active run is set to it. This way a single Dagster run outputs data\n        to the same Mlflow run, even when multiprocess executors are used.\n        """\n        # Get the run id\n        run_id = self._get_current_run_id()  # pylint: disable=no-member\n        self._set_active_run(run_id=run_id)\n        self._set_all_tags()\n\n        # hack needed to stop mlflow from marking run as finished when\n        # a process exits in parallel runs\n        atexit.unregister(mlflow.end_run)\n\n    def _get_current_run_id(\n        self, experiment: Optional[Any] = None, dagster_run_id: Optional[str] = None\n    ):\n        """Gets the run id of a specific dagster run and experiment id.\n        If it doesn't exist then it returns a None.\n\n        Args:\n            experiment (optional): Mlflow experiment.\n            When none is passed it fetches the experiment object set in\n            the constructor.  Defaults to None.\n            dagster_run_id (optional): The Dagster run id.\n            When none is passed it fetches the dagster_run_id object set in\n            the constructor.  Defaults to None.\n        Returns:\n            run_id (str or None): run_id if it is found else None\n        """\n        experiment = experiment or self.experiment\n        dagster_run_id = dagster_run_id or self.dagster_run_id\n        if experiment:\n            # Check if a run with this dagster run id has already been started\n            # in mlflow, will get an empty dataframe if not\n            current_run_df = mlflow.search_runs(\n                experiment_ids=[experiment.experiment_id],\n                filter_string=f"tags.dagster_run_id='{dagster_run_id}'",\n            )\n            if not current_run_df.empty:\n                return current_run_df.run_id.values[0]  # pylint: disable=no-member\n\n    def _set_active_run(self, run_id=None):\n        """\n        This method sets the active run to be that of the specified\n        run_id. If None is passed then a new run is started. The new run also\n        takes care of nested runs.\n\n        Args:\n            run_id (str, optional): Mlflow run_id. Defaults to None.\n        """\n        nested_run = False\n        if self.parent_run_id is not None:\n            self._start_run(run_id=self.parent_run_id, run_name=self.run_name)\n            nested_run = True\n        self._start_run(run_id=run_id, run_name=self.run_name, nested=nested_run)\n\n    def _start_run(self, **kwargs):\n        """\n        Catches the Mlflow exception if a run is already active.\n        """\n\n        try:\n            run = mlflow.start_run(**kwargs)\n            self.log.info(\n                f"Starting a new mlflow run with id {run.info.run_id} "\n                f"in experiment {self.experiment_name}"\n            )\n        except Exception as ex:\n            run = mlflow.active_run()\n            if "is already active" not in str(ex):\n                raise (ex)\n            self.log.info(f"Run with id {run.info.run_id} is already active.")\n\n    def _set_all_tags(self):\n        """Method collects dagster_run_id plus all env variables/tags that have been\n            specified by the user in the config_schema and logs them as tags in mlflow.\n\n        Returns:\n            tags [dict]: Dictionary of all the tags\n        """\n        tags = {tag: environ.get(tag) for tag in self.env_tags_to_log}\n        tags["dagster_run_id"] = self.dagster_run_id\n        if self.extra_tags:\n            tags.update(self.extra_tags)\n\n        mlflow.set_tags(tags)\n\n    def cleanup_on_error(self):\n        """Method ends mlflow run with correct exit status for failed runs. Note that\n        this method does not work when a job running in dagit fails, it seems\n        that in this case a different process runs the job and when it fails\n        the stack trace is therefore not available. For this case we can use the\n        cleanup_on_failure hook defined below.\n        """\n        any_error = sys.exc_info()\n\n        if any_error[1]:\n            if isinstance(any_error[1], KeyboardInterrupt):\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))\n            else:\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n    @staticmethod\n    def log_params(params: dict):\n        """Overload of the mlflow.log_params. If len(params) >100 then\n        params is sent to mlflow in chunks.\n\n        Args:\n            params (dict): Parameters to be logged\n        """\n        for param_chunk in MlFlow.chunks(params, 100):\n            mlflow.log_params(param_chunk)\n\n    @staticmethod\n    def chunks(params: dict, size: int = 100):\n        """Method that chunks a dictionary into batches of size.\n\n        Args:\n            params (dict): Dictionary set to be batched\n            size (int, optional): Number of batches. Defaults to 100.\n\n        Yields:\n            (dict): Batch of dictionary\n        """\n        it = iter(params)\n        for _ in range(0, len(params), size):\n            yield {k: params[k] for k in islice(it, size)}\n\n\n
[docs]@resource(config_schema=CONFIG_SCHEMA)\ndef mlflow_tracking(context):\n """\n This resource initializes an MLflow run that's used for all steps within a Dagster run.\n\n This resource provides access to all of mlflow's methods as well as the mlflow tracking client's\n methods.\n\n Usage:\n\n 1. Add the mlflow resource to any solids in which you want to invoke mlflow tracking APIs.\n 2. Add the `end_mlflow_on_run_finished` hook to your pipeline to end the MLflow run\n when the Dagster run is finished.\n\n Examples:\n\n .. code-block:: python\n\n from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n @op(required_resource_keys={"mlflow"})\n def mlflow_solid(context):\n mlflow.log_params(some_params)\n mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n @end_mlflow_on_run_finished\n @job(resource_defs={"mlflow": mlflow_tracking})\n def mlf_example():\n mlflow_op()\n\n # example using an mlflow instance with s3 storage\n mlf_example.execute_in_process(run_config={\n "resources": {\n "mlflow": {\n "config": {\n "experiment_name": my_experiment,\n "mlflow_tracking_uri": "http://localhost:5000",\n\n # if want to run a nested run, provide parent_run_id\n "parent_run_id": an_existing_mlflow_run_id,\n\n # env variables to pass to mlflow\n "env": {\n "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n "AWS_ACCESS_KEY_ID": my_aws_key_id,\n "AWS_SECRET_ACCESS_KEY": my_secret,\n },\n\n # env variables you want to log as mlflow tags\n "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n # key-value tags to add to your experiment\n "extra_tags": {"super": "experiment"},\n }\n }\n }\n })\n """\n mlf = MlFlow(context)\n yield mlf\n mlf.cleanup_on_error()
\n
", "current_page_name": "_modules/dagster_mlflow/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_mlflow.resources"}}, "dagster_msteams": {"hooks": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_msteams.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster_msteams.card import Card\n\nfrom dagster.core.definitions import failure_hook, success_hook\nfrom dagster.core.execution.context.hook import HookContext\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return "Solid {solid_name} on pipeline {pipeline_name} {status}!\\nRun ID: {run_id}".format(\n        solid_name=context.solid.name,\n        pipeline_name=context.pipeline_name,\n        run_id=context.run_id,\n        status=status,\n    )\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]def teams_on_failure(\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this\n to allow messages to include deeplinks to the specific pipeline run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_failure(dagit_base_url="http://localhost:3000")\n @pipeline(...)\n def my_pipeline():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return "Solid {solid_name} failed!".format(\n solid_name=context.solid\n )\n\n @solid\n def a_solid(context):\n pass\n\n @pipeline(...)\n def my_pipeline():\n a_solid.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n\n """\n\n @failure_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if dagit_base_url:\n text += "<a href='{base_url}/instance/runs/{run_id}'>View in Dagit</a>".format(\n base_url=dagit_base_url,\n run_id=context.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n\n\n
[docs]def teams_on_success(\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this\n to allow messages to include deeplinks to the specific pipeline run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_success(dagit_base_url="http://localhost:3000")\n @pipeline(...)\n def my_pipeline():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return "Solid {solid_name} failed!".format(\n solid_name=context.solid\n )\n\n @solid\n def a_solid(context):\n pass\n\n @pipeline(...)\n def my_pipeline():\n a_solid.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n\n """\n\n @success_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if dagit_base_url:\n text += "<a href='{base_url}/instance/runs/{run_id}'>View in Dagit</a>".format(\n base_url=dagit_base_url,\n run_id=context.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_msteams/hooks", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_msteams.hooks"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_msteams.resources

\nfrom dagster_msteams.client import TeamsClient\n\nfrom dagster import Bool, Field, Float, StringSource, resource\n\n\n
[docs]@resource(\n {\n "hook_url": Field(\n StringSource,\n description="""To send messages to MS Teams channel, an incoming webhook has to\n be created. The incoming webhook url must be given as a part of the\n resource config to the msteams_resource in dagster.\n """,\n ),\n "http_proxy": Field(StringSource, is_required=False),\n "https_proxy": Field(StringSource, is_required=False),\n "timeout": Field(Float, default_value=60, is_required=False),\n "Verify": Field(Bool, is_required=False),\n },\n description="This resource is for connecting to MS Teams",\n)\ndef msteams_resource(context):\n """This resource is for connecting to Microsoft Teams.\n\n The resource object is a `dagster_msteams.TeamsClient`.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster solid:\n\n Examples:\n\n .. code-block:: python\n\n import os\n\n from dagster import ModeDefinition, execute_pipeline, pipeline, solid\n from dagster_msteams import Card, msteams_resource\n\n\n @solid(required_resource_keys={"msteams"})\n def teams_solid(context):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n context.resources.msteams.post_message(payload=card.payload)\n\n\n @pipeline(\n mode_defs=[ModeDefinition(resource_defs={"msteams": msteams_resource})],\n )\n def teams_pipeline():\n teams_solid()\n\n\n execute_pipeline(\n teams_pipeline,\n {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}},\n )\n\n """\n return TeamsClient(\n hook_url=context.resource_config.get("hook_url"),\n http_proxy=context.resource_config.get("http_proxy"),\n https_proxy=context.resource_config.get("https_proxy"),\n timeout=context.resource_config.get("timeout"),\n verify=context.resource_config.get("verify"),\n )
\n
", "current_page_name": "_modules/dagster_msteams/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_msteams.resources"}, "sensors": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_msteams.sensors

\nfrom typing import Callable, Optional\n\nfrom dagster_msteams.card import Card\nfrom dagster_msteams.client import TeamsClient\n\nfrom dagster import DefaultSensorStatus\nfrom dagster.core.definitions.run_status_sensor_definition import (\n    PipelineFailureSensorContext,\n    pipeline_failure_sensor,\n)\n\n\ndef _default_failure_message(context: PipelineFailureSensorContext) -> str:\n    return "\\n".join(\n        [\n            f"Pipeline {context.pipeline_run.pipeline_name} failed!",\n            f"Run ID: {context.pipeline_run.run_id}",\n            f"Mode: {context.pipeline_run.mode}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\n
[docs]def make_teams_on_pipeline_failure_sensor(\n hook_url: str,\n message_fn: Callable[[PipelineFailureSensorContext], str] = _default_failure_message,\n http_proxy: Optional[str] = None,\n https_proxy: Optional[str] = None,\n timeout: Optional[float] = 60,\n verify: Optional[bool] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n):\n """Create a sensor on pipeline failures that will message the given MS Teams webhook URL.\n\n Args:\n hook_url (str): MS Teams incoming webhook URL.\n message_fn (Optional(Callable[[PipelineFailureSensorContext], str])): Function which\n takes in the ``PipelineFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, pipeline name, and run ID.\n http_proxy : (Optional[str]): Proxy for requests using http protocol.\n https_proxy : (Optional[str]): Proxy for requests using https protocol.\n timeout: (Optional[float]): Connection timeout in seconds. Defaults to 60.\n verify: (Optional[bool]): Whether to verify the servers TLS certificate.\n name: (Optional[str]): The name of the sensor. Defaults to "teams_on_pipeline_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed pipeline run.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n\n Examples:\n\n .. code-block:: python\n\n teams_on_pipeline_failure = make_teams_on_pipeline_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n )\n\n @repository\n def my_repo():\n return [my_pipeline + teams_on_pipeline_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: PipelineFailureSensorContext) -> str:\n return "Pipeline {pipeline_name} failed! Error: {error}".format(\n pipeline_name=context.pipeline_run.pipeline_name,\n error=context.failure_event.message,\n )\n\n teams_on_pipeline_failure = make_teams_on_pipeline_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n message_fn=my_message_fn,\n dagit_base_url="http://localhost:3000",\n )\n\n\n """\n\n teams_client = TeamsClient(\n hook_url=hook_url,\n http_proxy=http_proxy,\n https_proxy=https_proxy,\n timeout=timeout,\n verify=verify,\n )\n\n @pipeline_failure_sensor(name=name, default_status=default_status)\n def teams_on_pipeline_failure(context: PipelineFailureSensorContext):\n\n text = message_fn(context)\n if dagit_base_url:\n text += "<a href='{base_url}/instance/runs/{run_id}'>View in Dagit</a>".format(\n base_url=dagit_base_url,\n run_id=context.pipeline_run.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n teams_client.post_message(payload=card.payload)\n\n return teams_on_pipeline_failure
\n
", "current_page_name": "_modules/dagster_msteams/sensors", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_msteams.sensors"}}, "dagster_mysql": {"event_log": {"event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_mysql.event_log.event_log

\nimport sqlalchemy as db\n\nfrom dagster import check, seven\nfrom dagster.core.storage.event_log import (\n    AssetKeyTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlPollingEventWatcher,\n)\nfrom dagster.core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster.core.storage.sql import stamp_alembic_rev  # pylint: disable=unused-import\nfrom dagster.core.storage.sql import create_engine, run_alembic_upgrade\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom ..utils import (\n    MYSQL_POOL_RECYCLE,\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_config,\n    mysql_url_from_config,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nCHANNEL_NAME = "run_events"\n\n\n
[docs]class MySQLEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """MySQL-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql.yaml\n :caption: dagster.yaml\n :start-after: start_marker_event_log\n :end-before: end_marker_event_log\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, mysql_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = check.str_param(mysql_url, "mysql_url")\n self._disposed = False\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db.pool.NullPool,\n )\n self._secondary_index_cache = {}\n\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "event_logs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n # mark all secondary indexes to be used\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def _init_db(self):\n with self._connect() as conn:\n with conn.begin():\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n pool_recycle=MYSQL_POOL_RECYCLE,\n )\n\n def upgrade(self):\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return mysql_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MySQLEventLogStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url):\n engine = create_engine(mysql_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool)\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(conn_string):\n MySQLEventLogStorage.wipe_storage(conn_string)\n return MySQLEventLogStorage(conn_string)\n\n def store_asset_observation(self, event):\n # last_materialization_timestamp is updated upon observation or materialization\n # See store_asset method in SqlEventLogStorage for more details\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n with self.index_connection() as conn:\n conn.execute(\n db.dialects.mysql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n )\n .on_duplicate_key_update(\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n )\n )\n\n def store_asset_materialization(self, event):\n # last_materialization_timestamp is updated upon observation or materialization\n # See store_asset method in SqlEventLogStorage for more details\n materialization = event.dagster_event.step_materialization_data.materialization\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n with self.index_connection() as conn:\n conn.execute(\n db.dialects.mysql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization=serialize_dagster_namedtuple(materialization),\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n last_run_id=event.run_id,\n tags=seven.json.dumps(materialization.tags)\n if materialization.tags\n else None,\n )\n .on_duplicate_key_update(\n last_materialization=serialize_dagster_namedtuple(materialization),\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n last_run_id=event.run_id,\n tags=seven.json.dumps(materialization.tags)\n if materialization.tags\n else None,\n )\n )\n else:\n with self.index_connection() as conn:\n conn.execute(\n db.dialects.mysql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization=serialize_dagster_namedtuple(materialization),\n last_run_id=event.run_id,\n )\n .on_duplicate_key_update(\n last_materialization=serialize_dagster_namedtuple(materialization),\n last_run_id=event.run_id,\n )\n )\n\n def _connect(self):\n return create_mysql_connection(self._engine, __file__, "event log")\n\n def run_connection(self, run_id=None):\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n MySQLEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(MySQLEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, start_cursor, callback):\n self._event_watcher.watch_run(run_id, start_cursor, callback)\n\n def end_watch(self, run_id, handler):\n self._event_watcher.unwatch_run(run_id, handler)\n\n @property\n def event_watcher(self):\n return self._event_watcher\n\n def __del__(self):\n self.dispose()\n\n def dispose(self):\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()
\n
", "current_page_name": "_modules/dagster_mysql/event_log/event_log", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_mysql.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_mysql.run_storage.run_storage

\nimport sqlalchemy as db\n\nfrom dagster import check\nfrom dagster.core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster.core.storage.sql import stamp_alembic_rev  # pylint: disable=unused-import\nfrom dagster.core.storage.sql import create_engine, run_alembic_upgrade\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom ..utils import (\n    MYSQL_POOL_RECYCLE,\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_config,\n    mysql_url_from_config,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\n\n
[docs]class MySQLRunStorage(SqlRunStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql.yaml\n :caption: dagster.yaml\n :start-after: start_marker_runs\n :end-before: end_marker_runs\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db.pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "runs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n super().__init__()\n\n def _init_db(self):\n with self.connect() as conn:\n with conn.begin():\n RunStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold 1 open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n pool_recycle=MYSQL_POOL_RECYCLE,\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return mysql_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MySQLRunStorage(inst_data=inst_data, mysql_url=mysql_url_from_config(config_value))\n\n @staticmethod\n def wipe_storage(mysql_url):\n engine = create_engine(mysql_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool)\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url):\n MySQLRunStorage.wipe_storage(mysql_url)\n return MySQLRunStorage(mysql_url)\n\n def connect(self, run_id=None): # pylint: disable=arguments-differ, unused-argument\n return create_mysql_connection(self._engine, __file__, "run")\n\n def upgrade(self):\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_built_index(self, migration_name):\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n MySQLRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name):\n super(MySQLRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat):\n with self.connect() as conn:\n conn.execute(\n db.dialects.mysql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n .on_duplicate_key_update(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n )
\n
", "current_page_name": "_modules/dagster_mysql/run_storage/run_storage", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_mysql.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_mysql.schedule_storage.schedule_storage

\nimport sqlalchemy as db\n\nfrom dagster import check\nfrom dagster.core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster.core.storage.sql import create_engine, run_alembic_upgrade, stamp_alembic_rev\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom ..utils import (\n    MYSQL_POOL_RECYCLE,\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_config,\n    mysql_url_from_config,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\n\n
[docs]class MySQLScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql.yaml\n :caption: dagster.yaml\n :start-after: start_marker_schedules\n :end-before: end_marker_schedules\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db.pool.NullPool,\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n if "jobs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n\n super().__init__()\n\n def _init_db(self):\n with self.connect() as conn:\n with conn.begin():\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n pool_recycle=MYSQL_POOL_RECYCLE,\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return mysql_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MySQLScheduleStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url):\n engine = create_engine(mysql_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool)\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url):\n MySQLScheduleStorage.wipe_storage(mysql_url)\n return MySQLScheduleStorage(mysql_url)\n\n def connect(self, run_id=None): # pylint: disable=arguments-differ, unused-argument\n return create_mysql_connection(self._engine, __file__, "schedule")\n\n def upgrade(self):\n alembic_config = mysql_alembic_config(__file__)\n run_alembic_upgrade(alembic_config, self._engine)
\n
", "current_page_name": "_modules/dagster_mysql/schedule_storage/schedule_storage", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_mysql.schedule_storage.schedule_storage"}}}, "dagster_pagerduty": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pagerduty.resources

\nimport pypd\n\nfrom dagster import Field, resource\n\n\nclass PagerDutyService:\n    """Integrates with PagerDuty via the pypd library.\n\n    See:\n        https://v2.developer.pagerduty.com/docs/events-api-v2\n        https://v2.developer.pagerduty.com/docs/send-an-event-events-api-v2\n        https://support.pagerduty.com/docs/services-and-integrations#section-events-api-v2\n        https://github.com/PagerDuty/pagerduty-api-python-client\n\n    for documentation and more information.\n    """\n\n    def __init__(self, routing_key):\n        self.routing_key = routing_key\n\n    def EventV2_create(\n        self,\n        summary,\n        source,\n        severity,\n        event_action="trigger",\n        dedup_key=None,\n        timestamp=None,\n        component=None,\n        group=None,\n        event_class=None,\n        custom_details=None,\n    ):\n        """Events API v2 enables you to add PagerDuty's advanced event and incident management\n        functionality to any system that can make an outbound HTTP connection.\n\n        Arguments:\n            summary {string} -- A high-level, text summary message of the event. Will be used to\n                                construct an alert's description.\n\n                                Example: "PING OK - Packet loss = 0%, RTA = 1.41 ms" "Host\n                                         'acme-andromeda-sv1-c40 :: 179.21.24.50' is DOWN"\n\n            source {string} -- Specific human-readable unique identifier, such as a hostname, for\n                               the system having the problem.\n\n                               Examples:\n                               "prod05.theseus.acme-widgets.com"\n                               "171.26.23.22"\n                               "aws:elasticache:us-east-1:852511987:cluster/api-stats-prod-003"\n                               "9c09acd49a25"\n\n            severity {string} -- How impacted the affected system is. Displayed to users in lists\n                                 and influences the priority of any created incidents. Must be one\n                                 of {info, warning, error, critical}\n\n        Keyword Arguments:\n            event_action {str} -- There are three types of events that PagerDuty recognizes, and\n                                  are used to represent different types of activity in your\n                                  monitored systems. (default: 'trigger')\n                * trigger: When PagerDuty receives a trigger event, it will either open a new alert,\n                           or add a new trigger log entry to an existing alert, depending on the\n                           provided dedup_key. Your monitoring tools should send PagerDuty a trigger\n                           when a new problem has been detected. You may send additional triggers\n                           when a previously detected problem has occurred again.\n\n                * acknowledge: acknowledge events cause the referenced incident to enter the\n                               acknowledged state. While an incident is acknowledged, it won't\n                               generate any additional notifications, even if it receives new\n                               trigger events. Your monitoring tools should send PagerDuty an\n                               acknowledge event when they know someone is presently working on the\n                               problem.\n\n                * resolve: resolve events cause the referenced incident to enter the resolved state.\n                           Once an incident is resolved, it won't generate any additional\n                           notifications. New trigger events with the same dedup_key as a resolved\n                           incident won't re-open the incident. Instead, a new incident will be\n                           created. Your monitoring tools should send PagerDuty a resolve event when\n                           the problem that caused the initial trigger event has been fixed.\n\n            dedup_key {string} -- Deduplication key for correlating triggers and resolves. The\n                                  maximum permitted length of this property is 255 characters.\n\n            timestamp {string} -- Timestamp (ISO 8601). When the upstream system detected / created\n                                  the event. This is useful if a system batches or holds events\n                                  before sending them to PagerDuty.\n\n                                  Optional - Will be auto-generated by PagerDuty if not provided.\n\n                                  Example:\n                                  2015-07-17T08:42:58.315+0000\n\n            component {string} -- The part or component of the affected system that is broken.\n\n                                  Examples:\n                                  "keepalive"\n                                  "webping"\n                                  "mysql"\n                                  "wqueue"\n\n            group {string} -- A cluster or grouping of sources. For example, sources\n                              "prod-datapipe-02" and "prod-datapipe-03" might both be part of\n                              "prod-datapipe"\n\n                              Examples:\n                              "prod-datapipe"\n                              "www"\n                              "web_stack"\n\n            event_class {string} -- The class/type of the event.\n\n                                    Examples:\n                                    "High CPU"\n                                    "Latency"\n                                    "500 Error"\n\n            custom_details {Dict[str, str]} -- Additional details about the event and affected\n                                               system.\n\n                                               Example:\n                                               {"ping time": "1500ms", "load avg": 0.75 }\n        """\n\n        data = {\n            "routing_key": self.routing_key,\n            "event_action": event_action,\n            "payload": {"summary": summary, "source": source, "severity": severity},\n        }\n\n        if dedup_key is not None:\n            data["dedup_key"] = dedup_key\n\n        if timestamp is not None:\n            data["payload"]["timestamp"] = timestamp\n\n        if component is not None:\n            data["payload"]["component"] = component\n\n        if group is not None:\n            data["payload"]["group"] = group\n\n        if event_class is not None:\n            data["payload"]["class"] = event_class\n\n        if custom_details is not None:\n            data["payload"]["custom_details"] = custom_details\n\n        return pypd.EventV2.create(data=data)\n\n\n
[docs]@resource(\n {\n "routing_key": Field(\n str,\n description="""The routing key provisions access to your PagerDuty service. You\n will need to include the integration key for your new integration, as a\n routing_key in the event payload.""",\n )\n },\n description="""This resource is for posting events to PagerDuty.""",\n)\ndef pagerduty_resource(context):\n """A resource for posting events (alerts) to PagerDuty.\n\n Example:\n\n .. code-block:: python\n\n @op(required_resource_keys={'pagerduty'})\n def pagerduty_op(context):\n context.resources.pagerduty.EventV2_create(\n summary='alert from dagster'\n source='localhost',\n severity='error',\n event_action='trigger',\n )\n\n @job(resource_defs={ 'pagerduty': pagerduty_resource })\n def pagerduty_test():\n pagerduty_op()\n\n pagerduty_test.execute_in_process(\n run_config={\n "resources": {\n 'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n }\n }\n )\n """\n return PagerDutyService(context.resource_config.get("routing_key"))
\n
", "current_page_name": "_modules/dagster_pagerduty/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pagerduty.resources"}}, "dagster_pandas": {"constraints": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pandas.constraints

\nimport sys\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom functools import wraps\n\nimport pandas as pd\nfrom pandas import DataFrame\n\nfrom dagster import DagsterType, MetadataEntry, TypeCheck, check\nfrom dagster.utils.backcompat import experimental_class_warning\n\n\nclass ConstraintViolationException(Exception):\n    """Indicates that a constraint has been violated."""\n\n\nclass ConstraintWithMetadataException(Exception):\n    """\n    This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a\n    failed typecheck or an exception.\n\n    Args:\n        constraint_name (str):  the name of the violated constraint\n        constraint_description (Optional[str]): the description of the violated constraint\n        expectation (Optional[Union[dict,list, str, set]]): what result was expected -- typically a jsonlike, though it can be a string\n        offending (Optional[Union[dict,list, str, set]]):  which pieces of the dataframe violated the expectation, typically list or string\n        actual (Optional[Union[dict,list, str, set]]): what those pieces of the dataframe actually were -- typically a jsonlike\n    """\n\n    def __init__(\n        self,\n        constraint_name,\n        constraint_description="",\n        expectation=None,\n        offending=None,\n        actual=None,\n    ):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))\n        self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))\n        self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))\n        super(ConstraintWithMetadataException, self).__init__(\n            "Violated {} - {}, {} was/were expected, but we received {} which was/were {}".format(\n                constraint_name,\n                constraint_description,\n                expectation,\n                offending,\n                actual,\n            )\n        )\n\n    def normalize_metadata_json_value(self, val):\n        if isinstance(val, set):\n            return list(val)\n        else:\n            return val\n\n    def convert_to_metadata(self):\n        return MetadataEntry(\n            "constraint-metadata",\n            value={\n                "constraint_name": self.constraint_name,\n                "constraint_description": self.constraint_description,\n                "expected": self.normalize_metadata_json_value(self.expectation),\n                "offending": self.normalize_metadata_json_value(self.offending),\n                "actual": self.normalize_metadata_json_value(self.actual),\n            },\n        )\n\n    def return_as_typecheck(self):\n        return TypeCheck(\n            success=False, description=self.args[0], metadata_entries=[self.convert_to_metadata()]\n        )\n\n\nclass DataFrameConstraintViolationException(ConstraintViolationException):\n    """Indicates a dataframe level constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description):\n        super(DataFrameConstraintViolationException, self).__init__(\n            "Violated {constraint_name} - {constraint_description}".format(\n                constraint_name=constraint_name, constraint_description=constraint_description\n            )\n        )\n\n\nclass DataFrameWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, actual):\n        super(DataFrameWithMetadataException, self).__init__(\n            constraint_name, constraint_description, expectation, "a malformed dataframe", actual\n        )\n\n\nclass ColumnConstraintViolationException(ConstraintViolationException):\n    """Indicates that a column constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description, column_name, offending_rows=None):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.column_name = column_name\n        self.offending_rows = offending_rows\n        super(ColumnConstraintViolationException, self).__init__(self.construct_message())\n\n    def construct_message(self):\n        base_message = 'Violated "{constraint_name}" for column "{column_name}" - {constraint_description}'.format(\n            constraint_name=self.constraint_name,\n            constraint_description=self.constraint_description,\n            column_name=self.column_name,\n        )\n        if self.offending_rows is not None:\n            base_message += "The offending (index, row values) are the following: {}".format(\n                self.offending_rows\n            )\n        return base_message\n\n\nclass ColumnWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, offending, actual):\n        super(ColumnWithMetadataException, self).__init__(\n            "the column constraint " + constraint_name,\n            constraint_description,\n            expectation,\n            offending,\n            actual,\n        )\n\n\nclass Constraint:\n    """\n    Base constraint object that all constraints inherit from.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        self.name = self.__class__.__name__\n        self.markdown_description = check.str_param(markdown_description, "markdown_description")\n        self.error_description = check.str_param(error_description, "error_description")\n\n\nclass ConstraintWithMetadata:\n    """\n    This class defines a base constraint over pandas DFs with organized metadata\n\n    args:\n        description (str): description of the constraint\n        validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n                    the validation function to run over inputted data\n                    This function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    # TODO:  validation_fn returning metadata is sorta broken.  maybe have it yield typecheck events and grab metadata?\n\n    def __init__(\n        self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None\n    ):\n        experimental_class_warning(self.__class__.__name__)\n        if name is None:\n            self.name = self.__class__.__name__\n        else:\n            self.name = name\n        self.description = description\n        # should return a tuple of (bool, and either an empty dict or a dict of extra params)\n        self.validation_fn = validation_fn\n        self.resulting_exception = resulting_exception\n        self.raise_or_typecheck = raise_or_typecheck\n\n    def validate(self, data, *args, **kwargs):\n        res = self.validation_fn(data, *args, **kwargs)\n        if not res[0]:\n            exc = self.resulting_exception(\n                constraint_name=self.name, constraint_description=self.description, **res[1]\n            )\n\n            if self.raise_or_typecheck:\n                raise exc\n            else:\n                return exc.return_as_typecheck()\n\n        else:\n            if res[0]:\n                return TypeCheck(success=True)\n\n    # TODO:  composition of validations\n    def as_dagster_type(self, *args, **kwargs):\n        if self.raise_or_typecheck:\n            raise Exception(\n                "Dagster types can only be constructed from constraints that return typechecks"\n            )\n        return DagsterType(\n            name=self.name,\n            description="A Pandas DataFrame with the following validation: {}".format(\n                self.description\n            ),\n            type_check_fn=lambda x: self.validate(x, *args),\n            **kwargs,\n        )\n\n\nclass MultiConstraintWithMetadata(ConstraintWithMetadata):\n    """\n    Use this class if you have multiple constraints to check over the entire dataframe\n\n    args:\n        description (str): description of the constraint\n        validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n                    a list of the validation functions to run over inputted data\n                    Each function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    def __init__(\n        self,\n        description,\n        validation_fn_arr,\n        resulting_exception,\n        raise_or_typecheck=True,\n        name=None,\n    ):\n        validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")\n\n        def validation_fn(data, *args, **kwargs):\n\n            results = [f(data, *args, **kwargs) for f in validation_fn_arr]\n            truthparam = all(item[0] for item in results)\n            metadict = defaultdict(dict)\n            for i, dicta in enumerate(item[1] for item in results):\n                if len(dicta.keys()) > 0:\n                    for key in dicta:\n                        metadict[key][validation_fn_arr[i].__name__] = dicta[key]\n            return (truthparam, metadict)\n\n        super(MultiConstraintWithMetadata, self).__init__(\n            description,\n            validation_fn,\n            resulting_exception,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass StrictColumnsWithMetadata(ConstraintWithMetadata):\n    def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):\n        self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n        self.column_list = check.list_param(column_list, "strict_column_list", of_type=str)\n\n        def validation_fcn(inframe):\n            if list(inframe.columns) == column_list:\n                return (True, {})\n            else:\n                if self.enforce_ordering:\n                    resdict = {"expectation": self.column_list, "actual": list(inframe.columns)}\n                    return (False, resdict)\n                else:\n                    if set(inframe.columns) == set(column_list):\n                        return (True, {})\n                    else:\n                        extra = [x for x in inframe.columns if x not in set(column_list)]\n                        missing = [x for x in set(column_list) if x not in inframe.columns]\n                        resdict = {\n                            "expectation": self.column_list,\n                            "actual": {"extra_columns": extra, "missing_columns": missing},\n                        }\n                        return (False, resdict)\n\n        basestr = "ensuring that the right columns, {} were present".format(self.column_list)\n        if enforce_ordering:\n            basestr += " in the right order"\n        super(StrictColumnsWithMetadata, self).__init__(\n            basestr,\n            validation_fcn,\n            DataFrameWithMetadataException,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass DataFrameConstraint(Constraint):\n    """\n    Base constraint object that represent Dataframe shape constraints.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        super(DataFrameConstraint, self).__init__(\n            error_description=error_description, markdown_description=markdown_description\n        )\n\n    def validate(self, dataframe):\n        raise NotImplementedError()\n\n\n
[docs]class StrictColumnsConstraint(DataFrameConstraint):\n """\n A dataframe constraint that validates column existence and ordering.\n\n Args:\n strict_column_list (List[str]): The exact list of columns that your dataframe must have.\n enforce_ordering (Optional[bool]): If true, will enforce that the ordering of column names must match.\n Default is False.\n """\n\n def __init__(self, strict_column_list, enforce_ordering=False):\n self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n self.strict_column_list = check.list_param(\n strict_column_list, "strict_column_list", of_type=str\n )\n description = "No columns outside of {cols} allowed. ".format(cols=self.strict_column_list)\n if enforce_ordering:\n description += "Columns must be in that order."\n super(StrictColumnsConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n columns_received = list(dataframe.columns)\n if self.enforce_ordering:\n if self.strict_column_list != columns_received:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected the following ordering of columns {expected}. Received: {received}".format(\n expected=self.strict_column_list, received=columns_received\n ),\n )\n for column in columns_received:\n if column not in self.strict_column_list:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {}. Recevied {}.".format(\n self.strict_column_list, columns_received\n ),\n )
\n\n\n
[docs]class RowCountConstraint(DataFrameConstraint):\n """\n A dataframe constraint that validates the expected count of rows.\n\n Args:\n num_allowed_rows (int): The number of allowed rows in your dataframe.\n error_tolerance (Optional[int]): The acceptable threshold if you are not completely certain. Defaults to 0.\n """\n\n def __init__(self, num_allowed_rows, error_tolerance=0):\n self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows")\n self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))\n if self.error_tolerance > self.num_allowed_rows:\n raise ValueError("Tolerance can't be greater than the number of rows you expect.")\n description = "Dataframe must have {} +- {} rows.".format(\n self.num_allowed_rows, self.error_tolerance\n )\n super(RowCountConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n\n if not (\n self.num_allowed_rows - self.error_tolerance\n <= len(dataframe)\n <= self.num_allowed_rows + self.error_tolerance\n ):\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {expected} +- {tolerance} rows. Got {received}".format(\n expected=self.num_allowed_rows,\n tolerance=self.error_tolerance,\n received=len(dataframe),\n ),\n )
\n\n\ndef apply_ignore_missing_data_to_mask(mask, column):\n return mask & ~column.isnull()\n\n\nclass ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):\n """\n Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.\n args:\n description (str): description of the constraint\n validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n\n offending_columns = set()\n offending_values = {}\n for column in columns:\n # TODO: grab extra metadata\n res = self.validation_fn(relevant_data[column])\n if not res[0]:\n offending_columns.add(column)\n if not res[1].get("actual") is None:\n offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]\n else:\n offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]\n if len(offending_columns) == 0 and not self.raise_or_typecheck:\n return TypeCheck(success=True)\n elif len(offending_columns) > 0:\n metadict = {\n "expectation": self.description.replace("Confirms", ""),\n "actual": offending_values,\n "offending": offending_columns,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass ColumnConstraintWithMetadata(ConstraintWithMetadata):\n """\n This class is useful for constructing single constraints that\n you want to apply to multiple columns of your dataframe\n The main difference from the base class in terms of construction is that now, your validation_fns should operate on\n individual values.\n args:\n description (str): description of the constraint\n validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n offending = {}\n offending_values = {}\n # TODO: grab metadata from here\n inverse_validation = lambda x: not self.validation_fn(x)[0]\n for column in columns:\n results = relevant_data[relevant_data[column].apply(inverse_validation)]\n if len(results.index.tolist()) > 0:\n offending[column] = ["row " + str(i) for i in (results.index.tolist())]\n offending_values[column] = results[column].tolist()\n if len(offending) == 0:\n if not self.raise_or_typecheck:\n return TypeCheck(success=True)\n else:\n metadict = {\n "expectation": self.validation_fn.__doc__,\n "actual": offending_values,\n "offending": offending,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):\n """\n This class is useful for constructing more complicated relationships between columns\n and expectations -- i.e. you want some validations on column A, others on column B, etc.\n This lets you package up the metadata neatly,\n and also allows for cases like 'fail if any one of these constraints fails but still run all of them'\n\n Args:\n description (str): description of the overall set of validations\n fn_and_columns_dict (Dict[str, List[Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is 'a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n type_for_internal=ColumnConstraintWithMetadata,\n name=None,\n ):\n # TODO: support multiple descriptions\n self.column_to_fn_dict = check.dict_param(\n fn_and_columns_dict, "fn_and_columns_dict", key_type=str\n )\n\n def validation_fn(data, *args, **kwargs):\n metadict = defaultdict(dict)\n truthparam = True\n for column, fn_arr in self.column_to_fn_dict.items():\n if column not in data.columns:\n continue\n for fn in fn_arr:\n # TODO: do this more effectively\n new_validator = type_for_internal(\n fn.__doc__, fn, ColumnWithMetadataException, raise_or_typecheck=False\n )\n result = new_validator.validate(\n DataFrame(data[column]), column, *args, **kwargs\n )\n result_val = result.success\n if result_val:\n continue\n result_dict = result.metadata_entries[0].entry_data.data\n truthparam = truthparam and result_val\n for key in result_dict.keys():\n if "constraint" not in key:\n if key == "expected":\n new_key = "expectation"\n result_dict[key] = result_dict[key].replace("returns", "").strip()\n if column not in metadict[new_key] or new_key not in metadict:\n metadict[new_key][column] = dict()\n metadict[new_key][column][fn.__name__] = result_dict[key]\n else:\n if column not in metadict[key] or key not in metadict:\n metadict[key][column] = dict()\n if isinstance(result_dict[key], dict):\n metadict[key][column][fn.__name__] = result_dict[key][column]\n else:\n metadict[key][column][fn.__name__] = "a violation"\n return truthparam, metadict\n\n super(MultiColumnConstraintWithMetadata, self).__init__(\n description,\n validation_fn,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n def validate(self, data, *args, **kwargs):\n return ConstraintWithMetadata.validate(self, data, *args, **kwargs)\n\n\nclass MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):\n """\n This class is similar to multicolumn, but takes in functions that operate on the whole column at once\n rather than ones that operate on each value --\n consider this similar to the difference between apply-map and apply aggregate.\n\n Args:\n description (str): description of the overall set of validations (TODO: support multiple descriptions)\n fn_and_columns_dict (Dict[str, List[Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n name=None,\n ):\n super(MultiAggregateConstraintWithMetadata, self).__init__(\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n type_for_internal=ColumnAggregateConstraintWithMetadata,\n name=name,\n )\n\n\ndef non_null_validation(x):\n """\n validates that a particular value in a column is not null\n Usage:\n pass this as a column validator to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Generally, you should prefer to use nonnull as a decorator/wrapper rather than using this\n directly.\n """\n return not pd.isnull(x), {}\n\n\ndef all_unique_validator(column, ignore_missing_vals=False):\n """\n validates that all values in an iterable are unique\n Returns duplicated values as metadata\n\n Usage:\n As a validation function for a\n :py:class:'~dagster_pandas.constraints.ColumnAggregateConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiAggregateConstraintWithMetadata'\n Example:\n .. code-block:: python\n aggregate_validator = MultiAggregateConstraintWithMetadata(\n "confirms all values are unique",\n {'bar': [all_unique_validator]},\n ConstraintWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_aggregate_validator=aggregate_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 3], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'bar': {'all_unique_validator': 'a violation'}}\n metadata['actual'] == {'bar': {'all_unique_validator': [10.0]}}\n """\n column = pd.Series(column)\n duplicated = column.duplicated()\n if ignore_missing_vals:\n duplicated = apply_ignore_missing_data_to_mask(duplicated, column)\n return not duplicated.any(), {"actual": column[duplicated]}\n\n\ndef nonnull(func):\n """\n decorator for column validation functions to make them error on nulls\n Usage:\n pass decorated functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Args:\n func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n the column validator you want to error on nulls\n """\n\n @wraps(func)\n def nvalidator(val):\n origval = func(val)\n nval = non_null_validation(val)\n return origval[0] and nval[0], {}\n\n nvalidator.__doc__ += " and ensures no values are null"\n\n return nvalidator\n\n\ndef column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):\n """\n factory for validators testing if column values are within a range\n Args:\n minim(Optional[Comparable]): the low end of the range\n maxim(Optional[Comparable]): the high end of the range\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Examples:\n .. code-block:: python\n in_range_validator = column_range_validation_factory(1, 3, ignore_missing_vals=True)\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [in_range_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'in_range_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'in_range_validation_fn': [7]}}\n\n """\n if minim is None:\n if isinstance(maxim, datetime):\n minim = datetime.min\n else:\n minim = -1 * (sys.maxsize - 1)\n if maxim is None:\n if isinstance(minim, datetime):\n maxim = datetime.max\n else:\n maxim = sys.maxsize\n\n def in_range_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}\n\n in_range_validation_fn.__doc__ = "checks whether values are between {} and {}".format(\n minim, maxim\n )\n if ignore_missing_vals:\n in_range_validation_fn.__doc__ += ", ignoring nulls"\n\n return in_range_validation_fn\n\n\ndef categorical_column_validator_factory(categories, ignore_missing_vals=False):\n """\n factory for validators testing if all values are in some set\n Args:\n categories(Union[Sequence, set]): the set of allowed values\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Example:\n .. code-block:: python\n categorical_validation_fn = categorical_column_validator_factory([1, 2])\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [categorical_validation_fn]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}\n\n """\n\n categories = set(categories)\n\n def categorical_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (x in categories), {}\n\n categorical_validation_fn.__doc__ = (\n "checks whether values are within this set of values: {}".format(categories)\n )\n if ignore_missing_vals:\n categorical_validation_fn.__doc__ += ", ignoring nulls"\n\n return categorical_validation_fn\n\n\ndef dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):\n """\n factory for testing if the dtype of a val falls within some allowed set\n Args:\n datatypes(Union[set[type], type]): which datatype/datatypes are allowed\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Examples:\n .. code-block:: python\n dtype_is_num_validator = dtype_in_set_validation_factory((int, float, int64, float64))\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [dtype_is_num_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 'a', 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 1']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': ['a']}}\n\n """\n\n def dtype_in_set_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return isinstance(x, datatypes), {}\n\n dtype_in_set_validation_fn.__doc__ = "checks whether values are this type/types: {}".format(\n datatypes\n )\n if ignore_missing_vals:\n dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"\n\n return dtype_in_set_validation_fn\n\n\nclass ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):\n def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):\n self.name = self.__class__.__name__\n\n description = "Confirms values are between {} and {}".format(minim, maxim)\n super(ColumnRangeConstraintWithMetadata, self).__init__(\n description=description,\n validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),\n resulting_exception=ColumnWithMetadataException,\n raise_or_typecheck=raise_or_typecheck,\n )\n self.columns = columns\n\n def validate(self, data, *args, **kwargs):\n if self.columns is None:\n self.columns = list(data.columns)\n self.columns.extend(args)\n return super(ColumnRangeConstraintWithMetadata, self).validate(\n data, *self.columns, **kwargs\n )\n\n\nclass ColumnConstraint(Constraint):\n """\n Base constraint object that represent dataframe column shape constraints.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n super(ColumnConstraint, self).__init__(\n error_description=error_description, markdown_description=markdown_description\n )\n\n def validate(self, dataframe, column_name):\n pass\n\n @staticmethod\n def get_offending_row_pairs(dataframe, column_name):\n return zip(dataframe.index.tolist(), dataframe[column_name].tolist())\n\n\nclass ColumnDTypeFnConstraint(ColumnConstraint):\n """\n A column constraint that applies a pandas dtype validation function to a columns dtype.\n\n Args:\n type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and\n returns if those dtypes match the types it expects. See pandas.core.dtypes.common for examples.\n """\n\n def __init__(self, type_fn):\n self.type_fn = check.callable_param(type_fn, "type_fn")\n description = f'Dtype must satisfy "{self.type_fn.__name__}"'\n super(ColumnDTypeFnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n column_dtype = dataframe[column_name].dtype\n if not self.type_fn(column_dtype):\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=f'{self.error_description}, but was "{column_dtype}"',\n column_name=column_name,\n )\n\n\nclass ColumnDTypeInSetConstraint(ColumnConstraint):\n """\n A column constraint that validates the pandas column dtypes based on the expected set of dtypes.\n\n Args:\n expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.\n """\n\n def __init__(self, expected_dtype_set):\n self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")\n description = "Column dtype must be in the following set {}.".format(\n self.expected_dtype_set\n )\n super(ColumnDTypeInSetConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n received_dtypes = dataframe[column_name].dtype\n if str(received_dtypes) not in self.expected_dtype_set:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description="{base_error_message}. DTypes received: {received_dtypes}".format(\n base_error_message=self.error_description, received_dtypes=received_dtypes\n ),\n column_name=column_name,\n )\n\n\nclass NonNullableColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are not null.\n """\n\n def __init__(self):\n description = "No Null values allowed."\n super(NonNullableColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n rows_with_null_columns = dataframe[dataframe[column_name].isna()]\n if not rows_with_null_columns.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=self.get_offending_row_pairs(rows_with_null_columns, column_name),\n )\n\n\nclass UniqueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are unique.\n\n Args:\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, ignore_missing_vals):\n description = "Column must be unique."\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(UniqueColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name].duplicated()\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_duplicated_values = dataframe[invalid]\n if not rows_with_duplicated_values.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_duplicated_values,\n )\n\n\nclass CategoricalColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are a valid category.\n\n Args:\n categories (Set[str]): Set of categories that values in your pandas column must match.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, categories, ignore_missing_vals):\n self.categories = list(check.set_param(categories, "categories", of_type=str))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(CategoricalColumnConstraint, self).__init__(\n error_description="Expected Categories are {}".format(self.categories),\n markdown_description="Category examples are {}...".format(self.categories[:5]),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].isin(self.categories)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_unexpected_buckets = dataframe[invalid]\n if not rows_with_unexpected_buckets.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_unexpected_buckets,\n )\n\n\nclass MinValueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are greater than the provided\n lower bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, min_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MinValueColumnConstraint, self).__init__(\n markdown_description="values > {}".format(self.min_value),\n error_description="Column must have values > {}".format(self.min_value),\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] < self.min_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass MaxValueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are less than the provided\n upper bound [inclusive].\n\n Args:\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, max_value, ignore_missing_vals):\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MaxValueColumnConstraint, self).__init__(\n markdown_description="values < {}".format(self.max_value),\n error_description="Column must have values < {}".format(self.max_value),\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] > self.max_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass InRangeColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are between the lower and upper\n bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non\n missing values.\n """\n\n def __init__(self, min_value, max_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(InRangeColumnConstraint, self).__init__(\n markdown_description="{} < values < {}".format(self.min_value, self.max_value),\n error_description="Column must have values between {} and {} inclusive.".format(\n self.min_value, self.max_value\n ),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].between(self.min_value, self.max_value)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n
", "current_page_name": "_modules/dagster_pandas/constraints", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pandas.constraints"}, "data_frame": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pandas.data_frame

\nimport pandas as pd\nfrom dagster_pandas.constraints import (\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    ConstraintViolationException,\n)\nfrom dagster_pandas.validation import PandasColumn, validate_constraints\n\nfrom dagster import (\n    AssetMaterialization,\n    DagsterInvariantViolationError,\n    DagsterType,\n    Field,\n    MetadataEntry,\n    StringSource,\n    TypeCheck,\n    check,\n    dagster_type_loader,\n    dagster_type_materializer,\n)\nfrom dagster.check import CheckError\nfrom dagster.config.field_utils import Selector\nfrom dagster.core.definitions.metadata import normalize_metadata\nfrom dagster.core.errors import DagsterInvalidMetadata\nfrom dagster.utils import dict_without_keys\nfrom dagster.utils.backcompat import experimental\n\nCONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}\n\n\n@dagster_type_materializer(\n    Selector(\n        {\n            "csv": {\n                "path": StringSource,\n                "sep": Field(StringSource, is_required=False, default_value=","),\n            },\n            "parquet": {"path": StringSource},\n            "table": {"path": StringSource},\n            "pickle": {"path": StringSource},\n        },\n    )\n)\ndef dataframe_materializer(_context, config, pandas_df):\n    check.inst_param(pandas_df, "pandas_df", pd.DataFrame)\n    file_type, file_options = list(config.items())[0]\n\n    if file_type == "csv":\n        path = file_options["path"]\n        pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, "path"))\n    elif file_type == "parquet":\n        pandas_df.to_parquet(file_options["path"])\n    elif file_type == "table":\n        pandas_df.to_csv(file_options["path"], sep="\\t", index=False)\n    elif file_type == "pickle":\n        pandas_df.to_pickle(file_options["path"])\n    else:\n        check.failed("Unsupported file_type {file_type}".format(file_type=file_type))\n\n    return AssetMaterialization.file(file_options["path"])\n\n\n@dagster_type_loader(\n    Selector(\n        {\n            "csv": {\n                "path": StringSource,\n                "sep": Field(StringSource, is_required=False, default_value=","),\n            },\n            "parquet": {"path": StringSource},\n            "table": {"path": StringSource},\n            "pickle": {"path": StringSource},\n        },\n    )\n)\ndef dataframe_loader(_context, config):\n    file_type, file_options = list(config.items())[0]\n\n    if file_type == "csv":\n        path = file_options["path"]\n        return pd.read_csv(path, **dict_without_keys(file_options, "path"))\n    elif file_type == "parquet":\n        return pd.read_parquet(file_options["path"])\n    elif file_type == "table":\n        return pd.read_csv(file_options["path"], sep="\\t")\n    elif file_type == "pickle":\n        return pd.read_pickle(file_options["path"])\n    else:\n        raise DagsterInvariantViolationError(\n            "Unsupported file_type {file_type}".format(file_type=file_type)\n        )\n\n\ndef df_type_check(_, value):\n    if not isinstance(value, pd.DataFrame):\n        return TypeCheck(success=False)\n    return TypeCheck(\n        success=True,\n        metadata_entries=[\n            MetadataEntry("row_count", value=str(len(value))),\n            # string cast columns since they may be things like datetime\n            MetadataEntry("metadata", value={"columns": list(map(str, value.columns))}),\n        ],\n    )\n\n\nDataFrame = DagsterType(\n    name="PandasDataFrame",\n    description="""Two-dimensional size-mutable, potentially heterogeneous\n    tabular data structure with labeled axes (rows and columns).\n    See http://pandas.pydata.org/""",\n    loader=dataframe_loader,\n    materializer=dataframe_materializer,\n    type_check_fn=df_type_check,\n)\n\n\ndef _construct_constraint_list(constraints):\n    def add_bullet(constraint_list, constraint_description):\n        return constraint_list + "+ {constraint_description}\\n".format(\n            constraint_description=constraint_description\n        )\n\n    constraint_list = ""\n    for constraint in constraints:\n        if constraint.__class__ not in CONSTRAINT_BLACKLIST:\n            constraint_list = add_bullet(constraint_list, constraint.markdown_description)\n    return constraint_list\n\n\ndef _build_column_header(column_name, constraints):\n    header = "**{column_name}**".format(column_name=column_name)\n    for constraint in constraints:\n        if isinstance(constraint, ColumnDTypeInSetConstraint):\n            dtypes_tuple = tuple(constraint.expected_dtype_set)\n            return header + ": `{expected_dtypes}`".format(\n                expected_dtypes=dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]\n            )\n        elif isinstance(constraint, ColumnDTypeFnConstraint):\n            return header + ": Validator `{expected_dtype_fn}`".format(\n                expected_dtype_fn=constraint.type_fn.__name__\n            )\n    return header\n\n\ndef create_dagster_pandas_dataframe_description(description, columns):\n    title = "\\n".join([description, "### Columns", ""])\n    buildme = title\n    for column in columns:\n        buildme += "{}\\n{}\\n".format(\n            _build_column_header(column.name, column.constraints),\n            _construct_constraint_list(column.constraints),\n        )\n    return buildme\n\n\n
[docs]def create_dagster_pandas_dataframe_type(\n name,\n description=None,\n columns=None,\n event_metadata_fn=None,\n dataframe_constraints=None,\n loader=None,\n materializer=None,\n):\n """\n Constructs a custom pandas dataframe dagster type.\n\n Args:\n name (str): Name of the dagster pandas type.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects\n which express dataframe column schemas and constraints.\n event_metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]], List[MetadataEntry]]]]):\n A callable which takes your dataframe and returns a dict with string label keys and\n MetadataValue values. Can optionally return a List[MetadataEntry].\n dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from\n :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will\n default to using `dataframe_materializer`.\n """\n # We allow for the plugging in of dagster_type_loaders/materializers so that\n # Users can load and materialize their custom dataframes via configuration their own way if the default\n # configs don't suffice. This is purely optional.\n check.str_param(name, "name")\n event_metadata_fn = check.opt_callable_param(event_metadata_fn, "event_metadata_fn")\n description = create_dagster_pandas_dataframe_description(\n check.opt_str_param(description, "description", default=""),\n check.opt_list_param(columns, "columns", of_type=PandasColumn),\n )\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description="Must be a pandas.DataFrame. Got value of type. {type_name}".format(\n type_name=type(value).__name__\n ),\n )\n\n try:\n validate_constraints(\n value, pandas_columns=columns, dataframe_constraints=dataframe_constraints\n )\n except ConstraintViolationException as e:\n return TypeCheck(success=False, description=str(e))\n\n return TypeCheck(\n success=True,\n metadata_entries=_execute_summary_stats(name, value, event_metadata_fn)\n if event_metadata_fn\n else None,\n )\n\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n materializer=materializer if materializer else dataframe_materializer,\n description=description,\n )
\n\n\n@experimental\ndef create_structured_dataframe_type(\n name,\n description=None,\n columns_validator=None,\n columns_aggregate_validator=None,\n dataframe_validator=None,\n loader=None,\n materializer=None,\n):\n """\n\n Args:\n name (str): the name of the new type\n description (Optional[str]): the description of the new type\n columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):\n what column-level row by row validation you want to have applied.\n Leave empty for no column-level row by row validation.\n columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,\n MultiAggregateConstraintWithMetadata]]):\n what column-level aggregate validation you want to have applied,\n Leave empty for no column-level aggregate validation.\n dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):\n what dataframe-wide validation you want to have applied.\n Leave empty for no dataframe-wide validation.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will\n default to using `dataframe_materializer`.\n\n Returns:\n a DagsterType with the corresponding name and packaged validation.\n\n """\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description="Must be a pandas.DataFrame. Got value of type. {type_name}".format(\n type_name=type(value).__name__\n ),\n )\n individual_result_dict = {}\n\n if dataframe_validator is not None:\n individual_result_dict["dataframe"] = dataframe_validator.validate(value)\n if columns_validator is not None:\n individual_result_dict["columns"] = columns_validator.validate(value)\n\n if columns_aggregate_validator is not None:\n individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate(\n value\n )\n\n typechecks_succeeded = True\n metadata = []\n overall_description = "Failed Constraints: {}"\n constraint_clauses = []\n for key, result in individual_result_dict.items():\n result_val = result.success\n if result_val:\n continue\n typechecks_succeeded = typechecks_succeeded and result_val\n result_dict = result.metadata_entries[0].entry_data.data\n metadata.append(\n MetadataEntry(\n "{}-constraint-metadata".format(key),\n value=result_dict,\n )\n )\n constraint_clauses.append("{} failing constraints, {}".format(key, result.description))\n # returns aggregates, then column, then dataframe\n return TypeCheck(\n success=typechecks_succeeded,\n description=overall_description.format(constraint_clauses),\n metadata_entries=sorted(metadata, key=lambda x: x.label),\n )\n\n description = check.opt_str_param(description, "description", default="")\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n materializer=materializer if loader else dataframe_materializer,\n description=description,\n )\n\n\ndef _execute_summary_stats(type_name, value, event_metadata_fn):\n if not event_metadata_fn:\n return []\n\n metadata_or_metadata_entries = event_metadata_fn(value)\n\n invalid_message = (\n "The return value of the user-defined summary_statistics function for pandas "\n f"data frame type {type_name} returned {value}. This function must return "\n "Union[Dict[str, Union[str, float, int, Dict, MetadataValue]], List[MetadataEntry]]"\n )\n\n metadata = None\n metadata_entries = None\n\n if isinstance(metadata_or_metadata_entries, list):\n metadata_entries = metadata_or_metadata_entries\n elif isinstance(metadata_or_metadata_entries, dict):\n metadata = metadata_or_metadata_entries\n else:\n raise DagsterInvariantViolationError(invalid_message)\n\n try:\n return normalize_metadata(metadata, metadata_entries)\n except (DagsterInvalidMetadata, CheckError):\n raise DagsterInvariantViolationError(invalid_message)\n
", "current_page_name": "_modules/dagster_pandas/data_frame", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pandas.data_frame"}, "validation": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pandas.validation

\nfrom dagster_pandas.constraints import (\n    CategoricalColumnConstraint,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    Constraint,\n    ConstraintViolationException,\n    DataFrameConstraint,\n    InRangeColumnConstraint,\n    NonNullableColumnConstraint,\n    UniqueColumnConstraint,\n)\nfrom pandas import DataFrame, Timestamp\nfrom pandas.core.dtypes.common import (\n    is_bool_dtype,\n    is_float_dtype,\n    is_integer_dtype,\n    is_numeric_dtype,\n    is_string_dtype,\n)\n\nfrom dagster import DagsterInvariantViolationError, check\n\nPANDAS_NUMERIC_TYPES = {"int64", "float"}\n\n\ndef _construct_keyword_constraints(non_nullable, unique, ignore_missing_vals):\n    non_nullable = check.bool_param(non_nullable, "exists")\n    unique = check.bool_param(unique, "unique")\n    ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n    if non_nullable and ignore_missing_vals:\n        raise DagsterInvariantViolationError(\n            "PandasColumn cannot have a non-null constraint while also ignore missing values"\n        )\n    constraints = []\n    if non_nullable:\n        constraints.append(NonNullableColumnConstraint())\n    if unique:\n        constraints.append(UniqueColumnConstraint(ignore_missing_vals=ignore_missing_vals))\n    return constraints\n\n\n
[docs]class PandasColumn:\n """\n The main API for expressing column level schemas and constraints for your custom dataframe\n types.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If th column exists, the validate function will validate the column. Defaults to True.\n constraints (Optional[List[Constraint]]): List of constraint objects that indicate the\n validation rules for the pandas column.\n """\n\n def __init__(self, name, constraints=None, is_required=None):\n self.name = check.str_param(name, "name")\n self.is_required = check.opt_bool_param(is_required, "is_required", default=True)\n self.constraints = check.opt_list_param(constraints, "constraints", of_type=Constraint)\n\n def validate(self, dataframe):\n if self.name not in dataframe.columns:\n # Ignore validation if column is missing from dataframe and is not required\n if self.is_required:\n raise ConstraintViolationException(\n "Required column {column_name} not in dataframe with columns {dataframe_columns}".format(\n column_name=self.name, dataframe_columns=dataframe.columns\n )\n )\n else:\n for constraint in self.constraints:\n constraint.validate(dataframe, self.name)\n\n
[docs] @staticmethod\n def exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None):\n """\n Simple constructor for PandasColumns that expresses existence constraints.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=_construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def boolean_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """\n Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_bool_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def numeric_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_numeric_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def integer_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_integer_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def float_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_float_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def datetime_column(\n name,\n min_datetime=Timestamp.min,\n max_datetime=Timestamp.max,\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n tz=None,\n ):\n """\n Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.\n Defaults to pandas.Timestamp.min.\n max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.\n Defaults to pandas.Timestamp.max.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'.\n Defaults to None, meaning naive datetime values.\n """\n if tz is None:\n datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"})\n else:\n datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"})\n # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware\n if min_datetime.tz_localize(None) == Timestamp.min:\n min_datetime = Timestamp("1677-09-22 00:12:43.145225Z")\n if max_datetime.tz_localize(None) == Timestamp.max:\n max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z")\n # Convert bounds to same tz\n if Timestamp(min_datetime).tz is None:\n min_datetime = Timestamp(min_datetime).tz_localize(tz)\n if Timestamp(max_datetime).tz is None:\n max_datetime = Timestamp(max_datetime).tz_localize(tz)\n\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n datetime_constraint,\n InRangeColumnConstraint(\n min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def string_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """\n Simple constructor for PandasColumns that expresses constraints on string dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_string_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def categorical_column(\n name,\n categories,\n of_types=frozenset({"category", "object"}),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n categories (List[Any]): The valid set of buckets that all values in the column must match.\n of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must\n abide by.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in\n the column ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the\n constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n of_types = {of_types} if isinstance(of_types, str) else of_types\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeInSetConstraint(of_types),\n CategoricalColumnConstraint(categories, ignore_missing_vals=ignore_missing_vals),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n\ndef validate_constraints(dataframe, pandas_columns=None, dataframe_constraints=None):\n dataframe = check.inst_param(dataframe, "dataframe", DataFrame)\n pandas_columns = check.opt_list_param(\n pandas_columns, "column_constraints", of_type=PandasColumn\n )\n dataframe_constraints = check.opt_list_param(\n dataframe_constraints, "dataframe_constraints", of_type=DataFrameConstraint\n )\n\n if pandas_columns:\n for column in pandas_columns:\n column.validate(dataframe)\n\n if dataframe_constraints:\n for dataframe_constraint in dataframe_constraints:\n dataframe_constraint.validate(dataframe)\n
", "current_page_name": "_modules/dagster_pandas/validation", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pandas.validation"}}, "dagster_postgres": {"event_log": {"event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_postgres.event_log.event_log

\nimport logging\nimport threading\nfrom collections import defaultdict\nfrom typing import Callable, List, MutableMapping, Optional\n\nimport sqlalchemy as db\n\nfrom dagster import check, seven\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.storage.event_log import (\n    AssetKeyTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlEventLogStorageTable,\n)\nfrom dagster.core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster.core.storage.event_log.polling_event_watcher import CallbackAfterCursor\nfrom dagster.core.storage.sql import create_engine, run_alembic_upgrade, stamp_alembic_rev\nfrom dagster.serdes import (\n    ConfigurableClass,\n    ConfigurableClassData,\n    deserialize_json_to_dagster_namedtuple,\n    serialize_dagster_namedtuple,\n)\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom ..pynotify import await_pg_notifications\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\nCHANNEL_NAME = "run_events"\n\n\n
[docs]class PostgresEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """Postgres-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for event log storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 12-21\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, postgres_url, should_autocreate_tables=True, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = check.str_param(postgres_url, "postgres_url")\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n self._disposed = False\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n\n # lazy init\n self._event_watcher: Optional[PostgresEventWatcher] = None\n\n self._secondary_index_cache = {}\n\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables and "event_logs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def _init_db(self):\n with self._connect() as conn:\n with conn.begin():\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n def upgrade(self):\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresEventLogStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(conn_string, should_autocreate_tables=True):\n engine = create_engine(\n conn_string, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n return PostgresEventLogStorage(conn_string, should_autocreate_tables)\n\n def store_event(self, event):\n """Store an event corresponding to a pipeline run.\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event) # from SqlEventLogStorage.py\n with self._connect() as conn:\n result = conn.execute(\n insert_event_statement.returning(\n SqlEventLogStorageTable.c.run_id, SqlEventLogStorageTable.c.id\n )\n )\n res = result.fetchone()\n result.close()\n conn.execute(\n """NOTIFY {channel}, %s; """.format(channel=CHANNEL_NAME),\n (res[0] + "_" + str(res[1]),),\n )\n\n if (\n event.is_dagster_event\n and (\n event.dagster_event.is_step_materialization\n or event.dagster_event.is_asset_observation\n )\n and event.dagster_event.asset_key\n ):\n self.store_asset(event)\n\n def store_asset_observation(self, event):\n # last_materialization_timestamp is updated upon observation or materialization\n # See store_asset method in SqlEventLogStorage for more details\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n with self.index_connection() as conn:\n conn.execute(\n db.dialects.postgresql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n )\n .on_conflict_do_update(\n index_elements=[AssetKeyTable.c.asset_key],\n set_=dict(\n last_materialization_timestamp=utc_datetime_from_timestamp(\n event.timestamp\n ),\n ),\n )\n )\n\n def store_asset_materialization(self, event):\n # last_materialization_timestamp is updated upon observation or materialization\n # See store_asset method in SqlEventLogStorage for more details\n materialization = event.dagster_event.step_materialization_data.materialization\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n with self.index_connection() as conn:\n conn.execute(\n db.dialects.postgresql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization=serialize_dagster_namedtuple(materialization),\n last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),\n last_run_id=event.run_id,\n tags=seven.json.dumps(materialization.tags)\n if materialization.tags\n else None,\n )\n .on_conflict_do_update(\n index_elements=[AssetKeyTable.c.asset_key],\n set_=dict(\n last_materialization=serialize_dagster_namedtuple(materialization),\n last_materialization_timestamp=utc_datetime_from_timestamp(\n event.timestamp\n ),\n last_run_id=event.run_id,\n tags=seven.json.dumps(materialization.tags)\n if materialization.tags\n else None,\n ),\n )\n )\n\n else:\n with self.index_connection() as conn:\n conn.execute(\n db.dialects.postgresql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(),\n last_materialization=serialize_dagster_namedtuple(materialization),\n last_run_id=event.run_id,\n )\n .on_conflict_do_update(\n index_elements=[AssetKeyTable.c.asset_key],\n set_=dict(\n last_materialization=serialize_dagster_namedtuple(materialization),\n last_run_id=event.run_id,\n ),\n )\n )\n\n def _connect(self):\n return create_pg_connection(self._engine, __file__, "event log")\n\n def run_connection(self, run_id=None):\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n PostgresEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(PostgresEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, start_cursor, callback):\n if self._event_watcher is None:\n self._event_watcher = PostgresEventWatcher(self.postgres_url, self._engine)\n\n self._event_watcher.watch_run(run_id, start_cursor, callback)\n\n def end_watch(self, run_id, handler):\n if self._event_watcher is None:\n return\n\n self._event_watcher.unwatch_run(run_id, handler)\n\n def __del__(self):\n # Keep the inherent limitations of __del__ in Python in mind!\n self.dispose()\n\n def dispose(self):\n if not self._disposed:\n self._disposed = True\n if self._event_watcher:\n self._event_watcher.close()
\n\n\nPOLLING_CADENCE = 0.25\n\n\ndef watcher_thread(\n conn_string: str,\n engine: db.engine.Engine,\n handlers_dict: MutableMapping[str, List[CallbackAfterCursor]],\n dict_lock: threading.Lock,\n watcher_thread_exit: threading.Event,\n watcher_thread_started: threading.Event,\n):\n for notif in await_pg_notifications(\n conn_string,\n channels=[CHANNEL_NAME],\n timeout=POLLING_CADENCE,\n yield_on_timeout=True,\n exit_event=watcher_thread_exit,\n started_event=watcher_thread_started,\n ):\n if notif is None:\n if watcher_thread_exit.is_set():\n break\n else:\n run_id, index_str = notif.payload.split("_")\n with dict_lock:\n if run_id not in handlers_dict:\n continue\n\n index = int(index_str)\n with dict_lock:\n handlers = handlers_dict.get(run_id, [])\n\n with engine.connect() as conn:\n cursor_res = conn.execute(\n db.select([SqlEventLogStorageTable.c.event]).where(\n SqlEventLogStorageTable.c.id == index\n ),\n )\n dagster_event: EventLogEntry = deserialize_json_to_dagster_namedtuple(\n cursor_res.scalar()\n )\n\n for callback_with_cursor in handlers:\n if callback_with_cursor.start_cursor < index:\n try:\n callback_with_cursor.callback(dagster_event)\n except Exception:\n logging.exception(\n "Exception in callback for event watch on run %s.", run_id\n )\n\n\nclass PostgresEventWatcher:\n def __init__(self, conn_string: str, engine: db.engine.Engine):\n self._conn_string: str = check.str_param(conn_string, "conn_string")\n self._engine = engine\n self._handlers_dict: MutableMapping[str, List[CallbackAfterCursor]] = defaultdict(list)\n self._dict_lock: threading.Lock = threading.Lock()\n self._watcher_thread_exit: Optional[threading.Event] = None\n self._watcher_thread_started: Optional[threading.Event] = None\n self._watcher_thread: Optional[threading.Thread] = None\n\n def watch_run(\n self,\n run_id: str,\n start_cursor: int,\n callback: Callable[[EventLogEntry], None],\n start_timeout=15,\n ):\n check.str_param(run_id, "run_id")\n check.int_param(start_cursor, "start_cursor")\n check.callable_param(callback, "callback")\n if not self._watcher_thread:\n self._watcher_thread_exit = threading.Event()\n self._watcher_thread_started = threading.Event()\n\n self._watcher_thread = threading.Thread(\n target=watcher_thread,\n args=(\n self._conn_string,\n self._engine,\n self._handlers_dict,\n self._dict_lock,\n self._watcher_thread_exit,\n self._watcher_thread_started,\n ),\n name="postgres-event-watch",\n )\n self._watcher_thread.daemon = True\n self._watcher_thread.start()\n\n # Wait until the watcher thread is actually listening before returning\n self._watcher_thread_started.wait(start_timeout)\n if not self._watcher_thread_started.is_set():\n raise Exception("Watcher thread never started")\n\n with self._dict_lock:\n self._handlers_dict[run_id].append(CallbackAfterCursor(start_cursor + 1, callback))\n\n def unwatch_run(self, run_id: str, handler: Callable[[EventLogEntry], None]):\n check.str_param(run_id, "run_id")\n check.callable_param(handler, "handler")\n with self._dict_lock:\n if run_id in self._handlers_dict:\n self._handlers_dict[run_id] = [\n callback_with_cursor\n for callback_with_cursor in self._handlers_dict[run_id]\n if callback_with_cursor.callback != handler\n ]\n if not self._handlers_dict[run_id]:\n del self._handlers_dict[run_id]\n\n def close(self):\n if self._watcher_thread:\n self._watcher_thread_exit.set()\n self._watcher_thread.join()\n self._watcher_thread_exit = None\n self._watcher_thread = None\n
", "current_page_name": "_modules/dagster_postgres/event_log/event_log", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_postgres.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_postgres.run_storage.run_storage

\nimport sqlalchemy as db\n\nfrom dagster import check\nfrom dagster.core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster.core.storage.sql import create_engine, run_alembic_upgrade, stamp_alembic_rev\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresRunStorage(SqlRunStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-10\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, postgres_url, should_autocreate_tables=True, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db.pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables and "runs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n super().__init__()\n\n def _init_db(self):\n with self.connect() as conn:\n with conn.begin():\n RunStorageSqlMetadata.create_all(conn)\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold 1 open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresRunStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(postgres_url, should_autocreate_tables=True):\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresRunStorage(postgres_url, should_autocreate_tables)\n\n def connect(self):\n return create_pg_connection(\n self._engine,\n __file__,\n "run",\n )\n\n def upgrade(self):\n with self.connect() as conn:\n run_alembic_upgrade(pg_alembic_config(__file__), conn)\n\n def has_built_index(self, migration_name):\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n PostgresRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name):\n super(PostgresRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat):\n with self.connect() as conn:\n\n # insert or update if already present, using postgres specific on_conflict\n conn.execute(\n db.dialects.postgresql.insert(DaemonHeartbeatsTable)\n .values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n .on_conflict_do_update(\n index_elements=[DaemonHeartbeatsTable.c.daemon_type],\n set_={\n "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n "daemon_id": daemon_heartbeat.daemon_id,\n "body": serialize_dagster_namedtuple(daemon_heartbeat),\n },\n )\n )
\n
", "current_page_name": "_modules/dagster_postgres/run_storage/run_storage", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_postgres.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_postgres.schedule_storage.schedule_storage

\nimport sqlalchemy as db\n\nfrom dagster import check\nfrom dagster.core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster.core.storage.sql import create_engine, run_alembic_upgrade, stamp_alembic_rev\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for schedule storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 23-32\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, postgres_url, should_autocreate_tables=True, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n missing_main_table = "schedules" not in table_names and "jobs" not in table_names\n if self.should_autocreate_tables and missing_main_table:\n retry_pg_creation_fn(self._init_db)\n\n super().__init__()\n\n def _init_db(self):\n with self.connect() as conn:\n with conn.begin():\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresScheduleStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(postgres_url, should_autocreate_tables=True):\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresScheduleStorage(postgres_url, should_autocreate_tables)\n\n def connect(self, run_id=None): # pylint: disable=arguments-differ, unused-argument\n return create_pg_connection(self._engine, __file__, "schedule")\n\n def upgrade(self):\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/schedule_storage/schedule_storage", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_postgres.schedule_storage.schedule_storage"}}}, "dagster_prometheus": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_prometheus.resources

\nimport prometheus_client\nfrom prometheus_client.exposition import default_handler\n\nfrom dagster import Field, check, resource\n\n\n
[docs]class PrometheusResource:\n """Integrates with Prometheus via the prometheus_client library."""\n\n def __init__(self, gateway, timeout):\n self.gateway = check.str_param(gateway, "gateway")\n self.timeout = check.opt_int_param(timeout, "timeout")\n self.registry = prometheus_client.CollectorRegistry()\n\n def push_to_gateway(self, job, grouping_key=None, handler=default_handler):\n """Push metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n If not None, the argument must be a function which accepts\n the following arguments:\n url, method, timeout, headers, and content\n May be used to implement additional functionality not\n supported by the built-in default handler (such as SSL\n client certicates, and HTTP authentication mechanisms).\n 'url' is the URL for the request, the 'gateway' argument\n described earlier will form the basis of this URL.\n 'method' is the HTTP method which should be used when\n carrying out the request.\n 'timeout' requests not successfully completed after this\n many seconds should be aborted. If timeout is None, then\n the handler should not set a timeout.\n 'headers' is a list of ("header-name","header-value") tuples\n which must be passed to the pushgateway in the form of HTTP\n request headers.\n The function should raise an exception (e.g. IOError) on\n failure.\n 'content' is the data which should be used to form the HTTP\n Message Body.\n This overwrites all metrics with the same job and grouping_key.\n This uses the PUT HTTP method."""\n prometheus_client.push_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self.registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def pushadd_to_gateway(self, job, grouping_key=None, handler=default_handler):\n """PushAdd metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `registry` is an instance of CollectorRegistry\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This replaces metrics with the same name, job and grouping_key.\n This uses the POST HTTP method."""\n prometheus_client.pushadd_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self.registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def delete_from_gateway(self, job, grouping_key=None, handler=default_handler):\n """Delete metrics from the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This deletes metrics with the given job and grouping_key.\n This uses the DELETE HTTP method."""\n prometheus_client.delete_from_gateway(\n gateway=self.gateway,\n job=job,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )
\n\n\n
[docs]@resource(\n {\n "gateway": Field(\n str,\n description="the url for your push gateway. Either of the form "\n "'http://pushgateway.local', or 'pushgateway.local'. "\n "Scheme defaults to 'http' if none is provided",\n ),\n "timeout": Field(\n int,\n default_value=30,\n is_required=False,\n description="is how long delete will attempt to connect before giving up. "\n "Defaults to 30s.",\n ),\n },\n description="""This resource is for sending metrics to a Prometheus Pushgateway.""",\n)\ndef prometheus_resource(context):\n return PrometheusResource(\n gateway=context.resource_config["gateway"], timeout=context.resource_config["timeout"]\n )
\n
", "current_page_name": "_modules/dagster_prometheus/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_prometheus.resources"}}, "dagster_pyspark": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pyspark.resources

\nfrom dagster_spark.configs_spark import spark_config\nfrom dagster_spark.utils import flatten_dict\nfrom pyspark.sql import SparkSession\n\nfrom dagster import check, resource\n\n\ndef spark_session_from_config(spark_conf=None):\n    spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n    builder = SparkSession.builder\n    flat = flatten_dict(spark_conf)\n    for key, value in flat:\n        builder = builder.config(key, value)\n\n    return builder.getOrCreate()\n\n\nclass PySparkResource:\n    def __init__(self, spark_conf):\n        self._spark_session = spark_session_from_config(spark_conf)\n\n    @property\n    def spark_session(self):\n        return self._spark_session\n\n    @property\n    def spark_context(self):\n        return self.spark_session.sparkContext\n\n\n
[docs]@resource({"spark_conf": spark_config()})\ndef pyspark_resource(init_context):\n """This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.\n\n Example:\n\n .. code-block:: python\n\n @op(required_resource_keys={"pyspark"})\n def my_op(context):\n spark_session = context.resources.pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n\n """\n return PySparkResource(init_context.resource_config["spark_conf"])
\n
", "current_page_name": "_modules/dagster_pyspark/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pyspark.resources"}}, "dagster_shell": {"solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_shell.solids

\nimport os\n\nfrom dagster import (\n    Enum,\n    EnumValue,\n    Failure,\n    Field,\n    InputDefinition,\n    Noneable,\n    Nothing,\n    OutputDefinition,\n    Permissive,\n    check,\n    op,\n    solid,\n)\n\nfrom .utils import execute, execute_script_file\n\n\ndef shell_op_config():\n    return {\n        "env": Field(\n            Noneable(Permissive()),\n            is_required=False,\n            description="An optional dict of environment variables to pass to the subprocess.",\n        ),\n        "output_logging": Field(\n            Enum(\n                name="OutputType",\n                enum_values=[\n                    EnumValue("STREAM", description="Stream script stdout/stderr."),\n                    EnumValue(\n                        "BUFFER",\n                        description="Buffer shell script stdout/stderr, then log upon completion.",\n                    ),\n                    EnumValue("NONE", description="No logging"),\n                ],\n            ),\n            is_required=False,\n            default_value="BUFFER",\n        ),\n        "cwd": Field(\n            Noneable(str),\n            default_value=None,\n            is_required=False,\n            description="Working directory in which to execute shell script",\n        ),\n    }\n\n\ndef core_shell(dagster_decorator, decorator_name):\n    @dagster_decorator(\n        name=f"shell_{decorator_name}",\n        description=(\n            f"This {decorator_name} executes a shell command it receives as input.\\n\\n"\n            f"This {decorator_name} is suitable for uses where the command to execute is generated dynamically by "\n            f"upstream {decorator_name}. If you know the command to execute at pipeline construction time, "\n            f"consider `shell_command_{decorator_name}` instead."\n        ),\n        input_defs=[InputDefinition("shell_command", str)],\n        output_defs=[OutputDefinition(str, "result")],\n        config_schema=shell_op_config(),\n    )\n    def shell_fn(context, shell_command):\n        op_config = context.op_config.copy()\n        if not op_config.get("env"):\n            op_config["env"] = os.environ.copy()\n        output, return_code = execute(shell_command=shell_command, log=context.log, **op_config)\n\n        if return_code:\n            raise Failure(\n                description="Shell command execution failed with output: {output}".format(\n                    output=output\n                )\n            )\n\n        return output\n\n    return shell_fn\n\n\nshell_solid = core_shell(solid, "solid")\nshell_op = core_shell(op, "op")\n\n\n
[docs]def create_shell_command_op(\n shell_command,\n name,\n description=None,\n required_resource_keys=None,\n tags=None,\n):\n """This function is a factory that constructs ops to execute a shell command.\n\n Note that you can only use ``shell_command_op`` if you know the command you'd like to execute\n at pipeline construction time. If you'd like to construct shell commands dynamically during\n pipeline execution and pass them between ops, you should use ``shell_op`` instead.\n\n Examples:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_op.py\n :language: python\n\n\n Args:\n shell_command (str): The shell command that the constructed op will execute.\n name (str): The name of the constructed op.\n description (Optional[str]): Human-readable description of this op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return core_create_shell_command(\n op,\n shell_command=shell_command,\n name=name,\n description=description,\n required_resource_keys=required_resource_keys,\n tags=tags,\n )
\n\n\n
[docs]def create_shell_command_solid(\n shell_command,\n name,\n description=None,\n required_resource_keys=None,\n tags=None,\n):\n """This function is a factory that constructs solids to execute a shell command.\n\n Note that you can only use ``shell_command_solid`` if you know the command you'd like to execute\n at pipeline construction time. If you'd like to construct shell commands dynamically during\n pipeline execution and pass them between solids, you should use ``shell_solid`` instead.\n\n Examples:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_solid.py\n :language: python\n\n\n Args:\n shell_command (str): The shell command that the constructed solid will execute.\n name (str): The name of the constructed solid.\n description (Optional[str]): Human-readable description of this solid.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this solid.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n return core_create_shell_command(\n solid,\n shell_command=shell_command,\n name=name,\n description=description,\n required_resource_keys=required_resource_keys,\n tags=tags,\n )
\n\n\ndef core_create_shell_command(\n dagster_decorator,\n shell_command,\n name,\n description=None,\n required_resource_keys=None,\n tags=None,\n):\n check.str_param(shell_command, "shell_command")\n name = check.str_param(name, "name")\n\n @dagster_decorator(\n name=name,\n description=description,\n input_defs=[InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(str, "result")],\n config_schema=shell_op_config(),\n required_resource_keys=required_resource_keys,\n tags=tags,\n )\n def _shell_fn(context):\n op_config = context.op_config.copy()\n if not op_config.get("env"):\n op_config["env"] = os.environ.copy()\n output, return_code = execute(shell_command=shell_command, log=context.log, **op_config)\n\n if return_code:\n raise Failure(\n description="Shell command execution failed with output: {output}".format(\n output=output\n )\n )\n\n return output\n\n return _shell_fn\n\n\n
[docs]def create_shell_script_op(\n shell_script_path, name="create_shell_script_op", input_defs=None, **kwargs\n):\n """This function is a factory which constructs an op that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@op\n <dagster.op>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@graph <dagster.graph>` to wrap this op\n in the cases where you'd like to configure the shell op with different config fields.\n\n\n Examples:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_op.py\n :language: python\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (str, optional): The name of this op. Defaults to "create_shell_script_op".\n input_defs (List[InputDefinition], optional): input definitions for the op. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return core_create_shell_script(\n dagster_decorator=solid,\n decorator_name="solid",\n shell_script_path=shell_script_path,\n name=name,\n input_defs=input_defs,\n **kwargs,\n )
\n\n\n
[docs]def create_shell_script_solid(\n shell_script_path, name="create_shell_script_solid", input_defs=None, **kwargs\n):\n """This function is a factory which constructs a solid that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@solid\n <dagster.solid>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@composite_solid <dagster.composite_solid>` to wrap this solid\n in the cases where you'd like to configure the shell solid with different config fields.\n\n\n Examples:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_solid.py\n :language: python\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (str, optional): The name of this solid. Defaults to "create_shell_script_solid".\n input_defs (List[InputDefinition], optional): input definitions for the solid. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n return core_create_shell_script(\n dagster_decorator=solid,\n decorator_name="solid",\n shell_script_path=shell_script_path,\n name=name,\n input_defs=input_defs,\n **kwargs,\n )
\n\n\ndef core_create_shell_script(\n dagster_decorator,\n decorator_name,\n shell_script_path,\n name="create_shell_script_solid",\n input_defs=None,\n **kwargs,\n):\n check.str_param(shell_script_path, "shell_script_path")\n name = check.str_param(name, "name")\n check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n\n if "output_defs" in kwargs:\n raise TypeError(f"Overriding output_defs for shell {decorator_name} is not supported.")\n\n if "config" in kwargs:\n raise TypeError(f"Overriding config for shell {decorator_name} is not supported.")\n\n @dagster_decorator(\n name=name,\n description=kwargs.pop("description", f"A {decorator_name} to invoke a shell command."),\n input_defs=input_defs or [InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(str, "result")],\n config_schema=shell_op_config(),\n **kwargs,\n )\n def _shell_script_fn(context):\n op_config = context.op_config.copy()\n if not op_config.get("env"):\n op_config["env"] = os.environ.copy()\n output, return_code = execute_script_file(\n shell_script_path=shell_script_path, log=context.log, **op_config\n )\n\n if return_code:\n raise Failure(\n description="Shell command execution failed with output: {output}".format(\n output=output\n )\n )\n\n return output\n\n return _shell_script_fn\n
", "current_page_name": "_modules/dagster_shell/solids", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_shell.solids"}}, "dagster_slack": {"hooks": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_slack.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster.core.definitions import failure_hook, success_hook\nfrom dagster.core.execution.context.hook import HookContext\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return "Solid {solid_name} on pipeline {pipeline_name} {status}!\\nRun ID: {run_id}".format(\n        solid_name=context.solid.name,\n        pipeline_name=context.pipeline_name,\n        run_id=context.run_id,\n        status=status,\n    )\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]def slack_on_failure(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the specific pipeline run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_failure("#foo", dagit_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} failed!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n\n """\n\n @failure_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if dagit_base_url:\n text += "\\n<{base_url}/instance/runs/{run_id}|View in Dagit>".format(\n base_url=dagit_base_url, run_id=context.run_id\n )\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n\n\n
[docs]def slack_on_success(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the specific pipeline run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_success("#foo", dagit_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.solid} worked!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n\n """\n\n @success_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if dagit_base_url:\n text += "\\n<{base_url}/instance/runs/{run_id}|View in Dagit>".format(\n base_url=dagit_base_url, run_id=context.run_id\n )\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_slack/hooks", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_slack.hooks"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_slack.resources

\nfrom slack_sdk import WebClient\n\nfrom dagster import Field, StringSource, resource\n\n\n
[docs]@resource(\n {\n "token": Field(\n StringSource,\n description="""To configure access to the Slack API, you'll need an access\n token provisioned with access to your Slack workspace.\n\n Tokens are typically either user tokens or bot tokens. For programmatic posting\n to Slack from this resource, you probably want to provision and use a bot token.\n\n More in the Slack API documentation here: https://api.slack.com/docs/token-types\n """,\n )\n },\n description="This resource is for connecting to Slack",\n)\ndef slack_resource(context):\n """This resource is for connecting to Slack.\n\n The resource object is a `slack_sdk.WebClient`.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster solid:\n\n Examples:\n\n .. code-block:: python\n\n import os\n\n from dagster import job, op\n from dagster_slack import slack_resource\n\n\n @op(required_resource_keys={'slack'})\n def slack_op(context):\n context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job(resource_defs={'slack': slack_resource})\n def slack_job():\n slack_op()\n\n slack_job.execute_in_process(\n run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n )\n\n """\n return WebClient(context.resource_config.get("token"))
\n
", "current_page_name": "_modules/dagster_slack/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_slack.resources"}, "sensors": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_slack.sensors

\nfrom typing import Any, Callable, Dict, List, Optional, Tuple, Union\n\nfrom slack_sdk import WebClient\n\nfrom dagster import DefaultSensorStatus\nfrom dagster.core.definitions import GraphDefinition, PipelineDefinition\nfrom dagster.core.definitions.run_status_sensor_definition import (\n    PipelineFailureSensorContext,\n    RunFailureSensorContext,\n    pipeline_failure_sensor,\n    run_failure_sensor,\n)\n\n\ndef _build_slack_blocks_and_text(\n    context: RunFailureSensorContext,\n    text_fn: Callable[[RunFailureSensorContext], str],\n    blocks_fn: Optional[Callable[[RunFailureSensorContext], List[Dict]]],\n    dagit_base_url: Optional[str],\n) -> Tuple[List[Dict[str, Any]], str]:\n    blocks: List[Dict[str, Any]] = [\n        {\n            "type": "section",\n            "text": {\n                "type": "mrkdwn",\n                "text": f'*Job "{context.pipeline_run.pipeline_name}" failed. `{context.pipeline_run.run_id.split("-")[0]}`*',\n            },\n        },\n    ]\n    main_body_text = text_fn(context)\n\n    if blocks_fn:\n        blocks.extend(blocks_fn(context))\n    else:\n        blocks.append(\n            {\n                "type": "section",\n                "text": {"type": "mrkdwn", "text": main_body_text},\n            },\n        )\n\n    if dagit_base_url:\n        blocks.append(\n            {\n                "type": "actions",\n                "elements": [\n                    {\n                        "type": "button",\n                        "text": {"type": "plain_text", "text": "View in Dagit"},\n                        "url": f"{dagit_base_url}/instance/runs/{context.pipeline_run.run_id}",\n                    }\n                ],\n            }\n        )\n    return blocks, main_body_text\n\n\ndef _default_failure_message_text_fn(context: PipelineFailureSensorContext) -> str:\n    return f"Error: ```{context.failure_event.message}```"\n\n\n
[docs]def make_slack_on_pipeline_failure_sensor(\n channel: str,\n slack_token: str,\n text_fn: Callable[[PipelineFailureSensorContext], str] = _default_failure_message_text_fn,\n blocks_fn: Optional[Callable[[PipelineFailureSensorContext], List[Dict]]] = None,\n pipeline_selection: Optional[List[str]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n):\n """Create a sensor on pipeline failures that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n text_fn (Optional(Callable[[PipelineFailureSensorContext], str])): Function which\n takes in the ``PipelineFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, pipeline name, and run ID.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with mrkdwn.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[PipelineFailureSensorContext], List[Dict]]): Function which takes in\n the ``PipelineFailureSensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n pipeline_selection (Optional[List[str]]): Names of the pipelines that will be monitored by\n this failure sensor. Defaults to None, which means the alert will be sent when any\n pipeline in the repository fails.\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_pipeline_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed pipeline run.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n\n Examples:\n\n .. code-block:: python\n\n slack_on_pipeline_failure = make_slack_on_pipeline_failure_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN")\n )\n\n @repository\n def my_repo():\n return [my_pipeline + slack_on_pipeline_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: PipelineFailureSensorContext) -> str:\n return "Pipeline {pipeline_name} failed! Error: {error}".format(\n pipeline_name=context.pipeline_run.pipeline_name,\n error=context.failure_event.message,\n )\n\n slack_on_pipeline_failure = make_slack_on_pipeline_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n message_fn=my_message_fn,\n dagit_base_url="http://mycoolsite.com",\n )\n\n\n """\n\n slack_client = WebClient(token=slack_token)\n\n @pipeline_failure_sensor(\n name=name, pipeline_selection=pipeline_selection, default_status=default_status\n )\n def slack_on_pipeline_failure(context: PipelineFailureSensorContext):\n\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context, text_fn=text_fn, blocks_fn=blocks_fn, dagit_base_url=dagit_base_url\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_pipeline_failure
\n\n\n
[docs]def make_slack_on_run_failure_sensor(\n channel: str,\n slack_token: str,\n text_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message_text_fn,\n blocks_fn: Optional[Callable[[RunFailureSensorContext], List[Dict]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n):\n """Create a sensor on job failures that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with mrkdwn.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]): Function which takes in\n the ``RunFailureSensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n job_selection (Optional[List[Union[PipelineDefinition, GraphDefinition]]]): The jobs that\n will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n\n Examples:\n\n .. code-block:: python\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN")\n )\n\n @repository\n def my_repo():\n return [my_job + slack_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.pipeline_run.pipeline_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n message_fn=my_message_fn,\n dagit_base_url="http://mycoolsite.com",\n )\n\n\n """\n\n slack_client = WebClient(token=slack_token)\n\n @run_failure_sensor(name=name, job_selection=job_selection, default_status=default_status)\n def slack_on_run_failure(context: RunFailureSensorContext):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context, text_fn=text_fn, blocks_fn=blocks_fn, dagit_base_url=dagit_base_url\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_run_failure
\n
", "current_page_name": "_modules/dagster_slack/sensors", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_slack.sensors"}}, "dagster_snowflake": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_snowflake.resources

\nimport sys\nimport warnings\nfrom contextlib import closing, contextmanager\n\nfrom dagster import check, resource\n\nfrom .configs import define_snowflake_config\n\ntry:\n    import snowflake.connector\nexcept ImportError:\n    msg = (\n        "Could not import snowflake.connector. This could mean you have an incompatible version "\n        "of azure-storage-blob installed. dagster-snowflake requires azure-storage-blob<12.0.0; "\n        "this conflicts with dagster-azure which requires azure-storage-blob~=12.0.0 and is "\n        "incompatible with dagster-snowflake. Please uninstall dagster-azure and reinstall "\n        "dagster-snowflake to fix this error."\n    )\n    warnings.warn(msg)\n    raise\n\n\n
[docs]class SnowflakeConnection:\n def __init__(self, context): # pylint: disable=too-many-locals\n # Extract parameters from resource config. Note that we can't pass None values to\n # snowflake.connector.connect() because they will override the default values set within the\n # connector; remove them from the conn_args dict.\n self.connector = context.resource_config.get("connector", None)\n\n if self.connector == "sqlalchemy":\n self.conn_args = {\n k: context.resource_config.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "cache_column_metadata",\n "numpy",\n )\n if context.resource_config.get(k) is not None\n }\n\n else:\n self.conn_args = {\n k: context.resource_config.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "autocommit",\n "client_prefetch_threads",\n "client_session_keep_alive",\n "login_timeout",\n "network_timeout",\n "ocsp_response_cache_filename",\n "validate_default_parameters",\n "paramstyle",\n "timezone",\n )\n if context.resource_config.get(k) is not None\n }\n\n self.autocommit = self.conn_args.get("autocommit", False)\n self.log = context.log\n\n
[docs] @contextmanager\n def get_connection(self, raw_conn=True):\n if self.connector == "sqlalchemy":\n from snowflake.sqlalchemy import URL # pylint: disable=no-name-in-module,import-error\n from sqlalchemy import create_engine\n\n engine = create_engine(URL(**self.conn_args))\n conn = engine.raw_connection() if raw_conn else engine.connect()\n\n yield conn\n conn.close()\n engine.dispose()\n else:\n conn = snowflake.connector.connect(**self.conn_args)\n\n yield conn\n if not self.autocommit:\n conn.commit()\n conn.close()
\n\n
[docs] def execute_query(self, sql, parameters=None, fetch_results=False):\n check.str_param(sql, "sql")\n check.opt_dict_param(parameters, "parameters")\n check.bool_param(fetch_results, "fetch_results")\n\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n\n self.log.info("Executing query: " + sql)\n cursor.execute(sql, parameters) # pylint: disable=E1101\n if fetch_results:\n return cursor.fetchall() # pylint: disable=E1101
\n\n
[docs] def execute_queries(self, sql_queries, parameters=None, fetch_results=False):\n check.list_param(sql_queries, "sql_queries", of_type=str)\n check.opt_dict_param(parameters, "parameters")\n check.bool_param(fetch_results, "fetch_results")\n\n results = []\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n for sql in sql_queries:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n self.log.info("Executing query: " + sql)\n cursor.execute(sql, parameters) # pylint: disable=E1101\n if fetch_results:\n results.append(cursor.fetchall()) # pylint: disable=E1101\n\n return results if fetch_results else None
\n\n
[docs] def load_table_from_local_parquet(self, src, table):\n check.str_param(src, "src")\n check.str_param(table, "table")\n\n sql_queries = [\n "CREATE OR REPLACE TABLE {table} ( data VARIANT DEFAULT NULL);".format(table=table),\n "CREATE OR REPLACE FILE FORMAT parquet_format TYPE = 'parquet';",\n "PUT {src} @%{table};".format(src=src, table=table),\n "COPY INTO {table} FROM @%{table} FILE_FORMAT = (FORMAT_NAME = 'parquet_format');".format(\n table=table\n ),\n ]\n\n self.execute_queries(sql_queries)
\n\n\n
[docs]@resource(\n config_schema=define_snowflake_config(),\n description="This resource is for connecting to the Snowflake data warehouse",\n)\ndef snowflake_resource(context):\n """A resource for connecting to the Snowflake data warehouse.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import snowflake_resource\n\n @op(required_resource_keys={'snowflake'})\n def get_one(context):\n context.resources.snowflake.execute_query('SELECT 1')\n\n @job(resource_defs={'snowflake': snowflake_resource})\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n run_config={\n 'resources': {\n 'snowflake': {\n 'config': {\n 'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n 'user': {'env': 'SNOWFLAKE_USER'},\n 'password': {'env': 'SNOWFLAKE_PASSWORD'},\n 'database': {'env': 'SNOWFLAKE_DATABASE'},\n 'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n 'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n }\n }\n }\n }\n )\n\n """\n return SnowflakeConnection(context)
\n\n\ndef _filter_password(args):\n """Remove password from connection args for logging"""\n return {k: v for k, v in args.items() if k != "password"}\n
", "current_page_name": "_modules/dagster_snowflake/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_snowflake.resources"}, "solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_snowflake.solids

\nfrom dagster import InputDefinition, Nothing, check, op, solid\n\n\ndef _core_create_snowflake_command(dagster_decorator, decorator_name, sql, parameters=None):\n    check.str_param(sql, "sql")\n    check.opt_dict_param(parameters, "parameters")\n\n    @dagster_decorator(\n        name=f"snowflake_{decorator_name}",\n        input_defs=[InputDefinition("start", Nothing)],\n        required_resource_keys={"snowflake"},\n        tags={"kind": "sql", "sql": sql},\n    )\n    def snowflake_fn(context):\n        context.resources.snowflake.execute_query(sql=sql, parameters=parameters)\n\n    return snowflake_fn\n\n\ndef snowflake_solid_for_query(sql, parameters=None):\n    """This function is a solid factory that constructs solids to execute a snowflake query.\n\n    Note that you can only use `snowflake_solid_for_query` if you know the query you'd like to\n    execute at pipeline construction time. If you'd like to execute queries dynamically during\n    pipeline execution, you should manually execute those queries in your custom solid using the\n    snowflake resource.\n\n    Args:\n        sql (str): The sql query that will execute against the provided snowflake resource.\n        parameters (dict): The parameters for the sql query.\n\n    Returns:\n        SolidDefinition: Returns the constructed solid definition.\n    """\n    return _core_create_snowflake_command(solid, "solid", sql, parameters)\n\n\n
[docs]def snowflake_op_for_query(sql, parameters=None):\n """This function is an op factory that constructs an op to execute a snowflake query.\n\n Note that you can only use `snowflake_op_for_query` if you know the query you'd like to\n execute at graph construction time. If you'd like to execute queries dynamically during\n job execution, you should manually execute those queries in your custom op using the\n snowflake resource.\n\n Args:\n sql (str): The sql query that will execute against the provided snowflake resource.\n parameters (dict): The parameters for the sql query.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return _core_create_snowflake_command(op, "op", sql, parameters)
\n
", "current_page_name": "_modules/dagster_snowflake/solids", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_snowflake.solids"}}, "dagster_spark": {"configs": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.configs

\n"""Spark Configuration\n\nIn this file we define the key configuration parameters for submitting Spark jobs. Spark can be run\nin a variety of deployment contexts. See the Spark documentation at\nhttps://spark.apache.org/docs/latest/submitting-applications.html for a more in-depth summary of\nSpark deployment contexts and configuration.\n"""\nfrom dagster import Field, StringSource\n\nfrom .configs_spark import spark_config\nfrom .types import SparkDeployMode\n\n\n
[docs]def define_spark_config():\n """Spark configuration.\n\n See the Spark documentation for reference:\n https://spark.apache.org/docs/latest/submitting-applications.html\n """\n\n master_url = Field(\n StringSource,\n description="The master URL for the cluster (e.g. spark://23.195.26.187:7077)",\n is_required=True,\n )\n\n deploy_mode = Field(\n SparkDeployMode,\n description="""Whether to deploy your driver on the worker nodes (cluster) or locally as an\n external client (client) (default: client). A common deployment strategy is to submit your\n application from a gateway machine that is physically co-located with your worker machines\n (e.g. Master node in a standalone EC2 cluster). In this setup, client mode is appropriate.\n In client mode, the driver is launched directly within the spark-submit process which acts\n as a client to the cluster. The input and output of the application is attached to the\n console. Thus, this mode is especially suitable for applications that involve the REPL (e.g.\n Spark shell).""",\n is_required=False,\n )\n\n application_jar = Field(\n StringSource,\n description="""Path to a bundled jar including your application and all\n dependencies. The URL must be globally visible inside of your cluster, for\n instance, an hdfs:// path or a file:// path that is present on all nodes.\n """,\n is_required=True,\n )\n\n application_arguments = Field(\n StringSource,\n description="Arguments passed to the main method of your main class, if any",\n is_required=False,\n )\n\n spark_home = Field(\n StringSource,\n description="The path to your spark installation. Defaults to $SPARK_HOME at runtime if not provided.",\n is_required=False,\n )\n\n return {\n "master_url": master_url,\n "deploy_mode": deploy_mode,\n "application_jar": application_jar,\n "spark_conf": spark_config(),\n "spark_home": spark_home,\n "application_arguments": application_arguments,\n }
\n
", "current_page_name": "_modules/dagster_spark/configs", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.configs"}, "ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.ops

\nfrom dagster import InputDefinition, Nothing, OutputDefinition, check, op, solid\n\nfrom .configs import define_spark_config\n\n\ndef create_spark_solid(\n    name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n    return core_create_spark(\n        dagster_decorator=solid,\n        name=name,\n        main_class=main_class,\n        description=description,\n        required_resource_keys=required_resource_keys,\n    )\n\n\n
[docs]def create_spark_op(\n name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n return core_create_spark(\n dagster_decorator=op,\n name=name,\n main_class=main_class,\n description=description,\n required_resource_keys=required_resource_keys,\n )
\n\n\ndef core_create_spark(\n dagster_decorator,\n name,\n main_class,\n description=None,\n required_resource_keys=frozenset(["spark"]),\n):\n check.str_param(name, "name")\n check.str_param(main_class, "main_class")\n check.opt_str_param(description, "description", "A parameterized Spark job.")\n check.set_param(required_resource_keys, "required_resource_keys")\n\n @dagster_decorator(\n name=name,\n description=description,\n config_schema=define_spark_config(),\n input_defs=[InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(Nothing)],\n tags={"kind": "spark", "main_class": main_class},\n required_resource_keys=required_resource_keys,\n )\n def spark_solid(context): # pylint: disable=unused-argument\n context.resources.spark.run_spark_job(context.solid_config, main_class)\n\n return spark_solid\n
", "current_page_name": "_modules/dagster_spark/ops", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.resources

\nimport os\nimport subprocess\n\nfrom dagster import check, resource\nfrom dagster.core.log_manager import DagsterLogManager\n\nfrom .types import SparkOpError\nfrom .utils import construct_spark_shell_command\n\n\nclass SparkResource:\n    def __init__(self, logger):\n        self.logger = check.inst_param(logger, "logger", DagsterLogManager)\n\n    def run_spark_job(self, config, main_class):\n        check.dict_param(config, "config")\n        check.str_param(main_class, "main_class")\n\n        # Extract parameters from config\n        (\n            master_url,\n            deploy_mode,\n            application_jar,\n            spark_conf,\n            application_arguments,\n            spark_home,\n        ) = [\n            config.get(k)\n            for k in (\n                "master_url",\n                "deploy_mode",\n                "application_jar",\n                "spark_conf",\n                "application_arguments",\n                "spark_home",\n            )\n        ]\n\n        if not os.path.exists(application_jar):\n            raise SparkOpError(\n                (\n                    "Application jar {} does not exist. A valid jar must be "\n                    "built before running this op.".format(application_jar)\n                )\n            )\n\n        spark_shell_cmd = construct_spark_shell_command(\n            application_jar=application_jar,\n            main_class=main_class,\n            master_url=master_url,\n            spark_conf=spark_conf,\n            deploy_mode=deploy_mode,\n            application_arguments=application_arguments,\n            spark_home=spark_home,\n        )\n        self.logger.info("Running spark-submit: " + " ".join(spark_shell_cmd))\n\n        retcode = subprocess.call(" ".join(spark_shell_cmd), shell=True)\n\n        if retcode != 0:\n            raise SparkOpError("Spark job failed. Please consult your logs.")\n\n\n
[docs]@resource\ndef spark_resource(context):\n return SparkResource(context.log)
\n
", "current_page_name": "_modules/dagster_spark/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.resources"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.types

\nfrom dagster import Enum, EnumValue\n\nSparkDeployModeCluster = EnumValue("cluster")\nSparkDeployModeClient = EnumValue("client")\nSparkDeployMode = Enum(\n    name="SparkDeployMode", enum_values=[SparkDeployModeCluster, SparkDeployModeClient]\n)\n\n\nclass SparkSolidError(Exception):\n    pass\n\n\n
[docs]class SparkOpError(SparkSolidError):\n pass
\n
", "current_page_name": "_modules/dagster_spark/types", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.types"}, "utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.utils

\nimport itertools\nimport os\n\nfrom dagster import check\n\nfrom .types import SparkOpError\n\n\ndef flatten_dict(d):\n    def _flatten_dict(d, result, key_path=None):\n        """Iterates an arbitrarily nested dictionary and yield dot-notation key:value tuples.\n\n        {'foo': {'bar': 3, 'baz': 1}, {'other': {'key': 1}} =>\n            [('foo.bar', 3), ('foo.baz', 1), ('other.key', 1)]\n\n        """\n        for k, v in d.items():\n            new_key_path = (key_path or []) + [k]\n            if isinstance(v, dict):\n                _flatten_dict(v, result, new_key_path)\n            else:\n                result.append((".".join(new_key_path), v))\n\n    result = []\n    if d is not None:\n        _flatten_dict(d, result)\n    return result\n\n\ndef parse_spark_config(spark_conf):\n    """For each key-value pair in spark conf, we need to pass to CLI in format:\n\n    --conf "key=value"\n    """\n\n    spark_conf_list = flatten_dict(spark_conf)\n    return format_for_cli(spark_conf_list)\n\n\ndef format_for_cli(spark_conf_list):\n    return list(\n        itertools.chain.from_iterable([("--conf", "{}={}".format(*c)) for c in spark_conf_list])\n    )\n\n\n
[docs]def construct_spark_shell_command(\n application_jar,\n main_class,\n master_url=None,\n spark_conf=None,\n deploy_mode=None,\n application_arguments=None,\n spark_home=None,\n):\n """Constructs the spark-submit command for a Spark job."""\n check.opt_str_param(master_url, "master_url")\n check.str_param(application_jar, "application_jar")\n spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n check.opt_str_param(deploy_mode, "deploy_mode")\n check.opt_str_param(application_arguments, "application_arguments")\n check.opt_str_param(spark_home, "spark_home")\n\n spark_home = spark_home if spark_home else os.environ.get("SPARK_HOME")\n if spark_home is None:\n raise SparkOpError(\n (\n "No spark home set. You must either pass spark_home in config or "\n "set $SPARK_HOME in your environment (got None)."\n )\n )\n\n master_url = ["--master", master_url] if master_url else []\n deploy_mode = ["--deploy-mode", deploy_mode] if deploy_mode else []\n\n spark_shell_cmd = (\n ["{}/bin/spark-submit".format(spark_home), "--class", main_class]\n + master_url\n + deploy_mode\n + parse_spark_config(spark_conf)\n + [application_jar]\n + [application_arguments]\n )\n return spark_shell_cmd
\n
", "current_page_name": "_modules/dagster_spark/utils", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.utils"}}, "dagster_ssh": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_ssh.resources

\nimport getpass\nimport os\nfrom io import StringIO\n\nimport paramiko\nfrom paramiko.config import SSH_PORT\nfrom sshtunnel import SSHTunnelForwarder\n\nfrom dagster import Field, StringSource, check, resource\nfrom dagster.utils import merge_dicts, mkdir_p\n\n\ndef key_from_str(key_str):\n    """Creates a paramiko SSH key from a string."""\n    check.str_param(key_str, "key_str")\n\n    # py2 StringIO doesn't support with\n    key_file = StringIO(key_str)\n    result = paramiko.RSAKey.from_private_key(key_file)\n    key_file.close()\n    return result\n\n\n
[docs]class SSHResource:\n """\n Resource for ssh remote execution using Paramiko.\n ref: https://github.com/paramiko/paramiko\n """\n\n def __init__(\n self,\n remote_host,\n remote_port,\n username=None,\n password=None,\n key_file=None,\n key_string=None,\n timeout=10,\n keepalive_interval=30,\n compress=True,\n no_host_key_check=True,\n allow_host_key_change=False,\n logger=None,\n ):\n self.remote_host = check.str_param(remote_host, "remote_host")\n self.remote_port = check.opt_int_param(remote_port, "remote_port")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.key_file = check.opt_str_param(key_file, "key_file")\n self.timeout = check.opt_int_param(timeout, "timeout")\n self.keepalive_interval = check.opt_int_param(keepalive_interval, "keepalive_interval")\n self.compress = check.opt_bool_param(compress, "compress")\n self.no_host_key_check = check.opt_bool_param(no_host_key_check, "no_host_key_check")\n self.allow_host_key_change = check.opt_bool_param(\n allow_host_key_change, "allow_host_key_change"\n )\n self.log = logger\n\n self.host_proxy = None\n\n # Create RSAKey object from private key string\n self.key_obj = key_from_str(key_string) if key_string is not None else None\n\n # Auto detecting username values from system\n if not self.username:\n logger.debug(\n "username to ssh to host: %s is not specified. Using system's default provided by"\n " getpass.getuser()" % self.remote_host\n )\n self.username = getpass.getuser()\n\n user_ssh_config_filename = os.path.expanduser("~/.ssh/config")\n if os.path.isfile(user_ssh_config_filename):\n ssh_conf = paramiko.SSHConfig()\n ssh_conf.parse(open(user_ssh_config_filename))\n host_info = ssh_conf.lookup(self.remote_host)\n if host_info and host_info.get("proxycommand"):\n self.host_proxy = paramiko.ProxyCommand(host_info.get("proxycommand"))\n\n if not (self.password or self.key_file):\n if host_info and host_info.get("identityfile"):\n self.key_file = host_info.get("identityfile")[0]\n\n def get_connection(self):\n """\n Opens a SSH connection to the remote host.\n\n :rtype: paramiko.client.SSHClient\n """\n client = paramiko.SSHClient()\n if not self.allow_host_key_change:\n self.log.warning(\n "Remote Identification Change is not verified. This won't protect against "\n "Man-In-The-Middle attacks"\n )\n client.load_system_host_keys()\n if self.no_host_key_check:\n self.log.warning(\n "No Host Key Verification. This won't protect against Man-In-The-Middle attacks"\n )\n # Default is RejectPolicy\n client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n\n if self.password and self.password.strip():\n client.connect(\n hostname=self.remote_host,\n username=self.username,\n password=self.password,\n key_filename=self.key_file,\n pkey=self.key_obj,\n timeout=self.timeout,\n compress=self.compress,\n port=self.remote_port,\n sock=self.host_proxy,\n look_for_keys=False,\n )\n else:\n client.connect(\n hostname=self.remote_host,\n username=self.username,\n key_filename=self.key_file,\n pkey=self.key_obj,\n timeout=self.timeout,\n compress=self.compress,\n port=self.remote_port,\n sock=self.host_proxy,\n )\n\n if self.keepalive_interval:\n client.get_transport().set_keepalive(self.keepalive_interval)\n\n return client\n\n def get_tunnel(self, remote_port, remote_host="localhost", local_port=None):\n check.int_param(remote_port, "remote_port")\n check.str_param(remote_host, "remote_host")\n check.opt_int_param(local_port, "local_port")\n\n if local_port is not None:\n local_bind_address = ("localhost", local_port)\n else:\n local_bind_address = ("localhost",)\n\n # Will prefer key string if specified, otherwise use the key file\n pkey = self.key_obj if self.key_obj else self.key_file\n\n if self.password and self.password.strip():\n client = SSHTunnelForwarder(\n self.remote_host,\n ssh_port=self.remote_port,\n ssh_username=self.username,\n ssh_password=self.password,\n ssh_pkey=pkey,\n ssh_proxy=self.host_proxy,\n local_bind_address=local_bind_address,\n remote_bind_address=(remote_host, remote_port),\n logger=self.log,\n )\n else:\n client = SSHTunnelForwarder(\n self.remote_host,\n ssh_port=self.remote_port,\n ssh_username=self.username,\n ssh_pkey=pkey,\n ssh_proxy=self.host_proxy,\n local_bind_address=local_bind_address,\n remote_bind_address=(remote_host, remote_port),\n host_pkey_directories=[],\n logger=self.log,\n )\n\n return client\n\n def sftp_get(self, remote_filepath, local_filepath):\n check.str_param(remote_filepath, "remote_filepath")\n check.str_param(local_filepath, "local_filepath")\n conn = self.get_connection()\n with conn.open_sftp() as sftp_client:\n local_folder = os.path.dirname(local_filepath)\n\n # Create intermediate directories if they don't exist\n mkdir_p(local_folder)\n\n self.log.info(\n "Starting to transfer from {0} to {1}".format(remote_filepath, local_filepath)\n )\n\n sftp_client.get(remote_filepath, local_filepath)\n\n conn.close()\n return local_filepath\n\n def sftp_put(self, remote_filepath, local_filepath, confirm=True):\n check.str_param(remote_filepath, "remote_filepath")\n check.str_param(local_filepath, "local_filepath")\n conn = self.get_connection()\n with conn.open_sftp() as sftp_client:\n self.log.info(\n "Starting to transfer file from {0} to {1}".format(local_filepath, remote_filepath)\n )\n\n sftp_client.put(local_filepath, remote_filepath, confirm=confirm)\n\n conn.close()\n return local_filepath
\n\n\n
[docs]@resource(\n {\n "remote_host": Field(\n StringSource, description="remote host to connect to", is_required=True\n ),\n "remote_port": Field(\n int,\n description="port of remote host to connect (Default is paramiko SSH_PORT)",\n is_required=False,\n default_value=SSH_PORT,\n ),\n "username": Field(\n StringSource, description="username to connect to the remote_host", is_required=False\n ),\n "password": Field(\n StringSource,\n description="password of the username to connect to the remote_host",\n is_required=False,\n ),\n "key_file": Field(\n StringSource,\n description="key file to use to connect to the remote_host.",\n is_required=False,\n ),\n "key_string": Field(\n StringSource,\n description="key string to use to connect to remote_host",\n is_required=False,\n ),\n "timeout": Field(\n int,\n description="timeout for the attempt to connect to the remote_host.",\n is_required=False,\n default_value=10,\n ),\n "keepalive_interval": Field(\n int,\n description="send a keepalive packet to remote host every keepalive_interval seconds",\n is_required=False,\n default_value=30,\n ),\n "compress": Field(bool, is_required=False, default_value=True),\n "no_host_key_check": Field(bool, is_required=False, default_value=True),\n "allow_host_key_change": Field(bool, is_required=False, default_value=False),\n }\n)\ndef ssh_resource(init_context):\n args = init_context.resource_config\n args = merge_dicts(init_context.resource_config, {"logger": init_context.log})\n return SSHResource(**args)
\n
", "current_page_name": "_modules/dagster_ssh/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_ssh.resources"}}, "dagster_twilio": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_twilio.resources

\nfrom twilio.rest import Client\n\nfrom dagster import Field, StringSource, resource\n\n\n
[docs]@resource(\n {\n "account_sid": Field(StringSource, description="Twilio Account SID"),\n "auth_token": Field(StringSource, description="Twilio Auth Token"),\n },\n description="This resource is for connecting to Twilio",\n)\ndef twilio_resource(context):\n return Client(context.resource_config["account_sid"], context.resource_config["auth_token"])
\n
", "current_page_name": "_modules/dagster_twilio/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_twilio.resources"}}, "dagstermill": {"context": {"alabaster_version": "0.7.12", "body": "

Source code for dagstermill.context

\nfrom typing import Any, Dict, Set\n\nfrom dagster import PipelineDefinition, PipelineRun, SolidDefinition, check\nfrom dagster.core.definitions.dependency import Node, NodeHandle\nfrom dagster.core.execution.context.compute import AbstractComputeExecutionContext\nfrom dagster.core.execution.context.system import PlanExecutionContext, StepExecutionContext\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]class DagstermillExecutionContext(AbstractComputeExecutionContext):\n """Dagstermill-specific execution context.\n\n Do not initialize directly: use :func:`dagstermill.get_context`.\n """\n\n def __init__(\n self,\n pipeline_context: PlanExecutionContext,\n pipeline_def: PipelineDefinition,\n resource_keys_to_init: Set[str],\n solid_name: str,\n solid_handle: NodeHandle,\n solid_config: Any = None,\n ):\n self._pipeline_context = check.inst_param(\n pipeline_context, "pipeline_context", PlanExecutionContext\n )\n self._pipeline_def = check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n self._resource_keys_to_init = check.set_param(\n resource_keys_to_init, "resource_keys_to_init", of_type=str\n )\n self.solid_name = check.str_param(solid_name, "solid_name")\n self.solid_handle = check.inst_param(solid_handle, "solid_handle", NodeHandle)\n self._solid_config = solid_config\n\n
[docs] def has_tag(self, key: str) -> bool:\n """Check if a logging tag is defined on the context.\n\n Args:\n key (str): The key to check.\n\n Returns:\n bool\n """\n check.str_param(key, "key")\n return self._pipeline_context.has_tag(key)
\n\n
[docs] def get_tag(self, key: str) -> str:\n """Get a logging tag defined on the context.\n\n Args:\n key (str): The key to get.\n\n Returns:\n str\n """\n check.str_param(key, "key")\n return self._pipeline_context.get_tag(key)
\n\n @property\n def run_id(self) -> str:\n """str: The run_id for the context."""\n return self._pipeline_context.run_id\n\n @property\n def run_config(self) -> Dict[str, Any]:\n """dict: The run_config for the context."""\n return self._pipeline_context.run_config\n\n @property\n def resolved_run_config(self) -> ResolvedRunConfig:\n """:class:`dagster.ResolvedRunConfig`: The resolved_run_config for the context"""\n return self._pipeline_context.resolved_run_config\n\n @property\n def logging_tags(self) -> Dict[str, str]:\n """dict: The logging tags for the context."""\n return self._pipeline_context.logging_tags\n\n @property\n def pipeline_name(self) -> str:\n return self._pipeline_context.pipeline_name\n\n @property\n def pipeline_def(self) -> PipelineDefinition:\n """:class:`dagster.PipelineDefinition`: The pipeline definition for the context.\n\n This will be a dagstermill-specific shim.\n """\n return self._pipeline_def\n\n @property\n def resources(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n resources."""\n return self._pipeline_context.scoped_resources_builder.build(\n required_resource_keys=self._resource_keys_to_init,\n )\n\n @property\n def pipeline_run(self) -> PipelineRun:\n """:class:`dagster.PipelineRun`: The pipeline run for the context."""\n return self._pipeline_context.pipeline_run\n\n @property\n def log(self) -> DagsterLogManager:\n """:class:`dagster.DagsterLogManager`: The log manager for the context.\n\n Call, e.g., ``log.info()`` to log messages through the Dagster machinery.\n """\n return self._pipeline_context.log\n\n @property\n def solid_def(self) -> SolidDefinition:\n """:class:`dagster.SolidDefinition`: The solid definition for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether a\n solid definition was passed to ``dagstermill.get_context``.\n """\n return self.pipeline_def.solid_def_named(self.solid_name)\n\n @property\n def solid(self) -> Node:\n """:class:`dagster.Node`: The solid for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether a\n solid definition was passed to ``dagstermill.get_context``.\n """\n return self.pipeline_def.get_solid(self.solid_handle)\n\n @property\n def solid_config(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n solid-specific config."""\n if self._solid_config:\n return self._solid_config\n\n solid_config = self.resolved_run_config.solids.get(self.solid_name)\n return solid_config.config if solid_config else None
\n\n\nclass DagstermillRuntimeExecutionContext(DagstermillExecutionContext):\n def __init__(\n self,\n pipeline_context: PlanExecutionContext,\n pipeline_def: PipelineDefinition,\n resource_keys_to_init: Set[str],\n solid_name: str,\n step_context: StepExecutionContext,\n solid_handle: NodeHandle,\n solid_config: Any = None,\n ):\n self._step_context = check.inst_param(step_context, "step_context", StepExecutionContext)\n super().__init__(\n pipeline_context,\n pipeline_def,\n resource_keys_to_init,\n solid_name,\n solid_handle,\n solid_config,\n )\n
", "current_page_name": "_modules/dagstermill/context", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagstermill.context"}, "errors": {"alabaster_version": "0.7.12", "body": "

Source code for dagstermill.errors

\nfrom dagster.core.errors import DagsterError\n\n\n
[docs]class DagstermillError(DagsterError):\n """Base class for errors raised by dagstermill."""
\n
", "current_page_name": "_modules/dagstermill/errors", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagstermill.errors"}, "factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagstermill.factory

\nimport copy\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport uuid\nfrom typing import Any, Dict, List, Optional, Sequence, Set, Union\n\nimport nbformat\nimport papermill\nfrom papermill.engines import papermill_engines\nfrom papermill.iorw import load_notebook_node, write_ipynb\n\nfrom dagster import (\n    InputDefinition,\n    OpDefinition,\n    Output,\n    OutputDefinition,\n    SolidDefinition,\n    check,\n    seven,\n)\nfrom dagster.core.definitions.events import AssetMaterialization, Failure, RetryRequested\nfrom dagster.core.definitions.metadata import MetadataEntry, MetadataValue\nfrom dagster.core.definitions.reconstruct import ReconstructablePipeline\nfrom dagster.core.definitions.utils import validate_tags\nfrom dagster.core.execution.context.compute import SolidExecutionContext\nfrom dagster.core.execution.context.input import build_input_context\nfrom dagster.core.execution.context.system import StepExecutionContext\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.storage.file_manager import FileHandle\nfrom dagster.serdes import pack_value\nfrom dagster.seven import get_system_temp_directory\nfrom dagster.utils import mkdir_p, safe_tempfile_path\nfrom dagster.utils.backcompat import rename_warning\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .compat import ExecutionError\nfrom .engine import DagstermillEngine\nfrom .errors import DagstermillError\nfrom .translator import DagsterTranslator\n\n\n# https://github.com/nteract/papermill/blob/17d4bbb3960c30c263bca835e48baf34322a3530/papermill/parameterize.py\ndef _find_first_tagged_cell_index(nb, tag):\n    parameters_indices = []\n    for idx, cell in enumerate(nb.cells):\n        if tag in cell.metadata.tags:\n            parameters_indices.append(idx)\n    if not parameters_indices:\n        return -1\n    return parameters_indices[0]\n\n\n# This is based on papermill.parameterize.parameterize_notebook\n# Typically, papermill injects the injected-parameters cell *below* the parameters cell\n# but we want to *replace* the parameters cell, which is what this function does.\ndef replace_parameters(context, nb, parameters):\n    """Assigned parameters into the appropiate place in the input notebook\n\n    Args:\n        nb (NotebookNode): Executable notebook object\n        parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.\n    """\n    check.dict_param(parameters, "parameters")\n\n    # Copy the nb object to avoid polluting the input\n    nb = copy.deepcopy(nb)\n\n    # papermill method chooses translator based on kernel_name and language, but we just call the\n    # DagsterTranslator to generate parameter content based on the kernel_name\n    param_content = DagsterTranslator.codify(parameters)\n\n    newcell = nbformat.v4.new_code_cell(source=param_content)\n    newcell.metadata["tags"] = ["injected-parameters"]\n\n    param_cell_index = _find_first_tagged_cell_index(nb, "parameters")\n    injected_cell_index = _find_first_tagged_cell_index(nb, "injected-parameters")\n    if injected_cell_index >= 0:\n        # Replace the injected cell with a new version\n        before = nb.cells[:injected_cell_index]\n        after = nb.cells[injected_cell_index + 1 :]\n        check.int_value_param(param_cell_index, -1, "param_cell_index")\n        # We should have blown away the parameters cell if there is an injected-parameters cell\n    elif param_cell_index >= 0:\n        # Replace the parameter cell with the injected-parameters cell\n        before = nb.cells[:param_cell_index]\n        after = nb.cells[param_cell_index + 1 :]\n    else:\n        # Inject to the top of the notebook, presumably first cell includes dagstermill import\n        context.log.debug(\n            (\n                "Executing notebook with no tagged parameters cell: injecting boilerplate in first "\n                "cell."\n            )\n        )\n        before = []\n        after = nb.cells\n\n    nb.cells = before + [newcell] + after\n    nb.metadata.papermill["parameters"] = seven.json.dumps(parameters)\n\n    return nb\n\n\ndef get_papermill_parameters(step_context, inputs, output_log_path, compute_descriptor):\n    check.inst_param(step_context, "step_context", StepExecutionContext)\n    check.param_invariant(\n        isinstance(step_context.run_config, dict),\n        "step_context",\n        "StepExecutionContext must have valid run_config",\n    )\n    check.dict_param(inputs, "inputs", key_type=str)\n\n    run_id = step_context.run_id\n    temp_dir = get_system_temp_directory()\n    marshal_dir = os.path.normpath(os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))\n    mkdir_p(marshal_dir)\n\n    if not isinstance(step_context.pipeline, ReconstructablePipeline):\n        if compute_descriptor == "solid":\n            raise DagstermillError(\n                "Can't execute a dagstermill solid from a pipeline that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n        else:\n            raise DagstermillError(\n                "Can't execute a dagstermill op from a job that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n\n    dm_executable_dict = step_context.pipeline.to_dict()\n\n    dm_context_dict = {\n        "output_log_path": output_log_path,\n        "marshal_dir": marshal_dir,\n        "run_config": step_context.run_config,\n    }\n\n    dm_solid_handle_kwargs = step_context.solid_handle._asdict()\n    dm_step_key = step_context.step.key\n\n    parameters = {}\n\n    parameters["__dm_context"] = dm_context_dict\n    parameters["__dm_executable_dict"] = dm_executable_dict\n    parameters["__dm_pipeline_run_dict"] = pack_value(step_context.pipeline_run)\n    parameters["__dm_solid_handle_kwargs"] = dm_solid_handle_kwargs\n    parameters["__dm_instance_ref_dict"] = pack_value(step_context.instance.get_ref())\n    parameters["__dm_step_key"] = dm_step_key\n    parameters["__dm_input_names"] = list(inputs.keys())\n\n    return parameters\n\n\ndef _dm_compute(\n    dagster_factory_name,\n    name,\n    notebook_path,\n    output_notebook_name=None,\n    asset_key_prefix=None,\n    output_notebook=None,\n):\n    check.str_param(name, "name")\n    check.str_param(notebook_path, "notebook_path")\n    check.opt_str_param(output_notebook_name, "output_notebook_name")\n    check.opt_list_param(asset_key_prefix, "asset_key_prefix")\n    check.opt_str_param(output_notebook, "output_notebook")\n\n    def _t_fn(step_context, inputs):\n        check.inst_param(step_context, "step_context", SolidExecutionContext)\n        check.param_invariant(\n            isinstance(step_context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        step_execution_context = step_context.get_step_execution_context()\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            with safe_tempfile_path() as output_log_path:\n\n                prefix = str(uuid.uuid4())\n                parameterized_notebook_path = os.path.join(\n                    output_notebook_dir, f"{prefix}-inter.ipynb"\n                )\n\n                executed_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-out.ipynb")\n\n                # Scaffold the registration here\n                nb = load_notebook_node(notebook_path)\n                compute_descriptor = (\n                    "solid" if dagster_factory_name == "define_dagstermill_solid" else "op"\n                )\n                nb_no_parameters = replace_parameters(\n                    step_execution_context,\n                    nb,\n                    get_papermill_parameters(\n                        step_execution_context, inputs, output_log_path, compute_descriptor\n                    ),\n                )\n                write_ipynb(nb_no_parameters, parameterized_notebook_path)\n\n                try:\n                    papermill_engines.register("dagstermill", DagstermillEngine)\n                    papermill.execute_notebook(\n                        input_path=parameterized_notebook_path,\n                        output_path=executed_notebook_path,\n                        engine_name="dagstermill",\n                        log_output=True,\n                    )\n\n                except Exception as ex:\n                    step_execution_context.log.warn(\n                        "Error when attempting to materialize executed notebook: {exc}".format(\n                            exc=str(serializable_error_info_from_exc_info(sys.exc_info()))\n                        )\n                    )\n                    # pylint: disable=no-member\n                    # compat:\n                    if isinstance(ex, ExecutionError) and (\n                        ex.ename == "RetryRequested" or ex.ename == "Failure"\n                    ):\n                        step_execution_context.log.warn(\n                            f"Encountered raised {ex.ename} in notebook. Use dagstermill.yield_event "\n                            "with RetryRequested or Failure to trigger their behavior."\n                        )\n\n                    raise\n\n            step_execution_context.log.debug(\n                "Notebook execution complete for {name} at {executed_notebook_path}.".format(\n                    name=name,\n                    executed_notebook_path=executed_notebook_path,\n                )\n            )\n            if output_notebook_name is not None:\n                # yield output notebook binary stream as a solid output\n                with open(executed_notebook_path, "rb") as fd:\n                    yield Output(fd.read(), output_notebook_name)\n\n            else:\n                # backcompat\n                executed_notebook_file_handle = None\n                try:\n                    # use binary mode when when moving the file since certain file_managers such as S3\n                    # may try to hash the contents\n                    with open(executed_notebook_path, "rb") as fd:\n                        executed_notebook_file_handle = step_context.resources.file_manager.write(\n                            fd, mode="wb", ext="ipynb"\n                        )\n                        executed_notebook_materialization_path = (\n                            executed_notebook_file_handle.path_desc\n                        )\n\n                    yield AssetMaterialization(\n                        asset_key=(asset_key_prefix + [f"{name}_output_notebook"]),\n                        description="Location of output notebook in file manager",\n                        metadata_entries=[\n                            MetadataEntry(\n                                "path",\n                                value=MetadataValue.path(executed_notebook_materialization_path),\n                            )\n                        ],\n                    )\n\n                except Exception:\n                    # if file manager writing errors, e.g. file manager is not provided, we throw a warning\n                    # and fall back to the previously stored temp executed notebook.\n                    step_context.log.warning(\n                        "Error when attempting to materialize executed notebook using file manager: "\n                        f"{str(serializable_error_info_from_exc_info(sys.exc_info()))}"\n                        f"\\nNow falling back to local: notebook execution was temporarily materialized at {executed_notebook_path}"\n                        "\\nIf you have supplied a file manager and expect to use it for materializing the "\n                        'notebook, please include "file_manager" in the `required_resource_keys` argument '\n                        f"to `{dagster_factory_name}`"\n                    )\n\n                if output_notebook is not None:\n                    yield Output(executed_notebook_file_handle, output_notebook)\n\n            # deferred import for perf\n            import scrapbook\n\n            output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n            for (output_name, _) in step_execution_context.solid_def.output_dict.items():\n                data_dict = output_nb.scraps.data_dict\n                if output_name in data_dict:\n                    # read outputs that were passed out of process via io manager from `yield_result`\n                    step_output_handle = StepOutputHandle(\n                        step_key=step_execution_context.step.key, output_name=output_name\n                    )\n                    output_context = step_execution_context.get_output_context(step_output_handle)\n                    io_manager = step_execution_context.get_io_manager(step_output_handle)\n                    value = io_manager.load_input(\n                        build_input_context(upstream_output=output_context)\n                    )\n\n                    yield Output(value, output_name)\n\n            for key, value in output_nb.scraps.items():\n                if key.startswith("event-"):\n                    with open(value.data, "rb") as fd:\n                        event = pickle.loads(fd.read())\n                        if isinstance(event, (Failure, RetryRequested)):\n                            raise event\n                        else:\n                            yield event\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_solid(\n name: str,\n notebook_path: str,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n output_notebook: Optional[str] = None,\n output_notebook_name: Optional[str] = None,\n asset_key_prefix: Optional[Union[List[str], str]] = None,\n description: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n):\n """Wrap a Jupyter notebook in a solid.\n\n Arguments:\n name (str): The name of the solid.\n notebook_path (str): Path to the backing notebook.\n input_defs (Optional[List[InputDefinition]]): The solid's inputs.\n output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook (Optional[str]): If set, will be used as the name of an injected output of\n type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in\n addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This\n respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on\n the pipeline resources via the "file_manager" resource key, so, e.g.,\n if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :\n py:class:`~dagster_aws.s3.S3FileHandle`.\n output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output\n of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed\n notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always\n created). It allows the downstream solids to access the executed notebook via a file\n object.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n description (Optional[str]): If set, description used for solid.\n tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.\n Dagster uses the tag keys `notebook_path` and `kind`, which cannot be\n overwritten by the user.\n\n Returns:\n :py:class:`~dagster.SolidDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition)\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n\n extra_output_defs = []\n if output_notebook_name is not None:\n required_resource_keys.add("output_notebook_io_manager")\n extra_output_defs.append(\n OutputDefinition(name=output_notebook_name, io_manager_key="output_notebook_io_manager")\n )\n # backcompact\n if output_notebook is not None:\n rename_warning(\n new_name="output_notebook_name", old_name="output_notebook", breaking_version="0.14.0"\n )\n required_resource_keys.add("file_manager")\n extra_output_defs.append(OutputDefinition(dagster_type=FileHandle, name=output_notebook))\n\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n default_description = f"This solid is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n user_tags = validate_tags(tags)\n if tags is not None:\n check.invariant(\n "notebook_path" not in tags,\n "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in tags,\n "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",\n )\n default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}\n\n return SolidDefinition(\n name=name,\n input_defs=input_defs,\n compute_fn=_dm_compute(\n "define_dagstermill_solid",\n name,\n notebook_path,\n output_notebook_name,\n asset_key_prefix=asset_key_prefix,\n output_notebook=output_notebook, # backcompact\n ),\n output_defs=output_defs + extra_output_defs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description=description,\n tags={**user_tags, **default_tags},\n )
\n\n\n
[docs]def define_dagstermill_op(\n name: str,\n notebook_path: str,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n output_notebook_name: Optional[str] = None,\n asset_key_prefix: Optional[Union[List[str], str]] = None,\n description: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n):\n """Wrap a Jupyter notebook in a solid.\n\n Arguments:\n name (str): The name of the solid.\n notebook_path (str): Path to the backing notebook.\n input_defs (Optional[List[InputDefinition]]): The solid's inputs.\n output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output\n of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed\n notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always\n created). It allows the downstream solids to access the executed notebook via a file\n object.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n description (Optional[str]): If set, description used for solid.\n tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.\n Dagster uses the tag keys `notebook_path` and `kind`, which cannot be\n overwritten by the user.\n\n Returns:\n :py:class:`~dagster.SolidDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition)\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n\n extra_output_defs = []\n if output_notebook_name is not None:\n required_resource_keys.add("output_notebook_io_manager")\n extra_output_defs.append(\n OutputDefinition(name=output_notebook_name, io_manager_key="output_notebook_io_manager")\n )\n\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n default_description = f"This op is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n user_tags = validate_tags(tags)\n if tags is not None:\n check.invariant(\n "notebook_path" not in tags,\n "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in tags,\n "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",\n )\n default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}\n\n return OpDefinition(\n name=name,\n input_defs=input_defs,\n compute_fn=_dm_compute(\n "define_dagstermill_op",\n name,\n notebook_path,\n output_notebook_name,\n asset_key_prefix=asset_key_prefix,\n ),\n output_defs=output_defs + extra_output_defs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description=description,\n tags={**user_tags, **default_tags},\n )
\n
", "current_page_name": "_modules/dagstermill/factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagstermill.factory"}, "io_managers": {"alabaster_version": "0.7.12", "body": "

Source code for dagstermill.io_managers

\nimport os\nfrom pathlib import Path\nfrom typing import Any, List, Optional\n\nfrom dagster import check\nfrom dagster.config.field import Field\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.metadata import MetadataEntry, MetadataValue\nfrom dagster.core.execution.context.input import InputContext\nfrom dagster.core.execution.context.output import OutputContext\nfrom dagster.core.storage.io_manager import IOManager, io_manager\nfrom dagster.utils import mkdir_p\n\n\nclass OutputNotebookIOManager(IOManager):\n    def __init__(self, asset_key_prefix: Optional[List[str]] = None):\n        self.asset_key_prefix = asset_key_prefix if asset_key_prefix else []\n\n    def get_output_asset_key(self, context: OutputContext):\n        return AssetKey([*self.asset_key_prefix, f"{context.step_key}_output_notebook"])\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        raise NotImplementedError\n\n    def load_input(self, context: InputContext) -> Any:\n        raise NotImplementedError\n\n\nclass LocalOutputNotebookIOManager(OutputNotebookIOManager):\n    """Built-in IO Manager for handling output notebook."""\n\n    def __init__(self, base_dir: str, asset_key_prefix: Optional[List[str]] = None):\n        super(LocalOutputNotebookIOManager, self).__init__(asset_key_prefix=asset_key_prefix)\n        self.base_dir = base_dir\n        self.write_mode = "wb"\n        self.read_mode = "rb"\n\n    def _get_path(self, context: OutputContext) -> str:\n        """Automatically construct filepath."""\n        keys = context.get_run_scoped_output_identifier()\n        return str(Path(self.base_dir, *keys).with_suffix(".ipynb"))\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        """obj: bytes"""\n        check.inst_param(context, "context", OutputContext)\n\n        # the output notebook itself is stored at output_file_path\n        output_notebook_path = self._get_path(context)\n        mkdir_p(os.path.dirname(output_notebook_path))\n        with open(output_notebook_path, self.write_mode) as dest_file_obj:\n            dest_file_obj.write(obj)\n        yield MetadataEntry("path", value=MetadataValue.path(output_notebook_path))\n\n    def load_input(self, context) -> bytes:\n        check.inst_param(context, "context", InputContext)\n        # pass output notebook to downstream solids as File Object\n        with open(self._get_path(context.upstream_output), self.read_mode) as file_obj:\n            return file_obj.read()\n\n\n
[docs]@io_manager(\n config_schema={\n "asset_key_prefix": Field(str, is_required=False),\n "base_dir": Field(str, is_required=False),\n },\n)\ndef local_output_notebook_io_manager(init_context):\n """Built-in IO Manager that handles output notebooks."""\n return LocalOutputNotebookIOManager(\n base_dir=init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory()\n ),\n asset_key_prefix=init_context.resource_config.get("asset_key_prefix", []),\n )
\n
", "current_page_name": "_modules/dagstermill/io_managers", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagstermill.io_managers"}}} \ No newline at end of file +{"": {"index": {"alabaster_version": "0.7.12", "body": "

All modules for which code is available

\n", "current_page_name": "_modules/index", "customsidebar": null, "sidebars": ["globaltoc.html", "searchbox.html"], "title": "Overview: module code"}}, "dagster": {"config": {"config_schema": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.config.config_schema

\nfrom typing import TYPE_CHECKING, Any, Dict, List, Type, Union\n\nfrom typing_extensions import TypeAlias\n\nif TYPE_CHECKING:\n    from dagster.config.config_type import ConfigType\n    from dagster.config.field import Field\n\n# Eventually, the below `ConfigSchemaType` should be renamed to `ConfigSchema` and the class\n# definition should be dropped. The reason we don't do this now is that sphinx autodoc doesn't\n# support type aliases, so there is no good way to gracefully attach a docstring to this and have it\n# show up in the docs. See: https://github.com/sphinx-doc/sphinx/issues/8934\n#\n# Unfortunately mypy doesn't support recursive types, which would be used to properly define the\n# List/Dict elements of this union: `Dict[str, ConfigSchema]`, `List[ConfigSchema]`.\nConfigSchemaType: TypeAlias = Union[\n    Type[Union[bool, float, int, str]],\n    Type[Union[dict, list]],\n    "ConfigType",\n    "Field",\n    Dict[str, Any],\n    List[Any],\n]\n\n\n
[docs]class ConfigSchema:\n """This is a placeholder type. Any time that it appears in documentation, it means that any of\n the following types are acceptable:\n\n #. A Python scalar type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`). For example:\n\n * ``@op(config_schema=int)``\n * ``@op(config_schema=str)``\n\n #. A built-in python collection (:py:class:`~python:list`, or :py:class:`~python:dict`).\n :py:class:`~python:list` is exactly equivalent to :py:class:`~dagster.Array` [\n :py:class:`~dagster.Any` ] and :py:class:`~python:dict` is equivalent to\n :py:class:`~dagster.Permissive`. For example:\n\n * ``@op(config_schema=list)``\n * ``@op(config_schema=dict)``\n\n #. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.Map`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n\n #. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules. For example:\n\n * ``{'some_config': str}`` is equivalent to ``Shape({'some_config: str})``.\n\n * ``{'some_config1': {'some_config2': str}}`` is equivalent to\n ``Shape({'some_config1: Shape({'some_config2: str})})``.\n\n #. A bare python list of length one, whose single element will be wrapped in a\n :py:class:`~dagster.Array` is resolved recursively according to the same\n rules. For example:\n\n * ``[str]`` is equivalent to ``Array[str]``.\n\n * ``[[str]]`` is equivalent to ``Array[Array[str]]``.\n\n * ``[{'some_config': str}]`` is equivalent to ``Array(Shape({'some_config: str}))``.\n\n #. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self):\n raise NotImplementedError(\n "ConfigSchema is a placeholder type and should not be instantiated."\n )
\n
", "current_page_name": "_modules/dagster/config/config_schema", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.config.config_schema"}, "config_type": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.config.config_type

\nimport typing\nfrom enum import Enum as PythonEnum\nfrom typing import Dict, List, Optional, cast\n\nimport dagster._check as check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.serdes import whitelist_for_serdes\n\n\n@whitelist_for_serdes\nclass ConfigTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    ENUM = "ENUM"\n\n    SELECTOR = "SELECTOR"\n    STRICT_SHAPE = "STRICT_SHAPE"\n    PERMISSIVE_SHAPE = "PERMISSIVE_SHAPE"\n    SCALAR_UNION = "SCALAR_UNION"\n\n    MAP = "MAP"\n\n    # Closed generic types\n    ARRAY = "ARRAY"\n    NONEABLE = "NONEABLE"\n\n    @staticmethod\n    def has_fields(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR or ConfigTypeKind.is_shape(kind)\n\n    @staticmethod\n    def is_closed_generic(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return (\n            kind == ConfigTypeKind.ARRAY\n            or kind == ConfigTypeKind.NONEABLE\n            or kind == ConfigTypeKind.SCALAR_UNION\n            or kind == ConfigTypeKind.MAP\n        )\n\n    @staticmethod\n    def is_shape(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.STRICT_SHAPE or kind == ConfigTypeKind.PERMISSIVE_SHAPE\n\n    @staticmethod\n    def is_selector(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR\n\n\nclass ConfigType:\n    """\n    The class backing DagsterTypes as they are used processing configuration data.\n    """\n\n    def __init__(\n        self,\n        key: str,\n        kind: ConfigTypeKind,\n        given_name: Optional[str] = None,\n        description: Optional[str] = None,\n        type_params: Optional[List["ConfigType"]] = None,\n    ):\n\n        self.key: str = check.str_param(key, "key")\n        self.kind: ConfigTypeKind = check.inst_param(kind, "kind", ConfigTypeKind)\n        self.given_name: Optional[str] = check.opt_str_param(given_name, "given_name")\n        self._description: Optional[str] = check.opt_str_param(description, "description")\n        self.type_params: Optional[List[ConfigType]] = (\n            check.list_param(type_params, "type_params", of_type=ConfigType)\n            if type_params\n            else None\n        )\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @staticmethod\n    def from_builtin_enum(builtin_enum: typing.Any) -> "ConfigType":\n        check.invariant(BuiltinEnum.contains(builtin_enum), "param must be member of BuiltinEnum")\n        return _CONFIG_MAP[builtin_enum]\n\n    def post_process(self, value):\n        """\n        Implement this in order to take a value provided by the user\n        and perform computation on it. This can be done to coerce data types,\n        fetch things from the environment (e.g. environment variables), or\n        to do custom validation. If the value is not valid, throw a\n        PostProcessingError. Otherwise return the coerced value.\n        """\n        return value\n\n\n@whitelist_for_serdes\nclass ConfigScalarKind(PythonEnum):\n    INT = "INT"\n    STRING = "STRING"\n    FLOAT = "FLOAT"\n    BOOL = "BOOL"\n\n\n# Scalars, Composites, Selectors, Lists, Optional, Any\n\n\nclass ConfigScalar(ConfigType):\n    def __init__(\n        self, key: str, given_name: Optional[str], scalar_kind: ConfigScalarKind, **kwargs: object\n    ):\n        self.scalar_kind = check.inst_param(scalar_kind, "scalar_kind", ConfigScalarKind)\n        super(ConfigScalar, self).__init__(\n            key, kind=ConfigTypeKind.SCALAR, given_name=given_name, **kwargs  # type: ignore\n        )\n\n\nclass BuiltinConfigScalar(ConfigScalar):\n    def __init__(self, scalar_kind, description=None):\n        super(BuiltinConfigScalar, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=scalar_kind,\n            description=description,\n        )\n\n\nclass Int(BuiltinConfigScalar):\n    def __init__(self):\n        super(Int, self).__init__(scalar_kind=ConfigScalarKind.INT, description="")\n\n\nclass String(BuiltinConfigScalar):\n    def __init__(self):\n        super(String, self).__init__(scalar_kind=ConfigScalarKind.STRING, description="")\n\n\nclass Bool(BuiltinConfigScalar):\n    def __init__(self):\n        super(Bool, self).__init__(scalar_kind=ConfigScalarKind.BOOL, description="")\n\n\nclass Float(BuiltinConfigScalar):\n    def __init__(self):\n        super(Float, self).__init__(scalar_kind=ConfigScalarKind.FLOAT, description="")\n\n    def post_process(self, value):\n        return float(value)\n\n\nclass Any(ConfigType):\n    def __init__(self):\n        super(Any, self).__init__(\n            key="Any",\n            given_name="Any",\n            kind=ConfigTypeKind.ANY,\n        )\n\n\n
[docs]class Noneable(ConfigType):\n """Defines a configuration type that is the union of ``NoneType`` and the type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n\n **Examples:**\n\n .. code-block:: python\n\n config_schema={"name": Noneable(str)}\n\n config={"name": "Hello"} # Ok\n config={"name": None} # Ok\n config={} # Error\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Noneable, self).__init__(\n key="Noneable.{inner_type}".format(inner_type=self.inner_type.key),\n kind=ConfigTypeKind.NONEABLE,\n type_params=[self.inner_type],\n )
\n\n\n
[docs]class Array(ConfigType):\n """Defines an array (list) configuration type that contains values of type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n """\n\n def __init__(self, inner_type):\n from .field import resolve_to_config_type\n\n self.inner_type = resolve_to_config_type(inner_type)\n super(Array, self).__init__(\n key="Array.{inner_type}".format(inner_type=self.inner_type.key),\n type_params=[self.inner_type],\n kind=ConfigTypeKind.ARRAY,\n )\n\n @property\n def description(self):\n return "List of {inner_type}".format(inner_type=self.key)
\n\n\n
[docs]class EnumValue:\n """Define an entry in a :py:class:`Enum`.\n\n Args:\n config_value (str):\n The string representation of the config to accept when passed.\n python_value (Optional[Any]):\n The python value to convert the enum entry in to. Defaults to the ``config_value``.\n description (Optional[str]):\n A human-readable description of the enum entry.\n\n """\n\n def __init__(\n self,\n config_value: str,\n python_value: Optional[object] = None,\n description: Optional[str] = None,\n ):\n self.config_value = check.str_param(config_value, "config_value")\n self.python_value = config_value if python_value is None else python_value\n self.description = check.opt_str_param(description, "description")
\n\n\n
[docs]class Enum(ConfigType):\n """Defines a enum configuration type that allows one of a defined set of possible values.\n\n Args:\n name (str):\n The name of the enum configuration type.\n enum_values (List[EnumValue]):\n The set of possible values for the enum configuration type.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Enum(\n 'CowboyType',\n [\n EnumValue('good'),\n EnumValue('bad'),\n EnumValue('ugly'),\n ]\n )\n )\n )\n def resolve_standoff(context):\n # ...\n """\n\n def __init__(self, name: str, enum_values: List[EnumValue]):\n check.str_param(name, "name")\n super(Enum, self).__init__(key=name, given_name=name, kind=ConfigTypeKind.ENUM)\n self.enum_values = check.list_param(enum_values, "enum_values", of_type=EnumValue)\n self._valid_python_values = {ev.python_value for ev in enum_values}\n check.invariant(len(self._valid_python_values) == len(enum_values))\n self._valid_config_values = {ev.config_value for ev in enum_values}\n check.invariant(len(self._valid_config_values) == len(enum_values))\n\n @property\n def config_values(self):\n return [ev.config_value for ev in self.enum_values]\n\n def is_valid_config_enum_value(self, config_value):\n return config_value in self._valid_config_values\n\n def post_process(self, value: typing.Any) -> typing.Any:\n if isinstance(value, PythonEnum):\n value = value.name\n\n for ev in self.enum_values:\n if ev.config_value == value:\n return ev.python_value\n\n check.failed(\n (\n "Should never reach this. config_value should be pre-validated. "\n "Got {config_value}"\n ).format(config_value=value)\n )\n\n
[docs] @classmethod\n def from_python_enum(cls, enum, name=None):\n """\n Create a Dagster enum corresponding to an existing Python enum.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n # ...\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v) for v in enum])
\n\n\n
[docs]class ScalarUnion(ConfigType):\n """Defines a configuration type that accepts a scalar value OR a non-scalar value like a\n :py:class:`~dagster.List`, :py:class:`~dagster.Dict`, or :py:class:`~dagster.Selector`.\n\n This allows runtime scalars to be configured without a dictionary with the key ``value`` and\n instead just use the scalar value directly. However this still leaves the option to\n load scalars from a json or pickle file.\n\n Args:\n scalar_type (type):\n The scalar type of values that this configuration type can hold. For example,\n :py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`.\n non_scalar_schema (ConfigSchema):\n The schema of a non-scalar Dagster configuration type. For example, :py:class:`List`,\n :py:class:`Dict`, or :py:class:`~dagster.Selector`.\n key (Optional[str]):\n The configuation type's unique key. If not set, then the key will be set to\n ``ScalarUnion.{scalar_type}-{non_scalar_schema}``.\n\n **Examples:**\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word:\n value: foobar\n\n\n becomes, optionally,\n\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word: foobar\n """\n\n def __init__(\n self,\n scalar_type: typing.Any,\n non_scalar_schema: ConfigSchemaType,\n _key: Optional[str] = None,\n ):\n from .field import resolve_to_config_type\n\n self.scalar_type = resolve_to_config_type(scalar_type)\n self.non_scalar_type = resolve_to_config_type(non_scalar_schema)\n\n check.param_invariant(self.scalar_type.kind == ConfigTypeKind.SCALAR, "scalar_type")\n check.param_invariant(\n self.non_scalar_type.kind\n in {ConfigTypeKind.STRICT_SHAPE, ConfigTypeKind.SELECTOR, ConfigTypeKind.ARRAY},\n "non_scalar_type",\n )\n\n # https://github.com/dagster-io/dagster/issues/2133\n key = check.opt_str_param(\n _key, "_key", "ScalarUnion.{}-{}".format(self.scalar_type.key, self.non_scalar_type.key)\n )\n\n super(ScalarUnion, self).__init__(\n key=key,\n kind=ConfigTypeKind.SCALAR_UNION,\n type_params=[self.scalar_type, self.non_scalar_type],\n )
\n\n\nConfigAnyInstance = Any()\nConfigBoolInstance = Bool()\nConfigFloatInstance = Float()\nConfigIntInstance = Int()\nConfigStringInstance = String()\n\n_CONFIG_MAP: Dict[check.TypeOrTupleOfTypes, ConfigType] = {\n BuiltinEnum.ANY: ConfigAnyInstance,\n BuiltinEnum.BOOL: ConfigBoolInstance,\n BuiltinEnum.FLOAT: ConfigFloatInstance,\n BuiltinEnum.INT: ConfigIntInstance,\n BuiltinEnum.STRING: ConfigStringInstance,\n}\n\n\n_CONFIG_MAP_BY_NAME: Dict[str, ConfigType] = {\n "Any": ConfigAnyInstance,\n "Bool": ConfigBoolInstance,\n "Float": ConfigFloatInstance,\n "Int": ConfigIntInstance,\n "String": ConfigStringInstance,\n}\n\nALL_CONFIG_BUILTINS = set(_CONFIG_MAP.values())\n\n\ndef get_builtin_scalar_by_name(type_name: str):\n if type_name not in _CONFIG_MAP_BY_NAME:\n check.failed("Scalar {} is not supported".format(type_name))\n return _CONFIG_MAP_BY_NAME[type_name]\n
", "current_page_name": "_modules/dagster/config/config_type", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.config.config_type"}, "field": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.config.field

\nfrom typing import Any, Union, overload\n\nimport dagster._check as check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster.serdes import serialize_value\nfrom dagster.utils import is_enum_value\nfrom dagster.utils.typing_api import is_closed_python_optional_type, is_typing_type\n\nfrom .config_type import Array, ConfigAnyInstance, ConfigType, ConfigTypeKind\nfrom .field_utils import FIELD_NO_DEFAULT_PROVIDED, Map, all_optional_type\n\n\ndef _is_config_type_class(obj):\n    return isinstance(obj, type) and issubclass(obj, ConfigType)\n\n\ndef helpful_list_error_string():\n    return "Please use a python list (e.g. [int]) or dagster.Array (e.g. Array(int)) instead."\n\n\nVALID_CONFIG_DESC = """\n1. A Python primitive type that resolve to dagster config\n   types: int, float, bool, str.\n\n2. A dagster config type: Int, Float, Bool, String, StringSource, Path, Any,\n   Array, Noneable, Selector, Shape, Permissive, etc.\n\n3. A bare python dictionary, which is wrapped in Shape. Any\n   values in the dictionary get resolved by the same rules, recursively.\n\n4. A bare python list of length one which itself is config type.\n   Becomes Array with list element as an argument.\n"""\n\n\n@overload\ndef resolve_to_config_type(dagster_type: Union[ConfigType, ConfigSchemaType]) -> ConfigType:\n    pass\n\n\n@overload\ndef resolve_to_config_type(dagster_type: object) -> Union[ConfigType, bool]:\n    pass\n\n\ndef resolve_to_config_type(dagster_type: object) -> Union[ConfigType, bool]:\n    from .field_utils import convert_fields_to_dict_type\n\n    # Short circuit if it's already a Config Type\n    if isinstance(dagster_type, ConfigType):\n        return dagster_type\n\n    if isinstance(dagster_type, dict):\n        # Dicts of the special form {type: value} are treated as Maps\n        # mapping from the type to value type, otherwise treat as dict type\n        if len(dagster_type) == 1:\n            key = list(dagster_type.keys())[0]\n            key_type = resolve_to_config_type(key)\n            if not isinstance(key, str):\n                if not key_type:\n                    raise DagsterInvalidDefinitionError(\n                        "Invalid key in map specification: {key} in map {collection}".format(\n                            key=repr(key), collection=dagster_type\n                        )\n                    )\n\n                if not key_type.kind == ConfigTypeKind.SCALAR:\n                    raise DagsterInvalidDefinitionError(\n                        "Non-scalar key in map specification: {key} in map {collection}".format(\n                            key=repr(key), collection=dagster_type\n                        )\n                    )\n\n                inner_type = resolve_to_config_type(dagster_type[key])\n\n                if not inner_type:\n                    raise DagsterInvalidDefinitionError(\n                        "Invalid value in map specification: {value} in map {collection}".format(\n                            value=repr(dagster_type[str]), collection=dagster_type\n                        )\n                    )\n                return Map(key_type, inner_type)\n        return convert_fields_to_dict_type(dagster_type)\n\n    if isinstance(dagster_type, list):\n        if len(dagster_type) != 1:\n            raise DagsterInvalidDefinitionError("Array specifications must only be of length 1")\n\n        inner_type = resolve_to_config_type(dagster_type[0])\n\n        if not inner_type:\n            raise DagsterInvalidDefinitionError(\n                "Invalid member of array specification: {value} in list {the_list}".format(\n                    value=repr(dagster_type[0]), the_list=dagster_type\n                )\n            )\n        return Array(inner_type)\n\n    from dagster.core.types.dagster_type import DagsterType, List, ListType\n    from dagster.core.types.python_set import Set, _TypedPythonSet\n    from dagster.core.types.python_tuple import Tuple, _TypedPythonTuple\n\n    if _is_config_type_class(dagster_type):\n        check.param_invariant(\n            False,\n            "dagster_type",\n            f"Cannot pass config type class {dagster_type} to resolve_to_config_type. "\n            "This error usually occurs when you pass a dagster config type class instead of a class instance into "\n            'another dagster config type. E.g. "Noneable(Permissive)" should instead be "Noneable(Permissive())".',\n        )\n\n    if isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            "You have passed a DagsterType class {dagster_type} to the config system. "\n            "The DagsterType and config schema systems are separate. "\n            "Valid config values are:\\n{desc}".format(\n                dagster_type=repr(dagster_type),\n                desc=VALID_CONFIG_DESC,\n            )\n        )\n\n    if is_closed_python_optional_type(dagster_type):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use typing.Optional as a config type. If you want this field to be "\n            "optional, please use Field(<type>, is_required=False), and if you want this field to "\n            "be required, but accept a value of None, use dagster.Noneable(<type>)."\n        )\n\n    if is_typing_type(dagster_type):\n        raise DagsterInvalidDefinitionError(\n            (\n                "You have passed in {dagster_type} to the config system. Types from "\n                "the typing module in python are not allowed in the config system. "\n                "You must use types that are imported from dagster or primitive types "\n                "such as bool, int, etc."\n            ).format(dagster_type=dagster_type)\n        )\n\n    if dagster_type is List or isinstance(dagster_type, ListType):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use List in the context of config. " + helpful_list_error_string()\n        )\n\n    if dagster_type is Set or isinstance(dagster_type, _TypedPythonSet):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Set in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if dagster_type is Tuple or isinstance(dagster_type, _TypedPythonTuple):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Tuple in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if isinstance(dagster_type, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            (\n                "You have passed an instance of DagsterType {type_name} to the config "\n                "system (Repr of type: {dagster_type}). "\n                "The DagsterType and config schema systems are separate. "\n                "Valid config values are:\\n{desc}"\n            ).format(\n                type_name=dagster_type.display_name,\n                dagster_type=repr(dagster_type),\n                desc=VALID_CONFIG_DESC,\n            ),\n        )\n\n    # If we are passed here either:\n    #  1) We have been passed a python builtin\n    #  2) We have been a dagster wrapping type that needs to be convert its config variant\n    #     e.g. dagster.List\n    #  2) We have been passed an invalid thing. We return False to signify this. It is\n    #     up to callers to report a reasonable error.\n\n    from dagster.primitive_mapping import (\n        is_supported_config_python_builtin,\n        remap_python_builtin_for_config,\n    )\n\n    if BuiltinEnum.contains(dagster_type):\n        return ConfigType.from_builtin_enum(dagster_type)\n\n    if is_supported_config_python_builtin(dagster_type):\n        return remap_python_builtin_for_config(dagster_type)\n\n    if dagster_type is None:\n        return ConfigAnyInstance\n\n    # This means that this is an error and we are return False to a callsite\n    # We do the error reporting there because those callsites have more context\n    return False\n\n\ndef has_implicit_default(config_type):\n    if config_type.kind == ConfigTypeKind.NONEABLE:\n        return True\n\n    return all_optional_type(config_type)\n\n\n
[docs]class Field:\n """Defines the schema for a configuration field.\n\n Fields are used in config schema instead of bare types when one wants to add a description,\n a default value, or to mark it as not required.\n\n Config fields are parsed according to their schemas in order to yield values available at\n job execution time through the config system. Config fields can be set on ops, on\n loaders and materializers for custom, and on other pluggable components of the system, such as\n resources, loggers, and executors.\n\n\n Args:\n config (Any): The schema for the config. This value can be any of:\n\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n default_value (Any):\n A default value for this field, conformant to the schema set by the ``dagster_type``\n argument. If a default value is provided, ``is_required`` should be ``False``.\n\n Note: for config types that do post processing such as Enum, this value must be\n the pre processed version, ie use ``ExampleEnum.VALUE.name`` instead of\n ``ExampleEnum.VALUE``\n\n is_required (bool):\n Whether the presence of this field is required. Defaults to true. If ``is_required``\n is ``True``, no default value should be provided.\n\n description (str):\n A human-readable description of this config field.\n\n Examples:\n\n .. code-block:: python\n\n @op(\n config_schema={\n 'word': Field(str, description='I am a word.'),\n 'repeats': Field(Int, default_value=1, is_required=False),\n }\n )\n def repeat_word(context):\n return context.op_config['word'] * context.op_config['repeats']\n """\n\n def _resolve_config_arg(self, config):\n if isinstance(config, ConfigType):\n return config\n\n config_type = resolve_to_config_type(config)\n if not config_type:\n raise DagsterInvalidDefinitionError(\n (\n "Attempted to pass {value_repr} to a Field that expects a valid "\n "dagster type usable in config (e.g. Dict, Int, String et al)."\n ).format(value_repr=repr(config))\n )\n return config_type\n\n def __init__(\n self,\n config,\n default_value=FIELD_NO_DEFAULT_PROVIDED,\n is_required=None,\n description=None,\n ):\n from .post_process import resolve_defaults\n from .validate import validate_config\n\n self.config_type = check.inst(self._resolve_config_arg(config), ConfigType)\n\n self.description = check.opt_str_param(description, "description")\n\n check.opt_bool_param(is_required, "is_required")\n\n if default_value != FIELD_NO_DEFAULT_PROVIDED:\n check.param_invariant(\n not (callable(default_value)), "default_value", "default_value cannot be a callable"\n )\n\n if is_required is True:\n check.param_invariant(\n default_value == FIELD_NO_DEFAULT_PROVIDED,\n "default_value",\n "required arguments should not specify default values",\n )\n\n self._default_value = default_value\n\n # check explicit default value\n if self.default_provided:\n if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(default_value):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed into a python enum value as the default value "\n "into of a config enum type {name}. You must pass in the underlying "\n "string represention as the default value. One of {value_set}."\n ).format(\n value_set=[ev.config_value for ev in self.config_type.enum_values], # type: ignore\n name=self.config_type.given_name,\n )\n )\n\n evr = validate_config(self.config_type, default_value)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Invalid default_value for Field.",\n evr.errors,\n default_value,\n )\n\n if is_required is None:\n is_optional = has_implicit_default(self.config_type) or self.default_provided\n is_required = not is_optional\n\n # on implicitly optional - set the default value\n # by resolving the defaults of the type\n if is_optional and not self.default_provided:\n evr = resolve_defaults(self.config_type, None)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Unable to resolve implicit default_value for Field.",\n evr.errors,\n None,\n )\n self._default_value = evr.value\n self._is_required = is_required\n\n @property\n def is_required(self) -> bool:\n return self._is_required\n\n @property\n def default_provided(self) -> bool:\n """Was a default value provided\n\n Returns:\n bool: Yes or no\n """\n return self._default_value != FIELD_NO_DEFAULT_PROVIDED\n\n @property\n def default_value(self) -> Any:\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return self._default_value\n\n @property\n def default_value_as_json_str(self) -> str:\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return serialize_value(self.default_value)\n\n def __repr__(self):\n return ("Field({config_type}, default={default}, is_required={is_required})").format(\n config_type=self.config_type,\n default="@"\n if self._default_value == FIELD_NO_DEFAULT_PROVIDED\n else self._default_value,\n is_required=self.is_required,\n )
\n\n\ndef check_opt_field_param(obj, param_name):\n return check.opt_inst_param(obj, param_name, Field)\n
", "current_page_name": "_modules/dagster/config/field", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.config.field"}, "field_utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.config.field_utils

\n# encoding: utf-8\nimport hashlib\nfrom typing import TYPE_CHECKING, Any, Dict, List\n\nimport dagster._check as check\nfrom dagster.core.errors import DagsterInvalidConfigDefinitionError\n\nfrom .config_type import Array, ConfigType, ConfigTypeKind\n\nif TYPE_CHECKING:\n    from dagster.config.field import Field\n\n\ndef all_optional_type(config_type: ConfigType) -> bool:\n    check.inst_param(config_type, "config_type", ConfigType)\n\n    if ConfigTypeKind.is_shape(config_type.kind):\n        for field in config_type.fields.values():  # type: ignore\n            if field.is_required:\n                return False\n        return True\n\n    if ConfigTypeKind.is_selector(config_type.kind):\n        if len(config_type.fields) == 1:  # type: ignore\n            for field in config_type.fields.values():  # type: ignore\n                if field.is_required:\n                    return False\n            return True\n\n    return False\n\n\nclass __FieldValueSentinel:\n    pass\n\n\nclass __InferOptionalCompositeFieldSentinel:\n    pass\n\n\nFIELD_NO_DEFAULT_PROVIDED = __FieldValueSentinel\n\nINFER_OPTIONAL_COMPOSITE_FIELD = __InferOptionalCompositeFieldSentinel\n\n\nclass _ConfigHasFields(ConfigType):\n    def __init__(self, fields, **kwargs):\n        self.fields = expand_fields_dict(fields)\n        super(_ConfigHasFields, self).__init__(**kwargs)\n\n\nFIELD_HASH_CACHE: Dict[str, Any] = {}\n\n\ndef _memoize_inst_in_field_cache(passed_cls, defined_cls, key):\n    if key in FIELD_HASH_CACHE:\n        return FIELD_HASH_CACHE[key]\n\n    defined_cls_inst = super(defined_cls, passed_cls).__new__(defined_cls)\n\n    FIELD_HASH_CACHE[key] = defined_cls_inst\n    return defined_cls_inst\n\n\ndef _add_hash(m, string):\n    m.update(string.encode("utf-8"))\n\n\ndef _compute_fields_hash(fields, description, field_aliases=None):\n\n    m = hashlib.sha1()  # so that hexdigest is 40, not 64 bytes\n    if description:\n        _add_hash(m, ":description: " + description)\n\n    for field_name in sorted(list(fields.keys())):\n        field = fields[field_name]\n        _add_hash(m, ":fieldname:" + field_name)\n        if field.default_provided:\n            _add_hash(m, ":default_value: " + field.default_value_as_json_str)\n        _add_hash(m, ":is_required: " + str(field.is_required))\n        _add_hash(m, ":type_key: " + field.config_type.key)\n        if field.description:\n            _add_hash(m, ":description: " + field.description)\n\n    field_aliases = check.opt_dict_param(\n        field_aliases, "field_aliases", key_type=str, value_type=str\n    )\n    for field_name in sorted(list(field_aliases.keys())):\n        field_alias = field_aliases[field_name]\n        _add_hash(m, ":fieldname: " + field_name)\n        _add_hash(m, ":fieldalias: " + field_alias)\n\n    return m.hexdigest()\n\n\ndef _define_shape_key_hash(fields, description, field_aliases):\n    return "Shape." + _compute_fields_hash(fields, description, field_aliases=field_aliases)\n\n\n
[docs]class Shape(_ConfigHasFields):\n """Schema for configuration data with string keys and typed values via :py:class:`Field`.\n\n Unlike :py:class:`Permissive`, unspecified fields are not allowed and will throw a\n :py:class:`~dagster.DagsterInvalidConfigError`.\n\n Args:\n fields (Dict[str, Field]):\n The specification of the config dict.\n field_aliases (Dict[str, str]):\n Maps a string key to an alias that can be used instead of the original key. For example,\n an entry {"solids": "ops"} means that someone could use "ops" instead of "solids" as a\n top level string key.\n """\n\n def __new__(\n cls,\n fields,\n description=None,\n field_aliases=None,\n ):\n return _memoize_inst_in_field_cache(\n cls,\n Shape,\n _define_shape_key_hash(expand_fields_dict(fields), description, field_aliases),\n )\n\n def __init__(\n self,\n fields,\n description=None,\n field_aliases=None,\n ):\n fields = expand_fields_dict(fields)\n super(Shape, self).__init__(\n kind=ConfigTypeKind.STRICT_SHAPE,\n key=_define_shape_key_hash(fields, description, field_aliases),\n description=description,\n fields=fields,\n )\n self.field_aliases = check.opt_dict_param(\n field_aliases, "field_aliases", key_type=str, value_type=str\n )
\n\n\n
[docs]class Map(ConfigType):\n """Defines a config dict with arbitrary scalar keys and typed values.\n\n A map can contrain arbitrary keys of the specified scalar type, each of which has\n type checked values. Unlike :py:class:`Shape` and :py:class:`Permissive`, scalar\n keys other than strings can be used, and unlike :py:class:`Permissive`, all\n values are type checked.\n Args:\n key_type (type):\n The type of keys this map can contain. Must be a scalar type.\n inner_type (type):\n The type of the values that this map type can contain.\n key_label_name (string):\n Optional name which describes the role of keys in the map.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Map({str: int})))\n def partially_specified_config(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __init__(self, key_type, inner_type, key_label_name=None):\n from .field import resolve_to_config_type\n\n self.key_type = resolve_to_config_type(key_type)\n self.inner_type = resolve_to_config_type(inner_type)\n self.given_name = key_label_name\n\n check.inst_param(self.key_type, "key_type", ConfigType)\n check.inst_param(self.inner_type, "inner_type", ConfigType)\n check.param_invariant(\n self.key_type.kind == ConfigTypeKind.SCALAR, "key_type", "Key type must be a scalar"\n )\n check.opt_str_param(self.given_name, "name")\n\n super(Map, self).__init__(\n key="Map.{key_type}.{inner_type}{name_key}".format(\n key_type=self.key_type.key,\n inner_type=self.inner_type.key,\n name_key=f":name: {key_label_name}" if key_label_name else "",\n ),\n # We use the given name field to store the key label name\n # this is used elsewhere to give custom types names\n given_name=key_label_name,\n type_params=[self.key_type, self.inner_type],\n kind=ConfigTypeKind.MAP,\n )\n\n @property\n def key_label_name(self):\n return self.given_name
\n\n\ndef _define_permissive_dict_key(fields, description):\n return (\n "Permissive." + _compute_fields_hash(fields, description=description)\n if fields\n else "Permissive"\n )\n\n\n
[docs]class Permissive(_ConfigHasFields):\n """Defines a config dict with a partially specified schema.\n\n A permissive dict allows partial specification of the config schema. Any fields with a\n specified schema will be type checked. Other fields will be allowed, but will be ignored by\n the type checker.\n\n Args:\n fields (Dict[str, Field]): The partial specification of the config dict.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Permissive({'required': Field(String)})))\n def map_config_op(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __new__(cls, fields=None, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Permissive,\n _define_permissive_dict_key(\n expand_fields_dict(fields) if fields else None, description\n ),\n )\n\n def __init__(self, fields=None, description=None):\n fields = expand_fields_dict(fields) if fields else None\n super(Permissive, self).__init__(\n key=_define_permissive_dict_key(fields, description),\n kind=ConfigTypeKind.PERMISSIVE_SHAPE,\n fields=fields or dict(),\n description=description,\n )
\n\n\ndef _define_selector_key(fields, description):\n return "Selector." + _compute_fields_hash(fields, description=description)\n\n\n
[docs]class Selector(_ConfigHasFields):\n """Define a config field requiring the user to select one option.\n\n Selectors are used when you want to be able to present several different options in config but\n allow only one to be selected. For example, a single input might be read in from either a csv\n file or a parquet file, but not both at once.\n\n Note that in some other type systems this might be called an 'input union'.\n\n Functionally, a selector is like a :py:class:`Dict`, except that only one key from the dict can\n be specified in valid config.\n\n Args:\n fields (Dict[str, Field]): The fields from which the user must select.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Selector(\n {\n 'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n 'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n 'en': {'whom': Field(String, default_value='world', is_required=False)},\n }\n ),\n is_required=False,\n default_value={'en': {'whom': 'world'}},\n )\n )\n def hello_world_with_default(context):\n if 'haw' in context.op_config:\n return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n if 'cn' in context.op_config:\n return '\u4f60\u597d\uff0c{whom}!'.format(whom=context.op_config['cn']['whom'])\n if 'en' in context.op_config:\n return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n """\n\n def __new__(cls, fields, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Selector,\n _define_selector_key(expand_fields_dict(fields), description),\n )\n\n def __init__(self, fields, description=None):\n fields = expand_fields_dict(fields)\n super(Selector, self).__init__(\n key=_define_selector_key(fields, description),\n kind=ConfigTypeKind.SELECTOR,\n fields=fields,\n description=description,\n )
\n\n\n# Config syntax expansion code below\n\n\ndef is_potential_field(potential_field: object) -> bool:\n from .field import Field, resolve_to_config_type\n\n return isinstance(potential_field, (Field, dict, list)) or bool(\n resolve_to_config_type(potential_field)\n )\n\n\ndef convert_fields_to_dict_type(fields: Dict[str, object]):\n return _convert_fields_to_dict_type(fields, fields, [])\n\n\ndef _convert_fields_to_dict_type(\n original_root: object, fields: Dict[str, object], stack: List[str]\n) -> Shape:\n return Shape(_expand_fields_dict(original_root, fields, stack))\n\n\ndef expand_fields_dict(fields: Dict[str, object]) -> Dict[str, "Field"]:\n return _expand_fields_dict(fields, fields, [])\n\n\ndef _expand_fields_dict(\n original_root: object, fields: Dict[str, object], stack: List[str]\n) -> Dict[str, "Field"]:\n check.dict_param(fields, "fields")\n return {\n name: _convert_potential_field(original_root, value, stack + [name])\n for name, value in fields.items()\n }\n\n\ndef expand_list(original_root: object, the_list: List[object], stack: List[str]) -> Array:\n\n if len(the_list) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_list, stack, "List must be of length 1"\n )\n\n inner_type = _convert_potential_type(original_root, the_list[0], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_list,\n stack,\n "List have a single item and contain a valid type i.e. [int]. Got item {}".format(\n repr(the_list[0])\n ),\n )\n\n return Array(inner_type)\n\n\ndef expand_map(original_root: object, the_dict: Dict[object, object], stack: List[str]) -> Map:\n\n if len(the_dict) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_dict, stack, "Map dict must be of length 1"\n )\n\n key = list(the_dict.keys())[0]\n key_type = _convert_potential_type(original_root, key, stack)\n if not key_type or not key_type.kind == ConfigTypeKind.SCALAR:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n "Map dict must have a scalar type as its only key. Got key {}".format(repr(key)),\n )\n\n inner_type = _convert_potential_type(original_root, the_dict[key], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n "Map must have a single value and contain a valid type i.e. {{str: int}}. Got item {}".format(\n repr(the_dict[key])\n ),\n )\n\n return Map(key_type, inner_type)\n\n\ndef convert_potential_field(potential_field: object) -> "Field":\n return _convert_potential_field(potential_field, potential_field, [])\n\n\ndef _convert_potential_type(original_root: object, potential_type, stack: List[str]):\n from .field import resolve_to_config_type\n\n if isinstance(potential_type, dict):\n # A dictionary, containing a single key which is a type (int, str, etc) and not a string is interpreted as a Map\n if len(potential_type) == 1:\n key = list(potential_type.keys())[0]\n if not isinstance(key, str) and _convert_potential_type(original_root, key, stack):\n return expand_map(original_root, potential_type, stack)\n\n # Otherwise, the dictionary is interpreted as a Shape\n return Shape(_expand_fields_dict(original_root, potential_type, stack))\n\n if isinstance(potential_type, list):\n return expand_list(original_root, potential_type, stack)\n\n return resolve_to_config_type(potential_type)\n\n\ndef _convert_potential_field(\n original_root: object, potential_field: object, stack: List[str]\n) -> "Field":\n from .field import Field\n\n if potential_field is None:\n raise DagsterInvalidConfigDefinitionError(\n original_root, potential_field, stack, reason="Fields cannot be None"\n )\n\n if not is_potential_field(potential_field):\n raise DagsterInvalidConfigDefinitionError(original_root, potential_field, stack)\n\n if isinstance(potential_field, Field):\n return potential_field\n\n return Field(_convert_potential_type(original_root, potential_field, stack))\n
", "current_page_name": "_modules/dagster/config/field_utils", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.config.field_utils"}}, "core": {"asset_defs": {"asset_group": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.asset_group

\nimport inspect\nimport os\nimport pkgutil\nimport warnings\nfrom collections import defaultdict\nfrom importlib import import_module\nfrom types import ModuleType\nfrom typing import (\n    Any,\n    Dict,\n    FrozenSet,\n    Generator,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster.core.definitions.dependency import NodeHandle\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.executor_definition import in_process_executor\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.execution.execute_in_process_result import ExecuteInProcessResult\nfrom dagster.core.selector.subset_selector import AssetSelectionData\nfrom dagster.core.storage.fs_io_manager import fs_io_manager\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.backcompat import ExperimentalWarning\n\nfrom ..definitions.asset_layer import build_asset_selection_job\nfrom ..definitions.executor_definition import ExecutorDefinition\nfrom ..definitions.job_definition import JobDefinition\nfrom ..definitions.partition import PartitionsDefinition\nfrom ..definitions.resource_definition import ResourceDefinition\nfrom ..errors import DagsterInvalidDefinitionError\nfrom .assets import AssetsDefinition\nfrom .assets_job import build_assets_job\nfrom .source_asset import SourceAsset\n\nASSET_GROUP_BASE_JOB_PREFIX = "__ASSET_GROUP"\n\n\n
[docs]class AssetGroup:\n """Defines a group of assets, along with environment information in the\n form of resources and an executor.\n\n An AssetGroup can be provided to a :py:class:`RepositoryDefinition`. When\n provided to a repository, the constituent assets can be materialized from\n Dagit. The AssetGroup also provides an interface for creating jobs from\n subselections of assets, which can then be provided to a\n :py:class:`ScheduleDefinition` or :py:class:`SensorDefinition`.\n\n There can only be one AssetGroup per repository.\n\n Args:\n assets (Sequence[AssetsDefinition]): The set of software-defined assets\n to group.\n source_assets (Optional[Sequence[SourceAsset]]): The set of source\n assets that the software-defined may depend on.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A\n dictionary of resource definitions. When the AssetGroup is\n constructed, if there are any unsatisfied resource requirements\n from the assets, it will result in an error. Note that the\n `root_manager` key is a reserved resource key, and will result in\n an error if provided by the user.\n executor_def (Optional[ExecutorDefinition]): The executor definition to\n use when re-materializing assets in this group.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import AssetGroup, asset, AssetIn, AssetKey, SourceAsset, resource\n\n source_asset = SourceAsset("source")\n\n @asset(required_resource_keys={"foo"})\n def start_asset(context, source):\n ...\n\n @asset\n def next_asset(start_asset):\n ...\n\n @resource\n def foo_resource():\n ...\n\n asset_group = AssetGroup(\n assets=[start_asset, next_asset],\n source_assets=[source_asset],\n resource_defs={"foo": foo_resource},\n )\n ...\n\n """\n\n def __init__(\n self,\n assets: Sequence[AssetsDefinition],\n source_assets: Optional[Sequence[SourceAsset]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n ):\n check.sequence_param(assets, "assets", of_type=AssetsDefinition)\n\n source_assets = check.opt_sequence_param(\n source_assets, "source_assets", of_type=SourceAsset\n )\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n executor_def = check.opt_inst_param(executor_def, "executor_def", ExecutorDefinition)\n\n # In the case of collisions, merge_dicts takes values from the\n # dictionary latest in the list, so we place the user provided resource\n # defs after the defaults.\n resource_defs = merge_dicts({"io_manager": fs_io_manager}, resource_defs)\n\n _validate_resource_reqs_for_asset_group(\n asset_list=assets, source_assets=source_assets, resource_defs=resource_defs\n )\n\n self._assets = assets\n self._source_assets = source_assets\n self._resource_defs = resource_defs\n self._executor_def = executor_def\n\n @property\n def assets(self):\n return self._assets\n\n @property\n def source_assets(self):\n return self._source_assets\n\n @property\n def resource_defs(self):\n return self._resource_defs\n\n @property\n def executor_def(self):\n return self._executor_def\n\n @staticmethod\n def is_base_job_name(name) -> bool:\n return name.startswith(ASSET_GROUP_BASE_JOB_PREFIX)\n\n
[docs] def build_job(\n self,\n name: str,\n selection: Optional[Union[str, List[str]]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n tags: Optional[Dict[str, Any]] = None,\n description: Optional[str] = None,\n _asset_selection_data: Optional[AssetSelectionData] = None,\n ) -> JobDefinition:\n """Defines an executable job from the provided assets, resources, and executor.\n\n Args:\n name (str): The name to give the job.\n selection (Union[str, List[str]]): A single selection query or list of selection queries\n to execute. For example:\n\n - ``['some_asset_key']`` select ``some_asset_key`` itself.\n - ``['*some_asset_key']`` select ``some_asset_key`` and all its ancestors (upstream dependencies).\n - ``['*some_asset_key+++']`` select ``some_asset_key``, all its ancestors, and its descendants (downstream dependencies) within 3 levels down.\n - ``['*some_asset_key', 'other_asset_key_a', 'other_asset_key_b+']`` select ``some_asset_key`` and all its ancestors, ``other_asset_key_a`` itself, and ``other_asset_key_b`` and its direct child asset keys. When subselecting into a multi-asset, all of the asset keys in that multi-asset must be selected.\n\n executor_def (Optional[ExecutorDefinition]): The executor\n definition to use when executing the job. Defaults to the\n executor on the AssetGroup. If no executor was provided on the\n AssetGroup, then it defaults to :py:class:`multi_or_in_process_executor`.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten\n tag values provided at invocation time.\n description (Optional[str]): A description of the job.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import AssetGroup\n\n the_asset_group = AssetGroup(...)\n\n job_with_all_assets = the_asset_group.build_job()\n\n job_with_one_selection = the_asset_group.build_job(selection="some_asset")\n\n job_with_multiple_selections = the_asset_group.build_job(selection=["*some_asset", "other_asset++"])\n """\n\n from dagster.core.selector.subset_selector import parse_asset_selection\n\n check.str_param(name, "name")\n check.opt_inst_param(_asset_selection_data, "_asset_selection_data", AssetSelectionData)\n\n selected_asset_keys: FrozenSet[AssetKey] = frozenset()\n if isinstance(selection, str):\n selected_asset_keys = parse_asset_selection(self.assets, [selection])\n elif isinstance(selection, list):\n selection = check.opt_list_param(selection, "selection", of_type=str)\n selected_asset_keys = parse_asset_selection(self.assets, selection)\n elif isinstance(selection, FrozenSet):\n check.opt_set_param(selection, "selection", of_type=AssetKey)\n selected_asset_keys = selection\n\n executor_def = check.opt_inst_param(\n executor_def, "executor_def", ExecutorDefinition, self.executor_def\n )\n description = check.opt_str_param(description, "description", "")\n tags = check.opt_dict_param(tags, "tags", key_type=str)\n\n return build_asset_selection_job(\n name=name,\n assets=self.assets,\n source_assets=self.source_assets,\n executor_def=executor_def,\n resource_defs=self.resource_defs,\n description=description,\n tags=tags,\n asset_selection=selected_asset_keys,\n )
\n\n
[docs] def to_source_assets(self) -> Sequence[SourceAsset]:\n """\n Returns a list of source assets corresponding to all the non-source assets in this group.\n """\n return [\n source_asset\n for assets_def in self.assets\n for source_asset in assets_def.to_source_assets()\n ]
\n\n
[docs] @staticmethod\n def from_package_module(\n package_module: ModuleType,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n extra_source_assets: Optional[Sequence[SourceAsset]] = None,\n ) -> "AssetGroup":\n """\n Constructs an AssetGroup that includes all asset definitions and source assets in all\n sub-modules of the given package module.\n\n A package module is the result of importing a package.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A dictionary of resource\n definitions to include on the returned asset group.\n executor_def (Optional[ExecutorDefinition]): An executor to include on the returned\n asset group.\n extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n group in addition to the source assets found in the package.\n\n Returns:\n AssetGroup: An asset group with all the assets in the package.\n """\n return AssetGroup.from_modules(\n _find_modules_in_package(package_module),\n resource_defs=resource_defs,\n executor_def=executor_def,\n extra_source_assets=extra_source_assets,\n )
\n\n
[docs] @staticmethod\n def from_package_name(\n package_name: str,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n extra_source_assets: Optional[Sequence[SourceAsset]] = None,\n ) -> "AssetGroup":\n """\n Constructs an AssetGroup that includes all asset definitions and source assets in all\n sub-modules of the given package.\n\n Args:\n package_name (str): The name of a Python package to look for assets inside.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A dictionary of resource\n definitions to include on the returned asset group.\n executor_def (Optional[ExecutorDefinition]): An executor to include on the returned\n asset group.\n extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n group in addition to the source assets found in the package.\n\n Returns:\n AssetGroup: An asset group with all the assets in the package.\n """\n package_module = import_module(package_name)\n return AssetGroup.from_package_module(\n package_module,\n resource_defs=resource_defs,\n executor_def=executor_def,\n extra_source_assets=extra_source_assets,\n )
\n\n
[docs] @staticmethod\n def from_modules(\n modules: Iterable[ModuleType],\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n extra_source_assets: Optional[Sequence[SourceAsset]] = None,\n ) -> "AssetGroup":\n """\n Constructs an AssetGroup that includes all asset definitions and source assets in the given\n modules.\n\n Args:\n modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A dictionary of resource\n definitions to include on the returned asset group.\n executor_def (Optional[ExecutorDefinition]): An executor to include on the returned\n asset group.\n extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n group in addition to the source assets found in the modules.\n\n Returns:\n AssetGroup: An asset group with all the assets defined in the given modules.\n """\n asset_ids: Set[int] = set()\n asset_keys: Dict[AssetKey, ModuleType] = dict()\n source_assets: List[SourceAsset] = list(\n check.opt_sequence_param(\n extra_source_assets, "extra_source_assets", of_type=SourceAsset\n )\n )\n assets: List[AssetsDefinition] = []\n for module in modules:\n for asset in _find_assets_in_module(module):\n if id(asset) not in asset_ids:\n asset_ids.add(id(asset))\n keys = asset.asset_keys if isinstance(asset, AssetsDefinition) else [asset.key]\n for key in keys:\n if key in asset_keys:\n modules_str = ", ".join(\n set([asset_keys[key].__name__, module.__name__])\n )\n raise DagsterInvalidDefinitionError(\n f"Asset key {key} is defined multiple times. Definitions found in modules: {modules_str}."\n )\n else:\n asset_keys[key] = module\n if isinstance(asset, SourceAsset):\n source_assets.append(asset)\n else:\n assets.append(asset)\n\n return AssetGroup(\n assets=assets,\n source_assets=source_assets,\n resource_defs=resource_defs,\n executor_def=executor_def,\n )
\n\n
[docs] @staticmethod\n def from_current_module(\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n extra_source_assets: Optional[Sequence[SourceAsset]] = None,\n ) -> "AssetGroup":\n """\n Constructs an AssetGroup that includes all asset definitions and source assets in the module\n where this is called from.\n\n Args:\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): A dictionary of resource\n definitions to include on the returned asset group.\n executor_def (Optional[ExecutorDefinition]): An executor to include on the returned\n asset group.\n extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n group in addition to the source assets found in the module.\n\n Returns:\n AssetGroup: An asset group with all the assets defined in the module.\n """\n caller = inspect.stack()[1]\n module = inspect.getmodule(caller[0])\n if module is None:\n check.failed("Could not find a module for the caller")\n return AssetGroup.from_modules(\n [module], resource_defs, executor_def, extra_source_assets=extra_source_assets\n )
\n\n
[docs] def materialize(\n self, selection: Optional[Union[str, List[str]]] = None\n ) -> ExecuteInProcessResult:\n """\n Executes an in-process run that materializes all assets in the group.\n\n The execution proceeds serially, in a single thread. Only supported by AssetGroups that have\n no executor_def or that that use the in-process executor.\n\n Args:\n selection (Union[str, List[str]]): A single selection query or list of selection queries\n to for assets in the group. For example:\n\n - ``['some_asset_key']`` select ``some_asset_key`` itself.\n - ``['*some_asset_key']`` select ``some_asset_key`` and all its ancestors (upstream dependencies).\n - ``['*some_asset_key+++']`` select ``some_asset_key``, all its ancestors, and its descendants (downstream dependencies) within 3 levels down.\n - ``['*some_asset_key', 'other_asset_key_a', 'other_asset_key_b+']`` select ``some_asset_key`` and all its ancestors, ``other_asset_key_a`` itself, and ``other_asset_key_b`` and its direct child asset keys. When subselecting into a multi-asset, all of the asset keys in that multi-asset must be selected.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n """\n if self.executor_def and self.executor_def is not in_process_executor:\n raise DagsterUnmetExecutorRequirementsError(\n "'materialize' can only be invoked on AssetGroups which have no executor or have "\n "the in_process_executor, but the AssetGroup had executor "\n f"'{self.executor_def.name}'"\n )\n\n return self.build_job(\n name="in_process_materialization_job", selection=selection\n ).execute_in_process()
\n\n
[docs] def get_base_jobs(self) -> Sequence[JobDefinition]:\n """For internal use only."""\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n\n assets_by_partitions_def: Dict[\n Optional[PartitionsDefinition], List[AssetsDefinition]\n ] = defaultdict(list)\n for assets_def in self.assets:\n assets_by_partitions_def[assets_def.partitions_def].append(assets_def)\n\n if len(assets_by_partitions_def.keys()) == 0 or assets_by_partitions_def.keys() == {\n None\n }:\n return [self.build_job(ASSET_GROUP_BASE_JOB_PREFIX)]\n else:\n unpartitioned_assets = assets_by_partitions_def.get(None, [])\n jobs = []\n\n # sort to ensure some stability in the ordering\n for i, (partitions_def, assets_with_partitions) in enumerate(\n sorted(assets_by_partitions_def.items(), key=lambda item: repr(item[0]))\n ):\n if partitions_def is not None:\n jobs.append(\n build_assets_job(\n f"{ASSET_GROUP_BASE_JOB_PREFIX}_{i}",\n assets=assets_with_partitions + unpartitioned_assets,\n source_assets=[*self.source_assets, *self.assets],\n resource_defs=self.resource_defs,\n executor_def=self.executor_def,\n )\n )\n\n return jobs
\n\n
[docs] def prefixed(self, key_prefix: str):\n """\n Returns an AssetGroup that's identical to this AssetGroup, but with prefixes on all the\n asset keys. The prefix is not added to source assets.\n\n Input asset keys that reference other assets within the group are "brought along" -\n i.e. prefixed as well.\n\n Example with a single asset:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n result = AssetGroup([asset1]).prefixed("my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n\n Example with dependencies within the list of assets:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n result = AssetGroup([asset1, asset2]).prefixed("my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n assert result.assets[1].asset_key == AssetKey(["my_prefix", "asset2"])\n assert result.assets[1].dependency_asset_keys == {AssetKey(["my_prefix", "asset1"])}\n\n Examples with input prefixes provided by source assets:\n\n .. code-block:: python\n\n asset1 = SourceAsset(AssetKey(["upstream_prefix", "asset1"]))\n\n @asset\n def asset2(asset1):\n ...\n\n result = AssetGroup([asset2], source_assets=[asset1]).prefixed("my_prefix")\n assert len(result.assets) == 1\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset2"])\n assert result.assets[0].dependency_asset_keys == {AssetKey(["upstream_prefix", "asset1"])}\n assert result.source_assets[0].key == AssetKey(["upstream_prefix", "asset1"])\n """\n\n asset_keys = {\n asset_key for assets_def in self.assets for asset_key in assets_def.asset_keys\n }\n\n result_assets: List[AssetsDefinition] = []\n for assets_def in self.assets:\n output_asset_key_replacements = {\n asset_key: AssetKey([key_prefix] + asset_key.path)\n for asset_key in assets_def.asset_keys\n }\n input_asset_key_replacements = {}\n for dep_asset_key in assets_def.dependency_asset_keys:\n if dep_asset_key in asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n (key_prefix, *dep_asset_key.path)\n )\n\n result_assets.append(\n assets_def.with_replaced_asset_keys(\n output_asset_key_replacements=output_asset_key_replacements,\n input_asset_key_replacements=input_asset_key_replacements,\n )\n )\n\n return AssetGroup(\n assets=result_assets,\n source_assets=self.source_assets,\n resource_defs=self.resource_defs,\n executor_def=self.executor_def,\n )
\n\n def __add__(self, other: "AssetGroup") -> "AssetGroup":\n check.inst_param(other, "other", AssetGroup)\n\n if self.resource_defs != other.resource_defs:\n raise DagsterInvalidDefinitionError(\n "Can't add asset groups together with different resource definition dictionaries"\n )\n\n if self.executor_def != other.executor_def:\n raise DagsterInvalidDefinitionError(\n "Can't add asset groups together with different executor definitions"\n )\n\n return AssetGroup(\n assets=self.assets + other.assets,\n source_assets=self.source_assets + other.source_assets,\n resource_defs=self.resource_defs,\n executor_def=self.executor_def,\n )\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, AssetGroup)\n and self.assets == other.assets\n and self.source_assets == other.source_assets\n and self.resource_defs == other.resource_defs\n and self.executor_def == other.executor_def\n )
\n\n\ndef _find_assets_in_module(\n module: ModuleType,\n) -> Generator[Union[AssetsDefinition, SourceAsset], None, None]:\n """\n Finds assets in the given module and adds them to the given sets of assets and source assets.\n """\n for attr in dir(module):\n value = getattr(module, attr)\n if isinstance(value, (AssetsDefinition, SourceAsset)):\n yield value\n elif isinstance(value, list) and all(\n isinstance(el, (AssetsDefinition, SourceAsset)) for el in value\n ):\n yield from value\n\n\ndef _find_modules_in_package(package_module: ModuleType) -> Iterable[ModuleType]:\n yield package_module\n package_path = package_module.__file__\n if package_path:\n for _, modname, is_pkg in pkgutil.walk_packages([os.path.dirname(package_path)]):\n submodule = import_module(f"{package_module.__name__}.{modname}")\n if is_pkg:\n yield from _find_modules_in_package(submodule)\n else:\n yield submodule\n else:\n raise ValueError(\n f"Tried to find modules in package {package_module}, but its __file__ is None"\n )\n\n\ndef _validate_resource_reqs_for_asset_group(\n asset_list: Sequence[AssetsDefinition],\n source_assets: Sequence[SourceAsset],\n resource_defs: Mapping[str, ResourceDefinition],\n):\n present_resource_keys = set(resource_defs.keys())\n for asset_def in asset_list:\n provided_resource_keys = set(asset_def.resource_defs.keys())\n present_resource_keys = present_resource_keys.union(provided_resource_keys)\n\n required_resource_keys: Set[str] = set()\n for op_def in asset_def.node_def.iterate_solid_defs():\n required_resource_keys.update(set(op_def.required_resource_keys or {}))\n missing_resource_keys = list(set(required_resource_keys) - present_resource_keys)\n if missing_resource_keys:\n raise DagsterInvalidDefinitionError(\n f"AssetGroup is missing required resource keys for asset '{asset_def.node_def.name}'. "\n f"Missing resource keys: {missing_resource_keys}"\n )\n\n for output_name, asset_key in asset_def.asset_keys_by_output_name.items():\n output_def, _ = asset_def.node_def.resolve_output_to_origin(\n output_name, NodeHandle(name=asset_def.node_def.name, parent=None)\n )\n if output_def.io_manager_key and output_def.io_manager_key not in present_resource_keys:\n raise DagsterInvalidDefinitionError(\n f"Output '{output_def.name}' with AssetKey '{asset_key}' "\n f"requires io manager '{output_def.io_manager_key}' but was "\n f"not provided on asset group. Provided resources: {sorted(list(present_resource_keys))}"\n )\n\n for source_asset in source_assets:\n if source_asset.io_manager_key and source_asset.io_manager_key not in present_resource_keys:\n raise DagsterInvalidDefinitionError(\n f"SourceAsset with key {source_asset.key} requires io manager "\n f"with key '{source_asset.io_manager_key}', which was not "\n f"provided on AssetGroup. Provided keys: {sorted(list(present_resource_keys))}"\n )\n\n for resource_key, resource_def in resource_defs.items():\n resource_keys = set(resource_def.required_resource_keys)\n missing_resource_keys = sorted(list(set(resource_keys) - present_resource_keys))\n if missing_resource_keys:\n raise DagsterInvalidDefinitionError(\n "AssetGroup is missing required resource keys for resource '"\n f"{resource_key}'. Missing resource keys: {missing_resource_keys}"\n )\n
", "current_page_name": "_modules/dagster/core/asset_defs/asset_group", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.asset_group"}, "asset_in": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.asset_in

\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster.core.definitions.events import AssetKey, CoerceableToAssetKey\n\n\n
[docs]class AssetIn(\n NamedTuple(\n "_AssetIn",\n [\n ("asset_key", Optional[AssetKey]),\n ("metadata", Optional[Mapping[str, Any]]),\n ("namespace", Optional[Sequence[str]]),\n ],\n )\n):\n def __new__(\n cls,\n asset_key: Optional[CoerceableToAssetKey] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n namespace: Optional[Sequence[str]] = None,\n ):\n check.invariant(\n not (asset_key and namespace),\n ("Asset key and namespace cannot both be set on AssetIn"),\n )\n\n # if user inputs a single string, coerce to list\n namespace = [namespace] if isinstance(namespace, str) else namespace\n\n return super(AssetIn, cls).__new__(\n cls,\n asset_key=AssetKey.from_coerceable(asset_key) if asset_key is not None else None,\n metadata=check.opt_inst_param(metadata, "metadata", Mapping),\n namespace=check.opt_list_param(namespace, "namespace", str),\n )
\n
", "current_page_name": "_modules/dagster/core/asset_defs/asset_in", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.asset_in"}, "assets_job": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.assets_job

\nimport itertools\nfrom typing import AbstractSet, Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast\n\nimport dagster._check as check\nfrom dagster.config import Shape\nfrom dagster.core.definitions.asset_layer import AssetLayer\nfrom dagster.core.definitions.config import ConfigMapping\nfrom dagster.core.definitions.dependency import (\n    DependencyDefinition,\n    IDependencyDefinition,\n    NodeHandle,\n    NodeInvocation,\n)\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.executor_definition import ExecutorDefinition\nfrom dagster.core.definitions.graph_definition import GraphDefinition\nfrom dagster.core.definitions.job_definition import JobDefinition\nfrom dagster.core.definitions.node_definition import NodeDefinition\nfrom dagster.core.definitions.output import OutputDefinition\nfrom dagster.core.definitions.partition import PartitionedConfig, PartitionsDefinition\nfrom dagster.core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.core.selector.subset_selector import AssetSelectionData\nfrom dagster.utils.backcompat import experimental\n\nfrom .asset_partitions import get_upstream_partitions_for_partition_range\nfrom .assets import AssetsDefinition\nfrom .source_asset import SourceAsset\n\n\n
[docs]@experimental\ndef build_assets_job(\n name: str,\n assets: Sequence[AssetsDefinition],\n source_assets: Optional[Sequence[Union[SourceAsset, AssetsDefinition]]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n description: Optional[str] = None,\n config: Optional[Union[ConfigMapping, Dict[str, Any], PartitionedConfig]] = None,\n tags: Optional[Dict[str, Any]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n _asset_selection_data: Optional[AssetSelectionData] = None,\n) -> JobDefinition:\n """Builds a job that materializes the given assets.\n\n The dependencies between the ops in the job are determined by the asset dependencies defined\n in the metadata on the provided asset nodes.\n\n Args:\n name (str): The name of the job.\n assets (List[AssetsDefinition]): A list of assets or\n multi-assets - usually constructed using the :py:func:`@asset` or :py:func:`@multi_asset`\n decorator.\n source_assets (Optional[Sequence[Union[SourceAsset, AssetsDefinition]]]): A list of\n assets that are not materialized by this job, but that assets in this job depend on.\n resource_defs (Optional[Dict[str, ResourceDefinition]]): Resource defs to be included in\n this job.\n description (Optional[str]): A description of the job.\n\n Examples:\n\n .. code-block:: python\n\n @asset\n def asset1():\n return 5\n\n @asset\n def asset2(asset1):\n return my_upstream_asset + 1\n\n my_assets_job = build_assets_job("my_assets_job", assets=[asset1, asset2])\n\n Returns:\n JobDefinition: A job that materializes the given assets.\n """\n\n check.str_param(name, "name")\n check.sequence_param(assets, "assets", of_type=AssetsDefinition)\n check.opt_sequence_param(\n source_assets, "source_assets", of_type=(SourceAsset, AssetsDefinition)\n )\n check.opt_str_param(description, "description")\n check.opt_inst_param(_asset_selection_data, "_asset_selection_data", AssetSelectionData)\n source_assets_by_key = build_source_assets_by_key(source_assets)\n\n deps, assets_defs_by_node_handle = build_deps(assets, source_assets_by_key.keys())\n partitioned_config = build_job_partitions_from_assets(assets, source_assets or [])\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n\n graph = GraphDefinition(\n name=name,\n node_defs=[asset.node_def for asset in assets],\n dependencies=deps,\n description=description,\n input_mappings=None,\n output_mappings=None,\n config=None,\n )\n\n all_resource_defs = dict(resource_defs)\n for asset_def in assets:\n for resource_key, resource_def in asset_def.resource_defs.items():\n if (\n resource_key in all_resource_defs\n and all_resource_defs[resource_key] != resource_def\n ):\n raise DagsterInvalidDefinitionError(\n f"When attempting to build job, asset {asset_def.asset_key} had a conflicting version of the same resource key {resource_key}. Please resolve this conflict by giving different keys to each resource definition."\n )\n all_resource_defs[resource_key] = resource_def\n\n # turn any AssetsDefinitions into SourceAssets\n resolved_source_assets: List[SourceAsset] = []\n for asset in source_assets or []:\n if isinstance(asset, AssetsDefinition):\n resolved_source_assets += asset.to_source_assets()\n elif isinstance(asset, SourceAsset):\n resolved_source_assets.append(asset)\n\n return graph.to_job(\n resource_defs=all_resource_defs,\n config=config or partitioned_config,\n tags=tags,\n executor_def=executor_def,\n asset_layer=AssetLayer.from_graph_and_assets_node_mapping(\n graph, assets_defs_by_node_handle, resolved_source_assets\n ),\n _asset_selection_data=_asset_selection_data,\n )
\n\n\ndef build_job_partitions_from_assets(\n assets: Sequence[AssetsDefinition],\n source_assets: Sequence[Union[SourceAsset, AssetsDefinition]],\n) -> Optional[PartitionedConfig]:\n assets_with_partitions_defs = [assets_def for assets_def in assets if assets_def.partitions_def]\n\n if len(assets_with_partitions_defs) == 0:\n return None\n\n first_assets_with_partitions_def: AssetsDefinition = assets_with_partitions_defs[0]\n for assets_def in assets_with_partitions_defs:\n if assets_def.partitions_def != first_assets_with_partitions_def.partitions_def:\n first_asset_key = next(iter(assets_def.asset_keys)).to_string()\n second_asset_key = next(iter(first_assets_with_partitions_def.asset_keys)).to_string()\n raise DagsterInvalidDefinitionError(\n "When an assets job contains multiple partitions assets, they must have the "\n f"same partitions definitions, but asset '{first_asset_key}' and asset "\n f"'{second_asset_key}' have different partitions definitions. "\n )\n\n partitions_defs_by_asset_key: Dict[AssetKey, PartitionsDefinition] = {}\n asset: Union[AssetsDefinition, SourceAsset]\n for asset in itertools.chain.from_iterable([assets, source_assets]):\n if isinstance(asset, AssetsDefinition) and asset.partitions_def is not None:\n for asset_key in asset.asset_keys:\n partitions_defs_by_asset_key[asset_key] = asset.partitions_def\n elif isinstance(asset, SourceAsset) and asset.partitions_def is not None:\n partitions_defs_by_asset_key[asset.key] = asset.partitions_def\n\n def asset_partitions_for_job_partition(\n job_partition_key: str,\n ) -> Mapping[AssetKey, PartitionKeyRange]:\n return {\n asset_key: PartitionKeyRange(job_partition_key, job_partition_key)\n for assets_def in assets\n for asset_key in assets_def.asset_keys\n if assets_def.partitions_def\n }\n\n def run_config_for_partition_fn(partition_key: str) -> Dict[str, Any]:\n ops_config: Dict[str, Any] = {}\n asset_partitions_by_asset_key = asset_partitions_for_job_partition(partition_key)\n\n for assets_def in assets:\n outputs_dict: Dict[str, Dict[str, Any]] = {}\n if assets_def.partitions_def is not None:\n for output_name, asset_key in assets_def.asset_keys_by_output_name.items():\n asset_partition_key_range = asset_partitions_by_asset_key[asset_key]\n outputs_dict[output_name] = {\n "start": asset_partition_key_range.start,\n "end": asset_partition_key_range.end,\n }\n\n inputs_dict: Dict[str, Dict[str, Any]] = {}\n for input_name, in_asset_key in assets_def.asset_keys_by_input_name.items():\n upstream_partitions_def = partitions_defs_by_asset_key.get(in_asset_key)\n if assets_def.partitions_def is not None and upstream_partitions_def is not None:\n upstream_partition_key_range = get_upstream_partitions_for_partition_range(\n assets_def, upstream_partitions_def, in_asset_key, asset_partition_key_range\n )\n inputs_dict[input_name] = {\n "start": upstream_partition_key_range.start,\n "end": upstream_partition_key_range.end,\n }\n\n config_schema = assets_def.node_def.config_schema\n if (\n config_schema\n and isinstance(config_schema.config_type, Shape)\n and "assets" in config_schema.config_type.fields\n ):\n ops_config[assets_def.node_def.name] = {\n "config": {\n "assets": {\n "input_partitions": inputs_dict,\n "output_partitions": outputs_dict,\n }\n }\n }\n\n return {"ops": ops_config}\n\n return PartitionedConfig(\n partitions_def=cast(PartitionsDefinition, first_assets_with_partitions_def.partitions_def),\n run_config_for_partition_fn=lambda p: run_config_for_partition_fn(p.name),\n )\n\n\ndef build_source_assets_by_key(\n source_assets: Optional[Sequence[Union[SourceAsset, AssetsDefinition]]]\n) -> Mapping[AssetKey, Union[SourceAsset, OutputDefinition]]:\n source_assets_by_key: Dict[AssetKey, Union[SourceAsset, OutputDefinition]] = {}\n for asset_source in source_assets or []:\n if isinstance(asset_source, SourceAsset):\n source_assets_by_key[asset_source.key] = asset_source\n elif isinstance(asset_source, AssetsDefinition):\n for output_name, asset_key in asset_source.asset_keys_by_output_name.items():\n if asset_key:\n source_assets_by_key[asset_key] = asset_source.node_def.output_def_named(\n output_name\n )\n\n return source_assets_by_key\n\n\ndef build_deps(\n assets_defs: Sequence[AssetsDefinition], source_paths: AbstractSet[AssetKey]\n) -> Tuple[\n Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]],\n Mapping[NodeHandle, AssetsDefinition],\n]:\n node_outputs_by_asset: Dict[AssetKey, Tuple[NodeDefinition, str]] = {}\n assets_defs_by_node_handle: Dict[NodeHandle, AssetsDefinition] = {}\n\n for assets_def in assets_defs:\n for output_name, asset_key in assets_def.asset_keys_by_output_name.items():\n if asset_key in node_outputs_by_asset:\n raise DagsterInvalidDefinitionError(\n f"The same asset key was included for two definitions: '{asset_key.to_string()}'"\n )\n\n node_outputs_by_asset[asset_key] = (assets_def.node_def, output_name)\n\n deps: Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]] = {}\n # if the same graph/op is used in multiple assets_definitions, their invocations much have\n # different names. we keep track of definitions that share a name and add a suffix to their\n # invocations to solve this issue\n collisions: Dict[str, int] = {}\n for assets_def in assets_defs:\n node_name = assets_def.node_def.name\n if collisions.get(node_name):\n collisions[node_name] += 1\n alias = f"{node_name}_{collisions[node_name]}"\n node_key = NodeInvocation(node_name, alias)\n else:\n collisions[node_name] = 1\n alias = node_name\n node_key = node_name\n deps[node_key] = {}\n assets_defs_by_node_handle[NodeHandle(alias, parent=None)] = assets_def\n for input_name, asset_key in sorted(\n assets_def.asset_keys_by_input_name.items(), key=lambda input: input[0]\n ): # sort so that input definition order is deterministic\n if asset_key in node_outputs_by_asset:\n node_def, output_name = node_outputs_by_asset[asset_key]\n deps[node_key][input_name] = DependencyDefinition(node_def.name, output_name)\n elif asset_key not in source_paths:\n input_def = assets_def.node_def.input_def_named(input_name)\n if not input_def.dagster_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n f"Input asset '{asset_key.to_string()}' for asset "\n f"'{next(iter(assets_def.asset_keys)).to_string()}' is not "\n "produced by any of the provided asset ops and is not one of the provided "\n "sources"\n )\n\n return deps, assets_defs_by_node_handle\n
", "current_page_name": "_modules/dagster/core/asset_defs/assets_job", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.assets_job"}, "decorators": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.decorators

\nimport warnings\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster.builtins import Nothing\nfrom dagster.config import Field\nfrom dagster.core.decorator_utils import get_function_params, get_valid_name_permutations\nfrom dagster.core.definitions.decorators.op_decorator import _Op\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.input import In\nfrom dagster.core.definitions.output import Out\nfrom dagster.core.definitions.partition import PartitionsDefinition\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.definitions.utils import NoValueSentinel\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.core.storage.io_manager import IOManagerDefinition\nfrom dagster.core.types.dagster_type import DagsterType\nfrom dagster.seven import funcsigs\nfrom dagster.utils.backcompat import ExperimentalWarning, experimental_decorator\n\nfrom .asset_in import AssetIn\nfrom .assets import AssetsDefinition\nfrom .partition_mapping import PartitionMapping\n\n\n@overload\ndef asset(\n    name: Callable[..., Any],\n) -> AssetsDefinition:\n    ...\n\n\n@overload\ndef asset(\n    name: Optional[str] = ...,\n    namespace: Optional[Sequence[str]] = ...,\n    ins: Optional[Mapping[str, AssetIn]] = ...,\n    non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = ...,\n    metadata: Optional[Mapping[str, Any]] = ...,\n    description: Optional[str] = ...,\n    required_resource_keys: Optional[Set[str]] = ...,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]] = ...,\n    io_manager_def: Optional[IOManagerDefinition] = ...,\n    io_manager_key: Optional[str] = ...,\n    compute_kind: Optional[str] = ...,\n    dagster_type: Optional[DagsterType] = ...,\n    partitions_def: Optional[PartitionsDefinition] = ...,\n    partition_mappings: Optional[Mapping[str, PartitionMapping]] = ...,\n    op_tags: Optional[Dict[str, Any]] = ...,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n    ...\n\n\n
[docs]@experimental_decorator\ndef asset(\n name: Optional[Union[Callable[..., Any], Optional[str]]] = None,\n namespace: Optional[Sequence[str]] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n io_manager_def: Optional[IOManagerDefinition] = None,\n io_manager_key: Optional[str] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n op_tags: Optional[Dict[str, Any]] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Create a definition for how to compute an asset.\n\n A software-defined asset is the combination of:\n 1. An asset key, e.g. the name of a table.\n 2. A function, which can be run to compute the contents of the asset.\n 3. A set of upstream assets that are provided as inputs to the function when computing the asset.\n\n Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\n about the upstream assets it depends on. The upstream assets are inferred from the arguments\n to the decorated function. The name of the argument designates the name of the upstream asset.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function.\n namespace (Optional[Sequence[str]]): The namespace that the asset resides in. The namespace + the\n name forms the asset key.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to their metadata\n and namespaces.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Set of asset keys that are\n upstream dependencies, but do not pass an input to the asset.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the op.\n io_manager_key (Optional[str]): The resource key of the IOManager used\n for storing the output of the op as an asset, and for loading it in downstream ops (default: "io_manager"). Only one of io_manager_key and io_manager_def can be provided.\n io_manager_def (Optional[IOManagerDefinition]): The definition of the IOManager used for\n storing the output of the op as an asset, and for loading it in\n downstream ops. Only one of io_manager_def and io_manager_key can be provided.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in Dagit as a badge on the asset.\n dagster_type (Optional[DagsterType]): Allows specifying type validation functions that\n will be executed on the output of the decorated function after it runs.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n\n Examples:\n\n .. code-block:: python\n\n @asset\n def my_asset(my_upstream_asset: int) -> int:\n return my_upstream_asset + 1\n """\n if callable(name):\n return _Asset()(name)\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n check.invariant(\n not (io_manager_key and io_manager_def),\n "Both io_manager_key and io_manager_def were provided to `@asset` decorator. Please provide one or the other. ",\n )\n return _Asset(\n name=cast(Optional[str], name), # (mypy bug that it can't infer name is Optional[str])\n namespace=namespace,\n ins=ins,\n non_argument_deps=_make_asset_keys(non_argument_deps),\n metadata=metadata,\n description=description,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n io_manager=io_manager_def or io_manager_key,\n compute_kind=check.opt_str_param(compute_kind, "compute_kind"),\n dagster_type=dagster_type,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n op_tags=op_tags,\n )(fn)\n\n return inner
\n\n\nclass _Asset:\n def __init__(\n self,\n name: Optional[str] = None,\n namespace: Optional[Sequence[str]] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n non_argument_deps: Optional[Set[AssetKey]] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n io_manager: Optional[Union[str, IOManagerDefinition]] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n op_tags: Optional[Dict[str, Any]] = None,\n ):\n self.name = name\n # if user inputs a single string, coerce to list\n self.namespace = [namespace] if isinstance(namespace, str) else namespace\n self.ins = ins or {}\n self.non_argument_deps = non_argument_deps\n self.metadata = metadata\n self.description = description\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self.io_manager = io_manager\n self.compute_kind = compute_kind\n self.dagster_type = dagster_type\n self.partitions_def = partitions_def\n self.partition_mappings = partition_mappings\n self.op_tags = op_tags\n self.resource_defs = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n\n def __call__(self, fn: Callable) -> AssetsDefinition:\n asset_name = self.name or fn.__name__\n\n asset_ins = build_asset_ins(fn, self.namespace, self.ins or {}, self.non_argument_deps)\n\n out_asset_key = AssetKey(list(filter(None, [*(self.namespace or []), asset_name])))\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n\n if isinstance(self.io_manager, str):\n io_manager_key = cast(str, self.io_manager)\n elif self.io_manager is not None:\n io_manager_def = check.inst_param(\n self.io_manager, "io_manager", IOManagerDefinition\n )\n out_asset_resource_key = "__".join(out_asset_key.path)\n io_manager_key = f"{out_asset_resource_key}__io_manager"\n self.resource_defs[io_manager_key] = cast(ResourceDefinition, io_manager_def)\n else:\n io_manager_key = "io_manager"\n\n out = Out(\n metadata=self.metadata or {},\n io_manager_key=io_manager_key,\n dagster_type=self.dagster_type if self.dagster_type else NoValueSentinel,\n description=self.description,\n )\n\n required_resource_keys = set()\n for key in self.required_resource_keys:\n required_resource_keys.add(key)\n for key in self.resource_defs.keys():\n required_resource_keys.add(key)\n\n op = _Op(\n name="__".join(out_asset_key.path).replace("-", "_"),\n description=self.description,\n ins=dict(asset_ins.values()),\n out=out,\n required_resource_keys=required_resource_keys,\n tags={\n **({"kind": self.compute_kind} if self.compute_kind else {}),\n **(self.op_tags or {}),\n },\n config_schema={\n "assets": {\n "input_partitions": Field(dict, is_required=False),\n "output_partitions": Field(dict, is_required=False),\n }\n },\n )(fn)\n\n asset_keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n return AssetsDefinition(\n asset_keys_by_input_name=asset_keys_by_input_name,\n asset_keys_by_output_name={"result": out_asset_key},\n node_def=op,\n partitions_def=self.partitions_def,\n partition_mappings={\n asset_keys_by_input_name[input_name]: partition_mapping\n for input_name, partition_mapping in self.partition_mappings.items()\n }\n if self.partition_mappings\n else None,\n resource_defs=self.resource_defs,\n )\n\n\n
[docs]@experimental_decorator\ndef multi_asset(\n outs: Dict[str, Out],\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n compute_kind: Optional[str] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n op_tags: Optional[Dict[str, Any]] = None,\n can_subset: bool = False,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same op and same\n upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n Args:\n name (Optional[str]): The name of the op.\n outs: (Optional[Dict[str, Out]]): The Outs representing the produced assets.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to their metadata\n and namespaces.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Set of asset keys that are upstream dependencies,\n but do not pass an input to the multi_asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the op.\n io_manager_key (Optional[str]): The resource key of the IOManager used for storing the\n output of the op as an asset, and for loading it in downstream ops\n (default: "io_manager").\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in Dagit as a badge on the asset.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by a multi_asset depend on all assets that are consumed by that\n multi asset. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\n used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n can_subset (bool): If this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n """\n\n check.invariant(\n all(out.asset_key is None or isinstance(out.asset_key, AssetKey) for out in outs.values()),\n "The asset_key argument for Outs supplied to a multi_asset must be a constant or None, not a function. ",\n )\n asset_deps = check.opt_dict_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n\n op_name = name or fn.__name__\n asset_ins = build_asset_ins(\n fn, None, ins or {}, non_argument_deps=_make_asset_keys(non_argument_deps)\n )\n\n # validate that the asset_deps make sense\n valid_asset_deps = set(asset_ins.keys())\n valid_asset_deps.update(\n cast(AssetKey, out.asset_key or AssetKey([name])) for name, out in outs.items()\n )\n for out_name, asset_keys in asset_deps.items():\n check.invariant(\n out_name in outs,\n f"Invalid out key '{out_name}' supplied to `internal_asset_deps` argument for multi-asset "\n f"{op_name}. Must be one of the outs for this multi-asset {list(outs.keys())}.",\n )\n invalid_asset_deps = asset_keys.difference(valid_asset_deps)\n check.invariant(\n not invalid_asset_deps,\n f"Invalid asset dependencies: {invalid_asset_deps} specified in `internal_asset_deps` "\n f"argument for multi-asset '{op_name}' on key '{out_name}'. Each specified asset key "\n "must be associated with an input to the asset or produced by this asset. Valid "\n f"keys: {valid_asset_deps}",\n )\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n op = _Op(\n name=op_name,\n description=description,\n ins=dict(asset_ins.values()),\n out=outs,\n required_resource_keys=required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema={\n "assets": {\n "input_partitions": Field(dict, is_required=False),\n "output_partitions": Field(dict, is_required=False),\n }\n },\n )(fn)\n\n asset_keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n asset_keys_by_output_name = {\n name: cast(AssetKey, out.asset_key or AssetKey([name])) for name, out in outs.items()\n }\n return AssetsDefinition(\n asset_keys_by_input_name=asset_keys_by_input_name,\n asset_keys_by_output_name=asset_keys_by_output_name,\n node_def=op,\n asset_deps={asset_keys_by_output_name[name]: asset_deps[name] for name in asset_deps},\n partitions_def=partitions_def,\n partition_mappings={\n asset_keys_by_input_name[input_name]: partition_mapping\n for input_name, partition_mapping in partition_mappings.items()\n }\n if partition_mappings\n else None,\n can_subset=can_subset,\n )\n\n return inner
\n\n\ndef build_asset_ins(\n fn: Callable,\n asset_namespace: Optional[Sequence[str]],\n asset_ins: Mapping[str, AssetIn],\n non_argument_deps: Optional[AbstractSet[AssetKey]],\n) -> Mapping[AssetKey, Tuple[str, In]]:\n """\n Creates a mapping from AssetKey to (name of input, In object)\n """\n\n non_argument_deps = check.opt_set_param(non_argument_deps, "non_argument_deps", AssetKey)\n\n params = get_function_params(fn)\n is_context_provided = len(params) > 0 and params[0].name in get_valid_name_permutations(\n "context"\n )\n input_params = params[1:] if is_context_provided else params\n non_var_input_param_names = [\n param.name\n for param in input_params\n if param.kind == funcsigs.Parameter.POSITIONAL_OR_KEYWORD\n ]\n has_kwargs = any(param.kind == funcsigs.Parameter.VAR_KEYWORD for param in input_params)\n\n all_input_names = set(non_var_input_param_names) | asset_ins.keys()\n\n if not has_kwargs:\n for in_key in asset_ins.keys():\n if in_key not in non_var_input_param_names:\n raise DagsterInvalidDefinitionError(\n f"Key '{in_key}' in provided ins dict does not correspond to any of the names "\n "of the arguments to the decorated function"\n )\n\n ins_by_asset_key: Dict[AssetKey, Tuple[str, In]] = {}\n for input_name in all_input_names:\n asset_key = None\n\n if input_name in asset_ins:\n asset_key = asset_ins[input_name].asset_key\n metadata = asset_ins[input_name].metadata or {}\n namespace = asset_ins[input_name].namespace\n else:\n metadata = {}\n namespace = None\n\n asset_key = asset_key or AssetKey(\n list(filter(None, [*(namespace or asset_namespace or []), input_name]))\n )\n\n ins_by_asset_key[asset_key] = (input_name.replace("-", "_"), In(metadata=metadata))\n\n for asset_key in non_argument_deps:\n stringified_asset_key = "_".join(asset_key.path).replace("-", "_")\n # mypy doesn't realize that Nothing is a valid type here\n ins_by_asset_key[asset_key] = (stringified_asset_key, In(cast(type, Nothing)))\n\n return ins_by_asset_key\n\n\ndef _make_asset_keys(deps: Optional[Union[Set[AssetKey], Set[str]]]) -> Optional[Set[AssetKey]]:\n """Convert all str items to AssetKey in the set."""\n if deps is None:\n return deps\n\n deps_asset_keys = {\n AssetKey.from_user_string(dep) if isinstance(dep, str) else dep for dep in deps\n }\n return deps_asset_keys\n
", "current_page_name": "_modules/dagster/core/asset_defs/decorators", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.decorators"}, "source_asset": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.asset_defs.source_asset

\nfrom typing import NamedTuple, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom dagster.core.definitions.events import AssetKey, CoerceableToAssetKey\nfrom dagster.core.definitions.metadata import (\n    MetadataEntry,\n    MetadataMapping,\n    MetadataUserInput,\n    PartitionMetadataEntry,\n    normalize_metadata,\n)\nfrom dagster.core.definitions.partition import PartitionsDefinition\n\n\n
[docs]class SourceAsset(\n NamedTuple(\n "_SourceAsset",\n [\n ("key", AssetKey),\n ("metadata_entries", Sequence[Union[MetadataEntry, PartitionMetadataEntry]]),\n ("io_manager_key", str),\n ("description", Optional[str]),\n ("partitions_def", Optional[PartitionsDefinition]),\n ],\n )\n):\n """A SourceAsset represents an asset that will be loaded by (but not updated by) Dagster.\n\n Attributes:\n key (Union[AssetKey, Sequence[str], str]): The key of the asset.\n metadata_entries (List[MetadataEntry]): Metadata associated with the asset.\n io_manager_key (str): The key for the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n description (Optional[str]): The description of the asset.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n """\n\n def __new__(\n cls,\n key: CoerceableToAssetKey,\n metadata: Optional[MetadataUserInput] = None,\n io_manager_key: str = "io_manager",\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n ):\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = normalize_metadata(metadata, [], allow_invalid=True)\n return super().__new__(\n cls,\n key=AssetKey.from_coerceable(key),\n metadata_entries=metadata_entries,\n io_manager_key=check.str_param(io_manager_key, "io_manager_key"),\n description=check.opt_str_param(description, "description"),\n partitions_def=check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n ),\n )\n\n @property\n def metadata(self) -> MetadataMapping:\n # PartitionMetadataEntry (unstable API) case is unhandled\n return {entry.label: entry.entry_data for entry in self.metadata_entries} # type: ignore
\n
", "current_page_name": "_modules/dagster/core/asset_defs/source_asset", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.asset_defs.source_asset"}}, "definitions": {"config": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.config

\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, NamedTuple, Optional, Union, cast\n\nimport dagster._check as check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.config import ConfigType\nfrom dagster.config.post_process import resolve_defaults\nfrom dagster.config.validate import process_config, validate_config\nfrom dagster.core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster.core.errors import DagsterInvalidConfigError\nfrom dagster.primitive_mapping import is_supported_config_python_builtin\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\nif TYPE_CHECKING:\n    from .pipeline_definition import PipelineDefinition\n\n\ndef is_callable_valid_config_arg(config: Union[Callable[..., Any], Dict[str, Any]]) -> bool:\n    return BuiltinEnum.contains(config) or is_supported_config_python_builtin(config)\n\n\n
[docs]class ConfigMapping(\n NamedTuple(\n "_ConfigMapping",\n [\n ("config_fn", Callable[[Any], Any]),\n ("config_schema", IDefinitionConfigSchema),\n ("receive_processed_config_values", Optional[bool]),\n ],\n )\n):\n """Defines a config mapping for a graph (or job).\n\n By specifying a config mapping function, you can override the configuration for the child\n ops and graphs contained within a graph.\n\n Config mappings require the configuration schema to be specified as ``config_schema``, which will\n be exposed as the configuration schema for the graph, as well as a configuration mapping\n function, ``config_fn``, which maps the config provided to the composite solid to the config\n that will be provided to the child nodes.\n\n Args:\n config_fn (Callable[[dict], dict]): The function that will be called\n to map the graph config to a config appropriate for the child nodes.\n config_schema (ConfigSchema): The schema of the graph config.\n receive_processed_config_values (Optional[bool]): If true, config values provided to the config_fn\n will be converted to their dagster types before being passed in. For example, if this\n value is true, enum config passed to config_fn will be actual enums, while if false,\n then enum config passed to config_fn will be strings.\n """\n\n def __new__(\n cls,\n config_fn: Callable[[Any], Any],\n config_schema: Optional[Any] = None,\n receive_processed_config_values: Optional[bool] = None,\n ):\n return super(ConfigMapping, cls).__new__(\n cls,\n config_fn=check.callable_param(config_fn, "config_fn"),\n config_schema=convert_user_facing_definition_config_schema(config_schema),\n receive_processed_config_values=check.opt_bool_param(\n receive_processed_config_values, "receive_processed_config_values"\n ),\n )\n\n def resolve_from_unvalidated_config(self, config: Any) -> Any:\n """Validates config against outer config schema, and calls mapping against validated config."""\n\n receive_processed_config_values = check.opt_bool_param(\n self.receive_processed_config_values, "receive_processed_config_values", default=True\n )\n if receive_processed_config_values:\n outer_evr = process_config(\n self.config_schema.config_type,\n config,\n )\n else:\n outer_evr = validate_config(\n self.config_schema.config_type,\n config,\n )\n if not outer_evr.success:\n raise DagsterInvalidConfigError(\n "Error in config mapping ",\n outer_evr.errors,\n config,\n )\n\n outer_config = outer_evr.value\n if not receive_processed_config_values:\n outer_config = resolve_defaults(\n cast(ConfigType, self.config_schema.config_type),\n outer_config,\n ).value\n\n return self.config_fn(outer_config)\n\n def resolve_from_validated_config(self, config: Any) -> Any:\n if self.receive_processed_config_values is not None:\n check.failed(\n "`receive_processed_config_values` parameter has been set, but only applies to "\n "unvalidated config."\n )\n\n return self.config_fn(config)
\n
", "current_page_name": "_modules/dagster/core/definitions/config", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.config"}, "configurable": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.configurable

\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Callable, Dict, Optional, Union\n\nfrom dagster import Field\nfrom dagster import _check as check\nfrom dagster.config.evaluate_value_result import EvaluateValueResult\n\nfrom .definition_config_schema import (\n    ConfiguredDefinitionConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\n\nclass ConfigurableDefinition(ABC):\n    @property\n    @abstractmethod\n    def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n        raise NotImplementedError()\n\n    @property\n    def has_config_field(self) -> bool:\n        return self.config_schema is not None and bool(self.config_schema.as_field())\n\n    @property\n    def config_field(self) -> Optional[Field]:\n        return None if not self.config_schema else self.config_schema.as_field()\n\n    # getter for typed access\n    def get_config_field(self) -> Field:\n        field = self.config_field\n        if field is None:\n            check.failed("Must check has_config_Field before calling get_config_field")\n        return field\n\n    def apply_config_mapping(self, config: Any) -> EvaluateValueResult:\n        """\n        Applies user-provided config mapping functions to the given configuration and validates the\n        results against the respective config schema.\n\n        Expects incoming config to be validated and have fully-resolved values (StringSource values\n        resolved, Enum types hydrated, etc.) via process_config() during ResolvedRunConfig\n        construction and CompositeSolid config mapping.\n\n        Args:\n            config (Any): A validated and resolved configuration dictionary matching this object's\n            config_schema\n\n        Returns (EvaluateValueResult):\n            If successful, the value is a validated and resolved configuration dictionary for the\n            innermost wrapped object after applying the config mapping transformation function.\n        """\n        # If schema is on a mapped schema this is the innermost resource (base case),\n        # so we aren't responsible for validating against anything farther down.\n        # Returns an EVR for type consistency with config_mapping_fn.\n        return (\n            self.config_schema.resolve_config(config)\n            if isinstance(self.config_schema, ConfiguredDefinitionConfigSchema)\n            else EvaluateValueResult.for_value(config)\n        )\n\n\nclass AnonymousConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method not accept a name argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        config_schema: Optional[Dict[str, Any]] = None,\n        description: Optional[str] = None,\n    ):\n        """\n        Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(description, new_config_schema, config_or_config_fn)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n        config_or_config_fn: Union[Any, Callable[[Any], Any]],\n    ):\n        raise NotImplementedError()\n\n\nclass NamedConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method require a positional `name` argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        name: str,\n        config_schema: Optional[Dict[str, Any]] = None,\n        description: Optional[str] = None,\n    ):\n        """\n        Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            name (str): Name of the new definition. This is a required argument, as this definition\n                type has a name uniqueness constraint.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n\n        name = check.str_param(name, "name")\n\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(name, description, new_config_schema, config_or_config_fn)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        name: str,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n        config_or_config_fn: Union[Any, Callable[[Any], Any]],\n    ):\n        raise NotImplementedError()\n\n\ndef _check_configurable_param(configurable: ConfigurableDefinition) -> Any:\n    from dagster.core.definitions.composition import PendingNodeInvocation\n\n    check.param_invariant(\n        not isinstance(configurable, PendingNodeInvocation),\n        "configurable",\n        (\n            "You have invoked `configured` on a PendingNodeInvocation (an intermediate type), which is "\n            "produced by aliasing or tagging a solid definition. To configure a solid, you must "\n            "call `configured` on either a SolidDefinition and CompositeSolidDefinition. To fix "\n            "this error, make sure to call `configured` on the definition object *before* using "\n            "the `tag` or `alias` methods. For usage examples, see "\n            "https://docs.dagster.io/concepts/configuration/configured"\n        ),\n    )\n    check.inst_param(\n        configurable,\n        "configurable",\n        ConfigurableDefinition,\n        (\n            "Only the following types can be used with the `configured` method: ResourceDefinition, "\n            "ExecutorDefinition, CompositeSolidDefinition, SolidDefinition, and LoggerDefinition. "\n            "For usage examples of `configured`, see "\n            "https://docs.dagster.io/concepts/configuration/configured"\n        ),\n    )\n\n\ndef _is_named_configurable_param(configurable: ConfigurableDefinition) -> bool:\n    return isinstance(configurable, NamedConfigurableDefinition)\n\n\n
[docs]def configured(\n configurable: ConfigurableDefinition,\n config_schema: Optional[Dict[str, Any]] = None,\n **kwargs: Any,\n):\n """\n A decorator that makes it easy to create a function-configured version of an object.\n The following definition types can be configured using this function:\n\n * :py:class:`CompositeSolidDefinition`\n * :py:class:`ExecutorDefinition`\n * :py:class:`LoggerDefinition`\n * :py:class:`ResourceDefinition`\n * :py:class:`SolidDefinition`\n\n If the config that will be supplied to the object is constant, you may alternatively invoke this\n and call the result with a dict of config values to be curried. Examples of both strategies\n below.\n\n Args:\n configurable (ConfigurableDefinition): An object that can be configured.\n config_schema (ConfigSchema): The config schema that the inputs to the decorated function\n must satisfy.\n **kwargs: Arbitrary keyword arguments that will be passed to the initializer of the returned\n object.\n\n Returns:\n (Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])\n\n **Examples:**\n\n .. code-block:: python\n\n dev_s3 = configured(s3_resource, name="dev_s3")({'bucket': 'dev'})\n\n @configured(s3_resource)\n def dev_s3(_):\n return {'bucket': 'dev'}\n\n @configured(s3_resource, {'bucket_prefix', str})\n def dev_s3(config):\n return {'bucket': config['bucket_prefix'] + 'dev'}\n """\n _check_configurable_param(configurable)\n\n if _is_named_configurable_param(configurable):\n\n def _configured(config_or_config_fn):\n fn_name = config_or_config_fn.__name__ if callable(config_or_config_fn) else None\n name = kwargs.get("name") or fn_name\n return configurable.configured(\n config_or_config_fn=config_or_config_fn,\n name=name,\n config_schema=config_schema,\n **{k: v for k, v in kwargs.items() if k != "name"},\n )\n\n return _configured\n else:\n\n def _configured(config_or_config_fn):\n return configurable.configured(\n config_schema=config_schema, config_or_config_fn=config_or_config_fn, **kwargs\n )\n\n return _configured
\n
", "current_page_name": "_modules/dagster/core/definitions/configurable", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.configurable"}, "decorators": {"composite_solid_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.composite_solid_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, List, Optional, Union, overload\n\nimport dagster._check as check\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.core.decorator_utils import format_docstring_for_description\n\nfrom ..composition import do_composition, get_validated_config_mapping\nfrom ..input import InputDefinition\nfrom ..output import OutputDefinition\nfrom ..solid_definition import CompositeSolidDefinition\n\n\nclass _CompositeSolid:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        input_defs: Optional[List[InputDefinition]] = None,\n        output_defs: Optional[List[OutputDefinition]] = None,\n        description: Optional[str] = None,\n        config_schema: Optional[ConfigSchemaType] = None,\n        config_fn: Optional[Callable[[dict], dict]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.input_defs = check.opt_list_param(input_defs, "input_defs", InputDefinition)\n        self.output_defs = check.opt_nullable_list_param(output_defs, "output", OutputDefinition)\n        self.description = check.opt_str_param(description, "description")\n\n        self.config_schema = config_schema  # gets validated in do_composition\n        self.config_fn = check.opt_callable_param(config_fn, "config_fn")\n\n    def __call__(self, fn: Callable[..., Any]):\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        config_mapping = get_validated_config_mapping(\n            self.name, self.config_schema, self.config_fn, decorator_name="composite_solid"\n        )\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            solid_defs,\n            config_mapping,\n            positional_inputs,\n        ) = do_composition(\n            "@composite_solid",\n            self.name,\n            fn,\n            self.input_defs,\n            self.output_defs,\n            config_mapping,\n            ignore_output_from_composition_fn=False,\n        )\n\n        composite_def = CompositeSolidDefinition(\n            name=self.name,\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            dependencies=dependencies,\n            solid_defs=solid_defs,\n            description=self.description or format_docstring_for_description(fn),\n            config_mapping=config_mapping,\n            positional_inputs=positional_inputs,\n        )\n        update_wrapper(composite_def, fn)\n        return composite_def\n\n\n@overload\ndef composite_solid(\n    name: Callable[..., Any],\n) -> CompositeSolidDefinition:\n    ...\n\n\n@overload\ndef composite_solid(\n    name: Optional[str] = ...,\n    input_defs: Optional[List[InputDefinition]] = ...,\n    output_defs: Optional[List[OutputDefinition]] = ...,\n    description: Optional[str] = ...,\n    config_schema: Optional[ConfigSchemaType] = ...,\n    config_fn: Optional[Callable[[dict], dict]] = ...,\n) -> _CompositeSolid:\n    ...\n\n\n
[docs]def composite_solid(\n name: Optional[Union[Callable[..., Any], str]] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n description: Optional[str] = None,\n config_schema: Optional[ConfigSchemaType] = None,\n config_fn: Optional[Callable[[dict], dict]] = None,\n) -> Union[CompositeSolidDefinition, _CompositeSolid]:\n """Create a composite solid with the specified parameters from the decorated composition\n function.\n\n Using this decorator allows you to build up the dependency graph of the composite by writing a\n function that invokes solids and passes the output to other solids. This is similar to the use\n of the :py:func:`@pipeline <pipeline>` decorator, with the additional ability to remap inputs,\n outputs, and config across the composite boundary.\n\n Args:\n name (Optional[str]): Name for the new composite solid. Must be unique within any\n :py:class:`PipelineDefinition` using the solid.\n description (Optional[str]): Human-readable description of the new composite solid.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs that this composite solid maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit InputDefinitions taking precedence.\n\n Uses of inputs in the body of the decorated composition function will determine\n the :py:class:`InputMappings <InputMapping>` passed to the underlying\n :py:class:`CompositeSolidDefinition`.\n output_defs (Optional[List[OutputDefinition]]):\n Information about the outputs this composite solid maps. Information provided here\n will be combined with what can be inferred from the return type signature if there\n is only one OutputDefinition.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`CompositeSolidDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n config_schema (Optional[ConfigSchema]): If the `config_fn` argument is provided, this\n argument can be provided to set the schema for outer config that is passed to the\n `config_fn`. If `config_fn` is provided, but this argument is not provided, any config\n will be accepted.\n config_fn (Callable[[dict], dict]): By specifying a config mapping\n function, you can override the configuration for the child solids contained within this\n composite solid. ``config_fn``, maps the config provided to the\n composite solid to the config that will be provided to the child solids.\n\n If this argument is provided, the `config_schema` argument can also be provided to limit\n what config values can be passed to the composite solid.\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n @composite_solid\n def add_two(num: int) -> int:\n adder_1 = add_one.alias('adder_1')\n adder_2 = add_one.alias('adder_2')\n\n return adder_2(adder_1(num))\n\n """\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(output_defs is None)\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(config_fn is None)\n return _CompositeSolid()(name)\n\n return _CompositeSolid(\n name=name,\n input_defs=input_defs,\n output_defs=output_defs,\n description=description,\n config_schema=config_schema,\n config_fn=config_fn,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/composite_solid_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.composite_solid_decorator"}, "graph_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.graph_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Dict, List, Optional, Union, overload\n\nimport dagster._check as check\nfrom dagster.core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..input import GraphIn, InputDefinition\nfrom ..output import GraphOut, OutputDefinition\n\n\nclass _Graph:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        input_defs: Optional[List[InputDefinition]] = None,\n        output_defs: Optional[List[OutputDefinition]] = None,\n        ins: Optional[Dict[str, GraphIn]] = None,\n        out: Optional[Union[GraphOut, Dict[str, GraphOut]]] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        config_mapping: Optional[ConfigMapping] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n        self.did_pass_outputs = output_defs is not None or out is not None\n        self.output_defs = check.opt_nullable_list_param(\n            output_defs, "output_defs", of_type=OutputDefinition\n        )\n        self.ins = ins\n        self.out = out\n        self.tags = tags\n        self.config_mapping = check.opt_inst_param(config_mapping, "config_mapping", ConfigMapping)\n\n    def __call__(self, fn: Callable[..., Any]) -> GraphDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        if self.ins is not None:\n            input_defs = [inp.to_definition(name) for name, inp in self.ins.items()]\n        else:\n            input_defs = check.opt_list_param(\n                self.input_defs, "input_defs", of_type=InputDefinition\n            )\n\n        if self.out is None:\n            output_defs = self.output_defs\n        elif isinstance(self.out, GraphOut):\n            output_defs = [self.out.to_definition(name=None)]\n        else:\n            check.dict_param(self.out, "out", key_type=str, value_type=GraphOut)\n            output_defs = [out.to_definition(name=name) for name, out in self.out.items()]\n\n        from dagster.core.definitions.decorators.composite_solid_decorator import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            solid_defs,\n            config_mapping,\n            positional_inputs,\n        ) = do_composition(\n            decorator_name="@graph",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=input_defs,\n            provided_output_defs=output_defs,\n            ignore_output_from_composition_fn=False,\n            config_mapping=self.config_mapping,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=solid_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n        )\n        update_wrapper(graph_def, fn)\n        return graph_def\n\n\n@overload\ndef graph(name: Callable[..., Any]) -> GraphDefinition:\n    ...\n\n\n@overload\ndef graph(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    input_defs: Optional[List[InputDefinition]] = ...,\n    output_defs: Optional[List[OutputDefinition]] = ...,\n    ins: Optional[Dict[str, GraphIn]] = ...,\n    out: Optional[Union[GraphOut, Dict[str, GraphOut]]] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    config: Optional[Union[ConfigMapping, Dict[str, Any]]] = ...,\n) -> _Graph:\n    ...\n\n\n
[docs]def graph(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n ins: Optional[Dict[str, GraphIn]] = None,\n out: Optional[Union[GraphOut, Dict[str, GraphOut]]] = None,\n tags: Optional[Dict[str, Any]] = None,\n config: Optional[Union[ConfigMapping, Dict[str, Any]]] = None,\n) -> Union[GraphDefinition, _Graph]:\n """Create a graph with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up a dependency graph by writing a\n function that invokes ops (or other graphs) and passes the output to subsequent invocations.\n\n Args:\n name (Optional[str]):\n The name of the graph. Must be unique within any :py:class:`RepositoryDefinition` containing the graph.\n description (Optional[str]):\n A human-readable description of the graph.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit InputDefinitions taking precedence.\n\n Uses of inputs in the body of the decorated composition function will determine\n the :py:class:`InputMappings <InputMapping>` passed to the underlying\n :py:class:`GraphDefinition`.\n output_defs (Optional[List[OutputDefinition]]):\n Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`GraphDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n ins (Optional[Dict[str, GraphIn]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit GraphIn taking precedence.\n out (Optional[Union[GraphOut, Dict[str, GraphOut]]]):\n Information about the outputs that this graph maps. Information provided here will be\n combined with what can be inferred from the return type signature if the function does\n not use yield.\n\n To map multiple outputs, return a dictionary from the composition function.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n """\n if callable(name):\n check.invariant(description is None)\n return _Graph()(name)\n\n config_mapping = None\n # Case 1: a dictionary of config is provided, convert to config mapping.\n if config is not None and not isinstance(config, ConfigMapping):\n config = check.dict_param(config, "config", key_type=str)\n config_mapping = ConfigMapping(config_fn=lambda _: config, config_schema=None)\n # Case 2: actual config mapping is provided.\n else:\n config_mapping = config\n\n return _Graph(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n ins=ins,\n out=out,\n tags=tags,\n config_mapping=config_mapping,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/graph_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.graph_decorator"}, "hook_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.hook_decorator

\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, List, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\nfrom ...decorator_utils import get_function_params, validate_expected_params\nfrom ..events import HookExecutionResult\nfrom ..hook_definition import HookDefinition\n\nif TYPE_CHECKING:\n    from dagster.core.events import DagsterEvent\n    from dagster.core.execution.context.hook import HookContext\n\n\ndef _validate_hook_fn_params(fn, expected_positionals):\n    params = get_function_params(fn)\n    missing_positional = validate_expected_params(params, expected_positionals)\n    if missing_positional:\n        raise DagsterInvalidDefinitionError(\n            "'{hook_name}' decorated function does not have required positional "\n            "parameter '{missing_param}'. Hook functions should only have keyword arguments "\n            "that match input names and a first positional parameter named 'context' and "\n            "a second positional parameter named 'event_list'.".format(\n                hook_name=fn.__name__, missing_param=missing_positional\n            )\n        )\n\n\nclass _Hook:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        decorated_fn: Optional[Callable[..., Any]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.required_resource_keys = check.opt_set_param(\n            required_resource_keys, "required_resource_keys"\n        )\n        self.decorated_fn = check.opt_callable_param(decorated_fn, "decorated_fn")\n\n    def __call__(self, fn) -> HookDefinition:\n\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        expected_positionals = ["context", "event_list"]\n\n        _validate_hook_fn_params(fn, expected_positionals)\n\n        hook_def = HookDefinition(\n            name=self.name or "",\n            hook_fn=fn,\n            required_resource_keys=self.required_resource_keys,\n            decorated_fn=self.decorated_fn or fn,\n        )\n        update_wrapper(cast(Callable[..., Any], hook_def), fn)\n        return hook_def\n\n\n@overload\ndef event_list_hook(\n    name: Callable[..., Any],\n) -> HookDefinition:\n    pass\n\n\n@overload\ndef event_list_hook(\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    decorated_fn: Optional[Callable[..., Any]] = ...,\n) -> _Hook:\n    pass\n\n\ndef event_list_hook(\n    name: Optional[Union[Callable[..., Any], str]] = None,\n    required_resource_keys: Optional[AbstractSet[str]] = None,\n    decorated_fn: Optional[Callable[..., Any]] = None,\n) -> Union[HookDefinition, _Hook]:\n    """Create a generic hook with the specified parameters from the decorated function.\n\n    This decorator is currently used internally by Dagster machinery to support success_hook and\n    failure_hook.\n\n    The user-defined hook function requires two parameters:\n    - A `context` object is passed as the first parameter. The context is an instance of\n        :py:class:`context <HookContext>`, and provides access to system\n        information, such as loggers (context.log), resources (context.resources), the solid\n        (context.solid) and its execution step (context.step) which triggers this hook.\n    - An `event_list` object is passed as the second paramter. It provides the full event list of the\n        associated execution step.\n\n    Args:\n        name (Optional[str]): The name of this hook.\n        required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n            hook.\n\n    Examples:\n\n        .. code-block:: python\n\n            @event_list_hook(required_resource_keys={'slack'})\n            def slack_on_materializations(context, event_list):\n                for event in event_list:\n                    if event.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n                        message = '{solid} has materialized an asset {key}.'.format(\n                            solid=context.solid.name,\n                            key=event.asset_key\n                        )\n                        # send a slack message every time a materialization event occurs\n                        context.resources.slack.send_message(message)\n\n\n    """\n    # This case is for when decorator is used bare, without arguments.\n    # e.g. @event_list_hook versus @event_list_hook()\n    if callable(name):\n        check.invariant(required_resource_keys is None)\n        return _Hook()(name)\n\n    return _Hook(\n        name=name, required_resource_keys=required_resource_keys, decorated_fn=decorated_fn\n    )\n\n\nSuccessOrFailureHookFn = Callable[["HookContext"], Any]\n\n\n@overload\ndef success_hook(name: SuccessOrFailureHookFn) -> HookDefinition:\n    ...\n\n\n@overload\ndef success_hook(\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]:\n    ...\n\n\n
[docs]def success_hook(\n name: Optional[Union[SuccessOrFailureHookFn, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step success events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n\n .. code-block:: python\n\n @success_hook(required_resource_keys={'slack'})\n def slack_message_on_success(context):\n message = 'op {} succeeded'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @success_hook\n def do_something_on_success(context):\n do_something()\n\n\n """\n\n def wrapper(fn: SuccessOrFailureHookFn) -> HookDefinition:\n\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(_name, required_resource_keys, decorated_fn=fn)\n def _success_hook(\n context: "HookContext", event_list: List["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_success:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _success_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @success_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper
\n\n\n@overload\ndef failure_hook(name: SuccessOrFailureHookFn) -> HookDefinition:\n ...\n\n\n@overload\ndef failure_hook(\n name: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]:\n ...\n\n\n
[docs]def failure_hook(\n name: Optional[Union[SuccessOrFailureHookFn, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step failure events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n\n .. code-block:: python\n\n @failure_hook(required_resource_keys={'slack'})\n def slack_message_on_failure(context):\n message = 'op {} failed'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @failure_hook\n def do_something_on_failure(context):\n do_something()\n\n\n """\n\n def wrapper(fn: Callable[["HookContext"], Any]) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(_name, required_resource_keys, decorated_fn=fn)\n def _failure_hook(\n context: "HookContext", event_list: List["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_failure:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _failure_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @failure_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/hook_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.hook_decorator"}, "job_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.job_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Mapping,\n    Optional,\n    Union,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster.core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..hook_definition import HookDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from ..executor_definition import ExecutorDefinition\n    from ..partition import PartitionedConfig, PartitionsDefinition\n\n\nclass _Job:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n        config: Optional[Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"]] = None,\n        logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet[HookDefinition]] = None,\n        op_retry_policy: Optional[RetryPolicy] = None,\n        version_strategy: Optional[VersionStrategy] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        input_values: Optional[Mapping[str, object]] = None,\n    ):\n        self.name = name\n        self.description = description\n        self.tags = tags\n        self.resource_defs = resource_defs\n        self.config = config\n        self.logger_defs = logger_defs\n        self.executor_def = executor_def\n        self.hooks = hooks\n        self.op_retry_policy = op_retry_policy\n        self.version_strategy = version_strategy\n        self.partitions_def = partitions_def\n        self.input_values = input_values\n\n    def __call__(self, fn: Callable[..., Any]) -> JobDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        from dagster.core.definitions.decorators.composite_solid_decorator import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            solid_defs,\n            config_mapping,\n            positional_inputs,\n        ) = do_composition(\n            decorator_name="@job",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=[],\n            provided_output_defs=[],\n            ignore_output_from_composition_fn=False,\n            config_mapping=None,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=solid_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n        )\n\n        job_def = graph_def.to_job(\n            description=self.description or format_docstring_for_description(fn),\n            resource_defs=self.resource_defs,\n            config=self.config,\n            tags=self.tags,\n            logger_defs=self.logger_defs,\n            executor_def=self.executor_def,\n            hooks=self.hooks,\n            op_retry_policy=self.op_retry_policy,\n            version_strategy=self.version_strategy,\n            partitions_def=self.partitions_def,\n            input_values=self.input_values,\n        )\n        update_wrapper(job_def, fn)\n        return job_def\n\n\n@overload\ndef job(name: Callable[..., Any]) -> JobDefinition:\n    ...\n\n\n@overload\ndef job(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    resource_defs: Optional[Dict[str, ResourceDefinition]] = ...,\n    config: Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    logger_defs: Optional[Dict[str, LoggerDefinition]] = ...,\n    executor_def: Optional["ExecutorDefinition"] = ...,\n    hooks: Optional[AbstractSet[HookDefinition]] = ...,\n    op_retry_policy: Optional[RetryPolicy] = ...,\n    version_strategy: Optional[VersionStrategy] = ...,\n    partitions_def: Optional["PartitionsDefinition"] = ...,\n    input_values: Optional[Mapping[str, object]] = ...,\n) -> _Job:\n    ...\n\n\n
[docs]def job(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n config: Optional[Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"]] = None,\n tags: Optional[Dict[str, Any]] = None,\n logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n) -> Union[JobDefinition, _Job]:\n """Creates a job with the specified parameters from the decorated graph/op invocation function.\n\n Using this decorator allows you to build an executable job by writing a function that invokes\n ops (or graphs).\n\n Args:\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Dict[str, ResourceDefinition]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagit playground, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the pipeline, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagit playground, so be careful with secrets.\n tags (Optional[Dict[str, Any]]):\n Arbitrary metadata for any execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n logger_defs (Optional[Dict[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multiprocess_executor` .\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoizaton will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition keys\n that can parameterize the job. If this argument is supplied, the config argument\n can't also be supplied.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n """\n if callable(name):\n check.invariant(description is None)\n return _Job()(name)\n\n return _Job(\n name=name,\n description=description,\n resource_defs=resource_defs,\n config=config,\n tags=tags,\n logger_defs=logger_defs,\n executor_def=executor_def,\n hooks=hooks,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n partitions_def=partitions_def,\n input_values=input_values,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/job_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.job_decorator"}, "op_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.op_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.core.decorator_utils import format_docstring_for_description\n\nfrom ....seven.typing import get_origin\nfrom ...errors import DagsterInvariantViolationError\nfrom ..inference import InferredOutputProps, infer_output_props\nfrom ..input import In, InputDefinition\nfrom ..output import Out, OutputDefinition\nfrom ..policy import RetryPolicy\nfrom ..solid_definition import SolidDefinition\nfrom .solid_decorator import (\n    DecoratedSolidFunction,\n    NoContextDecoratedSolidFunction,\n    resolve_checked_solid_fn_inputs,\n)\n\nif TYPE_CHECKING:\n    from ..op_definition import OpDefinition\n\n\nclass _Op:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        input_defs: Optional[Sequence[InputDefinition]] = None,\n        output_defs: Optional[Sequence[OutputDefinition]] = None,\n        description: Optional[str] = None,\n        required_resource_keys: Optional[Set[str]] = None,\n        config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        version: Optional[str] = None,\n        decorator_takes_context: Optional[bool] = True,\n        retry_policy: Optional[RetryPolicy] = None,\n        ins: Optional[Dict[str, In]] = None,\n        out: Optional[Union[Out, Dict[str, Out]]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.input_defs = check.opt_nullable_sequence_param(\n            input_defs, "input_defs", of_type=InputDefinition\n        )\n        self.output_defs = output_defs\n        self.decorator_takes_context = check.bool_param(\n            decorator_takes_context, "decorator_takes_context"\n        )\n\n        self.description = check.opt_str_param(description, "description")\n\n        # these will be checked within SolidDefinition\n        self.required_resource_keys = required_resource_keys\n        self.tags = tags\n        self.version = version\n        self.retry_policy = retry_policy\n\n        # config will be checked within SolidDefinition\n        self.config_schema = config_schema\n\n        self.ins = check.opt_nullable_dict_param(ins, "ins", key_type=str, value_type=In)\n        self.out = out\n\n    def __call__(self, fn: Callable[..., Any]) -> "OpDefinition":\n        from ..op_definition import OpDefinition\n\n        if self.input_defs is not None and self.ins is not None:\n            check.failed("Values cannot be provided for both the 'input_defs' and 'ins' arguments")\n\n        if self.output_defs is not None and self.out is not None:\n            check.failed("Values cannot be provided for both the 'output_defs' and 'out' arguments")\n\n        inferred_out = infer_output_props(fn)\n\n        if self.ins is not None:\n            input_defs = [\n                inp.to_definition(name)\n                for name, inp in sorted(self.ins.items(), key=lambda input: input[0])\n            ]  # sort so that input definition order is deterministic\n        else:\n            input_defs = check.opt_list_param(\n                self.input_defs, "input_defs", of_type=InputDefinition\n            )\n\n        output_defs_from_out = _resolve_output_defs_from_outs(\n            inferred_out=inferred_out, out=self.out\n        )\n        resolved_output_defs = (\n            output_defs_from_out if output_defs_from_out is not None else self.output_defs\n        )\n\n        if not self.name:\n            self.name = fn.__name__\n\n        if resolved_output_defs is None:\n            resolved_output_defs = [OutputDefinition.create_from_inferred(infer_output_props(fn))]\n        elif len(resolved_output_defs) == 1:\n            resolved_output_defs = [\n                resolved_output_defs[0].combine_with_inferred(infer_output_props(fn))\n            ]\n\n        compute_fn = (\n            DecoratedSolidFunction(decorated_fn=fn)\n            if self.decorator_takes_context\n            else NoContextDecoratedSolidFunction(decorated_fn=fn)\n        )\n\n        resolved_input_defs = resolve_checked_solid_fn_inputs(\n            decorator_name="@op",\n            fn_name=self.name,\n            compute_fn=compute_fn,\n            explicit_input_defs=input_defs,\n            exclude_nothing=True,\n        )\n\n        op_def = OpDefinition(\n            name=self.name,\n            input_defs=resolved_input_defs,\n            output_defs=resolved_output_defs,\n            compute_fn=compute_fn,\n            config_schema=self.config_schema,\n            description=self.description or format_docstring_for_description(fn),\n            required_resource_keys=self.required_resource_keys,\n            tags=self.tags,\n            version=self.version,\n            retry_policy=self.retry_policy,\n        )\n        update_wrapper(op_def, compute_fn.decorated_fn)\n        return op_def\n\n\ndef _resolve_output_defs_from_outs(\n    inferred_out: InferredOutputProps, out: Optional[Union[Out, dict]]\n) -> Optional[List[OutputDefinition]]:\n    if out is None:\n        return None\n    if isinstance(out, Out):\n        return [out.to_definition(inferred_out.annotation, name=None)]\n    else:\n        check.dict_param(out, "out", key_type=str, value_type=Out)\n\n        # If only a single entry has been provided to the out dict, then slurp the\n        # annotation into the entry.\n        if len(out) == 1:\n            name = list(out.keys())[0]\n            only_out = out[name]\n            return [only_out.to_definition(inferred_out.annotation, name)]\n\n        output_defs = []\n\n        # Introspection on type annotations is experimental, so checking\n        # metaclass is the best we can do.\n        if inferred_out.annotation and not get_origin(inferred_out.annotation) == tuple:\n            raise DagsterInvariantViolationError(\n                "Expected Tuple annotation for multiple outputs, but received non-tuple annotation."\n            )\n        if inferred_out.annotation and not len(inferred_out.annotation.__args__) == len(out):\n            raise DagsterInvariantViolationError(\n                "Expected Tuple annotation to have number of entries matching the "\n                f"number of outputs for more than one output. Expected {len(out)} "\n                f"outputs but annotation has {len(inferred_out.annotation.__args__)}."\n            )\n        for idx, (name, cur_out) in enumerate(out.items()):\n            annotation_type = (\n                inferred_out.annotation.__args__[idx] if inferred_out.annotation else None\n            )\n            output_defs.append(cur_out.to_definition(annotation_type, name=name))\n\n        return output_defs\n\n\n@overload\ndef op(name: Callable[..., Any]) -> SolidDefinition:\n    ...\n\n\n@overload\ndef op(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    ins: Optional[Dict[str, In]] = ...,\n    out: Optional[Union[Out, Dict[str, Out]]] = ...,\n    config_schema: Optional[ConfigSchemaType] = ...,\n    required_resource_keys: Optional[Set[str]] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    version: Optional[str] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    input_defs: Optional[List[InputDefinition]] = ...,\n    output_defs: Optional[List[OutputDefinition]] = ...,\n) -> _Op:\n    ...\n\n\n
[docs]def op(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n ins: Optional[Dict[str, In]] = None,\n out: Optional[Union[Out, Dict[str, Out]]] = None,\n config_schema: Optional[ConfigSchemaType] = None,\n required_resource_keys: Optional[Set[str]] = None,\n tags: Optional[Dict[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n) -> Union[SolidDefinition, _Op]:\n """\n Create an op with the specified parameters from the decorated function.\n\n Ins and outs will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the op's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n @op supports ``async def`` functions as well, including async generators when yielding multiple\n events or outputs. Note that async ops will generally be run on their own unless using a custom\n :py:class:`Executor` implementation that supports running them together.\n\n Args:\n name (Optional[str]): Name of op. Must be unique within any :py:class:`GraphDefinition`\n using the op.\n description (Optional[str]): Human-readable description of this op. If not provided, and\n the decorated function has docstring, that docstring will be used as the description.\n ins (Optional[Dict[str, In]]):\n Information about the inputs to the op. Information provided here will be combined\n with what can be inferred from the function signature.\n out (Optional[Union[Out, Dict[str, Out]]]):\n Information about the op outputs. Information provided here will be combined with\n what can be inferred from the return type signature if the function does not use yield.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the op matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Values that are not strings\n will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`.\n version (Optional[str]): (Experimental) The version of the op's compute_fn. Two ops should have\n the same version if and only if they deterministically produce the same outputs when\n provided the same inputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n input_defs (Optional[List[InputDefinition]]):\n (legacy) Preserved to ease migration from :py:class:`solid`. Can be used in place of ins argument.\n output_defs (Optional[List[OutputDefinition]]):\n (legacy) Preserved to ease migration from :py:class:`solid`. Can be used in place of out argument.\n\n Examples:\n\n .. code-block:: python\n\n @op\n def hello_world():\n print('hello')\n\n @op\n def echo(msg: str) -> str:\n return msg\n\n @op(\n ins={'msg': In(str)},\n out=Out(str)\n )\n def echo_2(msg): # same as above\n return msg\n\n @op(\n out={'word': Out(), 'num': Out()}\n )\n def multi_out() -> Tuple[str, int]:\n return 'cool', 4\n """\n\n # This case is for when decorator is used bare, without arguments. e.g. @op versus @op()\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(output_defs is None)\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Op()(name)\n\n return _Op(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n retry_policy=retry_policy,\n ins=ins,\n out=out,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/op_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.op_decorator"}, "pipeline_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.pipeline_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Dict, List, Optional, Set, Union, overload\n\nimport dagster._check as check\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.core.decorator_utils import format_docstring_for_description\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom ..graph_definition import GraphDefinition\nfrom ..hook_definition import HookDefinition\nfrom ..input import InputDefinition\nfrom ..mode import ModeDefinition\nfrom ..output import OutputDefinition\nfrom ..pipeline_definition import PipelineDefinition\nfrom ..preset import PresetDefinition\nfrom ..version_strategy import VersionStrategy\n\n\nclass _Pipeline:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        mode_defs: Optional[List[ModeDefinition]] = None,\n        preset_defs: Optional[List[PresetDefinition]] = None,\n        description: Optional[str] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        hook_defs: Optional[Set[HookDefinition]] = None,\n        input_defs: Optional[List[InputDefinition]] = None,\n        output_defs: Optional[List[OutputDefinition]] = None,\n        config_schema: Optional[ConfigSchemaType] = None,\n        config_fn: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,\n        solid_retry_policy: Optional[RetryPolicy] = None,\n        version_strategy: Optional[VersionStrategy] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.mode_definitions = check.opt_list_param(mode_defs, "mode_defs", ModeDefinition)\n        self.preset_definitions = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition)\n        self.description = check.opt_str_param(description, "description")\n        self.tags = check.opt_dict_param(tags, "tags")\n        self.hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n        self.input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n        self.did_pass_outputs = output_defs is not None\n        self.output_defs = check.opt_nullable_list_param(\n            output_defs, "output_defs", of_type=OutputDefinition\n        )\n        self.config_schema = config_schema\n        self.config_fn = check.opt_callable_param(config_fn, "config_fn")\n        self.solid_retry_policy = check.opt_inst_param(\n            solid_retry_policy, "solid_retry_policy", RetryPolicy\n        )\n        self.version_strategy = check.opt_inst_param(\n            version_strategy, "version_strategy", VersionStrategy\n        )\n\n    def __call__(self, fn: Callable[..., Any]) -> PipelineDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        from dagster.core.definitions.decorators.composite_solid_decorator import (\n            do_composition,\n            get_validated_config_mapping,\n        )\n\n        config_mapping = get_validated_config_mapping(\n            self.name, self.config_schema, self.config_fn, decorator_name="pipeline"\n        )\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            solid_defs,\n            config_mapping,\n            positional_inputs,\n        ) = do_composition(\n            "@pipeline",\n            self.name,\n            fn,\n            self.input_defs,\n            self.output_defs,\n            config_mapping,\n            ignore_output_from_composition_fn=not self.did_pass_outputs,\n        )\n\n        pipeline_def = PipelineDefinition(\n            mode_defs=self.mode_definitions,\n            preset_defs=self.preset_definitions,\n            graph_def=GraphDefinition(\n                name=self.name,\n                description=None,  # put desc on the pipeline\n                dependencies=dependencies,\n                node_defs=solid_defs,\n                input_mappings=input_mappings,\n                output_mappings=output_mappings,\n                config=config_mapping,\n                positional_inputs=positional_inputs,\n            ),\n            tags=self.tags,\n            description=self.description or format_docstring_for_description(fn),\n            hook_defs=self.hook_defs,\n            solid_retry_policy=self.solid_retry_policy,\n            version_strategy=self.version_strategy,\n        )\n        update_wrapper(pipeline_def, fn)\n        return pipeline_def\n\n\n@overload\ndef pipeline(\n    name: Callable[..., Any],\n) -> PipelineDefinition:\n    ...\n\n\n@overload\ndef pipeline(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    mode_defs: Optional[List[ModeDefinition]] = ...,\n    preset_defs: Optional[List[PresetDefinition]] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    hook_defs: Optional[Set[HookDefinition]] = ...,\n    input_defs: Optional[List[InputDefinition]] = ...,\n    output_defs: Optional[List[OutputDefinition]] = ...,\n    config_schema: Optional[ConfigSchemaType] = ...,\n    config_fn: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = ...,\n    solid_retry_policy: Optional[RetryPolicy] = ...,\n    version_strategy: Optional[VersionStrategy] = ...,\n) -> _Pipeline:\n    pass\n\n\n
[docs]def pipeline(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n mode_defs: Optional[List[ModeDefinition]] = None,\n preset_defs: Optional[List[PresetDefinition]] = None,\n tags: Optional[Dict[str, Any]] = None,\n hook_defs: Optional[Set[HookDefinition]] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_defs: Optional[List[OutputDefinition]] = None,\n config_schema: Optional[ConfigSchemaType] = None,\n config_fn: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,\n solid_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n) -> Union[PipelineDefinition, _Pipeline]:\n """Create a pipeline with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up the dependency graph of the pipeline by writing a\n function that invokes solids and passes the output to other solids.\n\n Args:\n name (Optional[str]): The name of the pipeline. Must be unique within any\n :py:class:`RepositoryDefinition` containing the pipeline.\n description (Optional[str]): A human-readable description of the pipeline.\n mode_defs (Optional[List[ModeDefinition]]): The set of modes in which this pipeline can\n operate. Modes are used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline. Modes can be used, e.g., to vary\n available resource and logging implementations between local test and production runs.\n preset_defs (Optional[List[PresetDefinition]]): A set of preset collections of configuration\n options that may be used to execute a pipeline. A preset consists of an environment\n dict, an optional subset of solids to execute, and a mode selection. Presets can be used\n to ship common combinations of options to pipeline end users in Python code, and can\n be selected by tools like Dagit.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the pipeline.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n hook_defs (Optional[Set[HookDefinition]]): A set of hook definitions applied to the\n pipeline. When a hook is applied to a pipeline, it will be attached to all solid\n instances within the pipeline.\n solid_retry_policy (Optional[RetryPolicy]): The default retry policy for all solids in\n this pipeline. Only used if retry policy is not defined on the solid definition or\n solid invocation.\n version_strategy (Optional[VersionStrategy]): The version strategy to use with this\n pipeline. Providing a VersionStrategy will enable memoization on the pipeline.\n\n Example:\n\n .. code-block:: python\n\n @solid(output_defs=[OutputDefinition(int, "two"), OutputDefinition(int, "four")])\n def emit_two_four(_) -> int:\n yield Output(2, "two")\n yield Output(4, "four")\n\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n\n @lambda_solid\n def mult_two(num: int) -> int:\n return num * 2\n\n\n @pipeline\n def math_pipeline():\n two, four = emit_two_four()\n add_one(two)\n mult_two(four)\n """\n\n if input_defs is not None:\n experimental_arg_warning("input_defs", "pipeline")\n\n if output_defs is not None:\n experimental_arg_warning("output_defs", "pipeline")\n\n if config_schema is not None:\n experimental_arg_warning("config_schema", "pipeline")\n\n if config_fn is not None:\n experimental_arg_warning("config_fn", "pipeline")\n\n if callable(name):\n check.invariant(description is None)\n return _Pipeline()(name)\n\n return _Pipeline(\n name=name,\n mode_defs=mode_defs,\n preset_defs=preset_defs,\n description=description,\n tags=tags,\n hook_defs=hook_defs,\n input_defs=input_defs,\n output_defs=output_defs,\n config_schema=config_schema,\n config_fn=config_fn,\n solid_retry_policy=solid_retry_policy,\n version_strategy=version_strategy,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/pipeline_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.pipeline_decorator"}, "repository_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.repository_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Optional, Union, overload\n\nimport dagster._check as check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\nfrom ..graph_definition import GraphDefinition\nfrom ..partition import PartitionSetDefinition\nfrom ..pipeline_definition import PipelineDefinition\nfrom ..repository_definition import (\n    VALID_REPOSITORY_DATA_DICT_KEYS,\n    CachingRepositoryData,\n    RepositoryData,\n    RepositoryDefinition,\n)\nfrom ..schedule_definition import ScheduleDefinition\nfrom ..sensor_definition import SensorDefinition\n\n\nclass _Repository:\n    def __init__(self, name: Optional[str] = None, description: Optional[str] = None):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n\n    def __call__(self, fn: Callable[[], Any]) -> RepositoryDefinition:\n        from dagster.core.asset_defs import AssetGroup\n\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        repository_definitions = fn()\n\n        repository_data: Union[CachingRepositoryData, RepositoryData]\n        if isinstance(repository_definitions, list):\n            bad_definitions = []\n            for i, definition in enumerate(repository_definitions):\n                if not (\n                    isinstance(definition, PipelineDefinition)\n                    or isinstance(definition, PartitionSetDefinition)\n                    or isinstance(definition, ScheduleDefinition)\n                    or isinstance(definition, SensorDefinition)\n                    or isinstance(definition, GraphDefinition)\n                    or isinstance(definition, AssetGroup)\n                ):\n                    bad_definitions.append((i, type(definition)))\n            if bad_definitions:\n                bad_definitions_str = ", ".join(\n                    [\n                        "value of type {type_} at index {i}".format(type_=type_, i=i)\n                        for i, type_ in bad_definitions\n                    ]\n                )\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: all elements of list "\n                    "must be of type JobDefinition, GraphDefinition, PipelineDefinition, "\n                    "PartitionSetDefinition, ScheduleDefinition, or SensorDefinition. "\n                    f"Got {bad_definitions_str}."\n                )\n            repository_data = CachingRepositoryData.from_list(repository_definitions)\n\n        elif isinstance(repository_definitions, dict):\n            if not set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS):\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: dict must not contain "\n                    "keys other than {{'pipelines', 'partition_sets', 'schedules', 'jobs'}}: found "\n                    "{bad_keys}".format(\n                        bad_keys=", ".join(\n                            [\n                                "'{key}'".format(key=key)\n                                for key in repository_definitions.keys()\n                                if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n                            ]\n                        )\n                    )\n                )\n            repository_data = CachingRepositoryData.from_dict(repository_definitions)\n        elif isinstance(repository_definitions, RepositoryData):\n            repository_data = repository_definitions\n        else:\n            raise DagsterInvalidDefinitionError(\n                "Bad return value of type {type_} from repository construction function: must "\n                "return list, dict, or RepositoryData. See the @repository decorator docstring for "\n                "details and examples".format(type_=type(repository_definitions)),\n            )\n\n        repository_def = RepositoryDefinition(\n            name=self.name, description=self.description, repository_data=repository_data\n        )\n\n        update_wrapper(repository_def, fn)\n        return repository_def\n\n\n@overload\ndef repository(name: Callable[..., Any]) -> RepositoryDefinition:\n    ...\n\n\n@overload\ndef repository(name: Optional[str] = ..., description: Optional[str] = ...) -> _Repository:\n    ...\n\n\n
[docs]def repository(\n name: Optional[Union[str, Callable[..., Any]]] = None, description: Optional[str] = None\n) -> Union[RepositoryDefinition, _Repository]:\n """Create a repository from the decorated function.\n\n The decorated function should take no arguments and its return value should one of:\n\n 1. ``List[Union[JobDefinition, PipelineDefinition, PartitionSetDefinition, ScheduleDefinition, SensorDefinition]]``.\n Use this form when you have no need to lazy load pipelines or other definitions. This is the\n typical use case.\n\n 2. A dict of the form:\n\n .. code-block:: python\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n 'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n 'sensors': Dict[str, Callable[[], SensorDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n\n 3. A :py:class:`RepositoryData`. Return this object if you need fine-grained\n control over the construction and indexing of definitions within the repository, e.g., to\n create definitions dynamically from .yaml files in a directory.\n\n Args:\n name (Optional[str]): The name of the repository. Defaults to the name of the decorated\n function.\n description (Optional[str]): A string description of the repository.\n\n Example:\n\n .. code-block:: python\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n ######################################################################\n\n @op(config_schema={n: Field(Int)})\n def return_n(context):\n return context.op_config['n']\n\n @job\n def simple_job():\n return_n()\n\n @job\n def some_job():\n ...\n\n @sensor(job=some_job)\n def some_sensor():\n if foo():\n yield RunRequest(\n run_key= ...,\n run_config={\n 'ops': {'return_n': {'config': {'n': bar()}}}\n }\n )\n\n @job\n def my_job():\n ...\n\n my_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n @repository\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n\n ######################################################################\n # A lazy-loaded repository\n ######################################################################\n\n def make_expensive_job():\n @job\n def expensive_job():\n for i in range(10000):\n return_n.alias(f'return_n_{i}')()\n\n return expensive_job\n\n def make_expensive_schedule():\n @job\n def other_expensive_job():\n for i in range(11000):\n return_n.alias(f'my_return_n_{i}')()\n\n return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n @repository\n def lazy_loaded_repository():\n return {\n 'jobs': {'expensive_job': make_expensive_job},\n 'schedules': {'expensive_schedule': make_expensive_schedule}\n }\n\n\n ######################################################################\n # A complex repository that lazily constructs jobs from a directory\n # of files in a bespoke YAML format\n ######################################################################\n\n class ComplexRepositoryData(RepositoryData):\n def __init__(self, yaml_directory):\n self._yaml_directory = yaml_directory\n\n def get_all_pipelines(self):\n return [\n self._construct_job_def_from_yaml_file(\n self._yaml_file_for_job_name(file_name)\n )\n for file_name in os.listdir(self._yaml_directory)\n ]\n\n ...\n\n @repository\n def complex_repository():\n return ComplexRepositoryData('some_directory')\n\n """\n if callable(name):\n check.invariant(description is None)\n\n return _Repository()(name)\n\n return _Repository(name=name, description=description)
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/repository_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.repository_decorator"}, "schedule_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.schedule_decorator

\nimport copy\nimport datetime\nimport warnings\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast\n\nimport dagster._check as check\nfrom dagster.core.definitions.partition import (\n    PartitionScheduleDefinition,\n    PartitionSetDefinition,\n    ScheduleTimeBasedPartitionsDefinition,\n    ScheduleType,\n)\nfrom dagster.core.errors import (\n    DagsterInvalidDefinitionError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster.utils import ensure_gen\nfrom dagster.utils.partitions import (\n    DEFAULT_DATE_FORMAT,\n    DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE,\n    DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,\n    DEFAULT_MONTHLY_FORMAT,\n    create_offset_partition_selector,\n)\n\nfrom ..graph_definition import GraphDefinition\nfrom ..mode import DEFAULT_MODE_NAME\nfrom ..pipeline_definition import PipelineDefinition\nfrom ..run_request import RunRequest, SkipReason\nfrom ..schedule_definition import (\n    DecoratedScheduleFunction,\n    DefaultScheduleStatus,\n    RawScheduleEvaluationFunction,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n    is_context_provided,\n)\nfrom ..utils import validate_tags\n\nif TYPE_CHECKING:\n    from dagster import Partition\n\n# Error messages are long\n# pylint: disable=C0301\n\n\n
[docs]def schedule(\n cron_schedule: str,\n pipeline_name: Optional[str] = None,\n name: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n tags_fn: Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[[ScheduleEvaluationContext], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n execution_timezone: Optional[str] = None,\n description: Optional[str] = None,\n job: Optional[Union[PipelineDefinition, GraphDefinition]] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[RawScheduleEvaluationFunction], ScheduleDefinition]:\n """\n Creates a schedule following the provided cron schedule and requests runs for the provided job.\n\n The decorated function takes in a :py:class:`~dagster.ScheduleEvaluationContext` as its only\n argument, and does one of the following:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Return a run config dictionary.\n 6. Yield a `SkipReason` or yield one ore more `RunRequest` objects.\n\n Returns a :py:class:`~dagster.ScheduleDefinition`.\n\n Args:\n cron_schedule (str): A valid cron string specifying when the schedule will run, e.g.,\n ``'45 23 * * 6'`` for a schedule that runs at 11:45 PM every Saturday.\n pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the\n schedule runs.\n name (Optional[str]): The name of the schedule to create.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A function\n that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags`` and ``tags_fn``.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs at\n schedule execution time to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this\n schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def inner(fn: RawScheduleEvaluationFunction) -> ScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n validated_tags = None\n\n # perform upfront validation of schedule tags\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n validated_tags = validate_tags(tags, allow_reserved_tags=False)\n\n def _wrapped_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n if should_execute:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of should_execute for schedule {schedule_name}",\n ):\n if not should_execute(context):\n yield SkipReason(\n f"should_execute function for {schedule_name} returned false."\n )\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the evaluation of schedule {schedule_name}",\n ):\n if is_context_provided(fn):\n result = fn(context)\n else:\n result = fn() # type: ignore\n\n if isinstance(result, dict):\n # this is the run-config based decorated function, wrap the evaluated run config\n # and tags in a RunRequest\n evaluated_run_config = copy.deepcopy(result)\n evaluated_tags = (\n validated_tags\n or (tags_fn and validate_tags(tags_fn(context), allow_reserved_tags=False))\n or None\n )\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n elif isinstance(result, list):\n yield from cast(List[RunRequest], result)\n else:\n # this is a run-request based decorated function\n yield from cast(RunRequestIterator, ensure_gen(result))\n\n has_context_arg = is_context_provided(fn)\n evaluation_fn = DecoratedScheduleFunction(\n decorated_fn=fn,\n wrapped_fn=_wrapped_fn,\n has_context_arg=has_context_arg,\n )\n\n schedule_def = ScheduleDefinition(\n name=schedule_name,\n cron_schedule=cron_schedule,\n pipeline_name=pipeline_name,\n solid_selection=solid_selection,\n mode=mode,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n description=description,\n execution_fn=evaluation_fn,\n job=job,\n default_status=default_status,\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n\n return schedule_def\n\n return inner
\n\n\ndef monthly_schedule(\n pipeline_name: Optional[str],\n start_date: datetime.datetime,\n name: Optional[str] = None,\n execution_day_of_month: int = 1,\n execution_time: datetime.time = datetime.time(0, 0),\n tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[["ScheduleEvaluationContext"], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n end_date: Optional[datetime.datetime] = None,\n execution_timezone: Optional[str] = None,\n partition_months_offset: Optional[int] = 1,\n description: Optional[str] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:\n """Create a partitioned schedule that runs monthly.\n\n The decorated function should accept a datetime object as its only argument. The datetime\n represents the date partition that it's meant to run on.\n\n The decorated function should return a run configuration dictionary, which will be used as\n configuration for the scheduled run.\n\n The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_day_of_month (int): The day of the month on which to run the schedule (must be\n between 1 and 31).\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleEvaluationContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n partition_months_offset (Optional[int]): How many months back to go when choosing the partition\n for a given schedule execution. For example, when partition_months_offset=1, the schedule\n that executes during month N will fill in the partition for month N-1.\n (Default: 1)\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(pipeline_name, "pipeline_name")\n check.int_param(execution_day_of_month, "execution_day")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_int_param(partition_months_offset, "partition_months_offset")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n if (\n start_date.day != 1\n or start_date.hour != 0\n or start_date.minute != 0\n or start_date.second != 0\n ):\n warnings.warn(\n "`start_date` must be at the beginning of the first day of the month for a monthly "\n "schedule. Use `execution_day_of_month` and `execution_time` to execute the schedule "\n "at a specific time within the month. For example, to run the schedule at 3AM on the "\n "23rd of each month starting in October, your schedule definition would look like:"\n """\n@monthly_schedule(\n start_date=datetime.datetime(2020, 10, 1),\n execution_day_of_month=23,\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_day_of_month <= 0 or execution_day_of_month > 31:\n raise DagsterInvalidDefinitionError(\n "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be "\n "between 1 and 31".format(execution_day_of_month)\n )\n\n def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value: Callable[\n ["Partition"], Optional[Dict[str, str]]\n ] = lambda partition: {}\n if tags_fn_for_date:\n tags_fn = cast(\n Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date\n )\n tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)\n\n fmt = DEFAULT_MONTHLY_FORMAT\n\n partitions_def = ScheduleTimeBasedPartitionsDefinition(\n schedule_type=ScheduleType.MONTHLY,\n start=start_date,\n execution_day=execution_day_of_month,\n execution_time=execution_time,\n end=end_date,\n fmt=fmt,\n timezone=execution_timezone,\n offset=partition_months_offset,\n )\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name, # type: ignore[arg-type]\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n partitions_def=partitions_def,\n )\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name,\n partitions_def.get_cron_schedule(),\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=partitions_def.get_execution_time_to_partition_fn()\n ),\n execution_timezone=execution_timezone,\n description=description,\n decorated_fn=fn,\n default_status=default_status,\n )\n update_wrapper(schedule_def, wrapped=fn)\n\n return schedule_def\n\n return inner\n\n\ndef weekly_schedule(\n pipeline_name: Optional[str],\n start_date: datetime.datetime,\n name: Optional[str] = None,\n execution_day_of_week: int = 0,\n execution_time: datetime.time = datetime.time(0, 0),\n tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[["ScheduleEvaluationContext"], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n end_date: Optional[datetime.datetime] = None,\n execution_timezone: Optional[str] = None,\n partition_weeks_offset: Optional[int] = 1,\n description: Optional[str] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:\n """Create a partitioned schedule that runs daily.\n\n The decorated function should accept a datetime object as its only argument. The datetime\n represents the date partition that it's meant to run on.\n\n The decorated function should return a run configuration dictionary, which will be used as\n configuration for the scheduled run.\n\n The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_day_of_week (int): The day of the week on which to run the schedule. Must be\n between 0 (Sunday) and 6 (Saturday).\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleEvaluationContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n partition_weeks_offset (Optional[int]): How many weeks back to go when choosing the partition\n for a given schedule execution. For example, when partition_weeks_offset=1, the schedule\n that executes during week N will fill in the partition for week N-1.\n (Default: 1)\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(pipeline_name, "pipeline_name")\n check.int_param(execution_day_of_week, "execution_day_of_week")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_int_param(partition_weeks_offset, "partition_weeks_offset")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of a day for a weekly schedule. "\n "Use `execution_time` to execute the schedule at a specific time of day. For example, "\n "to run the schedule at 3AM each Tuesday starting on 10/20/2020, your schedule "\n "definition would look like:"\n """\n@weekly_schedule(\n start_date=datetime.datetime(2020, 10, 20),\n execution_day_of_week=1,\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_day_of_week < 0 or execution_day_of_week >= 7:\n raise DagsterInvalidDefinitionError(\n "`execution_day_of_week={}` is not valid for weekly schedule. Execution day must be "\n "between 0 [Sunday] and 6 [Saturday]".format(execution_day_of_week)\n )\n\n def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value: Callable[\n ["Partition"], Optional[Dict[str, str]]\n ] = lambda partition: {}\n if tags_fn_for_date:\n tags_fn = cast(\n Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date\n )\n tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)\n\n fmt = DEFAULT_DATE_FORMAT\n\n partitions_def = ScheduleTimeBasedPartitionsDefinition(\n schedule_type=ScheduleType.WEEKLY,\n start=start_date,\n execution_time=execution_time,\n execution_day=execution_day_of_week,\n end=end_date,\n fmt=fmt,\n timezone=execution_timezone,\n offset=partition_weeks_offset,\n )\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name, # type: ignore[arg-type]\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n partitions_def=partitions_def,\n )\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name,\n partitions_def.get_cron_schedule(),\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=partitions_def.get_execution_time_to_partition_fn(),\n ),\n execution_timezone=execution_timezone,\n description=description,\n decorated_fn=fn,\n default_status=default_status,\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n return schedule_def\n\n return inner\n\n\ndef daily_schedule(\n pipeline_name: Optional[str],\n start_date: datetime.datetime,\n name: Optional[str] = None,\n execution_time: datetime.time = datetime.time(0, 0),\n tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[[ScheduleEvaluationContext], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n end_date: Optional[datetime.datetime] = None,\n execution_timezone: Optional[str] = None,\n partition_days_offset: Optional[int] = 1,\n description: Optional[str] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:\n """Create a partitioned schedule that runs daily.\n\n The decorated function should accept a datetime object as its only argument. The datetime\n represents the date partition that it's meant to run on.\n\n The decorated function should return a run configuration dictionary, which will be used as\n configuration for the scheduled run.\n\n The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create.\n execution_time (datetime.time): The time at which to execute the schedule.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleEvaluationContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n partition_days_offset (Optional[int]): How many days back to go when choosing the partition\n for a given schedule execution. For example, when partition_days_offset=1, the schedule\n that executes during day N will fill in the partition for day N-1.\n (Default: 1)\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(pipeline_name, "pipeline_name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_str_param(name, "name")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_int_param(partition_days_offset, "partition_days_offset")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of a day for a daily schedule. "\n "Use `execution_time` to execute the schedule at a specific time of day. For example, "\n "to run the schedule at 3AM each day starting on 10/20/2020, your schedule "\n "definition would look like:"\n """\n@daily_schedule(\n start_date=datetime.datetime(2020, 10, 20),\n execution_time=datetime.time(3, 0)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n fmt = DEFAULT_DATE_FORMAT\n\n def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value: Callable[\n ["Partition"], Optional[Dict[str, str]]\n ] = lambda partition: {}\n if tags_fn_for_date:\n tags_fn = cast(\n Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date\n )\n tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)\n\n partitions_def = ScheduleTimeBasedPartitionsDefinition(\n schedule_type=ScheduleType.DAILY,\n start=start_date,\n execution_time=execution_time,\n end=end_date,\n fmt=fmt,\n timezone=execution_timezone,\n offset=partition_days_offset,\n )\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name, # type: ignore[arg-type]\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n partitions_def=partitions_def,\n )\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name,\n partitions_def.get_cron_schedule(),\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=partitions_def.get_execution_time_to_partition_fn(),\n ),\n execution_timezone=execution_timezone,\n description=description,\n decorated_fn=fn,\n default_status=default_status,\n )\n update_wrapper(schedule_def, wrapped=fn)\n return schedule_def\n\n return inner\n\n\ndef hourly_schedule(\n pipeline_name: Optional[str],\n start_date: datetime.datetime,\n name: Optional[str] = None,\n execution_time: datetime.time = datetime.time(0, 0),\n tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[[ScheduleEvaluationContext], bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n end_date: Optional[datetime.datetime] = None,\n execution_timezone: Optional[str] = None,\n partition_hours_offset: Optional[int] = 1,\n description: Optional[str] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:\n """Create a partitioned schedule that runs hourly.\n\n The decorated function should accept a datetime object as its only argument. The datetime\n represents the date partition that it's meant to run on.\n\n The decorated function should return a run configuration dictionary, which will be used as\n configuration for the scheduled run.\n\n The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.\n\n Args:\n pipeline_name (str): The name of the pipeline to execute when the schedule runs.\n start_date (datetime.datetime): The date from which to run the schedule.\n name (Optional[str]): The name of the schedule to create. By default, this will be the name\n of the decorated function.\n execution_time (datetime.time): The time at which to execute the schedule. Only the minutes\n component will be respected -- the hour should be 0, and will be ignored if it is not 0.\n tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes the date of the\n schedule run and returns a dictionary of tags (string key-value pairs).\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The pipeline mode in which to execute this schedule.\n (Default: 'default')\n should_execute (Optional[Callable[ScheduleEvaluationContext, bool]]): A function that runs at\n schedule execution tie to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing\n the schedule.\n end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to\n current time.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n partition_hours_offset (Optional[int]): How many hours back to go when choosing the partition\n for a given schedule execution. For example, when partition_hours_offset=1, the schedule\n that executes during hour N will fill in the partition for hour N-1.\n (Default: 1)\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(name, "name")\n check.inst_param(start_date, "start_date", datetime.datetime)\n check.opt_inst_param(end_date, "end_date", datetime.datetime)\n check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")\n check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.opt_str_param(pipeline_name, "pipeline_name")\n check.inst_param(execution_time, "execution_time", datetime.time)\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_int_param(partition_hours_offset, "partition_hours_offset")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n if start_date.minute != 0 or start_date.second != 0:\n warnings.warn(\n "`start_date` must be at the beginning of the hour for an hourly schedule. "\n "Use `execution_time` to execute the schedule at a specific time within the hour. For "\n "example, to run the schedule each hour at 15 minutes past the hour starting at 3AM "\n "on 10/20/2020, your schedule definition would look like:"\n """\n@hourly_schedule(\n start_date=datetime.datetime(2020, 10, 20, 3),\n execution_time=datetime.time(0, 15)\n):\ndef my_schedule_definition(_):\n ...\n"""\n )\n\n if execution_time.hour != 0:\n warnings.warn(\n "Hourly schedule {schedule_name} created with:\\n"\n "\\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."\n "Since this is an hourly schedule, the hour parameter will be ignored and the schedule "\n "will run on the {minute} mark for the previous hour interval. Replace "\n "datetime.time(hour={hour}, minute={minute}, ...) with "\n "datetime.time(minute={minute}, ...) to fix this warning."\n )\n\n def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:\n check.callable_param(fn, "fn")\n\n schedule_name = name or fn.__name__\n\n tags_fn_for_partition_value: Callable[\n ["Partition"], Optional[Dict[str, str]]\n ] = lambda partition: {}\n if tags_fn_for_date:\n tags_fn = cast(\n Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date\n )\n tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)\n\n fmt = (\n DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE\n if execution_timezone\n else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n )\n\n partitions_def = ScheduleTimeBasedPartitionsDefinition(\n schedule_type=ScheduleType.HOURLY,\n start=start_date,\n execution_time=execution_time,\n end=end_date,\n fmt=fmt,\n timezone=execution_timezone,\n offset=partition_hours_offset,\n )\n\n partition_set = PartitionSetDefinition(\n name="{}_partitions".format(schedule_name),\n pipeline_name=pipeline_name, # type: ignore[arg-type]\n run_config_fn_for_partition=lambda partition: fn(partition.value),\n solid_selection=solid_selection,\n tags_fn_for_partition=tags_fn_for_partition_value,\n mode=mode,\n partitions_def=partitions_def,\n )\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name,\n partitions_def.get_cron_schedule(),\n should_execute=should_execute,\n environment_vars=environment_vars,\n partition_selector=create_offset_partition_selector(\n execution_time_to_partition_fn=partitions_def.get_execution_time_to_partition_fn(),\n ),\n execution_timezone=execution_timezone,\n description=description,\n decorated_fn=fn,\n default_status=default_status,\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n return schedule_def\n\n return inner\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/schedule_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.schedule_decorator"}, "sensor_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.sensor_decorator

\nimport inspect\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom dagster.core.definitions.sensor_definition import (\n    DefaultSensorStatus,\n    RunRequest,\n    SensorDefinition,\n    SkipReason,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\n\nfrom ...errors import DagsterInvariantViolationError\nfrom ..events import AssetKey\nfrom ..graph_definition import GraphDefinition\nfrom ..job_definition import JobDefinition\nfrom ..sensor_definition import (\n    AssetMaterializationFunction,\n    AssetSensorDefinition,\n    RawSensorEvaluationFunction,\n    RunRequest,\n    SensorDefinition,\n    SkipReason,\n)\n\nif TYPE_CHECKING:\n    from ...events.log import EventLogEntry\n\n\n
[docs]def sensor(\n pipeline_name: Optional[str] = None,\n name: Optional[str] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, JobDefinition]] = None,\n jobs: Optional[Sequence[Union[GraphDefinition, JobDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[[RawSensorEvaluationFunction], SensorDefinition]:\n """\n Creates a sensor where the decorated function is used as the sensor's evaluation function. The\n decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one ore more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n Args:\n pipeline_name (Optional[str]): (legacy) Name of the target pipeline. Cannot be used in\n conjunction with `job` or `jobs` parameters.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n solid_selection (Optional[List[str]]): (legacy) A list of solid subselection (including single\n solid names) to execute for runs for this sensor e.g.\n ``['*some_solid+', 'other_solid']``.\n Cannot be used in conjunction with `job` or `jobs` parameters.\n mode (Optional[str]): (legacy) The mode to apply when executing runs for this sensor. Cannot be used\n in conjunction with `job` or `jobs` parameters.\n (default: 'default')\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: RawSensorEvaluationFunction) -> SensorDefinition:\n check.callable_param(fn, "fn")\n\n sensor_def = SensorDefinition(\n name=name,\n pipeline_name=pipeline_name,\n evaluation_fn=fn,\n solid_selection=solid_selection,\n mode=mode,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n )\n\n update_wrapper(sensor_def, wrapped=fn)\n\n return sensor_def\n\n return inner
\n\n\n
[docs]def asset_sensor(\n asset_key: AssetKey,\n pipeline_name: Optional[str] = None,\n name: Optional[str] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, JobDefinition]] = None,\n jobs: Optional[Sequence[Union[GraphDefinition, JobDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[[AssetMaterializationFunction,], AssetSensorDefinition,]:\n """\n Creates an asset sensor where the decorated function is used as the asset sensor's evaluation\n function. The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one ore more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext` and an EventLogEntry corresponding to an\n AssetMaterialization event.\n\n Args:\n asset_key (AssetKey): The asset_key this sensor monitors.\n pipeline_name (Optional[str]): (legacy) Name of the target pipeline. Cannot be used in conjunction with `job` or `jobs` parameters.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n solid_selection (Optional[List[str]]): (legacy) A list of solid subselection (including single\n solid names) to execute for runs for this sensor e.g.\n ``['*some_solid+', 'other_solid']``. Cannot be used in conjunction with `job` or `jobs`\n parameters.\n mode (Optional[str]): (legacy) The mode to apply when executing runs for this sensor. Cannot be used\n in conjunction with `job` or `jobs` parameters.\n (default: 'default')\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n check.opt_str_param(name, "name")\n\n def inner(fn: AssetMaterializationFunction) -> AssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(context, event):\n result = fn(context, event)\n\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (RunRequest, SkipReason)):\n yield result\n\n elif result is not None:\n raise DagsterInvariantViolationError(\n (\n "Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n "{result} of type {type_}. Should only return SkipReason or "\n "RunRequest objects."\n ).format(sensor_name=sensor_name, result=result, type_=type(result))\n )\n\n return AssetSensorDefinition(\n name=sensor_name,\n asset_key=asset_key,\n pipeline_name=pipeline_name,\n asset_materialization_fn=_wrapped_fn,\n solid_selection=solid_selection,\n mode=mode,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/sensor_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.sensor_decorator"}, "solid_decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.decorators.solid_decorator

\nfrom functools import lru_cache, update_wrapper\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    List,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.core.decorator_utils import format_docstring_for_description\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.core.types.dagster_type import DagsterTypeKind\nfrom dagster.seven import funcsigs\n\nfrom ...decorator_utils import (\n    get_function_params,\n    get_valid_name_permutations,\n    param_is_var_keyword,\n    positional_arg_name_list,\n)\nfrom ..inference import infer_input_props, infer_output_props\nfrom ..input import InputDefinition\nfrom ..output import OutputDefinition\nfrom ..policy import RetryPolicy\nfrom ..solid_definition import SolidDefinition\n\n\nclass DecoratedSolidFunction(NamedTuple):\n    """Wrapper around the decorated solid function to provide commonly used util methods"""\n\n    decorated_fn: Callable[..., Any]\n\n    @lru_cache(maxsize=1)\n    def has_context_arg(self) -> bool:\n        return is_context_provided(get_function_params(self.decorated_fn))\n\n    @lru_cache(maxsize=1)\n    def _get_function_params(self) -> List[funcsigs.Parameter]:\n        return get_function_params(self.decorated_fn)\n\n    def positional_inputs(self) -> List[str]:\n        params = self._get_function_params()\n        input_args = params[1:] if self.has_context_arg() else params\n        return positional_arg_name_list(input_args)\n\n    def has_var_kwargs(self) -> bool:\n        params = self._get_function_params()\n        # var keyword arg has to be the last argument\n        return len(params) > 0 and param_is_var_keyword(params[-1])\n\n\nclass NoContextDecoratedSolidFunction(DecoratedSolidFunction):\n    """Wrapper around a decorated solid function, when the decorator does not permit a context\n    parameter (such as lambda_solid).\n    """\n\n    @lru_cache(maxsize=1)\n    def has_context_arg(self) -> bool:\n        return False\n\n\nclass _Solid:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        input_defs: Optional[Sequence[InputDefinition]] = None,\n        output_defs: Optional[Sequence[OutputDefinition]] = None,\n        description: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        config_schema: Optional[ConfigSchemaType] = None,\n        tags: Optional[Dict[str, Any]] = None,\n        version: Optional[str] = None,\n        decorator_takes_context: Optional[bool] = True,\n        retry_policy: Optional[RetryPolicy] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.input_defs = check.opt_list_param(input_defs, "input_defs", InputDefinition)\n        self.output_defs = check.opt_nullable_sequence_param(\n            output_defs, "output_defs", OutputDefinition\n        )\n        self.decorator_takes_context = check.bool_param(\n            decorator_takes_context, "decorator_takes_context"\n        )\n\n        self.description = check.opt_str_param(description, "description")\n\n        # these will be checked within SolidDefinition\n        self.required_resource_keys = required_resource_keys\n        self.tags = tags\n        self.version = version\n        self.retry_policy = retry_policy\n\n        # config will be checked within SolidDefinition\n        self.config_schema = config_schema\n\n    def __call__(self, fn: Callable[..., Any]) -> SolidDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        output_defs: Sequence[OutputDefinition]\n        if self.output_defs is None:\n            output_defs = [OutputDefinition.create_from_inferred(infer_output_props(fn))]\n        elif len(self.output_defs) == 1:\n            output_defs = [self.output_defs[0].combine_with_inferred(infer_output_props(fn))]\n        else:\n            output_defs = self.output_defs\n\n        compute_fn = (\n            DecoratedSolidFunction(decorated_fn=fn)\n            if self.decorator_takes_context\n            else NoContextDecoratedSolidFunction(decorated_fn=fn)\n        )\n\n        resolved_input_defs = resolve_checked_solid_fn_inputs(\n            decorator_name="@solid",\n            fn_name=self.name,\n            compute_fn=compute_fn,\n            explicit_input_defs=self.input_defs,\n            exclude_nothing=True,\n        )\n\n        solid_def = SolidDefinition(\n            name=self.name,\n            input_defs=resolved_input_defs,\n            output_defs=output_defs,\n            compute_fn=compute_fn,\n            config_schema=self.config_schema,\n            description=self.description or format_docstring_for_description(fn),\n            required_resource_keys=self.required_resource_keys,\n            tags=self.tags,\n            version=self.version,\n            retry_policy=self.retry_policy,\n        )\n        update_wrapper(solid_def, compute_fn.decorated_fn)\n        return solid_def\n\n\n@overload\ndef solid(name: Callable[..., Any]) -> SolidDefinition:\n    ...\n\n\n@overload\ndef solid(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    input_defs: Optional[Sequence[InputDefinition]] = ...,\n    output_defs: Optional[Sequence[OutputDefinition]] = ...,\n    config_schema: Optional[ConfigSchemaType] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    tags: Optional[Dict[str, Any]] = ...,\n    version: Optional[str] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n) -> Union[_Solid, SolidDefinition]:\n    ...\n\n\n
[docs]def solid(\n name: Optional[Union[Callable[..., Any], str]] = None,\n description: Optional[str] = None,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n config_schema: Optional[ConfigSchemaType] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Dict[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n) -> Union[_Solid, SolidDefinition]:\n """Create a solid with the specified parameters from the decorated function.\n\n This shortcut simplifies the core :class:`SolidDefinition` API by exploding arguments into\n kwargs of the decorated compute function and omitting additional parameters when they are not\n needed.\n\n Input and output definitions will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the solid's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n @solid supports ``async def`` functions as well, including async generators when yielding multiple\n events or outputs. Note that async solids will generally be run on their own unless using a custom\n :py:class:`Executor` implementation that supports running them together.\n\n Args:\n name (Optional[str]): Name of solid. Must be unique within any :py:class:`PipelineDefinition`\n using the solid.\n description (Optional[str]): Human-readable description of this solid. If not provided, and\n the decorated function has docstring, that docstring will be used as the description.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs to the solid. Information provided here will be combined\n with what can be inferred from the function signature, with these explicit InputDefinitions\n taking precedence.\n output_defs (Optional[List[OutputDefinition]]):\n Information about the solids outputs. Information provided here will be combined with\n what can be inferred from the return type signature if there is only one OutputDefinition\n and the function does not use yield.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the solid matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the solid.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Values that are not strings\n will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`.\n version (Optional[str]): (Experimental) The version of the solid's compute_fn. Two solids should have\n the same version if and only if they deterministically produce the same outputs when\n provided the same inputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this solid.\n\n\n Examples:\n\n .. code-block:: python\n\n @solid\n def hello_world():\n print('hello')\n\n @solid\n def hello_world():\n return {'foo': 'bar'}\n\n @solid\n def hello_world():\n return Output(value={'foo': 'bar'})\n\n @solid\n def hello_world():\n yield Output(value={'foo': 'bar'})\n\n @solid\n def hello_world(foo):\n return foo\n\n @solid(\n input_defs=[InputDefinition(name="foo", str)],\n output_defs=[OutputDefinition(str)]\n )\n def hello_world(foo):\n # explicitly type and name inputs and outputs\n return foo\n\n @solid\n def hello_world(foo: str) -> str:\n # same as above inferred from signature\n return foo\n\n @solid\n def hello_world(context, foo):\n context.log.info('log something')\n return foo\n\n @solid(\n config_schema={'str_value' : Field(str)}\n )\n def hello_world(context, foo):\n # context.solid_config is a dictionary with 'str_value' key\n return foo + context.solid_config['str_value']\n\n """\n # This case is for when decorator is used bare, without arguments. e.g. @solid versus @solid()\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(output_defs is None)\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Solid()(name)\n\n return _Solid(\n name=name,\n input_defs=input_defs,\n output_defs=output_defs,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n retry_policy=retry_policy,\n )
\n\n\ndef resolve_checked_solid_fn_inputs(\n decorator_name: str,\n fn_name: str,\n compute_fn: DecoratedSolidFunction,\n explicit_input_defs: List[InputDefinition],\n exclude_nothing: bool,\n) -> List[InputDefinition]:\n """\n Validate provided input definitions and infer the remaining from the type signature of the compute_fn.\n Returns the resolved set of InputDefinitions.\n\n Args:\n decorator_name (str): Name of the decorator that is wrapping the op/solid function.\n fn_name (str): Name of the decorated function.\n compute_fn (DecoratedSolidFunction): The decorated function, wrapped in the\n DecoratedSolidFunction wrapper.\n explicit_input_defs (List[InputDefinition]): The input definitions that were explicitly\n provided in the decorator.\n exclude_nothing (bool): True if Nothing type inputs should be excluded from compute_fn\n arguments.\n """\n\n if exclude_nothing:\n explicit_names = set(\n inp.name\n for inp in explicit_input_defs\n if not inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n nothing_names = set(\n inp.name\n for inp in explicit_input_defs\n if inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n else:\n explicit_names = set(inp.name for inp in explicit_input_defs)\n nothing_names = set()\n\n params = get_function_params(compute_fn.decorated_fn)\n\n input_args = params[1:] if compute_fn.has_context_arg() else params\n\n # Validate input arguments\n used_inputs = set()\n inputs_to_infer = set()\n has_kwargs = False\n\n for param in cast(List[funcsigs.Parameter], input_args):\n if param.kind == funcsigs.Parameter.VAR_KEYWORD:\n has_kwargs = True\n elif param.kind == funcsigs.Parameter.VAR_POSITIONAL:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has positional vararg parameter "\n f"'{param}'. {decorator_name} decorated functions should only have keyword "\n "arguments that match input names and, if system information is required, a first "\n "positional parameter named 'context'."\n )\n\n else:\n if param.name not in explicit_names:\n if param.name in nothing_names:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has parameter '{param.name}' that is "\n "one of the input_defs of type 'Nothing' which should not be included since "\n "no data will be passed for it. "\n )\n else:\n inputs_to_infer.add(param.name)\n\n else:\n used_inputs.add(param.name)\n\n undeclared_inputs = explicit_names - used_inputs\n if not has_kwargs and undeclared_inputs:\n undeclared_inputs_printed = ", '".join(undeclared_inputs)\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function does not have parameter(s) "\n f"'{undeclared_inputs_printed}', which are in provided input_defs. {decorator_name} "\n "decorated functions should only have keyword arguments that match input names and, if "\n "system information is required, a first positional parameter named 'context'."\n )\n\n inferred_props = {\n inferred.name: inferred\n for inferred in infer_input_props(compute_fn.decorated_fn, compute_fn.has_context_arg())\n }\n input_defs = []\n for input_def in explicit_input_defs:\n if input_def.name in inferred_props:\n # combine any information missing on the explicit def that can be inferred\n input_defs.append(\n input_def.combine_with_inferred(\n inferred_props[input_def.name], decorator_name=decorator_name\n )\n )\n else:\n # pass through those that don't have any inference info, such as Nothing type inputs\n input_defs.append(input_def)\n\n # build defs from the inferred props for those without explicit entries\n input_defs.extend(\n InputDefinition.create_from_inferred(inferred, decorator_name=decorator_name)\n for inferred in inferred_props.values()\n if inferred.name in inputs_to_infer\n )\n\n return input_defs\n\n\ndef is_context_provided(params: List[funcsigs.Parameter]) -> bool:\n if len(params) == 0:\n return False\n return params[0].name in get_valid_name_permutations("context")\n\n\ndef lambda_solid(\n name: Optional[Union[str, Callable[..., Any]]] = None,\n description: Optional[str] = None,\n input_defs: Optional[List[InputDefinition]] = None,\n output_def: Optional[OutputDefinition] = None,\n) -> Union[_Solid, SolidDefinition]:\n """Create a simple solid from the decorated function.\n\n This shortcut allows the creation of simple solids that do not require\n configuration and whose implementations do not require a\n :py:class:`context <SolidExecutionContext>`.\n\n Lambda solids take any number of inputs and produce a single output.\n\n Inputs can be defined using :class:`InputDefinition` and passed to the ``input_defs`` argument\n of this decorator, or inferred from the type signature of the decorated function.\n\n The single output can be defined using :class:`OutputDefinition` and passed as the\n ``output_def`` argument of this decorator, or its type can be inferred from the type signature\n of the decorated function.\n\n The body of the decorated function should return a single value, which will be yielded as the\n solid's output.\n\n Args:\n name (str): Name of solid.\n description (str): Solid description.\n input_defs (List[InputDefinition]): List of input_defs.\n output_def (OutputDefinition): The output of the solid. Defaults to\n :class:`OutputDefinition() <OutputDefinition>`.\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def hello_world():\n return 'hello'\n\n @lambda_solid(\n input_defs=[InputDefinition(name='foo', str)],\n output_def=OutputDefinition(str)\n )\n def hello_world(foo):\n # explicitly type and name inputs and outputs\n return foo\n\n @lambda_solid\n def hello_world(foo: str) -> str:\n # same as above inferred from signature\n return foo\n\n """\n if callable(name):\n check.invariant(input_defs is None)\n check.invariant(description is None)\n return _Solid(\n output_defs=[output_def] if output_def else None, decorator_takes_context=False\n )(name)\n\n return _Solid(\n name=name,\n input_defs=input_defs,\n output_defs=[output_def] if output_def else None,\n description=description,\n decorator_takes_context=False,\n )\n
", "current_page_name": "_modules/dagster/core/definitions/decorators/solid_decorator", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.decorators.solid_decorator"}}, "dependency": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.dependency

\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    NamedTuple,\n    Optional,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.serdes.serdes import (\n    DefaultNamedTupleSerializer,\n    WhitelistMap,\n    register_serdes_tuple_fallbacks,\n    whitelist_for_serdes,\n)\nfrom dagster.utils import frozentags\n\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .output import OutputDefinition\nfrom .utils import DEFAULT_OUTPUT, struct_to_string, validate_tags\n\nif TYPE_CHECKING:\n    from .composition import MappedInputPlaceholder\n    from .graph_definition import GraphDefinition\n    from .node_definition import NodeDefinition\n\n\n
[docs]class NodeInvocation(\n NamedTuple(\n "Node",\n [\n ("name", str),\n ("alias", Optional[str]),\n ("tags", Dict[str, Any]),\n ("hook_defs", AbstractSet[HookDefinition]),\n ("retry_policy", Optional[RetryPolicy]),\n ],\n )\n):\n """Identifies an instance of a node in a graph dependency structure.\n\n Args:\n name (str): Name of the solid of which this is an instance.\n alias (Optional[str]): Name specific to this instance of the solid. Necessary when there are\n multiple instances of the same solid.\n tags (Optional[Dict[str, Any]]): Optional tags values to extend or override those\n set on the solid definition.\n hook_defs (Optional[AbstractSet[HookDefinition]]): A set of hook definitions applied to the\n solid instance.\n\n Examples:\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n from dagster import job\n\n @job\n def my_job():\n other_name = some_op.alias('other_name')\n some_graph(other_name(some_op))\n\n """\n\n def __new__(\n cls,\n name: str,\n alias: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n alias=check.opt_str_param(alias, "alias"),\n tags=frozentags(check.opt_dict_param(tags, "tags", value_type=str, key_type=str)),\n hook_defs=frozenset(\n check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n ),\n retry_policy=check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy),\n )
\n\n\nSolidInvocation = NodeInvocation\n\n\nclass Node:\n """\n Node invocation within a graph. Identified by its name inside the graph.\n """\n\n def __init__(\n self,\n name: str,\n definition: "NodeDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Dict[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n from .solid_definition import NodeDefinition\n\n self.name = check.str_param(name, "name")\n self.definition = check.inst_param(definition, "definition", NodeDefinition)\n self.graph_definition = check.inst_param(\n graph_definition,\n "graph_definition",\n GraphDefinition,\n )\n self._additional_tags = validate_tags(tags)\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n input_handles = {}\n for name, input_def in self.definition.input_dict.items():\n input_handles[name] = SolidInputHandle(self, input_def)\n\n self._input_handles = input_handles\n\n output_handles = {}\n for name, output_def in self.definition.output_dict.items():\n output_handles[name] = SolidOutputHandle(self, output_def)\n\n self._output_handles = output_handles\n\n def input_handles(self):\n return self._input_handles.values()\n\n def output_handles(self):\n return self._output_handles.values()\n\n def input_handle(self, name: str) -> "SolidInputHandle":\n check.str_param(name, "name")\n return self._input_handles[name]\n\n def output_handle(self, name: str) -> "SolidOutputHandle":\n check.str_param(name, "name")\n return self._output_handles[name]\n\n def has_input(self, name: str) -> bool:\n return self.definition.has_input(name)\n\n def input_def_named(self, name: str) -> InputDefinition:\n return self.definition.input_def_named(name)\n\n def has_output(self, name: str) -> bool:\n return self.definition.has_output(name)\n\n def output_def_named(self, name: str) -> OutputDefinition:\n return self.definition.output_def_named(name)\n\n @property\n def is_graph(self) -> bool:\n from .graph_definition import GraphDefinition\n\n return isinstance(self.definition, GraphDefinition)\n\n def describe_node(self) -> str:\n from .op_definition import OpDefinition\n from .solid_definition import CompositeSolidDefinition, SolidDefinition\n\n if isinstance(self.definition, CompositeSolidDefinition):\n return f"composite solid '{self.name}'"\n elif isinstance(self.definition, OpDefinition):\n return f"op '{self.name}'"\n elif isinstance(self.definition, SolidDefinition):\n return f"solid '{self.name}'"\n else:\n return f"graph '{self.name}'"\n\n @property\n def input_dict(self):\n return self.definition.input_dict\n\n @property\n def output_dict(self):\n return self.definition.output_dict\n\n @property\n def tags(self) -> frozentags:\n return self.definition.tags.updated_with(self._additional_tags)\n\n def container_maps_input(self, input_name: str) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(InputPointer(self.name, input_name))\n is not None\n )\n\n def container_mapped_input(self, input_name: str) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n InputPointer(self.name, input_name)\n )\n if mapping is None:\n check.failed(\n f"container does not map input {input_name}, check container_maps_input first"\n )\n return mapping\n\n def container_maps_fan_in_input(self, input_name: str, fan_in_index: int) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n is not None\n )\n\n def container_mapped_fan_in_input(self, input_name: str, fan_in_index: int) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n if mapping is None:\n check.failed(\n f"container does not map fan-in {input_name} idx {fan_in_index}, check "\n "container_maps_fan_in_input first"\n )\n\n return mapping\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n return self._retry_policy\n\n\nclass NodeHandleSerializer(DefaultNamedTupleSerializer):\n @classmethod\n def value_to_storage_dict(\n cls,\n value: NamedTuple,\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Dict[str, Any]:\n storage = super().value_to_storage_dict(\n value,\n whitelist_map,\n descent_path,\n )\n # persist using legacy name SolidHandle\n storage["__class__"] = "SolidHandle"\n return storage\n\n\n@whitelist_for_serdes(serializer=NodeHandleSerializer)\nclass NodeHandle(\n # mypy does not yet support recursive types\n # NamedTuple("_NodeHandle", [("name", str), ("parent", Optional["NodeHandle"])])\n NamedTuple("_NodeHandle", [("name", str), ("parent", Any)])\n):\n """\n A structured object to identify nodes in the potentially recursive graph structure.\n """\n\n def __new__(cls, name: str, parent: Optional["NodeHandle"]):\n return super(NodeHandle, cls).__new__(\n cls,\n check.str_param(name, "name"),\n check.opt_inst_param(parent, "parent", NodeHandle),\n )\n\n def __str__(self):\n return self.to_string()\n\n @property\n def path(self) -> List[str]:\n """Return a list representation of the handle.\n\n Inverse of NodeHandle.from_path.\n\n Returns:\n List[str]:\n """\n path = []\n cur = self\n while cur:\n path.append(cur.name)\n cur = cur.parent\n path.reverse()\n return path\n\n def to_string(self) -> str:\n """Return a unique string representation of the handle.\n\n Inverse of NodeHandle.from_string.\n """\n return self.parent.to_string() + "." + self.name if self.parent else self.name\n\n def is_or_descends_from(self, handle: "NodeHandle") -> bool:\n """Check if the handle is or descends from another handle.\n\n Args:\n handle (NodeHandle): The handle to check against.\n\n Returns:\n bool:\n """\n check.inst_param(handle, "handle", NodeHandle)\n\n for idx in range(len(handle.path)):\n if idx >= len(self.path):\n return False\n if self.path[idx] != handle.path[idx]:\n return False\n return True\n\n def pop(self, ancestor: "NodeHandle") -> Optional["NodeHandle"]:\n """Return a copy of the handle with some of its ancestors pruned.\n\n Args:\n ancestor (NodeHandle): Handle to an ancestor of the current handle.\n\n Returns:\n NodeHandle:\n\n Example:\n\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('bar', NodeHandle('foo', None))\n assert handle.pop(ancestor) == NodeHandle('baz', None)\n """\n\n check.inst_param(ancestor, "ancestor", NodeHandle)\n check.invariant(\n self.is_or_descends_from(ancestor),\n "Handle {handle} does not descend from {ancestor}".format(\n handle=self.to_string(), ancestor=ancestor.to_string()\n ),\n )\n\n return NodeHandle.from_path(self.path[len(ancestor.path) :])\n\n def with_ancestor(self, ancestor: "NodeHandle") -> Optional["NodeHandle"]:\n """Returns a copy of the handle with an ancestor grafted on.\n\n Args:\n ancestor (NodeHandle): Handle to the new ancestor.\n\n Returns:\n NodeHandle:\n\n Example:\n\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('quux' None)\n assert handle.with_ancestor(ancestor) == NodeHandle(\n 'baz', NodeHandle('bar', NodeHandle('foo', NodeHandle('quux', None)))\n )\n """\n check.opt_inst_param(ancestor, "ancestor", NodeHandle)\n\n return NodeHandle.from_path((ancestor.path if ancestor else []) + self.path)\n\n @staticmethod\n def from_path(path: List[str]) -> "NodeHandle":\n check.list_param(path, "path", of_type=str)\n\n cur: Optional["NodeHandle"] = None\n while len(path) > 0:\n cur = NodeHandle(name=path.pop(0), parent=cur)\n\n if cur is None:\n check.failed(f"Invalid handle path {path}")\n\n return cur\n\n @staticmethod\n def from_string(handle_str: str) -> "NodeHandle":\n check.str_param(handle_str, "handle_str")\n\n path = handle_str.split(".")\n return NodeHandle.from_path(path)\n\n @classmethod\n def from_dict(cls, dict_repr: Dict[str, Any]) -> Optional["NodeHandle"]:\n """This method makes it possible to load a potentially nested NodeHandle after a\n roundtrip through json.loads(json.dumps(NodeHandle._asdict()))"""\n\n check.dict_param(dict_repr, "dict_repr", key_type=str)\n check.invariant(\n "name" in dict_repr, "Dict representation of NodeHandle must have a 'name' key"\n )\n check.invariant(\n "parent" in dict_repr, "Dict representation of NodeHandle must have a 'parent' key"\n )\n\n if isinstance(dict_repr["parent"], (list, tuple)):\n dict_repr["parent"] = NodeHandle.from_dict(\n {\n "name": dict_repr["parent"][0],\n "parent": dict_repr["parent"][1],\n }\n )\n\n return NodeHandle(**{k: dict_repr[k] for k in ["name", "parent"]})\n\n\nclass NodeInputHandle(\n NamedTuple("_NodeInputHandle", [("node_handle", NodeHandle), ("input_name", str)])\n):\n """\n A structured object to uniquely identify inputs in the potentially recursive graph structure.\n """\n\n\nclass NodeOutputHandle(\n NamedTuple("_NodeOutputHandle", [("node_handle", NodeHandle), ("output_name", str)])\n):\n """\n A structured object to uniquely identify outputs in the potentially recursive graph structure.\n """\n\n\n# previous name for NodeHandle was SolidHandle\nregister_serdes_tuple_fallbacks({"SolidHandle": NodeHandle})\n\n\nclass SolidInputHandle(\n NamedTuple("_SolidInputHandle", [("solid", Node), ("input_def", InputDefinition)])\n):\n def __new__(cls, solid: Node, input_def: InputDefinition):\n return super(SolidInputHandle, cls).__new__(\n cls,\n check.inst_param(solid, "solid", Node),\n check.inst_param(input_def, "input_def", InputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "SolidInputHandle",\n solid_name=self.solid.name,\n input_name=self.input_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self):\n return hash((self.solid.name, self.input_def.name))\n\n def __eq__(self, other):\n return self.solid.name == other.solid.name and self.input_def.name == other.input_def.name\n\n @property\n def solid_name(self) -> str:\n return self.solid.name\n\n @property\n def input_name(self) -> str:\n return self.input_def.name\n\n\nclass SolidOutputHandle(\n NamedTuple("_SolidOutputHandle", [("solid", Node), ("output_def", OutputDefinition)])\n):\n def __new__(cls, solid: Node, output_def: OutputDefinition):\n return super(SolidOutputHandle, cls).__new__(\n cls,\n check.inst_param(solid, "solid", Node),\n check.inst_param(output_def, "output_def", OutputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "SolidOutputHandle",\n solid_name=self.solid.name,\n output_name=self.output_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self):\n return hash((self.solid.name, self.output_def.name))\n\n def __eq__(self, other: Any):\n return self.solid.name == other.solid.name and self.output_def.name == other.output_def.name\n\n def describe(self) -> str:\n return f"{self.solid_name}:{self.output_def.name}"\n\n @property\n def solid_name(self) -> str:\n return self.solid.name\n\n @property\n def is_dynamic(self) -> bool:\n return self.output_def.is_dynamic\n\n\nclass DependencyType(Enum):\n DIRECT = "DIRECT"\n FAN_IN = "FAN_IN"\n DYNAMIC_COLLECT = "DYNAMIC_COLLECT"\n\n\nclass IDependencyDefinition(ABC): # pylint: disable=no-init\n @abstractmethod\n def get_solid_dependencies(self) -> List["DependencyDefinition"]:\n pass\n\n @abstractmethod\n def is_fan_in(self) -> bool:\n """The result passed to the corresponding input will be a List made from different solid outputs"""\n\n\n
[docs]class DependencyDefinition(\n NamedTuple(\n "_DependencyDefinition", [("node", str), ("output", str), ("description", Optional[str])]\n ),\n IDependencyDefinition,\n):\n """Represents an edge in the DAG of nodes (ops or graphs) forming a job.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent node and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_b depends on the output named 'result' of\n op_a, and the output named 'other_result' of graph_a, the structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_op', 'result')\n }\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_graph', 'result')\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n node_b(node_a())\n\n\n Args:\n solid (str): (legacy) The name of the solid that is depended on, that is, from which the value\n passed between the two nodes originates.\n output (Optional[str]): The name of the output that is depended on. (default: "result")\n description (Optional[str]): Human-readable description of this dependency.\n node (str): The name of the node (op or graph) that is depended on, that is, from which the value\n passed between the two nodes originates.\n """\n\n def __new__(\n cls,\n solid: Optional[str] = None,\n output: str = DEFAULT_OUTPUT,\n description: Optional[str] = None,\n node: Optional[str] = None,\n ):\n if solid and node:\n raise DagsterInvalidDefinitionError(\n "Both ``node`` and legacy ``solid`` arguments provided to DependencyDefinition. Please use one or the other."\n )\n\n if not solid and not node:\n raise DagsterInvalidDefinitionError(\n "Expected node parameter to be str for DependencyDefinition"\n )\n\n node = node or solid\n return super(DependencyDefinition, cls).__new__(\n cls,\n check.str_param(node, "node"),\n check.str_param(output, "output"),\n check.opt_str_param(description, "description"),\n )\n\n def get_solid_dependencies(self) -> List["DependencyDefinition"]:\n return [self]\n\n def is_fan_in(self) -> bool:\n return False\n\n @property\n def solid(self) -> str:\n return self.node\n\n def get_op_dependencies(self) -> List["DependencyDefinition"]:\n return [self]
\n\n\n
[docs]class MultiDependencyDefinition(\n NamedTuple(\n "_MultiDependencyDefinition",\n [("dependencies", List[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]])],\n ),\n IDependencyDefinition,\n):\n """Represents a fan-in edge in the DAG of op instances forming a job.\n\n This object is used only when an input of type ``List[T]`` is assembled by fanning-in multiple\n upstream outputs of type ``T``.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job or pipeline whose keys represent the dependent ops or graphs and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_c depends on the outputs named 'result' of\n op_a and op_b, this structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'op_c': {\n 'input': MultiDependencyDefinition(\n [\n DependencyDefinition('op_a', 'result'),\n DependencyDefinition('op_b', 'result')\n ]\n )\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n op_c(op_a(), op_b())\n\n Args:\n dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]): List of\n upstream dependencies fanned in to this input.\n """\n\n def __new__(\n cls,\n dependencies: List[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]],\n ):\n from .composition import MappedInputPlaceholder\n\n deps = check.list_param(dependencies, "dependencies")\n seen = {}\n for dep in deps:\n if isinstance(dep, DependencyDefinition):\n key = dep.solid + ":" + dep.output\n if key in seen:\n raise DagsterInvalidDefinitionError(\n 'Duplicate dependencies on node "{dep.solid}" output "{dep.output}" '\n "used in the same MultiDependencyDefinition.".format(dep=dep)\n )\n seen[key] = True\n elif dep is MappedInputPlaceholder:\n pass\n else:\n check.failed("Unexpected dependencies entry {}".format(dep))\n\n return super(MultiDependencyDefinition, cls).__new__(cls, deps)\n\n def get_solid_dependencies(self) -> List[DependencyDefinition]:\n return [dep for dep in self.dependencies if isinstance(dep, DependencyDefinition)]\n\n def get_node_dependencies(self) -> List[DependencyDefinition]:\n return self.get_solid_dependencies()\n\n def is_fan_in(self) -> bool:\n return True\n\n def get_dependencies_and_mappings(self) -> List:\n return self.dependencies
\n\n\nclass DynamicCollectDependencyDefinition(\n NamedTuple("_DynamicCollectDependencyDefinition", [("solid_name", str), ("output_name", str)]),\n IDependencyDefinition,\n):\n def get_solid_dependencies(self) -> List[DependencyDefinition]:\n return [DependencyDefinition(self.solid_name, self.output_name)]\n\n def is_fan_in(self) -> bool:\n return True\n\n\nDepTypeAndOutputHandles = Tuple[\n DependencyType,\n Union[SolidOutputHandle, List[Union[SolidOutputHandle, Type["MappedInputPlaceholder"]]]],\n]\n\nInputToOutputHandleDict = Dict[SolidInputHandle, DepTypeAndOutputHandles]\n\n\ndef _create_handle_dict(\n solid_dict: Dict[str, Node],\n dep_dict: Dict[str, Dict[str, IDependencyDefinition]],\n) -> InputToOutputHandleDict:\n from .composition import MappedInputPlaceholder\n\n check.dict_param(solid_dict, "solid_dict", key_type=str, value_type=Node)\n check.two_dim_dict_param(dep_dict, "dep_dict", value_type=IDependencyDefinition)\n\n handle_dict: InputToOutputHandleDict = {}\n\n for solid_name, input_dict in dep_dict.items():\n from_solid = solid_dict[solid_name]\n for input_name, dep_def in input_dict.items():\n if isinstance(dep_def, MultiDependencyDefinition):\n handles: List[Union[SolidOutputHandle, Type[MappedInputPlaceholder]]] = []\n for inner_dep in dep_def.get_dependencies_and_mappings():\n if isinstance(inner_dep, DependencyDefinition):\n handles.append(solid_dict[inner_dep.solid].output_handle(inner_dep.output))\n elif inner_dep is MappedInputPlaceholder:\n handles.append(inner_dep)\n else:\n check.failed(\n "Unexpected MultiDependencyDefinition dependencies type {}".format(\n inner_dep\n )\n )\n\n handle_dict[from_solid.input_handle(input_name)] = (DependencyType.FAN_IN, handles)\n\n elif isinstance(dep_def, DependencyDefinition):\n handle_dict[from_solid.input_handle(input_name)] = (\n DependencyType.DIRECT,\n solid_dict[dep_def.solid].output_handle(dep_def.output),\n )\n elif isinstance(dep_def, DynamicCollectDependencyDefinition):\n handle_dict[from_solid.input_handle(input_name)] = (\n DependencyType.DYNAMIC_COLLECT,\n solid_dict[dep_def.solid_name].output_handle(dep_def.output_name),\n )\n\n else:\n check.failed(f"Unknown dependency type {dep_def}")\n\n return handle_dict\n\n\nclass DependencyStructure:\n @staticmethod\n def from_definitions(solids: Dict[str, Node], dep_dict: Dict[str, Any]):\n return DependencyStructure(list(dep_dict.keys()), _create_handle_dict(solids, dep_dict))\n\n def __init__(self, solid_names: List[str], handle_dict: InputToOutputHandleDict):\n self._solid_names = solid_names\n self._handle_dict = handle_dict\n\n # Building up a couple indexes here so that one can look up all the upstream output handles\n # or downstream input handles in O(1). Without this, this can become O(N^2) where N is solid\n # count during the GraphQL query in particular\n\n # solid_name => input_handle => list[output_handle]\n self._solid_input_index: dict = defaultdict(dict)\n\n # solid_name => output_handle => list[input_handle]\n self._solid_output_index: dict = defaultdict(lambda: defaultdict(list))\n\n # solid_name => dynamic output_handle that this solid will dupe for\n self._dynamic_fan_out_index: dict = {}\n\n # solid_name => set of dynamic output_handle this collects over\n self._collect_index: Dict[str, set] = defaultdict(set)\n\n for input_handle, (dep_type, output_handle_or_list) in self._handle_dict.items():\n if dep_type == DependencyType.FAN_IN:\n output_handle_list = []\n for handle in output_handle_or_list:\n if not isinstance(handle, SolidOutputHandle):\n continue\n\n if handle.is_dynamic:\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of dynamic outputs. "\n f'Problematic dependency on dynamic output "{handle.describe()}".'\n )\n if self._dynamic_fan_out_index.get(handle.solid_name):\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of dynamic outputs. "\n f'Problematic dependency on output "{handle.describe()}", downstream of '\n f'"{self._dynamic_fan_out_index[handle.solid_name].describe()}".'\n )\n\n output_handle_list.append(handle)\n elif dep_type == DependencyType.DIRECT:\n output_handle = cast(SolidOutputHandle, output_handle_or_list)\n\n if output_handle.is_dynamic:\n self._validate_and_set_fan_out(input_handle, output_handle)\n\n if self._dynamic_fan_out_index.get(output_handle.solid_name):\n self._validate_and_set_fan_out(\n input_handle, self._dynamic_fan_out_index[output_handle.solid_name]\n )\n\n output_handle_list = [output_handle]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n output_handle = cast(SolidOutputHandle, output_handle_or_list)\n\n if output_handle.is_dynamic:\n self._validate_and_set_collect(input_handle, output_handle)\n\n elif self._dynamic_fan_out_index.get(output_handle.solid_name):\n self._validate_and_set_collect(\n input_handle,\n self._dynamic_fan_out_index[output_handle.solid_name],\n )\n else:\n check.failed(\n f"Unexpected dynamic fan in dep created {output_handle} -> {input_handle}"\n )\n\n output_handle_list = [output_handle]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n self._solid_input_index[input_handle.solid.name][input_handle] = output_handle_list\n for output_handle in output_handle_list:\n self._solid_output_index[output_handle.solid.name][output_handle].append(\n input_handle\n )\n\n def _validate_and_set_fan_out(\n self, input_handle: SolidInputHandle, output_handle: SolidOutputHandle\n ) -> Any:\n """Helper function for populating _dynamic_fan_out_index"""\n\n if not input_handle.solid.definition.input_supports_dynamic_output_dep(\n input_handle.input_name\n ):\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot be downstream of dynamic output "\n f'"{output_handle.describe()}" since input "{input_handle.input_name}" maps to a node '\n "that is already downstream of another dynamic output. Nodes cannot be downstream of more "\n "than one dynamic output"\n )\n\n if self._collect_index.get(input_handle.solid_name):\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot be both downstream of dynamic output "\n f"{output_handle.describe()} and collect over dynamic output "\n f"{list(self._collect_index[input_handle.solid_name])[0].describe()}."\n )\n\n if self._dynamic_fan_out_index.get(input_handle.solid_name) is None:\n self._dynamic_fan_out_index[input_handle.solid_name] = output_handle\n return\n\n if self._dynamic_fan_out_index[input_handle.solid_name] != output_handle:\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot be downstream of more than one dynamic output. "\n f'It is downstream of both "{output_handle.describe()}" and '\n f'"{self._dynamic_fan_out_index[input_handle.solid_name].describe()}"'\n )\n\n def _validate_and_set_collect(\n self,\n input_handle: SolidInputHandle,\n output_handle: SolidOutputHandle,\n ) -> None:\n if self._dynamic_fan_out_index.get(input_handle.solid_name):\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot both collect over dynamic output "\n f"{output_handle.describe()} and be downstream of the dynamic output "\n f"{self._dynamic_fan_out_index[input_handle.solid_name].describe()}."\n )\n\n self._collect_index[input_handle.solid_name].add(output_handle)\n\n # if the output is already fanned out\n if self._dynamic_fan_out_index.get(output_handle.solid_name):\n raise DagsterInvalidDefinitionError(\n f"{input_handle.solid.describe_node()} cannot be downstream of more than one dynamic output. "\n f'It is downstream of both "{output_handle.describe()}" and '\n f'"{self._dynamic_fan_out_index[output_handle.solid_name].describe()}"'\n )\n\n def all_upstream_outputs_from_solid(self, solid_name: str) -> List[SolidOutputHandle]:\n check.str_param(solid_name, "solid_name")\n\n # flatten out all outputs that feed into the inputs of this solid\n return [\n output_handle\n for output_handle_list in self._solid_input_index[solid_name].values()\n for output_handle in output_handle_list\n ]\n\n def input_to_upstream_outputs_for_solid(self, solid_name: str) -> Any:\n """\n Returns a Dict[SolidInputHandle, List[SolidOutputHandle]] that encodes\n where all the the inputs are sourced from upstream. Usually the\n List[SolidOutputHandle] will be a list of one, except for the\n multi-dependency case.\n """\n check.str_param(solid_name, "solid_name")\n return self._solid_input_index[solid_name]\n\n def output_to_downstream_inputs_for_solid(self, solid_name: str) -> Any:\n """\n Returns a Dict[SolidOutputHandle, List[SolidInputHandle]] that\n represents all the downstream inputs for each output in the\n dictionary\n """\n check.str_param(solid_name, "solid_name")\n return self._solid_output_index[solid_name]\n\n def has_direct_dep(self, solid_input_handle: SolidInputHandle) -> bool:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n if solid_input_handle not in self._handle_dict:\n return False\n dep_type, _ = self._handle_dict[solid_input_handle]\n return dep_type == DependencyType.DIRECT\n\n def get_direct_dep(self, solid_input_handle: SolidInputHandle) -> SolidOutputHandle:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n dep_type, dep = self._handle_dict[solid_input_handle]\n check.invariant(\n dep_type == DependencyType.DIRECT,\n f"Cannot call get_direct_dep when dep is not singular, got {dep_type}",\n )\n return cast(SolidOutputHandle, dep)\n\n def has_fan_in_deps(self, solid_input_handle: SolidInputHandle) -> bool:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n if solid_input_handle not in self._handle_dict:\n return False\n dep_type, _ = self._handle_dict[solid_input_handle]\n return dep_type == DependencyType.FAN_IN\n\n def get_fan_in_deps(\n self, solid_input_handle: SolidInputHandle\n ) -> List[Union[SolidOutputHandle, Type["MappedInputPlaceholder"]]]:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n dep_type, deps = self._handle_dict[solid_input_handle]\n check.invariant(\n dep_type == DependencyType.FAN_IN,\n f"Cannot call get_multi_dep when dep is not fan in, got {dep_type}",\n )\n return cast(List[Union[SolidOutputHandle, Type["MappedInputPlaceholder"]]], deps)\n\n def has_dynamic_fan_in_dep(self, solid_input_handle: SolidInputHandle) -> bool:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n if solid_input_handle not in self._handle_dict:\n return False\n dep_type, _ = self._handle_dict[solid_input_handle]\n return dep_type == DependencyType.DYNAMIC_COLLECT\n\n def get_dynamic_fan_in_dep(self, solid_input_handle: SolidInputHandle) -> SolidOutputHandle:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n dep_type, dep = self._handle_dict[solid_input_handle]\n check.invariant(\n dep_type == DependencyType.DYNAMIC_COLLECT,\n f"Cannot call get_dynamic_fan_in_dep when dep is not, got {dep_type}",\n )\n return cast(SolidOutputHandle, dep)\n\n def has_deps(self, solid_input_handle: SolidInputHandle) -> bool:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n return solid_input_handle in self._handle_dict\n\n def get_deps_list(self, solid_input_handle: SolidInputHandle) -> List[SolidOutputHandle]:\n check.inst_param(solid_input_handle, "solid_input_handle", SolidInputHandle)\n check.invariant(self.has_deps(solid_input_handle))\n dep_type, handle_or_list = self._handle_dict[solid_input_handle]\n if dep_type == DependencyType.DIRECT:\n return [cast(SolidOutputHandle, handle_or_list)]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n return [cast(SolidOutputHandle, handle_or_list)]\n elif dep_type == DependencyType.FAN_IN:\n return [handle for handle in handle_or_list if isinstance(handle, SolidOutputHandle)]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n def input_handles(self) -> List[SolidInputHandle]:\n return list(self._handle_dict.keys())\n\n def get_upstream_dynamic_handle_for_solid(self, solid_name: str) -> Any:\n return self._dynamic_fan_out_index.get(solid_name)\n\n def get_dependency_type(self, solid_input_handle: SolidInputHandle) -> Optional[DependencyType]:\n result = self._handle_dict.get(solid_input_handle)\n if result is None:\n return None\n dep_type, _ = result\n return dep_type\n\n def is_dynamic_mapped(self, solid_name: str) -> bool:\n return solid_name in self._dynamic_fan_out_index\n\n def has_dynamic_downstreams(self, solid_name: str) -> bool:\n for upstream_handle in self._dynamic_fan_out_index.values():\n if upstream_handle.solid_name == solid_name:\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/core/definitions/dependency", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.dependency"}, "events": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.events

\nimport re\nimport warnings\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.serdes import DefaultNamedTupleSerializer, whitelist_for_serdes\nfrom dagster.utils.backcompat import experimental_class_param_warning\n\nfrom .metadata import (\n    MetadataEntry,\n    MetadataValue,\n    PartitionMetadataEntry,\n    RawMetadataValue,\n    last_file_comp,\n    normalize_metadata,\n)\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.execution.context.output import OutputContext\n\nASSET_KEY_SPLIT_REGEX = re.compile("[^a-zA-Z0-9_]")\nASSET_KEY_STRUCTURED_DELIMITER = "."\n\n\ndef parse_asset_key_string(s: str) -> List[str]:\n    return list(filter(lambda x: x, re.split(ASSET_KEY_SPLIT_REGEX, s)))\n\n\n
[docs]@whitelist_for_serdes\nclass AssetKey(NamedTuple("_AssetKey", [("path", List[str])])):\n """Object representing the structure of an asset key. Takes in a sanitized string, list of\n strings, or tuple of strings.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import op\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey('flat_asset_key'),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(['parent', 'child', 'grandchild']),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key_2(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(('parent', 'child', 'grandchild')),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n Args:\n path (Sequence[str]): String, list of strings, or tuple of strings. A list of strings\n represent the hierarchical structure of the asset_key.\n """\n\n def __new__(cls, path: Sequence[str]):\n if isinstance(path, str):\n path = [path]\n else:\n path = list(check.sequence_param(path, "path", of_type=str))\n\n check.invariant(\n all(len(seg) > 0 for seg in path), "Asset key segments must be non-empty strings."\n )\n return super(AssetKey, cls).__new__(cls, path=path)\n\n def __str__(self):\n return "AssetKey({})".format(self.path)\n\n def __repr__(self):\n return "AssetKey({})".format(self.path)\n\n def __hash__(self):\n return hash(tuple(self.path))\n\n def __eq__(self, other):\n if not isinstance(other, AssetKey):\n return False\n return self.to_string() == other.to_string()\n\n
[docs] def to_string(self, legacy: Optional[bool] = False) -> Optional[str]:\n """\n E.g. '["first_component", "second_component"]'\n """\n if not self.path:\n return None\n if legacy:\n return ASSET_KEY_STRUCTURED_DELIMITER.join(self.path)\n return seven.json.dumps(self.path)
\n\n
[docs] def to_user_string(self) -> str:\n """\n E.g. "first_component>second_component"\n """\n return ">".join(self.path)
\n\n @staticmethod\n def from_user_string(asset_key_string: str) -> "AssetKey":\n return AssetKey(asset_key_string.split(">"))\n\n @staticmethod\n def from_db_string(asset_key_string: Optional[str]) -> Optional["AssetKey"]:\n if not asset_key_string:\n return None\n if asset_key_string[0] == "[":\n # is a json string\n try:\n path = seven.json.loads(asset_key_string)\n except seven.JSONDecodeError:\n path = parse_asset_key_string(asset_key_string)\n else:\n path = parse_asset_key_string(asset_key_string)\n return AssetKey(path)\n\n @staticmethod\n def get_db_prefix(path: List[str], legacy: Optional[bool] = False):\n check.list_param(path, "path", of_type=str)\n if legacy:\n return ASSET_KEY_STRUCTURED_DELIMITER.join(path)\n return seven.json.dumps(path)[:-2] # strip trailing '"]' from json string\n\n @staticmethod\n def from_graphql_input(asset_key: Mapping[str, List[str]]) -> Optional["AssetKey"]:\n if asset_key and asset_key.get("path"):\n return AssetKey(asset_key["path"])\n return None\n\n @staticmethod\n def from_coerceable(arg: "CoerceableToAssetKey") -> "AssetKey":\n if isinstance(arg, AssetKey):\n return check.inst_param(arg, "arg", AssetKey)\n elif isinstance(arg, str):\n return AssetKey([arg])\n elif isinstance(arg, list):\n check.list_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n else:\n check.tuple_param(arg, "arg", of_type=str)\n return AssetKey(arg)
\n\n\nCoerceableToAssetKey = Union[AssetKey, str, Sequence[str]]\n\n\nDynamicAssetKey = Callable[["OutputContext"], Optional[AssetKey]]\n\n\n@whitelist_for_serdes\nclass AssetLineageInfo(\n NamedTuple("_AssetLineageInfo", [("asset_key", AssetKey), ("partitions", AbstractSet[str])])\n):\n def __new__(cls, asset_key, partitions=None):\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partitions = check.opt_set_param(partitions, "partitions", str)\n return super(AssetLineageInfo, cls).__new__(cls, asset_key=asset_key, partitions=partitions)\n\n\nT = TypeVar("T")\n\n\n
[docs]class Output(Generic[T]):\n """Event corresponding to one of a op's outputs.\n\n Op compute functions must explicitly yield events of this type when they have more than\n one output, or when they also yield events of other types, or when defining a op using the\n :py:class:`OpDefinition` API directly.\n\n Outputs are values produced by ops that will be consumed by downstream ops in a job.\n They are type-checked at op boundaries when their corresponding :py:class:`Out`\n or the downstream :py:class:`In` is typed.\n\n Args:\n value (Any): The value returned by the compute function.\n output_name (Optional[str]): Name of the corresponding out. (default:\n "result")\n metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]):\n (Experimental) A set of metadata entries to attach to events related to this Output.\n metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __init__(\n self,\n value: T,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata_entries: Optional[Sequence[Union[MetadataEntry, PartitionMetadataEntry]]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = check.opt_list_param(\n metadata_entries,\n "metadata_entries",\n of_type=(MetadataEntry, PartitionMetadataEntry),\n )\n self._value = value\n self._output_name = check.str_param(output_name, "output_name")\n self._metadata_entries = normalize_metadata(metadata, metadata_entries)\n\n @property\n def metadata_entries(self) -> List[Union[PartitionMetadataEntry, MetadataEntry]]:\n return self._metadata_entries\n\n @property\n def value(self) -> Any:\n return self._value\n\n @property\n def output_name(self) -> str:\n return self._output_name
\n\n\n
[docs]class DynamicOutput(Generic[T]):\n """\n Variant of :py:class:`Output <dagster.Output>` used to support\n dynamic mapping & collect. Each ``DynamicOutput`` produced by an op represents\n one item in a set that can be processed individually with ``map`` or gathered\n with ``collect``.\n\n Each ``DynamicOutput`` must have a unique ``mapping_key`` to distinguish it with it's set.\n\n Args:\n value (Any):\n The value returned by the compute function.\n mapping_key (str):\n The key that uniquely identifies this dynamic value relative to its peers.\n This key will be used to identify the downstream ops when mapped, ie\n ``mapped_op[example_mapping_key]``\n output_name (Optional[str]):\n Name of the corresponding :py:class:`DynamicOut` defined on the op.\n (default: "result")\n metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]):\n (Experimental) A set of metadata entries to attach to events related to this output.\n metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __init__(\n self,\n value: T,\n mapping_key: str,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata_entries: Optional[List[Union[PartitionMetadataEntry, MetadataEntry]]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n self._mapping_key = check_valid_name(check.str_param(mapping_key, "mapping_key"))\n self._output_name = check.str_param(output_name, "output_name")\n self._metadata_entries = normalize_metadata(metadata, metadata_entries)\n self._value = value\n\n @property\n def metadata_entries(self) -> List[Union[PartitionMetadataEntry, MetadataEntry]]:\n return self._metadata_entries\n\n @property\n def mapping_key(self) -> str:\n return self._mapping_key\n\n @property\n def value(self) -> T:\n return self._value\n\n @property\n def output_name(self) -> str:\n return self._output_name
\n\n\n@whitelist_for_serdes\nclass AssetObservation(\n NamedTuple(\n "_AssetObservation",\n [\n ("asset_key", AssetKey),\n ("description", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ("partition", Optional[str]),\n ],\n )\n):\n """Event that captures metadata about an asset at a point in time.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the asset.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the asset.\n partition (Optional[str]): The name of a partition of the asset that the metadata\n corresponds to.\n metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: Union[List[str], AssetKey, str],\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n partition: Optional[str] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n elif isinstance(asset_key, list):\n check.list_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n else:\n check.tuple_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n\n return super(AssetObservation, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata_entries=cast(\n List[MetadataEntry], normalize_metadata(metadata, metadata_entries)\n ),\n partition=check.opt_str_param(partition, "partition"),\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n\n
[docs]@whitelist_for_serdes\nclass AssetMaterialization(\n NamedTuple(\n "_AssetMaterialization",\n [\n ("asset_key", AssetKey),\n ("description", Optional[str]),\n ("metadata_entries", List[Union[MetadataEntry, PartitionMetadataEntry]]),\n ("partition", Optional[str]),\n ("tags", Dict[str, str]),\n ],\n )\n):\n """Event indicating that an op has materialized an asset.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, asset materializations can not be passed to other\n ops, and their persistence is controlled by op logic, rather than by the Dagster\n framework.\n\n Op authors should use these events to organize metadata about the side effects of their\n computations, enabling tooling like the Assets dashboard in Dagit.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the materialized asset across job\n runs\n description (Optional[str]): A longer human-readable description of the materialized value.\n metadata_entries (Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]]): Arbitrary metadata about the\n materialized value.\n partition (Optional[str]): The name of the partition that was materialized.\n tags (Optional[Dict[str, str]]): (Experimental) Tag metadata for a given asset\n materialization. Used for search and organization of the asset entry in the asset\n catalog in Dagit.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoerceableToAssetKey,\n description: Optional[str] = None,\n metadata_entries: Optional[Sequence[Union[MetadataEntry, PartitionMetadataEntry]]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n elif isinstance(asset_key, list):\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n else:\n check.tuple_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n if tags:\n experimental_class_param_warning("tags", "AssetMaterialization")\n\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n metadata_entries = check.opt_sequence_param(\n metadata_entries, "metadata_entries", of_type=(MetadataEntry, PartitionMetadataEntry)\n )\n\n return super(AssetMaterialization, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata_entries=normalize_metadata(metadata, metadata_entries),\n partition=check.opt_str_param(partition, "partition"),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n
[docs] @staticmethod\n def file(\n path: str,\n description: Optional[str] = None,\n asset_key: Optional[Union[str, List[str], AssetKey]] = None,\n ) -> "AssetMaterialization":\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n if not asset_key:\n asset_key = path\n\n return AssetMaterialization(\n asset_key=cast(Union[str, AssetKey, List[str]], asset_key),\n description=description,\n metadata_entries=[MetadataEntry("path", value=MetadataValue.path(path))],\n )
\n\n\nclass MaterializationSerializer(DefaultNamedTupleSerializer):\n @classmethod\n def value_from_unpacked(cls, unpacked_dict, klass):\n # override the default `from_storage_dict` implementation in order to skip the deprecation\n # warning for historical Materialization events, loaded from event_log storage\n return Materialization(skip_deprecation_warning=True, **unpacked_dict)\n\n\n@whitelist_for_serdes(serializer=MaterializationSerializer)\nclass Materialization(\n NamedTuple(\n "_Materialization",\n [\n ("label", str),\n ("description", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ("asset_key", AssetKey),\n ("partition", Optional[str]),\n ("tags", Dict[str, str]),\n ],\n )\n):\n """Event indicating that an op has materialized a value.\n\n Solid compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, materializations can not be passed to other ops,\n and their persistence is controlled by op logic, rather than by the Dagster framework.\n\n Solid authors should use these events to organize metadata about the side effects of their\n computations to enable downstream tooling like artifact catalogues and diff tools.\n\n Args:\n label (str): A short display name for the materialized value.\n description (Optional[str]): A longer human-radable description of the materialized value.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the\n materialized value.\n asset_key (Optional[Union[str, AssetKey]]): An optional parameter to identify the materialized asset\n across runs\n partition (Optional[str]): The name of the partition that was materialized.\n tags (Optional[Dict[str, str]]): (Experimental) Tag metadata for a given asset\n materialization. Used for search and organization of the asset entry in the asset\n catalog in Dagit.\n """\n\n def __new__(\n cls,\n label: Optional[str] = None,\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n asset_key: Optional[Union[str, AssetKey]] = None,\n partition: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n skip_deprecation_warning: Optional[bool] = False,\n ):\n if asset_key and isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n asset_key = cast(AssetKey, asset_key)\n if not label:\n check.param_invariant(\n asset_key and asset_key.path,\n "label",\n "Either label or asset_key with a path must be provided",\n )\n label = asset_key.to_string()\n\n if not skip_deprecation_warning:\n warnings.warn("`Materialization` is deprecated; use `AssetMaterialization` instead.")\n\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n\n return super(Materialization, cls).__new__(\n cls,\n label=check.str_param(label, "label"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n asset_key=asset_key,\n partition=check.opt_str_param(partition, "partition"),\n tags=check.opt_dict_param(tags, "tags"),\n )\n\n @staticmethod\n def file(\n path: str,\n description: Optional[str] = None,\n asset_key: Optional[Union[str, AssetKey]] = None,\n ) -> "Materialization":\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n return Materialization(\n label=last_file_comp(path),\n description=description,\n metadata_entries=[MetadataEntry("path", value=MetadataValue.path(path))],\n asset_key=asset_key,\n )\n\n\n
[docs]@whitelist_for_serdes\nclass ExpectationResult(\n NamedTuple(\n "_ExpectationResult",\n [\n ("success", bool),\n ("label", Optional[str]),\n ("description", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ],\n )\n):\n """Event corresponding to a data quality test.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that a data quality test has produced a (positive or\n negative) result.\n\n Args:\n success (bool): Whether the expectation passed or not.\n label (Optional[str]): Short display name for expectation. Defaults to "result".\n description (Optional[str]): A longer human-readable description of the expectation.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the\n expectation.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n label: Optional[str] = None,\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n\n return super(ExpectationResult, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n label=check.opt_str_param(label, "label", "result"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=cast(\n List[MetadataEntry], normalize_metadata(metadata, metadata_entries)\n ),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass TypeCheck(\n NamedTuple(\n "_TypeCheck",\n [\n ("success", bool),\n ("description", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ],\n )\n):\n """Event corresponding to a successful typecheck.\n\n Events of this type should be returned by user-defined type checks when they need to encapsulate\n additional metadata about a type check's success or failure. (i.e., when using\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or the underlying\n :py:func:`PythonObjectDagsterType` API.)\n\n Solid compute functions should generally avoid yielding events of this type to avoid confusion.\n\n Args:\n success (bool): ``True`` if the type check succeeded, ``False`` otherwise.\n description (Optional[str]): A human-readable description of the type check.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the\n type check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n\n return super(TypeCheck, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n description=check.opt_str_param(description, "description"),\n metadata_entries=cast(\n List[MetadataEntry], normalize_metadata(metadata, metadata_entries)\n ),\n )
\n\n\n
[docs]class Failure(Exception):\n """Event indicating op failure.\n\n Raise events of this type from within op compute functions or custom type checks in order to\n indicate an unrecoverable failure in user code to the Dagster machinery and return\n structured metadata about the failure.\n\n Args:\n description (Optional[str]): A human-readable description of the failure.\n metadata_entries (Optional[List[MetadataEntry]]): Arbitrary metadata about the\n failure.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __init__(\n self,\n description: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n ):\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n\n super(Failure, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata_entries = normalize_metadata(metadata, metadata_entries)
\n\n\n
[docs]class RetryRequested(Exception):\n """\n An exception to raise from an op to indicate that it should be retried.\n\n Args:\n max_retries (Optional[int]):\n The max number of retries this step should attempt before failing\n seconds_to_wait (Optional[Union[float,int]]):\n Seconds to wait before restarting the step after putting the step in\n to the up_for_retry state\n\n Example:\n\n .. code-block:: python\n\n @op\n def flakes():\n try:\n flakey_operation()\n except Exception as e:\n raise RetryRequested(max_retries=3) from e\n """\n\n def __init__(\n self, max_retries: Optional[int] = 1, seconds_to_wait: Optional[Union[float, int]] = None\n ):\n super(RetryRequested, self).__init__()\n self.max_retries = check.int_param(max_retries, "max_retries")\n self.seconds_to_wait = check.opt_numeric_param(seconds_to_wait, "seconds_to_wait")
\n\n\nclass ObjectStoreOperationType(Enum):\n SET_OBJECT = "SET_OBJECT"\n GET_OBJECT = "GET_OBJECT"\n RM_OBJECT = "RM_OBJECT"\n CP_OBJECT = "CP_OBJECT"\n\n\nclass ObjectStoreOperation(\n NamedTuple(\n "_ObjectStoreOperation",\n [\n ("op", ObjectStoreOperationType),\n ("key", str),\n ("dest_key", Optional[str]),\n ("obj", Any),\n ("serialization_strategy_name", Optional[str]),\n ("object_store_name", Optional[str]),\n ("value_name", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n """This event is used internally by Dagster machinery when values are written to and read from\n an ObjectStore.\n\n Users should not import this class or yield events of this type from user code.\n\n Args:\n op (ObjectStoreOperationType): The type of the operation on the object store.\n key (str): The key of the object on which the operation was performed.\n dest_key (Optional[str]): The destination key, if any, to which the object was copied.\n obj (Any): The object, if any, retrieved by the operation.\n serialization_strategy_name (Optional[str]): The name of the serialization strategy, if any,\n employed by the operation\n object_store_name (Optional[str]): The name of the object store that performed the\n operation.\n value_name (Optional[str]): The name of the input/output\n version (Optional[str]): (Experimental) The version of the stored data.\n mapping_key (Optional[str]): The mapping key when a dynamic output is used.\n """\n\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n key: str,\n dest_key: Optional[str] = None,\n obj: Any = None,\n serialization_strategy_name: Optional[str] = None,\n object_store_name: Optional[str] = None,\n value_name: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperation, cls).__new__(\n cls,\n op=op,\n key=check.str_param(key, "key"),\n dest_key=check.opt_str_param(dest_key, "dest_key"),\n obj=obj,\n serialization_strategy_name=check.opt_str_param(\n serialization_strategy_name, "serialization_strategy_name"\n ),\n object_store_name=check.opt_str_param(object_store_name, "object_store_name"),\n value_name=check.opt_str_param(value_name, "value_name"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n @classmethod\n def serializable(cls, inst, **kwargs):\n return cls(\n **dict(\n {\n "op": inst.op.value,\n "key": inst.key,\n "dest_key": inst.dest_key,\n "obj": None,\n "serialization_strategy_name": inst.serialization_strategy_name,\n "object_store_name": inst.object_store_name,\n "value_name": inst.value_name,\n "version": inst.version,\n },\n **kwargs,\n )\n )\n\n\nclass HookExecutionResult(\n NamedTuple("_HookExecutionResult", [("hook_name", str), ("is_skipped", bool)])\n):\n """This event is used internally to indicate the execution result of a hook, e.g. whether the\n user-defined hook function is skipped.\n\n Args:\n hook_name (str): The name of the hook.\n is_skipped (bool): ``False`` if the hook_fn is executed, ``True`` otheriwse.\n """\n\n def __new__(cls, hook_name: str, is_skipped: Optional[bool] = None):\n return super(HookExecutionResult, cls).__new__(\n cls,\n hook_name=check.str_param(hook_name, "hook_name"),\n is_skipped=cast(bool, check.opt_bool_param(is_skipped, "is_skipped", default=False)),\n )\n\n\nUserEvent = Union[Materialization, AssetMaterialization, AssetObservation, ExpectationResult]\n
", "current_page_name": "_modules/dagster/core/definitions/events", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.events"}, "executor_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.executor_definition

\nfrom enum import Enum as PyEnum\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, overload\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster.builtins import Int\nfrom dagster.config import Field, Selector\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.core.definitions.configurable import (\n    ConfiguredDefinitionConfigSchema,\n    NamedConfigurableDefinition,\n)\nfrom dagster.core.definitions.pipeline_base import IPipeline\nfrom dagster.core.definitions.reconstruct import ReconstructablePipeline\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.execution.retries import RetryMode, get_retries_config\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    from dagster.core.executor.base import Executor\n    from dagster.core.executor.in_process import InProcessExecutor\n    from dagster.core.executor.init import InitExecutorContext\n    from dagster.core.executor.multiprocess import MultiprocessExecutor\n    from dagster.core.instance import DagsterInstance\n\n\nclass ExecutorRequirement(PyEnum):\n    """\n    An ExecutorDefinition can include a list of requirements that the system uses to\n    check whether the executor will be able to work for a particular job/pipeline execution.\n    """\n\n    # The passed in IPipeline must be reconstructable across process boundaries\n    RECONSTRUCTABLE_PIPELINE = "RECONSTRUCTABLE_PIPELINE"  # This needs to still exist for folks who may have written their own executor\n    RECONSTRUCTABLE_JOB = "RECONSTRUCTABLE_PIPELINE"\n\n    # The DagsterInstance must be loadable in a different process\n    NON_EPHEMERAL_INSTANCE = "NON_EPHEMERAL_INSTANCE"\n\n    # Any solid outputs on the pipeline must be persisted\n    PERSISTENT_OUTPUTS = "PERSISTENT_OUTPUTS"\n\n\ndef multiple_process_executor_requirements() -> List[ExecutorRequirement]:\n    return [\n        ExecutorRequirement.RECONSTRUCTABLE_JOB,\n        ExecutorRequirement.NON_EPHEMERAL_INSTANCE,\n        ExecutorRequirement.PERSISTENT_OUTPUTS,\n    ]\n\n\nExecutorConfig = Dict[str, object]\nExecutorCreationFunction: TypeAlias = Callable[["InitExecutorContext"], "Executor"]\nExecutorRequirementsFunction: TypeAlias = Callable[[ExecutorConfig], List[ExecutorRequirement]]\n\n\n
[docs]class ExecutorDefinition(NamedConfigurableDefinition):\n """\n Args:\n name (str): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available in `init_context.executor_config`. If not set, Dagster will accept any config\n provided.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular pipeline execution.\n executor_creation_fn(Optional[Callable]): Should accept an :py:class:`InitExecutorContext`\n and return an instance of :py:class:`Executor`\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n executor.\n """\n\n def __init__(\n self,\n name: str,\n config_schema: Optional[ConfigSchemaType] = None,\n requirements: Union[\n ExecutorRequirementsFunction, Optional[List[ExecutorRequirement]]\n ] = None,\n executor_creation_fn: Optional[ExecutorCreationFunction] = None,\n description: Optional[str] = None,\n ):\n self._name = check.str_param(name, "name")\n self._requirements_fn: ExecutorRequirementsFunction\n if callable(requirements):\n self._requirements_fn = requirements\n else:\n requirements_lst = check.opt_list_param(\n requirements, "requirements", of_type=ExecutorRequirement\n )\n self._requirements_fn = lambda _: requirements_lst\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._executor_creation_fn = check.opt_callable_param(\n executor_creation_fn, "executor_creation_fn"\n )\n self._description = check.opt_str_param(description, "description")\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n def get_requirements(self, executor_config: Dict[str, object]) -> List[ExecutorRequirement]:\n return self._requirements_fn(executor_config)\n\n @property\n def executor_creation_fn(self) -> Optional[ExecutorCreationFunction]:\n return self._executor_creation_fn\n\n def copy_for_configured(self, name, description, config_schema, _) -> "ExecutorDefinition":\n return ExecutorDefinition(\n name=name,\n config_schema=config_schema,\n executor_creation_fn=self.executor_creation_fn,\n description=description or self.description,\n requirements=self._requirements_fn,\n )\n\n # Backcompat: Overrides configured method to provide name as a keyword argument.\n # If no name is provided, the name is pulled off of this ExecutorDefinition.\n
[docs] def configured(\n self,\n config_or_config_fn: Any,\n name: Optional[str] = None,\n config_schema: Optional[Dict[str, Any]] = None,\n description: Optional[str] = None,\n ):\n """\n Wraps this object in an object of the same type that provides configuration to the inner\n object.\n\n Args:\n config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n that fully satisfies this object's config schema or (2) A function that accepts run\n configuration and returns run configuration that fully satisfies this object's\n config schema. In the latter case, config_schema must be specified. When\n passing a function, it's easiest to use :py:func:`configured`.\n name (Optional[str]): Name of the new definition. If not provided, the emitted\n definition will inherit the name of the `ExecutorDefinition` upon which this\n function is called.\n config_schema (Optional[ConfigSchema]): If config_or_config_fn is a function, the config\n schema that its input must satisfy. If not set, Dagster will accept any config\n provided.\n description (Optional[str]): Description of the new definition. If not specified,\n inherits the description of the definition being configured.\n\n Returns (ConfigurableDefinition): A configured version of this object.\n """\n\n name = check.opt_str_param(name, "name")\n\n new_config_schema = ConfiguredDefinitionConfigSchema(\n self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n )\n\n return self.copy_for_configured(\n name or self.name, description, new_config_schema, config_or_config_fn\n )
\n\n\n@overload\ndef executor(name: ExecutorCreationFunction) -> ExecutorDefinition:\n ...\n\n\n@overload\ndef executor(\n name: Optional[str] = ...,\n config_schema: Optional[ConfigSchemaType] = ...,\n requirements: Optional[Union[ExecutorRequirementsFunction, List[ExecutorRequirement]]] = ...,\n) -> "_ExecutorDecoratorCallable":\n ...\n\n\n
[docs]def executor(\n name: Union[ExecutorCreationFunction, Optional[str]] = None,\n config_schema: Optional[ConfigSchemaType] = None,\n requirements: Optional[Union[ExecutorRequirementsFunction, List[ExecutorRequirement]]] = None,\n) -> Union[ExecutorDefinition, "_ExecutorDecoratorCallable"]:\n """Define an executor.\n\n The decorated function should accept an :py:class:`InitExecutorContext` and return an instance\n of :py:class:`Executor`.\n\n Args:\n name (Optional[str]): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.executor_config`. If not set, Dagster will accept any config provided for.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular pipeline execution.\n """\n if callable(name):\n check.invariant(config_schema is None)\n check.invariant(requirements is None)\n return _ExecutorDecoratorCallable()(name)\n\n return _ExecutorDecoratorCallable(\n name=name, config_schema=config_schema, requirements=requirements\n )
\n\n\nclass _ExecutorDecoratorCallable:\n def __init__(self, name=None, config_schema=None, requirements=None):\n self.name = check.opt_str_param(name, "name")\n self.config_schema = config_schema # type check in definition\n self.requirements = requirements\n\n def __call__(self, fn: ExecutorCreationFunction) -> ExecutorDefinition:\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n executor_def = ExecutorDefinition(\n name=self.name,\n config_schema=self.config_schema,\n executor_creation_fn=fn,\n requirements=self.requirements,\n )\n\n update_wrapper(executor_def, wrapped=fn)\n\n return executor_def\n\n\ndef _core_in_process_executor_creation(config: ExecutorConfig) -> "InProcessExecutor":\n from dagster.core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n # shouldn't need to .get() here - issue with defaults in config setup\n retries=RetryMode.from_config(check.dict_elem(config, "retries")),\n marker_to_close=config.get("marker_to_close"),\n )\n\n\nIN_PROC_CONFIG = {\n "retries": get_retries_config(),\n "marker_to_close": Field(str, is_required=False),\n}\n\n\n
[docs]@executor(\n name="in_process",\n config_schema=IN_PROC_CONFIG,\n)\ndef in_process_executor(init_context):\n """The in-process executor executes all steps in a single process.\n\n For legacy pipelines, this will be the default executor. To select it explicitly,\n include the following top-level fragment in config:\n\n .. code-block:: yaml\n\n execution:\n in_process:\n\n Execution priority can be configured using the ``dagster/priority`` tag via solid/op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_in_process_executor_creation(init_context.executor_config)
\n\n\n@executor(name="execute_in_process_executor")\ndef execute_in_process_executor(_) -> "InProcessExecutor":\n """Executor used by execute_in_process.\n\n Use of this executor triggers special behavior in the config system that ignores all incoming\n executor config. This is because someone might set executor config on a job, and when we foist\n this executor onto the job for `execute_in_process`, that config becomes nonsensical.\n """\n from dagster.core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n retries=RetryMode.ENABLED,\n marker_to_close=None,\n )\n\n\ndef _core_multiprocess_executor_creation(config: ExecutorConfig) -> "MultiprocessExecutor":\n from dagster.core.executor.multiprocess import MultiprocessExecutor\n\n # unpack optional selector\n start_method = None\n start_cfg: Dict[str, object] = {}\n start_selector = check.opt_dict_elem(config, "start_method")\n if start_selector:\n start_method, start_cfg = list(start_selector.items())[0]\n\n return MultiprocessExecutor(\n max_concurrent=check.int_elem(config, "max_concurrent"),\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore\n start_method=start_method,\n explicit_forkserver_preload=check.opt_list_elem(start_cfg, "preload_modules", of_type=str),\n )\n\n\nMULTI_PROC_CONFIG = {\n "max_concurrent": Field(Int, is_required=False, default_value=0),\n "start_method": Field(\n Selector(\n {\n "spawn": {},\n "forkserver": {\n "preload_modules": Field(\n [str],\n is_required=False,\n description="Explicit modules to preload in the forkserver.",\n ),\n },\n # fork currently unsupported due to threads usage\n }\n ),\n is_required=False,\n description=(\n "Select how subprocesses are created. Defaults to spawn.\\n"\n "When forkserver is selected, set_forkserver_preload will be called with either:\\n"\n "* the preload_modules list if provided by config\\n"\n "* the module containing the Job if it was loaded from a module\\n"\n "* dagster\\n"\n "https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods"\n ),\n ),\n "retries": get_retries_config(),\n}\n\n\n
[docs]@executor(\n name="multiprocess",\n config_schema=MULTI_PROC_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef multiprocess_executor(init_context):\n """The multiprocess executor executes each step in an individual process.\n\n Any job that does not specify custom executors will use the multiprocess_executor by default.\n For jobs or legacy pipelines, to configure the multiprocess executor, include a fragment such\n as the following in your run config:\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n Execution priority can be configured using the ``dagster/priority`` tag via solid/op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_multiprocess_executor_creation(init_context.executor_config)
\n\n\ndefault_executors = [in_process_executor, multiprocess_executor]\n\n\ndef check_cross_process_constraints(init_context: "InitExecutorContext") -> None:\n from dagster.core.executor.init import InitExecutorContext\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n requirements_lst = init_context.executor_def.get_requirements(init_context.executor_config)\n\n if ExecutorRequirement.RECONSTRUCTABLE_JOB in requirements_lst:\n _check_intra_process_pipeline(init_context.pipeline)\n\n if ExecutorRequirement.NON_EPHEMERAL_INSTANCE in requirements_lst:\n _check_non_ephemeral_instance(init_context.instance)\n\n\ndef _check_intra_process_pipeline(pipeline: IPipeline) -> None:\n from dagster.core.definitions import JobDefinition\n\n if not isinstance(pipeline, ReconstructablePipeline):\n target = "job" if isinstance(pipeline.get_definition(), JobDefinition) else "pipeline"\n raise DagsterUnmetExecutorRequirementsError(\n 'You have attempted to use an executor that uses multiple processes with the {target} "{name}" '\n "that is not reconstructable. {target_cap} must be loaded in a way that allows dagster to reconstruct "\n "them in a new process. This means: \\n"\n " * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\\n"\n " * loading the {target} through the reconstructable() function\\n".format(\n target=target, name=pipeline.get_definition().name, target_cap=target.capitalize()\n )\n )\n\n\ndef _check_non_ephemeral_instance(instance: "DagsterInstance") -> None:\n if instance.is_ephemeral:\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with an "\n "ephemeral DagsterInstance. A non-ephemeral instance is needed to coordinate "\n "execution between multiple processes. You can configure your default instance "\n "via $DAGSTER_HOME or ensure a valid one is passed when invoking the python APIs. "\n "You can learn more about setting up a persistent DagsterInstance from the "\n "DagsterInstance docs here: https://docs.dagster.io/deployment/dagster-instance#default-local-behavior"\n )\n\n\ndef _get_default_executor_requirements(\n executor_config: ExecutorConfig,\n) -> List[ExecutorRequirement]:\n return multiple_process_executor_requirements() if "multiprocess" in executor_config else []\n\n\n@executor(\n name="multi_or_in_process_executor",\n config_schema=Field(\n Selector(\n {"multiprocess": MULTI_PROC_CONFIG, "in_process": IN_PROC_CONFIG},\n ),\n default_value={"multiprocess": {}},\n ),\n requirements=_get_default_executor_requirements,\n)\ndef multi_or_in_process_executor(init_context: "InitExecutorContext") -> "Executor":\n """The default executor for a job.\n\n This is the executor available by default on a :py:class:`JobDefinition`\n that does not provide custom executors. This executor has a multiprocessing-enabled mode, and a\n single-process mode. By default, multiprocessing mode is enabled. Switching between multiprocess\n mode and in-process mode can be achieved via config.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n\n\n execution:\n config:\n in_process:\n\n When using the multiprocess mode, ``max_concurrent`` and ``retries`` can also be configured.\n\n\n multiprocess:\n config:\n max_concurrent: 4\n retries:\n enabled:\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n When using the in_process mode, then only retries can be configured.\n\n Execution priority can be configured using the ``dagster/priority`` tag via solid metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n if "multiprocess" in init_context.executor_config:\n return _core_multiprocess_executor_creation(\n check.dict_elem(init_context.executor_config, "multiprocess")\n )\n else:\n return _core_in_process_executor_creation(\n check.dict_elem(init_context.executor_config, "in_process")\n )\n
", "current_page_name": "_modules/dagster/core/definitions/executor_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.executor_definition"}, "graph_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.graph_definition

\nfrom collections import OrderedDict\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nfrom toposort import CircularDependencyError, toposort_flatten\n\nimport dagster._check as check\nfrom dagster.config import Field, Shape\nfrom dagster.config.config_type import ConfigType\nfrom dagster.config.validate import validate_config\nfrom dagster.core.definitions.config import ConfigMapping\nfrom dagster.core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.definitions.utils import check_valid_name\nfrom dagster.core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster.core.selector.subset_selector import AssetSelectionData\nfrom dagster.core.storage.io_manager import io_manager\nfrom dagster.core.types.dagster_type import (\n    DagsterType,\n    DagsterTypeKind,\n    construct_dagster_type_dictionary,\n)\nfrom dagster.utils import merge_dicts\n\nfrom .dependency import (\n    DependencyStructure,\n    IDependencyDefinition,\n    Node,\n    NodeHandle,\n    NodeInvocation,\n    SolidInputHandle,\n)\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .logger_definition import LoggerDefinition\nfrom .node_definition import NodeDefinition\nfrom .output import OutputDefinition, OutputMapping\nfrom .preset import PresetDefinition\nfrom .solid_container import create_execution_structure, validate_dependency_dict\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster.core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster.core.instance import DagsterInstance\n\n    from .asset_layer import AssetLayer\n    from .executor_definition import ExecutorDefinition\n    from .job_definition import JobDefinition\n    from .partition import PartitionedConfig, PartitionsDefinition\n    from .solid_definition import SolidDefinition\n\n\ndef _check_node_defs_arg(graph_name: str, node_defs: Optional[List[NodeDefinition]]):\n    node_defs = node_defs or []\n\n    if not isinstance(node_defs, list):\n        raise DagsterInvalidDefinitionError(\n            '"nodes" arg to "{name}" is not a list. Got {val}.'.format(\n                name=graph_name, val=repr(node_defs)\n            )\n        )\n    for node_def in node_defs:\n        if isinstance(node_def, NodeDefinition):\n            continue\n        elif callable(node_def):\n            raise DagsterInvalidDefinitionError(\n                """You have passed a lambda or function {func} into {name} that is\n                not a node. You have likely forgetten to annotate this function with\n                the @op or @graph decorators.'\n                """.format(\n                    name=graph_name, func=node_def.__name__\n                )\n            )\n        else:\n            raise DagsterInvalidDefinitionError(\n                "Invalid item in node list: {item}".format(item=repr(node_def))\n            )\n\n    return node_defs\n\n\ndef _create_adjacency_lists(\n    solids: List[Node],\n    dep_structure: DependencyStructure,\n) -> Tuple[Dict[str, Set[Node]], Dict[str, Set[Node]]]:\n    visit_dict = {s.name: False for s in solids}\n    forward_edges: Dict[str, Set[Node]] = {s.name: set() for s in solids}\n    backward_edges: Dict[str, Set[Node]] = {s.name: set() for s in solids}\n\n    def visit(solid_name):\n        if visit_dict[solid_name]:\n            return\n\n        visit_dict[solid_name] = True\n\n        for output_handle in dep_structure.all_upstream_outputs_from_solid(solid_name):\n            forward_node = output_handle.solid.name\n            backward_node = solid_name\n            if forward_node in forward_edges:\n                forward_edges[forward_node].add(backward_node)\n                backward_edges[backward_node].add(forward_node)\n                visit(forward_node)\n\n    for s in solids:\n        visit(s.name)\n\n    return (forward_edges, backward_edges)\n\n\n
[docs]class GraphDefinition(NodeDefinition):\n """Defines a Dagster graph.\n\n A graph is made up of\n\n - Nodes, which can either be an op (the functional unit of computation), or another graph.\n - Dependencies, which determine how the values produced by nodes as outputs flow from\n one node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n (DAG) of compute.\n\n End users should prefer the :func:`@graph <graph>` decorator. GraphDefinition is generally\n intended to be used by framework authors or for programatically generated graphs.\n\n Args:\n name (str): The name of the graph. Must be unique within any :py:class:`GraphDefinition`\n or :py:class:`JobDefinition` containing the graph.\n description (Optional[str]): A human-readable description of the pipeline.\n node_defs (Optional[List[NodeDefinition]]): The set of ops / graphs used in this graph.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the graph. Keys of the top level dict are either the string names of ops in the\n graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[List[InputMapping]]): Defines the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[List[OutputMapping]]): Defines the outputs of the nested graph,\n and how they map from the outputs of its constituent ops.\n config (Optional[ConfigMapping]): Defines the config of the graph, and how its schema maps\n to the config of its constituent ops.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n Examples:\n\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(num):\n return num + 1\n\n graph_def = GraphDefinition(\n name='basic',\n node_defs=[return_one, add_one],\n dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n )\n """\n\n def __init__(\n self,\n name: str,\n description: Optional[str] = None,\n node_defs: Optional[List[NodeDefinition]] = None,\n dependencies: Optional[\n Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]\n ] = None,\n input_mappings: Optional[List[InputMapping]] = None,\n output_mappings: Optional[List[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Dict[str, Any]] = None,\n **kwargs,\n ):\n self._node_defs = _check_node_defs_arg(name, node_defs)\n self._dagster_type_dict = construct_dagster_type_dictionary(self._node_defs)\n self._dependencies = validate_dependency_dict(dependencies)\n self._dependency_structure, self._node_dict = create_execution_structure(\n self._node_defs, self._dependencies, graph_definition=self\n )\n\n # List[InputMapping]\n self._input_mappings, input_defs = _validate_in_mappings(\n check.opt_list_param(input_mappings, "input_mappings"),\n self._node_dict,\n self._dependency_structure,\n name,\n class_name=type(self).__name__,\n )\n # List[OutputMapping]\n self._output_mappings = _validate_out_mappings(\n check.opt_list_param(output_mappings, "output_mappings"),\n self._node_dict,\n self._dependency_structure,\n name,\n class_name=type(self).__name__,\n )\n\n self._config_mapping = check.opt_inst_param(config, "config", ConfigMapping)\n\n super(GraphDefinition, self).__init__(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=[output_mapping.definition for output_mapping in self._output_mappings],\n tags=tags,\n **kwargs,\n )\n\n # must happen after base class construction as properties are assumed to be there\n # eager computation to detect cycles\n self.solids_in_topological_order = self._solids_in_topological_order()\n\n def _solids_in_topological_order(self):\n\n _forward_edges, backward_edges = _create_adjacency_lists(\n self.solids, self.dependency_structure\n )\n\n try:\n order = toposort_flatten(backward_edges)\n except CircularDependencyError as err:\n raise DagsterInvalidDefinitionError(str(err)) from err\n\n return [self.solid_named(solid_name) for solid_name in order]\n\n @property\n def node_type_str(self) -> str:\n return "graph"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @property\n def solids(self) -> List[Node]:\n return list(set(self._node_dict.values()))\n\n @property\n def node_dict(self) -> Dict[str, Node]:\n return self._node_dict\n\n @property\n def node_defs(self) -> List[NodeDefinition]:\n return self._node_defs\n\n def has_solid_named(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._node_dict\n\n def solid_named(self, name: str) -> Node:\n check.str_param(name, "name")\n check.invariant(\n name in self._node_dict,\n "{graph_name} has no solid named {name}.".format(graph_name=self._name, name=name),\n )\n\n return self._node_dict[name]\n\n def get_solid(self, handle: NodeHandle) -> Node:\n check.inst_param(handle, "handle", NodeHandle)\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n name = lineage.pop()\n solid = self.solid_named(name)\n while lineage:\n name = lineage.pop()\n solid = solid.definition.solid_named(name)\n\n return solid\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_node_defs()\n\n def iterate_solid_defs(self) -> Iterator["SolidDefinition"]:\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_solid_defs()\n\n @property\n def input_mappings(self) -> List[InputMapping]:\n return self._input_mappings\n\n @property\n def output_mappings(self) -> List[OutputMapping]:\n return self._output_mappings\n\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n return self._config_mapping\n\n @property\n def has_config_mapping(self) -> bool:\n return self._config_mapping is not None\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._dagster_type_dict.values()\n\n def has_dagster_type(self, name):\n check.str_param(name, "name")\n return name in self._dagster_type_dict\n\n def dagster_type_named(self, name):\n check.str_param(name, "name")\n return self._dagster_type_dict[name]\n\n def get_input_mapping(self, input_name: str) -> InputMapping:\n\n check.str_param(input_name, "input_name")\n for mapping in self._input_mappings:\n if mapping.definition.name == input_name:\n return mapping\n check.failed(f"Could not find input mapping {input_name}")\n\n def input_mapping_for_pointer(\n self, pointer: Union[InputPointer, FanInInputPointer]\n ) -> Optional[InputMapping]:\n check.inst_param(pointer, "pointer", (InputPointer, FanInInputPointer))\n\n for mapping in self._input_mappings:\n if mapping.maps_to == pointer:\n return mapping\n return None\n\n def get_output_mapping(self, output_name: str) -> OutputMapping:\n check.str_param(output_name, "output_name")\n for mapping in self._output_mappings:\n if mapping.definition.name == output_name:\n return mapping\n check.failed(f"Could not find output mapping {output_name}")\n\n def resolve_output_to_origin(\n self, output_name: str, handle: NodeHandle\n ) -> Tuple[OutputDefinition, NodeHandle]:\n check.str_param(output_name, "output_name")\n check.inst_param(handle, "handle", NodeHandle)\n\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n mapped_solid = self.solid_named(mapping.maps_from.solid_name)\n return mapped_solid.definition.resolve_output_to_origin(\n mapping.maps_from.output_name,\n NodeHandle(mapped_solid.name, handle),\n )\n\n def default_value_for_input(self, input_name: str) -> Any:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return self.input_def_named(input_name).default_value\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_solid = self.solid_named(mapping.maps_to.solid_name)\n\n return mapped_solid.definition.default_value_for_input(mapping.maps_to.input_name)\n\n def input_has_default(self, input_name: str) -> bool:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return True\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_solid = self.solid_named(mapping.maps_to.solid_name)\n\n return mapped_solid.definition.input_has_default(mapping.maps_to.input_name)\n\n @property\n def dependencies(self) -> Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]:\n return self._dependencies\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._dependency_structure\n\n @property\n def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self.config_mapping.config_schema if self.config_mapping is not None else None\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n mapping = self.get_input_mapping(input_name)\n target_node = mapping.maps_to.solid_name\n # check if input mapped to solid which is downstream of another dynamic output within\n if self.dependency_structure.is_dynamic_mapped(target_node):\n return False\n\n # check if input mapped to solid which starts new dynamic downstream\n if self.dependency_structure.has_dynamic_downstreams(target_node):\n return False\n\n return self.solid_named(target_node).definition.input_supports_dynamic_output_dep(\n mapping.maps_to.input_name\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: Any,\n config_or_config_fn: Any,\n ):\n if not self.has_config_mapping:\n raise DagsterInvalidDefinitionError(\n "Only graphs utilizing config mapping can be pre-configured. The graph "\n '"{graph_name}" does not have a config mapping, and thus has nothing to be '\n "configured.".format(graph_name=self.name)\n )\n config_mapping = cast(ConfigMapping, self.config_mapping)\n return GraphDefinition(\n name=name,\n description=check.opt_str_param(description, "description", default=self.description),\n node_defs=self._node_defs,\n dependencies=self._dependencies,\n input_mappings=self._input_mappings,\n output_mappings=self._output_mappings,\n config=ConfigMapping(\n config_mapping.config_fn,\n config_schema=config_schema,\n receive_processed_config_values=config_mapping.receive_processed_config_values,\n ),\n )\n\n def node_names(self):\n return list(self._node_dict.keys())\n\n
[docs] def to_job(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n config: Optional[Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"]] = None,\n tags: Optional[Dict[str, Any]] = None,\n logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n op_selection: Optional[List[str]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _asset_selection_data: Optional[AssetSelectionData] = None,\n ) -> "JobDefinition":\n """\n Make this graph in to an executable Job by providing remaining components required for execution.\n\n Args:\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Dict[str, ResourceDefinition]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its solids and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagit playground, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagit playground, so be careful with secrets.\n tags (Optional[Dict[str, Any]]):\n Arbitrary metadata for any execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n logger_defs (Optional[Dict[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each solid (and optionally, resource) in the job can be versioned. If\n provided, memoizaton will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition\n keys that can parameterize the job. If this argument is supplied, the config\n argument can't also be supplied.\n asset_layer (Optional[AssetLayer]): Top level information about the assets this job\n will produce. Generally should not be set manually.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Returns:\n JobDefinition\n """\n from .executor_definition import ExecutorDefinition, multi_or_in_process_executor\n from .job_definition import JobDefinition\n from .partition import PartitionedConfig, PartitionsDefinition\n\n job_name = check_valid_name(name or self.name)\n\n tags = check.opt_dict_param(tags, "tags", key_type=str)\n executor_def = check.opt_inst_param(\n executor_def, "executor_def", ExecutorDefinition, default=multi_or_in_process_executor\n )\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n if resource_defs and "io_manager" in resource_defs:\n resource_defs_with_defaults = resource_defs\n else:\n resource_defs_with_defaults = merge_dicts(\n {"io_manager": default_job_io_manager}, resource_defs or {}\n )\n\n hooks = check.opt_set_param(hooks, "hooks", of_type=HookDefinition)\n op_retry_policy = check.opt_inst_param(op_retry_policy, "op_retry_policy", RetryPolicy)\n op_selection = check.opt_list_param(op_selection, "op_selection", of_type=str)\n presets = []\n config_mapping = None\n partitioned_config = None\n\n if partitions_def:\n check.inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n check.invariant(\n config is None, "Can't supply both the 'config' and 'partitions_def' arguments"\n )\n partitioned_config = PartitionedConfig(partitions_def, lambda _: {})\n\n if isinstance(config, ConfigMapping):\n config_mapping = config\n elif isinstance(config, PartitionedConfig):\n partitioned_config = config\n elif isinstance(config, dict):\n presets = [PresetDefinition(name="default", run_config=config)]\n # Using config mapping here is a trick to make it so that the preset will be used even\n # when no config is supplied for the job.\n config_mapping = _config_mapping_with_default_value(\n self._get_config_schema(resource_defs_with_defaults, executor_def, logger_defs),\n config,\n job_name,\n self.name,\n )\n elif config is not None:\n check.failed(\n f"config param must be a ConfigMapping, a PartitionedConfig, or a dictionary, but "\n f"is an object of type {type(config)}"\n )\n\n return JobDefinition(\n name=job_name,\n description=description or self.description,\n graph_def=self,\n resource_defs=resource_defs_with_defaults,\n logger_defs=logger_defs,\n executor_def=executor_def,\n config_mapping=config_mapping,\n partitioned_config=partitioned_config,\n preset_defs=presets,\n tags=tags,\n hook_defs=hooks,\n version_strategy=version_strategy,\n op_retry_policy=op_retry_policy,\n asset_layer=asset_layer,\n _input_values=input_values,\n _subset_selection_data=_asset_selection_data,\n ).get_job_def_for_subset_selection(op_selection)
\n\n def coerce_to_job(self):\n # attempt to coerce a Graph in to a Job, raising a useful error if it doesn't work\n try:\n return self.to_job()\n except DagsterInvalidDefinitionError as err:\n raise DagsterInvalidDefinitionError(\n f"Failed attempting to coerce Graph {self.name} in to a Job. "\n "Use to_job instead, passing the required information."\n ) from err\n\n def _get_config_schema(\n self,\n resource_defs: Optional[Dict[str, ResourceDefinition]],\n executor_def: "ExecutorDefinition",\n logger_defs: Optional[Dict[str, LoggerDefinition]],\n ) -> ConfigType:\n from .job_definition import JobDefinition\n\n return (\n JobDefinition(\n name=self.name,\n graph_def=self,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n )\n .get_run_config_schema("default")\n .run_config_schema_type\n )\n\n
[docs] def execute_in_process(\n self,\n run_config: Any = None,\n instance: Optional["DagsterInstance"] = None,\n resources: Optional[Dict[str, Any]] = None,\n raise_on_error: bool = True,\n op_selection: Optional[List[str]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """\n Execute this graph in-process, collecting results in-memory.\n\n Args:\n run_config (Optional[Dict[str, Any]]):\n Run config to provide to execution. The configuration for the underlying graph\n should exist under the "ops" key.\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n resources (Optional[Dict[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the graph.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n """\n from dagster.core.execution.build_resources import wrap_resources_for_execution\n from dagster.core.execution.execute_in_process import core_execute_in_process\n from dagster.core.instance import DagsterInstance\n\n from .executor_definition import execute_in_process_executor\n from .job_definition import JobDefinition\n\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resources = check.opt_dict_param(resources, "resources", key_type=str)\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n resource_defs = wrap_resources_for_execution(resources)\n\n ephemeral_job = JobDefinition(\n name=self._name,\n graph_def=self,\n executor_def=execute_in_process_executor,\n resource_defs=resource_defs,\n _input_values=input_values,\n ).get_job_def_for_subset_selection(op_selection)\n\n run_config = run_config if run_config is not None else {}\n op_selection = check.opt_list_param(op_selection, "op_selection", str)\n\n return core_execute_in_process(\n node=self,\n ephemeral_pipeline=ephemeral_job,\n run_config=run_config,\n instance=instance,\n output_capturing_enabled=True,\n raise_on_error=raise_on_error,\n run_id=run_id,\n )
\n\n @property\n def parent_graph_def(self) -> Optional["GraphDefinition"]:\n return None\n\n @property\n def is_subselected(self) -> bool:\n return False
\n\n\nclass SubselectedGraphDefinition(GraphDefinition):\n """Defines a subselected graph.\n\n Args:\n parent_graph_def (GraphDefinition): The parent graph that this current graph is subselected\n from. This is used for tracking where the subselected graph originally comes from.\n Note that we allow subselecting a subselected graph, and this field refers to the direct\n parent graph of the current subselection, rather than the original root graph.\n node_defs (Optional[List[NodeDefinition]]): A list of all top level nodes in the graph. A\n node can be an op or a graph that contains other nodes.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the subselected graph. Keys of the top level dict are either the string names of\n ops in the graph or, in the case of aliased solids, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[List[InputMapping]]): Define the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[List[OutputMapping]]): Define the outputs of the nested graph, and\n how they map from the outputs of its constituent ops.\n """\n\n def __init__(\n self,\n parent_graph_def: GraphDefinition,\n node_defs: Optional[List[NodeDefinition]],\n dependencies: Optional[Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]],\n input_mappings: Optional[List[InputMapping]],\n output_mappings: Optional[List[OutputMapping]],\n ):\n self._parent_graph_def = check.inst_param(\n parent_graph_def, "parent_graph_def", GraphDefinition\n )\n super(SubselectedGraphDefinition, self).__init__(\n name=parent_graph_def.name, # should we create special name for subselected graphs\n node_defs=node_defs,\n dependencies=dependencies,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config=parent_graph_def.config_mapping,\n tags=parent_graph_def.tags,\n )\n\n @property\n def parent_graph_def(self) -> GraphDefinition:\n return self._parent_graph_def\n\n def get_top_level_omitted_nodes(self) -> List[Node]:\n return [\n solid for solid in self.parent_graph_def.solids if not self.has_solid_named(solid.name)\n ]\n\n @property\n def is_subselected(self) -> bool:\n return True\n\n\ndef _validate_in_mappings(\n input_mappings: List[InputMapping],\n solid_dict: Dict[str, Node],\n dependency_structure: DependencyStructure,\n name: str,\n class_name: str,\n) -> Tuple[List[InputMapping], Iterable[InputDefinition]]:\n from .composition import MappedInputPlaceholder\n\n input_def_dict: Dict[str, InputDefinition] = OrderedDict()\n mapping_keys = set()\n\n for mapping in input_mappings:\n # handle incorrect objects passed in as mappings\n if not isinstance(mapping, InputMapping):\n if isinstance(mapping, InputDefinition):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' you passed an InputDefinition "\n "named '{input_name}' directly in to input_mappings. Return "\n "an InputMapping by calling mapping_to on the InputDefinition.".format(\n name=name, input_name=mapping.name, class_name=class_name\n )\n )\n else:\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' received unexpected type '{type}' in input_mappings. "\n "Provide an OutputMapping using InputDefinition(...).mapping_to(...)".format(\n type=type(mapping), name=name, class_name=class_name\n )\n )\n\n if input_def_dict.get(mapping.definition.name):\n if input_def_dict[mapping.definition.name] != mapping.definition:\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' multiple input mappings with same "\n "definition name but different definitions".format(\n name=name, class_name=class_name\n ),\n )\n else:\n input_def_dict[mapping.definition.name] = mapping.definition\n\n target_solid = solid_dict.get(mapping.maps_to.solid_name)\n if target_solid is None:\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' input mapping references solid "\n "'{solid_name}' which it does not contain.".format(\n name=name, solid_name=mapping.maps_to.solid_name, class_name=class_name\n )\n )\n if not target_solid.has_input(mapping.maps_to.input_name):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' input mapping to solid '{mapping.maps_to.solid_name}' "\n "which contains no input named '{mapping.maps_to.input_name}'".format(\n name=name, mapping=mapping, class_name=class_name\n )\n )\n\n target_input = target_solid.input_def_named(mapping.maps_to.input_name)\n solid_input_handle = SolidInputHandle(target_solid, target_input)\n\n if mapping.maps_to_fan_in:\n maps_to = cast(FanInInputPointer, mapping.maps_to)\n if not dependency_structure.has_fan_in_deps(solid_input_handle):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{maps_to.solid_name}.{maps_to.input_name}" (index {maps_to.fan_in_index} of fan-in) '\n f"is not a MultiDependencyDefinition."\n )\n inner_deps = dependency_structure.get_fan_in_deps(solid_input_handle)\n if (maps_to.fan_in_index >= len(inner_deps)) or (\n inner_deps[maps_to.fan_in_index] is not MappedInputPlaceholder\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{maps_to.solid_name}.{maps_to.input_name}" index {maps_to.fan_in_index} in '\n f"the MultiDependencyDefinition is not a MappedInputPlaceholder"\n )\n mapping_keys.add(f"{maps_to.solid_name}.{maps_to.input_name}.{maps_to.fan_in_index}")\n target_type = target_input.dagster_type.get_inner_type_for_fan_in()\n fan_in_msg = " (index {} of fan-in)".format(maps_to.fan_in_index)\n else:\n if dependency_structure.has_deps(solid_input_handle):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' input mapping target "\n '"{mapping.maps_to.solid_name}.{mapping.maps_to.input_name}" '\n "is already satisfied by output".format(\n name=name, mapping=mapping, class_name=class_name\n )\n )\n\n mapping_keys.add(\n "{mapping.maps_to.solid_name}.{mapping.maps_to.input_name}".format(mapping=mapping)\n )\n target_type = target_input.dagster_type\n fan_in_msg = ""\n\n if (\n # no need to check mapping type for graphs because users can't specify ins/out type on graphs\n class_name not in (GraphDefinition.__name__, SubselectedGraphDefinition.__name__)\n and target_type != mapping.definition.dagster_type\n ):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' input "\n "'{mapping.definition.name}' of type {mapping.definition.dagster_type.display_name} maps to "\n "{mapping.maps_to.solid_name}.{mapping.maps_to.input_name}{fan_in_msg} of different type "\n "{target_type.display_name}. InputMapping source and "\n "destination must have the same type.".format(\n mapping=mapping,\n name=name,\n target_type=target_type,\n class_name=class_name,\n fan_in_msg=fan_in_msg,\n )\n )\n\n for input_handle in dependency_structure.input_handles():\n if dependency_structure.has_fan_in_deps(input_handle):\n for idx, dep in enumerate(dependency_structure.get_fan_in_deps(input_handle)):\n if dep is MappedInputPlaceholder:\n mapping_str = (\n "{input_handle.solid_name}.{input_handle.input_name}.{idx}".format(\n input_handle=input_handle, idx=idx\n )\n )\n if mapping_str not in mapping_keys:\n raise DagsterInvalidDefinitionError(\n "Unsatisfied MappedInputPlaceholder at index {idx} in "\n "MultiDependencyDefinition for '{input_handle.solid_name}.{input_handle.input_name}'".format(\n input_handle=input_handle, idx=idx\n )\n )\n\n return input_mappings, input_def_dict.values()\n\n\ndef _validate_out_mappings(\n output_mappings: List[OutputMapping],\n solid_dict: Dict[str, Node],\n dependency_structure: DependencyStructure,\n name: str,\n class_name: str,\n) -> List[OutputMapping]:\n for mapping in output_mappings:\n if isinstance(mapping, OutputMapping):\n\n target_solid = solid_dict.get(mapping.maps_from.solid_name)\n if target_solid is None:\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' output mapping references node "\n "'{solid_name}' which it does not contain.".format(\n name=name, solid_name=mapping.maps_from.solid_name, class_name=class_name\n )\n )\n if not target_solid.has_output(mapping.maps_from.output_name):\n raise DagsterInvalidDefinitionError(\n "In {class_name} {name} output mapping from {described_node} "\n "which contains no output named '{mapping.maps_from.output_name}'".format(\n described_node=target_solid.describe_node(),\n name=name,\n mapping=mapping,\n class_name=class_name,\n )\n )\n\n target_output = target_solid.output_def_named(mapping.maps_from.output_name)\n\n if (\n mapping.definition.dagster_type.kind != DagsterTypeKind.ANY\n and (target_output.dagster_type != mapping.definition.dagster_type)\n and class_name != "GraphDefinition"\n ):\n raise DagsterInvalidDefinitionError(\n "In {class_name} '{name}' output "\n "'{mapping.definition.name}' of type {mapping.definition.dagster_type.display_name} "\n "maps from {mapping.maps_from.solid_name}.{mapping.maps_from.output_name} of different type "\n "{target_output.dagster_type.display_name}. OutputMapping source "\n "and destination must have the same type.".format(\n class_name=class_name,\n mapping=mapping,\n name=name,\n target_output=target_output,\n )\n )\n\n if target_output.is_dynamic and not mapping.definition.is_dynamic:\n raise DagsterInvalidDefinitionError(\n f'In {class_name} "{name}" can not map from {target_output.__class__.__name__} '\n f'"{target_output.name}" to {mapping.definition.__class__.__name__} '\n f'"{mapping.definition.name}". Definition types must align.'\n )\n\n dynamic_handle = dependency_structure.get_upstream_dynamic_handle_for_solid(\n target_solid.name\n )\n if dynamic_handle and not mapping.definition.is_dynamic:\n raise DagsterInvalidDefinitionError(\n f'In {class_name} "{name}" output "{mapping.definition.name}" mapping from '\n f"{target_solid.describe_node()} must be a DynamicOutputDefinition since it is "\n f'downstream of dynamic output "{dynamic_handle.describe()}".'\n )\n\n elif isinstance(mapping, OutputDefinition):\n raise DagsterInvalidDefinitionError(\n "You passed an OutputDefinition named '{output_name}' directly "\n "in to output_mappings. Return an OutputMapping by calling "\n "mapping_from on the OutputDefinition.".format(output_name=mapping.name)\n )\n else:\n raise DagsterInvalidDefinitionError(\n "Received unexpected type '{type}' in output_mappings. "\n "Provide an OutputMapping using OutputDefinition(...).mapping_from(...)".format(\n type=type(mapping)\n )\n )\n return output_mappings\n\n\ndef _config_mapping_with_default_value(\n inner_schema: ConfigType,\n default_config: Dict[str, Any],\n job_name: str,\n graph_name: str,\n) -> ConfigMapping:\n if not isinstance(inner_schema, Shape):\n check.failed("Only Shape (dictionary) config_schema allowed on Job ConfigMapping")\n\n def config_fn(x):\n return x\n\n updated_fields = {}\n field_aliases = inner_schema.field_aliases\n for name, field in inner_schema.fields.items():\n if name in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[name],\n description=field.description,\n )\n elif name in field_aliases and field_aliases[name] in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[field_aliases[name]],\n description=field.description,\n )\n else:\n updated_fields[name] = field\n\n config_schema = Shape(\n fields=updated_fields,\n description="run config schema with default values from default_config",\n field_aliases=inner_schema.field_aliases,\n )\n\n config_evr = validate_config(config_schema, default_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error in config when building job '{job_name}' from graph '{graph_name}' ",\n config_evr.errors,\n default_config,\n )\n\n return ConfigMapping(\n config_fn=config_fn, config_schema=config_schema, receive_processed_config_values=False\n )\n\n\n@io_manager(\n description="The default io manager for Jobs. Uses filesystem but switches to in-memory when invoked through execute_in_process."\n)\ndef default_job_io_manager(init_context):\n from dagster.core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n return PickledObjectFilesystemIOManager(base_dir=init_context.instance.storage_directory())\n
", "current_page_name": "_modules/dagster/core/definitions/graph_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.graph_definition"}, "hook_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.hook_definition

\nfrom typing import AbstractSet, Any, Callable, NamedTuple, Optional\n\nimport dagster._check as check\n\nfrom ..decorator_utils import get_function_params\nfrom ..errors import DagsterInvalidInvocationError\nfrom .utils import check_valid_name\n\n\n
[docs]class HookDefinition(\n NamedTuple(\n "_HookDefinition",\n [\n ("name", str),\n ("hook_fn", Callable),\n ("required_resource_keys", AbstractSet[str]),\n ("decorated_fn", Optional[Callable]),\n ],\n )\n):\n """Define a hook which can be triggered during a op execution (e.g. a callback on the step\n execution failure event during a op execution).\n\n Args:\n name (str): The name of this hook.\n hook_fn (Callable): The callback function that will be triggered.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n """\n\n def __new__(\n cls,\n name: str,\n hook_fn: Callable[..., Any],\n required_resource_keys: Optional[AbstractSet[str]] = None,\n decorated_fn: Optional[Callable[..., Any]] = None,\n ):\n return super(HookDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n hook_fn=check.callable_param(hook_fn, "hook_fn"),\n required_resource_keys=frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n ),\n decorated_fn=check.opt_callable_param(decorated_fn, "decorated_fn"),\n )\n\n def __call__(self, *args, **kwargs):\n """This is invoked when the hook is used as a decorator.\n\n We currently support hooks to decorate the following:\n\n - PipelineDefinition: when the hook decorates a job definition, it will be added to\n all the op invocations within the job.\n\n Example:\n .. code-block:: python\n\n @success_hook\n def slack_message_on_success(_):\n ...\n\n @slack_message_on_success\n @job\n def a_job():\n foo(bar())\n\n """\n from ..execution.context.hook import HookContext\n from .graph_definition import GraphDefinition\n from .hook_invocation import hook_invocation_result\n from .pipeline_definition import PipelineDefinition\n\n if len(args) > 0 and isinstance(args[0], (PipelineDefinition, GraphDefinition)):\n # when it decorates a pipeline, we apply this hook to all the solid invocations within\n # the pipeline.\n return args[0].with_hooks({self})\n else:\n if not self.decorated_fn:\n raise DagsterInvalidInvocationError(\n "Only hook definitions created using one of the hook decorators can be invoked."\n )\n fxn_args = get_function_params(self.decorated_fn)\n # If decorated fxn has two arguments, then this is an event list hook fxn, and parameter\n # names are always context and event_list\n if len(fxn_args) == 2:\n context_arg_name = fxn_args[0].name\n event_list_arg_name = fxn_args[1].name\n if len(args) + len(kwargs) != 2:\n raise DagsterInvalidInvocationError(\n "Decorated function expects two parameters, context and event_list, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], "context", HookContext)\n event_list = check.opt_list_param(\n args[1] if len(args) > 1 else kwargs[event_list_arg_name],\n event_list_arg_name,\n )\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n if event_list_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{event_list_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n event_list = check.opt_list_param(\n kwargs[event_list_arg_name], event_list_arg_name\n )\n return hook_invocation_result(self, context, event_list)\n else:\n context_arg_name = fxn_args[0].name\n if len(args) + len(kwargs) != 1:\n raise DagsterInvalidInvocationError(\n f"Decorated function expects one parameter, {context_arg_name}, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], context_arg_name, HookContext)\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n return hook_invocation_result(self, context)
\n
", "current_page_name": "_modules/dagster/core/definitions/hook_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.hook_definition"}, "input": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.input

\nfrom types import FunctionType\nfrom typing import TYPE_CHECKING, Any, Callable, Mapping, NamedTuple, Optional, Set, Type, Union\n\nimport dagster._check as check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.metadata import MetadataEntry, normalize_metadata\nfrom dagster.core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster.core.types.dagster_type import (\n    BuiltinScalarDagsterType,\n    DagsterType,\n    resolve_dagster_type,\n)\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom .inference import InferredInputProps\nfrom .utils import NoValueSentinel, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.execution.context.input import InputContext\n\n\n# unfortunately since type_check functions need TypeCheckContext which is only available\n# at runtime, we can only check basic types before runtime\ndef _check_default_value(input_name, dagster_type, default_value):\n    if default_value is not NoValueSentinel:\n        if dagster_type.is_nothing:\n            raise DagsterInvalidDefinitionError(\n                "Setting a default_value is invalid on InputDefinitions of type Nothing"\n            )\n\n        if isinstance(dagster_type, BuiltinScalarDagsterType):\n            type_check = dagster_type.type_check_scalar_value(default_value)\n            if not type_check.success:\n                raise DagsterInvalidDefinitionError(\n                    (\n                        "Type check failed for the default_value of InputDefinition "\n                        "{input_name} of type {dagster_type}. "\n                        "Received value {value} of type {type}"\n                    ).format(\n                        input_name=input_name,\n                        dagster_type=dagster_type.display_name,\n                        value=default_value,\n                        type=type(default_value),\n                    ),\n                )\n\n    return default_value\n\n\n
[docs]class InputDefinition:\n """Defines an argument to a solid's compute function.\n\n Inputs may flow from previous solids' outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n name (str): Name of the input.\n dagster_type (Optional[Union[Type, DagsterType]]]): The type of this input.\n Users should provide the Python type of the objects that they expect to be passed for\n this input, or a :py:class:`DagsterType` that defines a runtime check that they want\n to be run on this input. Defaults to :py:class:`Any`.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n root_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`RootInputManager` used for loading this input when it is not connected to an\n upstream output.\n metadata (Optional[Dict[str, Any]]): A dict of metadata for the input.\n asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n (or function that produces an AssetKey from the InputContext) which should be associated\n with this InputDefinition. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the InputContext) which should be associated with this InputDefinition.\n """\n\n def __init__(\n self,\n name=None,\n dagster_type=None,\n description=None,\n default_value=NoValueSentinel,\n root_manager_key=None,\n metadata=None,\n asset_key=None,\n asset_partitions=None,\n # when adding new params, make sure to update combine_with_inferred below\n ):\n self._name = check_valid_name(name) if name else None\n\n self._type_not_set = dagster_type is None\n self._dagster_type = check.inst(resolve_dagster_type(dagster_type), DagsterType)\n\n self._description = check.opt_str_param(description, "description")\n\n self._default_value = _check_default_value(self._name, self._dagster_type, default_value)\n\n if root_manager_key:\n experimental_arg_warning("root_manager_key", "InputDefinition.__init__")\n\n self._root_manager_key = check.opt_str_param(root_manager_key, "root_manager_key")\n\n self._metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n self._metadata_entries = check.is_list(\n normalize_metadata(self._metadata, [], allow_invalid=True), MetadataEntry\n )\n\n if asset_key:\n experimental_arg_warning("asset_key", "InputDefinition.__init__")\n\n if not callable(asset_key):\n check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n self._asset_key = asset_key\n\n if asset_partitions:\n experimental_arg_warning("asset_partitions", "InputDefinition.__init__")\n check.param_invariant(\n asset_key is not None,\n "asset_partitions",\n 'Cannot specify "asset_partitions" argument without also specifying "asset_key"',\n )\n if callable(asset_partitions):\n self._asset_partitions_fn = asset_partitions\n elif asset_partitions is not None:\n asset_partitions = check.opt_set_param(asset_partitions, "asset_partitions", str)\n self._asset_partitions_fn = lambda _: asset_partitions\n else:\n self._asset_partitions_fn = None\n\n @property\n def name(self):\n return self._name\n\n @property\n def dagster_type(self):\n return self._dagster_type\n\n @property\n def description(self):\n return self._description\n\n @property\n def has_default_value(self):\n return self._default_value is not NoValueSentinel\n\n @property\n def default_value(self):\n check.invariant(self.has_default_value, "Can only fetch default_value if has_default_value")\n return self._default_value\n\n @property\n def root_manager_key(self):\n return self._root_manager_key\n\n @property\n def metadata(self):\n return self._metadata\n\n @property\n def is_asset(self):\n return self._asset_key is not None\n\n @property\n def metadata_entries(self):\n return self._metadata_entries\n\n @property\n def hardcoded_asset_key(self) -> Optional[AssetKey]:\n if not callable(self._asset_key):\n return self._asset_key\n else:\n return None\n\n def get_asset_key(self, context) -> Optional[AssetKey]:\n """Get the AssetKey associated with this InputDefinition for the given\n :py:class:`InputContext` (if any).\n\n Args:\n context (InputContext): The InputContext that this InputDefinition is being evaluated\n in\n """\n if callable(self._asset_key):\n return self._asset_key(context)\n else:\n return self.hardcoded_asset_key\n\n def get_asset_partitions(self, context) -> Optional[Set[str]]:\n """Get the set of partitions that this solid will read from this InputDefinition for the given\n :py:class:`InputContext` (if any).\n\n Args:\n context (InputContext): The InputContext that this InputDefinition is being evaluated\n in\n """\n if self._asset_partitions_fn is None:\n return None\n\n return self._asset_partitions_fn(context)\n\n def mapping_to(self, solid_name, input_name, fan_in_index=None):\n """Create an input mapping to an input of a child solid.\n\n In a CompositeSolidDefinition, you can use this helper function to construct\n an :py:class:`InputMapping` to the input of a child solid.\n\n Args:\n solid_name (str): The name of the child solid to which to map this input.\n input_name (str): The name of the child solid' input to which to map this input.\n fan_in_index (Optional[int]): The index in to a fanned in input, else None\n\n Examples:\n\n .. code-block:: python\n\n input_mapping = InputDefinition('composite_input', Int).mapping_to(\n 'child_solid', 'int_input'\n )\n """\n check.str_param(solid_name, "solid_name")\n check.str_param(input_name, "input_name")\n check.opt_int_param(fan_in_index, "fan_in_index")\n\n if fan_in_index is not None:\n maps_to = FanInInputPointer(solid_name, input_name, fan_in_index)\n else:\n maps_to = InputPointer(solid_name, input_name)\n return InputMapping(self, maps_to)\n\n @staticmethod\n def create_from_inferred(\n inferred: InferredInputProps, decorator_name: str\n ) -> "InputDefinition":\n return InputDefinition(\n name=inferred.name,\n dagster_type=_checked_inferred_type(inferred, decorator_name),\n description=inferred.description,\n default_value=inferred.default_value,\n )\n\n def combine_with_inferred(\n self, inferred: InferredInputProps, decorator_name: str\n ) -> "InputDefinition":\n """\n Return a new InputDefinition that merges this ones properties with those inferred from type signature.\n This can update: dagster_type, description, and default_value if they are not set.\n """\n\n check.invariant(\n self.name == inferred.name,\n f"InferredInputProps name {inferred.name} did not align with InputDefinition name {self.name}",\n )\n\n dagster_type = self._dagster_type\n if self._type_not_set:\n dagster_type = _checked_inferred_type(inferred, decorator_name=decorator_name)\n\n description = self._description\n if description is None and inferred.description is not None:\n description = inferred.description\n\n default_value = self._default_value\n if not self.has_default_value:\n default_value = inferred.default_value\n\n return InputDefinition(\n name=self.name,\n dagster_type=dagster_type,\n description=description,\n default_value=default_value,\n root_manager_key=self._root_manager_key,\n metadata=self._metadata,\n asset_key=self._asset_key,\n asset_partitions=self._asset_partitions_fn,\n )
\n\n\ndef _checked_inferred_type(inferred: InferredInputProps, decorator_name: str) -> DagsterType:\n try:\n resolved_type = resolve_dagster_type(inferred.annotation)\n except DagsterError as e:\n raise DagsterInvalidDefinitionError(\n f"Problem using type '{inferred.annotation}' from type annotation for argument "\n f"'{inferred.name}', correct the issue or explicitly set the dagster_type on "\n "your InputDefinition."\n ) from e\n\n if resolved_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n f"Input parameter {inferred.name} is annotated with {resolved_type.display_name} "\n "which is a type that represents passing no data. This type must be used "\n f"via InputDefinition and no parameter should be included in the {decorator_name} decorated function."\n )\n return resolved_type\n\n\nclass InputPointer(NamedTuple("_InputPointer", [("solid_name", str), ("input_name", str)])):\n def __new__(cls, solid_name: str, input_name: str):\n return super(InputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.str_param(input_name, "input_name"),\n )\n\n\nclass FanInInputPointer(\n NamedTuple(\n "_FanInInputPointer", [("solid_name", str), ("input_name", str), ("fan_in_index", int)]\n )\n):\n def __new__(cls, solid_name: str, input_name: str, fan_in_index: int):\n return super(FanInInputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.str_param(input_name, "input_name"),\n check.int_param(fan_in_index, "fan_in_index"),\n )\n\n\n
[docs]class InputMapping(\n NamedTuple(\n "_InputMapping",\n [("definition", InputDefinition), ("maps_to", Union[InputPointer, FanInInputPointer])],\n )\n):\n """Defines an input mapping for a composite solid.\n\n Args:\n definition (InputDefinition): Defines the input to the composite solid.\n solid_name (str): The name of the child solid onto which to map the input.\n input_name (str): The name of the input to the child solid onto which to map the input.\n """\n\n def __new__(cls, definition: InputDefinition, maps_to: Union[InputPointer, FanInInputPointer]):\n return super(InputMapping, cls).__new__(\n cls,\n check.inst_param(definition, "definition", InputDefinition),\n check.inst_param(maps_to, "maps_to", (InputPointer, FanInInputPointer)),\n )\n\n @property\n def maps_to_fan_in(self):\n return isinstance(self.maps_to, FanInInputPointer)\n\n def describe(self) -> str:\n idx = self.maps_to.fan_in_index if isinstance(self.maps_to, FanInInputPointer) else ""\n return f"{self.definition.name} -> {self.maps_to.solid_name}:{self.maps_to.input_name}{idx}"
\n\n\n
[docs]class In(\n NamedTuple(\n "_In",\n [\n ("dagster_type", Union[DagsterType, Type[NoValueSentinel]]),\n ("description", Optional[str]),\n ("default_value", Any),\n ("root_manager_key", Optional[str]),\n ("metadata", Optional[Mapping[str, Any]]),\n ("asset_key", Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]),\n ("asset_partitions", Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]]),\n ],\n )\n):\n """\n Defines an argument to an op's compute function.\n\n Inputs may flow from previous op's outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this input. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n root_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`RootInputManager` used for loading this input when it is not connected to an\n upstream output.\n metadata (Optional[Dict[str, Any]]): A dict of metadata for the input.\n asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n (or function that produces an AssetKey from the InputContext) which should be associated\n with this In. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the InputContext) which should be associated with this In.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n default_value: Any = NoValueSentinel,\n root_manager_key: Optional[str] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n ):\n return super(In, cls).__new__(\n cls,\n dagster_type=NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type),\n description=check.opt_str_param(description, "description"),\n default_value=default_value,\n root_manager_key=check.opt_str_param(root_manager_key, "root_manager_key"),\n metadata=check.opt_dict_param(metadata, "metadata", key_type=str),\n asset_key=check.opt_inst_param(asset_key, "asset_key", (AssetKey, FunctionType)),\n asset_partitions=asset_partitions,\n )\n\n @staticmethod\n def from_definition(input_def: InputDefinition):\n return In(\n dagster_type=input_def.dagster_type,\n description=input_def.description,\n default_value=input_def._default_value, # pylint: disable=protected-access\n root_manager_key=input_def.root_manager_key,\n metadata=input_def.metadata,\n asset_key=input_def._asset_key, # pylint: disable=protected-access\n asset_partitions=input_def._asset_partitions_fn, # pylint: disable=protected-access\n )\n\n def to_definition(self, name: str) -> InputDefinition:\n dagster_type = self.dagster_type if self.dagster_type is not NoValueSentinel else None\n return InputDefinition(\n name=name,\n dagster_type=dagster_type,\n description=self.description,\n default_value=self.default_value,\n root_manager_key=self.root_manager_key,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n )
\n\n\n
[docs]class GraphIn(NamedTuple("_GraphIn", [("description", Optional[str])])):\n """\n Represents information about an input that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the input.\n """\n\n def __new__(cls, description=None):\n return super(GraphIn, cls).__new__(cls, description=description)\n\n def to_definition(self, name: str) -> InputDefinition:\n return InputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/core/definitions/input", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.input"}, "job_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.job_definition

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster.core.definitions.composition import MappedInputPlaceholder\nfrom dagster.core.definitions.dependency import (\n    DependencyDefinition,\n    DynamicCollectDependencyDefinition,\n    IDependencyDefinition,\n    MultiDependencyDefinition,\n    Node,\n    NodeHandle,\n    NodeInvocation,\n    SolidOutputHandle,\n)\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.node_definition import NodeDefinition\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n)\nfrom dagster.core.selector.subset_selector import (\n    AssetSelectionData,\n    LeafNodeSelection,\n    OpSelectionData,\n    parse_op_selection,\n)\nfrom dagster.core.utils import str_format_set\nfrom dagster.utils import merge_dicts\n\nfrom .asset_layer import AssetLayer, build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .executor_definition import ExecutorDefinition\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .hook_definition import HookDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .mode import ModeDefinition\nfrom .partition import PartitionSetDefinition, PartitionedConfig, PartitionsDefinition\nfrom .pipeline_definition import PipelineDefinition\nfrom .preset import PresetDefinition\nfrom .resource_definition import ResourceDefinition\nfrom .run_request import RunRequest\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster.core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster.core.instance import DagsterInstance\n    from dagster.core.snap import PipelineSnapshot\n\n\n
[docs]class JobDefinition(PipelineDefinition):\n def __init__(\n self,\n graph_def: GraphDefinition,\n resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n config_mapping: Optional[ConfigMapping] = None,\n partitioned_config: Optional[PartitionedConfig] = None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n preset_defs: Optional[List[PresetDefinition]] = None,\n tags: Optional[Dict[str, Any]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]] = None,\n asset_layer: Optional[AssetLayer] = None,\n _input_values: Optional[Mapping[str, object]] = None,\n ):\n\n # Exists for backcompat - JobDefinition is implemented as a single-mode pipeline.\n mode_def = ModeDefinition(\n resource_defs=resource_defs,\n logger_defs=logger_defs,\n executor_defs=[executor_def] if executor_def else None,\n _config_mapping=config_mapping,\n _partitioned_config=partitioned_config,\n )\n\n self._cached_partition_set: Optional["PartitionSetDefinition"] = None\n self._subset_selection_data = check.opt_inst_param(\n _subset_selection_data,\n "_subset_selection_data",\n (OpSelectionData, AssetSelectionData),\n )\n self._input_values: Mapping[str, object] = check.opt_mapping_param(\n _input_values, "_input_values"\n )\n for input_name in sorted(list(self._input_values.keys())):\n if not graph_def.has_input(input_name):\n job_name = name or graph_def.name\n raise DagsterInvalidDefinitionError(\n f"Error when constructing JobDefinition '{job_name}': Input value provided for key '{input_name}', but job has no top-level input with that name."\n )\n\n super(JobDefinition, self).__init__(\n name=name,\n description=description,\n mode_defs=[mode_def],\n preset_defs=preset_defs,\n tags=tags,\n hook_defs=hook_defs,\n solid_retry_policy=op_retry_policy,\n graph_def=graph_def,\n version_strategy=version_strategy,\n asset_layer=asset_layer,\n )\n\n @property\n def target_type(self) -> str:\n return "job"\n\n @property\n def is_job(self) -> bool:\n return True\n\n def describe_target(self):\n return f"{self.target_type} '{self.name}'"\n\n @property\n def executor_def(self) -> ExecutorDefinition:\n return self.get_mode_definition().executor_defs[0]\n\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n return self.get_mode_definition().resource_defs\n\n @property\n def partitioned_config(self) -> Optional[PartitionedConfig]:\n return self.get_mode_definition().partitioned_config\n\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n return self.get_mode_definition().config_mapping\n\n @property\n def loggers(self) -> Mapping[str, LoggerDefinition]:\n return self.get_mode_definition().loggers\n\n
[docs] def execute_in_process(\n self,\n run_config: Optional[Dict[str, Any]] = None,\n instance: Optional["DagsterInstance"] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n op_selection: Optional[List[str]] = None,\n asset_selection: Optional[List[AssetKey]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """\n Execute the Job in-process, gathering results in-memory.\n\n The `executor_def` on the Job will be ignored, and replaced with the in-process executor.\n If using the default `io_manager`, it will switch from filesystem to in-memory.\n\n\n Args:\n run_config (Optional[Dict[str, Any]]:\n The configuration for the run\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for jobs with partitioned config.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n\n """\n from dagster.core.definitions.executor_definition import execute_in_process_executor\n from dagster.core.execution.execute_in_process import core_execute_in_process\n\n run_config = check.opt_dict_param(run_config, "run_config")\n op_selection = check.opt_list_param(op_selection, "op_selection", str)\n asset_selection = check.opt_list_param(asset_selection, "asset_selection", AssetKey)\n\n check.invariant(\n not (op_selection and asset_selection),\n "op_selection and asset_selection cannot both be provided as args to execute_in_process",\n )\n\n partition_key = check.opt_str_param(partition_key, "partition_key")\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n # Combine provided input values at execute_in_process with input values\n # provided to the definition. Input values provided at\n # execute_in_process will override those provided on the definition.\n input_values = merge_dicts(self._input_values, input_values)\n\n resource_defs = dict(self.resource_defs)\n logger_defs = dict(self.loggers)\n ephemeral_job = JobDefinition(\n name=self._name,\n graph_def=self._graph_def,\n resource_defs=_swap_default_io_man(resource_defs, self),\n executor_def=execute_in_process_executor,\n logger_defs=logger_defs,\n hook_defs=self.hook_defs,\n config_mapping=self.config_mapping,\n partitioned_config=self.partitioned_config,\n tags=self.tags,\n op_retry_policy=self._solid_retry_policy,\n version_strategy=self.version_strategy,\n asset_layer=self.asset_layer,\n _input_values=input_values,\n )\n\n ephemeral_job = ephemeral_job.get_job_def_for_subset_selection(\n op_selection, frozenset(asset_selection) if asset_selection else None\n )\n\n tags = None\n if partition_key:\n if not self.partitioned_config:\n check.failed(\n f"Provided partition key `{partition_key}` for job `{self._name}` without a partitioned config"\n )\n check.invariant(\n not run_config,\n "Cannot provide both run_config and partition_key arguments to `execute_in_process`",\n )\n partition_set = self.get_partition_set_def()\n if not partition_set:\n check.failed(\n f"Provided partition key `{partition_key}` for job `{self._name}` without a partitioned config"\n )\n\n partition = partition_set.get_partition(partition_key)\n run_config = partition_set.run_config_for_partition(partition)\n tags = partition_set.tags_for_partition(partition)\n\n return core_execute_in_process(\n node=self._graph_def,\n ephemeral_pipeline=ephemeral_job,\n run_config=run_config,\n instance=instance,\n output_capturing_enabled=True,\n raise_on_error=raise_on_error,\n run_tags=tags,\n run_id=run_id,\n asset_selection=frozenset(asset_selection),\n )
\n\n @property\n def op_selection_data(self) -> Optional[OpSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, OpSelectionData)\n else None\n )\n\n @property\n def asset_selection_data(self) -> Optional[AssetSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, AssetSelectionData)\n else None\n )\n\n def get_job_def_for_subset_selection(\n self,\n op_selection: Optional[List[str]] = None,\n asset_selection: Optional[FrozenSet[AssetKey]] = None,\n ):\n check.invariant(\n not (op_selection and asset_selection),\n "op_selection and asset_selection cannot both be provided as args to execute_in_process",\n )\n if op_selection:\n return self._get_job_def_for_op_selection(op_selection)\n if asset_selection: # asset_selection:\n return self._get_job_def_for_asset_selection(asset_selection)\n else:\n return self\n\n def _get_job_def_for_asset_selection(\n self,\n asset_selection: Optional[FrozenSet[AssetKey]] = None,\n ) -> "JobDefinition":\n asset_selection = check.opt_set_param(asset_selection, "asset_selection", AssetKey)\n\n for asset in asset_selection:\n nonexistent_assets = [\n asset for asset in asset_selection if asset not in self.asset_layer.asset_keys\n ]\n nonexistent_asset_strings = [\n asset_str\n for asset_str in (asset.to_string() for asset in nonexistent_assets)\n if asset_str\n ]\n if nonexistent_assets:\n raise DagsterInvalidSubsetError(\n "Assets provided in asset_selection argument "\n f"{', '.join(nonexistent_asset_strings)} do not exist in parent asset group or job."\n )\n asset_selection_data = AssetSelectionData(\n asset_selection=asset_selection,\n parent_job_def=self,\n )\n\n check.invariant(\n self.asset_layer._assets_defs != None, # pylint:disable=protected-access\n "Asset layer must have _asset_defs argument defined",\n )\n\n new_job = build_asset_selection_job(\n name=self.name,\n assets=self.asset_layer._assets_defs, # pylint:disable=protected-access\n source_assets=self.asset_layer._source_asset_defs, # pylint:disable=protected-access\n executor_def=self.executor_def,\n resource_defs=self.resource_defs,\n description=self.description,\n tags=self.tags,\n asset_selection=asset_selection,\n asset_selection_data=asset_selection_data,\n )\n return new_job\n\n def _get_job_def_for_op_selection(\n self,\n op_selection: Optional[List[str]] = None,\n ) -> "JobDefinition":\n if not op_selection:\n return self\n\n op_selection = check.opt_list_param(op_selection, "op_selection", str)\n\n resolved_op_selection_dict = parse_op_selection(self, op_selection)\n\n try:\n sub_graph = get_subselected_graph_definition(self.graph, resolved_op_selection_dict)\n\n return JobDefinition(\n name=self.name,\n description=self.description,\n resource_defs=dict(self.resource_defs),\n logger_defs=dict(self.loggers),\n executor_def=self.executor_def,\n config_mapping=self.config_mapping,\n partitioned_config=self.partitioned_config,\n preset_defs=self.preset_defs,\n tags=self.tags,\n hook_defs=self.hook_defs,\n op_retry_policy=self._solid_retry_policy,\n graph_def=sub_graph,\n version_strategy=self.version_strategy,\n _subset_selection_data=OpSelectionData(\n op_selection=op_selection,\n resolved_op_selection=set(\n resolved_op_selection_dict.keys()\n ), # equivalent to solids_to_execute. currently only gets top level nodes.\n parent_job_def=self, # used by pipeline snapshot lineage\n ),\n # TODO: subset this structure.\n # https://github.com/dagster-io/dagster/issues/7541\n asset_layer=self.asset_layer,\n )\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(resolved_op_selection_dict)} for graph "\n f"{self.graph.name} results in an invalid graph."\n ) from exc\n\n def get_partition_set_def(self) -> Optional["PartitionSetDefinition"]:\n\n mode = self.get_mode_definition()\n if not mode.partitioned_config:\n return None\n\n if not self._cached_partition_set:\n\n tags_fn = mode.partitioned_config.tags_for_partition_fn\n if not tags_fn:\n tags_fn = lambda _: {}\n self._cached_partition_set = PartitionSetDefinition(\n job_name=self.name,\n name=f"{self.name}_partition_set",\n partitions_def=mode.partitioned_config.partitions_def,\n run_config_fn_for_partition=mode.partitioned_config.run_config_for_partition_fn,\n tags_fn_for_partition=tags_fn,\n mode=mode.name,\n )\n\n return self._cached_partition_set\n\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n mode = self.get_mode_definition()\n if not mode.partitioned_config:\n return None\n\n return mode.partitioned_config.partitions_def\n\n def run_request_for_partition(self, partition_key: str, run_key: Optional[str]) -> RunRequest:\n partition_set = self.get_partition_set_def()\n if not partition_set:\n check.failed("Called run_request_for_partition on a non-partitioned job")\n\n partition = partition_set.get_partition(partition_key)\n run_config = partition_set.run_config_for_partition(partition)\n tags = partition_set.tags_for_partition(partition)\n return RunRequest(run_key=run_key, run_config=run_config, tags=tags)\n\n
[docs] def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "JobDefinition":\n """Apply a set of hooks to all op instances within the job."""\n\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n\n job_def = JobDefinition(\n name=self.name,\n graph_def=self._graph_def,\n resource_defs=dict(self.resource_defs),\n logger_defs=dict(self.loggers),\n executor_def=self.executor_def,\n partitioned_config=self.partitioned_config,\n config_mapping=self.config_mapping,\n preset_defs=self.preset_defs,\n tags=self.tags,\n hook_defs=hook_defs | self.hook_defs,\n description=self._description,\n op_retry_policy=self._solid_retry_policy,\n asset_layer=self.asset_layer,\n _subset_selection_data=self._subset_selection_data,\n )\n\n update_wrapper(job_def, self, updated=())\n\n return job_def
\n\n def get_parent_pipeline_snapshot(self) -> Optional["PipelineSnapshot"]:\n if self.op_selection_data:\n return self.op_selection_data.parent_job_def.get_pipeline_snapshot()\n elif self.asset_selection_data:\n return self.asset_selection_data.parent_job_def.get_pipeline_snapshot()\n else:\n return None\n\n def has_direct_input_value(self, input_name: str) -> bool:\n return input_name in self._input_values\n\n def get_direct_input_value(self, input_name: str) -> object:\n if input_name not in self._input_values:\n raise DagsterInvalidInvocationError(\n f"On job '{self.name}', attempted to retrieve input value for input named '{input_name}', but no value was provided. Provided input values: {sorted(list(self._input_values.keys()))}"\n )\n return self._input_values[input_name]
\n\n\ndef _swap_default_io_man(resources: Dict[str, ResourceDefinition], job: PipelineDefinition):\n """\n Used to create the user facing experience of the default io_manager\n switching to in-memory when using execute_in_process.\n """\n from dagster.core.storage.mem_io_manager import mem_io_manager\n\n from .graph_definition import default_job_io_manager\n\n if (\n # pylint: disable=comparison-with-callable\n resources.get("io_manager") in [default_job_io_manager]\n and job.version_strategy is None\n ):\n updated_resources = dict(resources)\n updated_resources["io_manager"] = mem_io_manager\n return updated_resources\n\n return resources\n\n\ndef _dep_key_of(node: Node) -> NodeInvocation:\n return NodeInvocation(\n name=node.definition.name,\n alias=node.name,\n tags=node.tags,\n hook_defs=node.hook_defs,\n retry_policy=node.retry_policy,\n )\n\n\ndef get_subselected_graph_definition(\n graph: GraphDefinition,\n resolved_op_selection_dict: Dict,\n parent_handle: Optional[NodeHandle] = None,\n) -> SubselectedGraphDefinition:\n deps: Dict[\n Union[str, NodeInvocation],\n Dict[str, IDependencyDefinition],\n ] = {}\n\n selected_nodes: List[Tuple[str, NodeDefinition]] = []\n\n for node in graph.solids_in_topological_order:\n node_handle = NodeHandle(node.name, parent=parent_handle)\n # skip if the node isn't selected\n if node.name not in resolved_op_selection_dict:\n continue\n\n # rebuild graph if any nodes inside the graph are selected\n if node.is_graph and resolved_op_selection_dict[node.name] is not LeafNodeSelection:\n definition = get_subselected_graph_definition(\n node.definition,\n resolved_op_selection_dict[node.name],\n parent_handle=node_handle,\n )\n # use definition if the node as a whole is selected. this includes selecting the entire graph\n else:\n definition = node.definition\n selected_nodes.append((node.name, definition))\n\n # build dependencies for the node. we do it for both cases because nested graphs can have\n # inputs and outputs too\n deps[_dep_key_of(node)] = {}\n for input_handle in node.input_handles():\n if graph.dependency_structure.has_direct_dep(input_handle):\n output_handle = graph.dependency_structure.get_direct_dep(input_handle)\n if output_handle.solid.name in resolved_op_selection_dict:\n deps[_dep_key_of(node)][input_handle.input_def.name] = DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n elif graph.dependency_structure.has_dynamic_fan_in_dep(input_handle):\n output_handle = graph.dependency_structure.get_dynamic_fan_in_dep(input_handle)\n if output_handle.solid.name in resolved_op_selection_dict:\n deps[_dep_key_of(node)][\n input_handle.input_def.name\n ] = DynamicCollectDependencyDefinition(\n solid_name=output_handle.solid.name,\n output_name=output_handle.output_def.name,\n )\n elif graph.dependency_structure.has_fan_in_deps(input_handle):\n output_handles = graph.dependency_structure.get_fan_in_deps(input_handle)\n multi_dependencies = [\n DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n for output_handle in output_handles\n if (\n isinstance(output_handle, SolidOutputHandle)\n and output_handle.solid.name in resolved_op_selection_dict\n )\n ]\n deps[_dep_key_of(node)][input_handle.input_def.name] = MultiDependencyDefinition(\n cast(\n List[Union[DependencyDefinition, Type[MappedInputPlaceholder]]],\n multi_dependencies,\n )\n )\n # else input is unconnected\n\n # filter out unselected input/output mapping\n new_input_mappings = list(\n filter(\n lambda input_mapping: input_mapping.maps_to.solid_name\n in [name for name, _ in selected_nodes],\n graph._input_mappings, # pylint: disable=protected-access\n )\n )\n new_output_mappings = list(\n filter(\n lambda output_mapping: output_mapping.maps_from.solid_name\n in [name for name, _ in selected_nodes],\n graph._output_mappings, # pylint: disable=protected-access\n )\n )\n\n return SubselectedGraphDefinition(\n parent_graph_def=graph,\n dependencies=deps,\n node_defs=[definition for _, definition in selected_nodes],\n input_mappings=new_input_mappings,\n output_mappings=new_output_mappings,\n )\n\n\ndef get_direct_input_values_from_job(target: PipelineDefinition) -> Mapping[str, Any]:\n if target.is_job:\n return cast(JobDefinition, target)._input_values # pylint: disable=protected-access\n else:\n return {}\n
", "current_page_name": "_modules/dagster/core/definitions/job_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.job_definition"}, "logger_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.logger_definition

\nimport logging\nfrom typing import TYPE_CHECKING, Any, Callable, Optional, Union\n\nimport dagster._check as check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.configurable import AnonymousConfigurableDefinition\nfrom dagster.core.errors import DagsterInvalidInvocationError\n\nfrom ..decorator_utils import get_function_params\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\nif TYPE_CHECKING:\n    from dagster.core.definitions import JobDefinition, PipelineDefinition\n    from dagster.core.execution.context.logger import InitLoggerContext, UnboundInitLoggerContext\n\n    InitLoggerFunction = Callable[[InitLoggerContext], logging.Logger]\n\n\n
[docs]class LoggerDefinition(AnonymousConfigurableDefinition):\n """Core class for defining loggers.\n\n Loggers are job-scoped logging handlers, which will be automatically invoked whenever\n dagster messages are logged from within a job.\n\n Args:\n logger_fn (Callable[[InitLoggerContext], logging.Logger]): User-provided function to\n instantiate the logger. This logger will be automatically invoked whenever the methods\n on ``context.log`` are called from within job/pipeline compute logic.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of this logger.\n """\n\n def __init__(\n self,\n logger_fn: "InitLoggerFunction",\n config_schema: Any = None,\n description: Optional[str] = None,\n ):\n self._logger_fn = check.callable_param(logger_fn, "logger_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n\n def __call__(self, *args, **kwargs):\n from dagster.core.execution.context.logger import UnboundInitLoggerContext\n\n from .logger_invocation import logger_invocation_result\n\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Logger initialization function has context argument, but no context argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of logger received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.logger_fn)[0].name\n\n if args:\n context = check.opt_inst_param(\n args[0],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, pipeline_def=None),\n )\n return logger_invocation_result(self, context)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Logger initialization expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, pipeline_def=None),\n )\n\n return logger_invocation_result(self, context)\n\n @property\n def logger_fn(self) -> "InitLoggerFunction":\n return self._logger_fn\n\n @property\n def config_schema(self) -> Any:\n return self._config_schema\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n def copy_for_configured(\n self, description: Optional[str], config_schema: Any, _\n ) -> "LoggerDefinition":\n return LoggerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n logger_fn=self.logger_fn,\n )
\n\n\n
[docs]def logger(\n config_schema: Any = None, description: Optional[str] = None\n) -> Union["LoggerDefinition", Callable[["InitLoggerFunction"], "LoggerDefinition"]]:\n """Define a logger.\n\n The decorated function should accept an :py:class:`InitLoggerContext` and return an instance of\n :py:class:`python:logging.Logger`. This function will become the ``logger_fn`` of an underlying\n :py:class:`LoggerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the logger.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @logger versus @logger()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return LoggerDefinition(logger_fn=config_schema)\n\n def _wrap(logger_fn: "InitLoggerFunction") -> "LoggerDefinition":\n return LoggerDefinition(\n logger_fn=logger_fn,\n config_schema=config_schema,\n description=description,\n )\n\n return _wrap
\n\n\n
[docs]def build_init_logger_context(\n logger_config: Any = None,\n pipeline_def: Optional["PipelineDefinition"] = None,\n job_def: Optional["JobDefinition"] = None,\n) -> "UnboundInitLoggerContext":\n """Builds logger initialization context from provided parameters.\n\n This function can be used to provide the context argument to the invocation of a logger\n definition.\n\n Note that you may only specify one of pipeline_def and job_def.\n\n Args:\n logger_config (Any): The config to provide during initialization of logger.\n pipeline_def (Optional[PipelineDefinition]): The pipeline definition that the logger will be\n used with.\n job_def (Optional[JobDefinition]): The job definition that the logger will be used with.\n\n Examples:\n .. code-block:: python\n\n context = build_init_logger_context()\n logger_to_init(context)\n """\n from dagster.core.definitions import JobDefinition, PipelineDefinition\n from dagster.core.execution.context.logger import UnboundInitLoggerContext\n\n check.opt_inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.opt_inst_param(job_def, "job_def", JobDefinition)\n\n check.invariant(\n not (pipeline_def and job_def),\n "In build_init_logger_context, you may only specify one of the pipeline_def and job_def parameters, not both.",\n )\n\n return UnboundInitLoggerContext(\n logger_config=logger_config, pipeline_def=pipeline_def or job_def\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/logger_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.logger_definition"}, "metadata": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.metadata

\nimport functools\nimport os\nimport re\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.core.errors import DagsterInvalidMetadata\nfrom dagster.serdes import whitelist_for_serdes\nfrom dagster.utils.backcompat import (\n    canonicalize_backcompat_args,\n    deprecation_warning,\n    experimental,\n    experimental_class_warning,\n)\n\nfrom .table import TableColumn, TableColumnConstraints, TableConstraints, TableRecord, TableSchema\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.events import AssetKey\n\nRawMetadataValue = Union[\n    "MetadataValue",\n    dict,\n    float,\n    int,\n    list,\n    str,\n]\n\nMetadataMapping = Mapping[str, "MetadataValue"]\nMetadataUserInput = Mapping[str, RawMetadataValue]\n\n\ndef last_file_comp(path: str) -> str:\n    return os.path.basename(os.path.normpath(path))\n\n\n# ########################\n# ##### NORMALIZATION\n# ########################\n\n\ndef normalize_metadata(\n    metadata: Mapping[str, RawMetadataValue],\n    metadata_entries: Sequence[Union["MetadataEntry", "PartitionMetadataEntry"]],\n    allow_invalid: bool = False,\n) -> List[Union["MetadataEntry", "PartitionMetadataEntry"]]:\n    if metadata and metadata_entries:\n        raise DagsterInvalidMetadata(\n            "Attempted to provide both `metadata` and `metadata_entries` arguments to an event. "\n            "Must provide only one of the two."\n        )\n\n    if metadata_entries:\n        deprecation_warning(\n            'Argument "metadata_entries"',\n            "0.15.0",\n            additional_warn_txt="Use argument `metadata` instead. The `MetadataEntry` `description` attribute is also deprecated-- argument `metadata` takes a label: value dictionary.",\n            stacklevel=4,  # to get the caller of `normalize_metadata`\n        )\n        return check.list_param(\n            metadata_entries, "metadata_entries", (MetadataEntry, PartitionMetadataEntry)\n        )\n\n    # This is a stopgap measure to deal with unsupported metadata values, which occur when we try\n    # to convert arbitrary metadata (on e.g. OutputDefinition) to a MetadataValue, which is required\n    # for serialization. This will cause unsupported values to be silently replaced with a\n    # string placeholder.\n    elif allow_invalid:\n        metadata_entries = []\n        for k, v in metadata.items():\n            try:\n                metadata_entries.append(package_metadata_value(k, v))\n            except DagsterInvalidMetadata:\n                deprecation_warning(\n                    "Support for arbitrary metadata values",\n                    "0.15.0",\n                    additional_warn_txt=f"In the future, all user-supplied metadata values must be one of {RawMetadataValue}",\n                    stacklevel=4,  # to get the caller of `normalize_metadata`\n                )\n                metadata_entries.append(\n                    MetadataEntry.text(f"[{v.__class__.__name__}] (unserializable)", k)\n                )\n        return metadata_entries\n\n    return [\n        package_metadata_value(k, v)\n        for k, v in check.opt_dict_param(metadata, "metadata", key_type=str).items()\n    ]\n\n\ndef normalize_metadata_value(raw_value: RawMetadataValue):\n    from dagster.core.definitions.events import AssetKey\n\n    if isinstance(raw_value, MetadataValue):\n        return raw_value\n    elif isinstance(raw_value, str):\n        return MetadataValue.text(raw_value)\n    elif isinstance(raw_value, float):\n        return MetadataValue.float(raw_value)\n    elif isinstance(raw_value, bool):\n        return MetadataValue.bool(raw_value)\n    elif isinstance(raw_value, int):\n        return MetadataValue.int(raw_value)\n    elif isinstance(raw_value, dict):\n        return MetadataValue.json(raw_value)\n    elif isinstance(raw_value, os.PathLike):\n        return MetadataValue.path(raw_value)\n    elif isinstance(raw_value, AssetKey):\n        return MetadataValue.asset(raw_value)\n    elif isinstance(raw_value, TableSchema):\n        return MetadataValue.table_schema(raw_value)\n\n    raise DagsterInvalidMetadata(\n        f"Its type was {type(raw_value)}. Consider wrapping the value with the appropriate "\n        "MetadataValue type."\n    )\n\n\ndef package_metadata_value(label: str, raw_value: RawMetadataValue) -> "MetadataEntry":\n    check.str_param(label, "label")\n\n    if isinstance(raw_value, (MetadataEntry, PartitionMetadataEntry)):\n        raise DagsterInvalidMetadata(\n            f"Expected a metadata value, found an instance of {raw_value.__class__.__name__}. Consider "\n            "instead using a MetadataValue wrapper for the value."\n        )\n    try:\n        value = normalize_metadata_value(raw_value)\n    except DagsterInvalidMetadata as e:\n        raise DagsterInvalidMetadata(\n            f'Could not resolve the metadata value for "{label}" to a known type. {e}'\n        ) from None\n    return MetadataEntry(label=label, value=value)\n\n\n# ########################\n# ##### METADATA VALUE\n# ########################\n\n\n
[docs]class MetadataValue:\n """Utility class to wrap metadata values passed into Dagster events so that they can be\n displayed in Dagit and other tooling.\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": "hello",\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n "num_rows": 0,\n },\n )\n """\n\n
[docs] @staticmethod\n def text(text: str) -> "TextMetadataValue":\n """Static constructor for a metadata value wrapping text as\n :py:class:`TextMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": MetadataValue.text("hello")\n },\n )\n\n Args:\n text (str): The text string for a metadata entry.\n """\n return TextMetadataValue(text)
\n\n
[docs] @staticmethod\n def url(url: str) -> "UrlMetadataValue":\n """Static constructor for a metadata value wrapping a URL as\n :py:class:`UrlMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata={\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n }\n )\n\n\n Args:\n url (str): The URL for a metadata entry.\n """\n return UrlMetadataValue(url)
\n\n
[docs] @staticmethod\n def path(path: Union[str, os.PathLike]) -> "PathMetadataValue":\n """Static constructor for a metadata value wrapping a path as\n :py:class:`PathMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "filepath": MetadataValue.path("path/to/file"),\n }\n )\n\n Args:\n path (str): The path for a metadata entry.\n """\n return PathMetadataValue(path)
\n\n
[docs] @staticmethod\n def json(data: Dict[str, Any]) -> "JsonMetadataValue":\n """Static constructor for a metadata value wrapping a path as\n :py:class:`JsonMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata={\n "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n },\n )\n\n Args:\n data (Dict[str, Any]): The JSON data for a metadata entry.\n """\n return JsonMetadataValue(data)
\n\n
[docs] @staticmethod\n def md(data: str) -> "MarkdownMetadataValue":\n """Static constructor for a metadata value wrapping markdown data as\n :py:class:`MarkdownMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata={\n 'Details': MetadataValue.md(md_str)\n },\n )\n\n Args:\n md_str (str): The markdown for a metadata entry.\n """\n return MarkdownMetadataValue(data)
\n\n
[docs] @staticmethod\n def python_artifact(python_artifact: Callable) -> "PythonArtifactMetadataValue":\n """Static constructor for a metadata value wrapping a python artifact as\n :py:class:`PythonArtifactMetadataValue`. Can be used as the value type for the\n `metadata` parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "class": MetadataValue.python_artifact(MyClass),\n "function": MetadataValue.python_artifact(my_function),\n }\n )\n\n Args:\n value (Callable): The python class or function for a metadata entry.\n """\n check.callable_param(python_artifact, "python_artifact")\n return PythonArtifactMetadataValue(python_artifact.__module__, python_artifact.__name__)
\n\n
[docs] @staticmethod\n def float(value: float) -> "FloatMetadataValue":\n """Static constructor for a metadata value wrapping a float as\n :py:class:`FloatMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n }\n )\n\n Args:\n value (float): The float value for a metadata entry.\n """\n\n return FloatMetadataValue(value)
\n\n
[docs] @staticmethod\n def int(value: int) -> "IntMetadataValue":\n """Static constructor for a metadata value wrapping an int as\n :py:class:`IntMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "number of rows": MetadataValue.int(len(df)),\n },\n )\n\n Args:\n value (int): The int value for a metadata entry.\n """\n\n return IntMetadataValue(value)
\n\n
[docs] @staticmethod\n def bool(value: bool) -> "BoolMetadataValue":\n """Static constructor for a metadata value wrapping a bool as\n :py:class:`BoolMetadataValuye`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n },\n )\n\n Args:\n value (bool): The bool value for a metadata entry.\n """\n\n return BoolMetadataValue(value)
\n\n @staticmethod\n def pipeline_run(run_id: str) -> "DagsterPipelineRunMetadataValue":\n check.str_param(run_id, "run_id")\n return DagsterPipelineRunMetadataValue(run_id)\n\n
[docs] @staticmethod\n def dagster_run(run_id: str) -> "DagsterPipelineRunMetadataValue":\n """Static constructor for a metadata value wrapping a reference to a Dagster run.\n\n Args:\n run_id (str): The ID of the run.\n """\n return MetadataValue.pipeline_run(run_id)
\n\n
[docs] @staticmethod\n def asset(asset_key: "AssetKey") -> "DagsterAssetMetadataValue":\n """Static constructor for a metadata value referencing a Dagster asset, by key.\n\n For example:\n\n .. code-block:: python\n\n @op\n def validate_table(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey("my_table"),\n metadata={\n "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n },\n )\n\n Args:\n asset_key (AssetKey): The asset key referencing the asset.\n """\n\n from dagster.core.definitions.events import AssetKey\n\n check.inst_param(asset_key, "asset_key", AssetKey)\n return DagsterAssetMetadataValue(asset_key)
\n\n
[docs] @staticmethod\n @experimental\n def table(\n records: List[TableRecord], schema: Optional[TableSchema] = None\n ) -> "TableMetadataValue":\n """Static constructor for a metadata value wrapping arbitrary tabular data as\n :py:class:`TableMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not has_errors,\n label="is_valid",\n metadata={\n "errors": MetadataValue.table(\n records=[\n TableRecord(code="invalid-data-type", row=2, col="name"}]\n ],\n schema=TableSchema(\n columns=[\n TableColumn(name="code", type="string"),\n TableColumn(name="row", type="int"),\n TableColumn(name="col", type="string"),\n ]\n )\n ),\n },\n )\n\n Args:\n records (List[TableRecord]): The data as a list of records (i.e. rows).\n schema (Optional[TableSchema]): A schema for the table.\n """\n return TableMetadataValue(records, schema)
\n\n
[docs] @staticmethod\n def table_schema(\n schema: TableSchema,\n ) -> "TableSchemaMetadataValue":\n """Static constructor for a metadata value wrapping a table schema as\n :py:class:`TableSchemaMetadataValue`. Can be used as the value type\n for the `metadata` parameter for supported events. For example:\n\n .. code-block:: python\n\n schema = TableSchema(\n columns = [\n TableColumn(name="id", type="int"),\n TableColumn(name="status", type="bool"),\n ]\n )\n\n DagsterType(\n type_check_fn=some_validation_fn,\n name='MyTable',\n metadata={\n 'my_table_schema': MetadataValue.table_schema(schema),\n }\n )\n\n Args:\n schema (TableSchema): The table schema for a metadata entry.\n """\n return TableSchemaMetadataValue(schema)
\n\n\n# ########################\n# ##### METADATA VALUE TYPES\n# ########################\n\n# NOTE: We have `type: ignore` in a few places below because mypy complains about an instance method\n# (e.g. `text`) overriding a static method on the superclass of the same name. This is not a concern\n# for us because these static methods should never be called on instances.\n\n# NOTE: `XMetadataValue` classes are serialized with a storage name of `XMetadataEntryData` to\n# maintain backward compatibility. See docstring of `whitelist_for_serdes` for more info.\n\n\n
[docs]@whitelist_for_serdes(storage_name="TextMetadataEntryData")\nclass TextMetadataValue( # type: ignore\n NamedTuple(\n "_TextMetadataValue",\n [\n ("text", Optional[str]),\n ],\n ),\n MetadataValue,\n):\n """Container class for text metadata entry data.\n\n Args:\n text (Optional[str]): The text data.\n """\n\n def __new__(cls, text: Optional[str]):\n return super(TextMetadataValue, cls).__new__(\n cls, check.opt_str_param(text, "text", default="")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="UrlMetadataEntryData")\nclass UrlMetadataValue( # type: ignore\n NamedTuple(\n "_UrlMetadataValue",\n [\n ("url", Optional[str]),\n ],\n ),\n MetadataValue,\n):\n """Container class for URL metadata entry data.\n\n Args:\n url (Optional[str]): The URL as a string.\n """\n\n def __new__(cls, url: Optional[str]):\n return super(UrlMetadataValue, cls).__new__(\n cls, check.opt_str_param(url, "url", default="")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="PathMetadataEntryData")\nclass PathMetadataValue( # type: ignore\n NamedTuple("_PathMetadataValue", [("path", Optional[str])]), MetadataValue\n):\n """Container class for path metadata entry data.\n\n Args:\n path (Optional[str]): The path as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(PathMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="JsonMetadataEntryData")\nclass JsonMetadataValue(\n NamedTuple(\n "_JsonMetadataValue",\n [\n ("data", Dict[str, Any]),\n ],\n ),\n MetadataValue,\n):\n """Container class for JSON metadata entry data.\n\n Args:\n data (Dict[str, Any]): The JSON data.\n """\n\n def __new__(cls, data: Optional[Dict[str, Any]]):\n data = check.opt_dict_param(data, "data", key_type=str)\n try:\n # check that the value is JSON serializable\n seven.dumps(data)\n except TypeError:\n raise DagsterInvalidMetadata("Value is a dictionary but is not JSON serializable.")\n return super(JsonMetadataValue, cls).__new__(cls, data)
\n\n\n
[docs]@whitelist_for_serdes(storage_name="MarkdownMetadataEntryData")\nclass MarkdownMetadataValue(\n NamedTuple(\n "_MarkdownMetadataValue",\n [\n ("md_str", Optional[str]),\n ],\n ),\n MetadataValue,\n):\n """Container class for markdown metadata entry data.\n\n Args:\n md_str (Optional[str]): The markdown as a string.\n """\n\n def __new__(cls, md_str: Optional[str]):\n return super(MarkdownMetadataValue, cls).__new__(\n cls, check.opt_str_param(md_str, "md_str", default="")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="PythonArtifactMetadataEntryData")\nclass PythonArtifactMetadataValue(\n NamedTuple(\n "_PythonArtifactMetadataValue",\n [\n ("module", str),\n ("name", str),\n ],\n ),\n MetadataValue,\n):\n """Container class for python artifact metadata entry data.\n\n Args:\n module (str): The module where the python artifact can be found\n name (str): The name of the python artifact\n """\n\n def __new__(cls, module: str, name: str):\n return super(PythonArtifactMetadataValue, cls).__new__(\n cls, check.str_param(module, "module"), check.str_param(name, "name")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="FloatMetadataEntryData")\nclass FloatMetadataValue(\n NamedTuple(\n "_FloatMetadataValue",\n [\n ("value", Optional[float]),\n ],\n ),\n MetadataValue,\n):\n """Container class for float metadata entry data.\n\n Args:\n value (Optional[float]): The float value.\n """\n\n def __new__(cls, value: Optional[float]):\n return super(FloatMetadataValue, cls).__new__(cls, check.opt_float_param(value, "value"))
\n\n\n
[docs]@whitelist_for_serdes(storage_name="IntMetadataEntryData")\nclass IntMetadataValue(\n NamedTuple(\n "_IntMetadataValue",\n [\n ("value", Optional[int]),\n ],\n ),\n MetadataValue,\n):\n """Container class for int metadata entry data.\n\n Args:\n value (Optional[int]): The int value.\n """\n\n def __new__(cls, value: Optional[int]):\n return super(IntMetadataValue, cls).__new__(cls, check.opt_int_param(value, "value"))
\n\n\n@whitelist_for_serdes(storage_name="BoolMetadataEntryData")\nclass BoolMetadataValue(\n NamedTuple("_BoolMetadataValue", [("value", Optional[bool])]),\n MetadataValue,\n):\n """Container class for bool metadata entry data.\n\n Args:\n value (Optional[bool]): The bool value.\n """\n\n def __new__(cls, value: Optional[bool]):\n return super(BoolMetadataValue, cls).__new__(cls, check.opt_bool_param(value, "value"))\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterPipelineRunMetadataEntryData")\nclass DagsterPipelineRunMetadataValue(\n NamedTuple(\n "_DagsterPipelineRunMetadataValue",\n [\n ("run_id", str),\n ],\n ),\n MetadataValue,\n):\n """Representation of a dagster pipeline run.\n\n Args:\n run_id (str): The pipeline run id\n """\n\n def __new__(cls, run_id: str):\n return super(DagsterPipelineRunMetadataValue, cls).__new__(\n cls, check.str_param(run_id, "run_id")\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterAssetMetadataEntryData")\nclass DagsterAssetMetadataValue(\n NamedTuple("_DagsterAssetMetadataValue", [("asset_key", "AssetKey")]), MetadataValue\n):\n """Representation of a dagster asset.\n\n Args:\n asset_key (AssetKey): The dagster asset key\n """\n\n def __new__(cls, asset_key: "AssetKey"):\n from dagster.core.definitions.events import AssetKey\n\n return super(DagsterAssetMetadataValue, cls).__new__(\n cls, check.inst_param(asset_key, "asset_key", AssetKey)\n )
\n\n\n
[docs]@experimental\n@whitelist_for_serdes(storage_name="TableMetadataEntryData")\nclass TableMetadataValue(\n NamedTuple(\n "_TableMetadataValue",\n [\n ("records", List[TableRecord]),\n ("schema", TableSchema),\n ],\n ),\n MetadataValue,\n):\n """Container class for table metadata entry data.\n\n Args:\n records (TableRecord): The data as a list of records (i.e. rows).\n schema (Optional[TableSchema]): A schema for the table.\n """\n\n @staticmethod\n def infer_column_type(value):\n if isinstance(value, bool):\n return "bool"\n elif isinstance(value, int):\n return "int"\n elif isinstance(value, float):\n return "float"\n else:\n return "string"\n\n def __new__(cls, records: List[TableRecord], schema: Optional[TableSchema]):\n\n check.list_param(records, "records", of_type=TableRecord)\n check.opt_inst_param(schema, "schema", TableSchema)\n\n if len(records) == 0:\n schema = check.not_none(schema, "schema must be provided if records is empty")\n else:\n columns = set(records[0].data.keys())\n for record in records[1:]:\n check.invariant(\n set(record.data.keys()) == columns, "All records must have the same fields"\n )\n schema = schema or TableSchema(\n columns=[\n TableColumn(name=k, type=TableMetadataValue.infer_column_type(v))\n for k, v in records[0].data.items()\n ]\n )\n\n return super(TableMetadataValue, cls).__new__(\n cls,\n records,\n schema,\n )
\n\n\n
[docs]@whitelist_for_serdes(storage_name="TableSchemaMetadataEntryData")\nclass TableSchemaMetadataValue(\n NamedTuple("_TableSchemaMetadataValue", [("schema", TableSchema)]), MetadataValue\n):\n """Representation of a schema for arbitrary tabular data.\n\n Args:\n schema (TableSchema): The dictionary containing the schema representation.\n """\n\n def __new__(cls, schema: TableSchema):\n return super(TableSchemaMetadataValue, cls).__new__(\n cls, check.inst_param(schema, "schema", TableSchema)\n )
\n\n\n# ########################\n# ##### METADATA ENTRY\n# ########################\n\n\ndef deprecated_metadata_entry_constructor(fn):\n @functools.wraps(fn)\n def wrapper(*args, **kwargs):\n deprecation_warning(\n f"Function `MetadataEntry.{fn.__name__}`",\n "0.15.0",\n additional_warn_txt=re.sub(\n r"\\n\\s*",\n " ",\n """\n The recommended way to supply metadata is to pass a `Dict[str,\n MetadataValue]` to the `metadata` keyword argument. To construct `MetadataEntry`\n directly, call constructor and pass a `MetadataValue`: `MetadataEntry(label="foo",\n value=MetadataValue.text("bar")",\n """,\n ),\n )\n return fn(*args, **kwargs)\n\n return wrapper\n\n\n# NOTE: This would better be implemented as a generic with `MetadataValue` set as a\n# typevar, but as of 2022-01-25 mypy does not support generics on NamedTuple.\n#\n# NOTE: This currently stores value in the `entry_data` NamedTuple attribute. In the next release,\n# we will change the name of the NamedTuple property to `value`, and need to implement custom\n# serialization so that it continues to be saved as `entry_data` for backcompat purposes.\n
[docs]@whitelist_for_serdes(storage_name="EventMetadataEntry")\nclass MetadataEntry(\n NamedTuple(\n "_MetadataEntry",\n [\n ("label", str),\n ("description", Optional[str]),\n ("entry_data", MetadataValue),\n ],\n ),\n):\n """The standard structure for describing metadata for Dagster events.\n\n Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\n in Dagit and other tooling.\n\n Should be yielded from within an IO manager to append metadata for a given input/output event.\n For other event types, passing a dict with `MetadataValue` values to the `metadata` argument\n is preferred.\n\n Args:\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n value (MetadataValue): Typed metadata entry data. The different types allow\n for customized display in tools like dagit.\n """\n\n def __new__(\n cls,\n label: str,\n description: Optional[str] = None,\n entry_data: Optional["RawMetadataValue"] = None,\n value: Optional["RawMetadataValue"] = None,\n ):\n if description is not None:\n deprecation_warning(\n 'The "description" attribute on "MetadataEntry"',\n "0.15.0",\n )\n value = cast(\n RawMetadataValue,\n canonicalize_backcompat_args(\n new_val=value,\n new_arg="value",\n old_val=entry_data,\n old_arg="entry_data",\n breaking_version="0.15.0",\n ),\n )\n value = normalize_metadata_value(value)\n\n return super(MetadataEntry, cls).__new__(\n cls,\n check.str_param(label, "label"),\n check.opt_str_param(description, "description"),\n check.inst_param(value, "value", MetadataValue),\n )\n\n @property\n def value(self):\n """Alias of `entry_data`."""\n return self.entry_data\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def text(text: Optional[str], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing text as\n :py:class:`TextMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[\n MetadataEntry.text("Text-based metadata for this event", "text_metadata")\n ],\n )\n\n Args:\n text (Optional[str]): The text of this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, TextMetadataValue(text))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def url(url: Optional[str], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing a URL as\n :py:class:`UrlMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata_entries=[\n MetadataEntry.url(\n "http://mycoolsite.com/my_dashboard", label="dashboard_url"\n ),\n ],\n )\n\n Args:\n url (Optional[str]): The URL contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, UrlMetadataValue(url))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def path(path: Optional[str], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing a path as\n :py:class:`PathMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[MetadataEntry.path("path/to/file", label="filepath")],\n )\n\n Args:\n path (Optional[str]): The path contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, PathMetadataValue(path))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def fspath(\n path: Optional[str], label: Optional[str] = None, description: Optional[str] = None\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing a filesystem path as\n :py:class:`PathMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[MetadataEntry.fspath("path/to/file")],\n )\n\n Args:\n path (Optional[str]): The path contained by this metadata entry.\n label (Optional[str]): Short display label for this metadata entry. Defaults to the\n base name of the path.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n if not label:\n path = cast(str, check.str_param(path, "path"))\n label = last_file_comp(path)\n\n return MetadataEntry.path(path, label, description)
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def json(\n data: Optional[Dict[str, Any]],\n label: str,\n description: Optional[str] = None,\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing JSON data as\n :py:class:`JsonMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata_entries=[\n MetadataEntry.json(\n label="metadata", data={"missing_columns": missing_things},\n )\n ],\n )\n\n Args:\n data (Optional[Dict[str, Any]]): The JSON data contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, JsonMetadataValue(data))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def md(md_str: Optional[str], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing markdown data as\n :py:class:`MarkdownMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata_entries=[MetadataEntry.md(md_str=md_str)],\n )\n\n Args:\n md_str (Optional[str]): The markdown contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(label, description, MarkdownMetadataValue(md_str))
\n\n @staticmethod\n @deprecated_metadata_entry_constructor\n def python_artifact(\n python_artifact: Callable[..., Any], label: str, description: Optional[str] = None\n ) -> "MetadataEntry":\n check.callable_param(python_artifact, "python_artifact")\n return MetadataEntry(\n label,\n description,\n PythonArtifactMetadataValue(python_artifact.__module__, python_artifact.__name__),\n )\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def float(\n value: Optional[float], label: str, description: Optional[str] = None\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing float as\n :py:class:`FloatMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[MetadataEntry.float(calculate_bytes(df), "size (bytes)")],\n )\n\n Args:\n value (Optional[float]): The float value contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n return MetadataEntry(label, description, FloatMetadataValue(value))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def int(value: Optional[int], label: str, description: Optional[str] = None) -> "MetadataEntry":\n """Static constructor for a metadata entry containing int as\n :py:class:`IntMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata_entries=[MetadataEntry.int(len(df), "number of rows")],\n )\n\n Args:\n value (Optional[int]): The int value contained by this metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n return MetadataEntry(label, description, IntMetadataValue(value))
\n\n @staticmethod\n @deprecated_metadata_entry_constructor\n def pipeline_run(run_id: str, label: str, description: Optional[str] = None) -> "MetadataEntry":\n check.str_param(run_id, "run_id")\n return MetadataEntry(label, description, DagsterPipelineRunMetadataValue(run_id))\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def asset(\n asset_key: "AssetKey", label: str, description: Optional[str] = None\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry referencing a Dagster asset, by key.\n\n For example:\n\n .. code-block:: python\n\n @op\n def validate_table(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey("my_table"),\n metadata_entries=[\n MetadataEntry.asset(AssetKey('my_other_table'), "Related asset"),\n ],\n )\n\n Args:\n asset_key (AssetKey): The asset key referencing the asset.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n\n from dagster.core.definitions.events import AssetKey\n\n check.inst_param(asset_key, "asset_key", AssetKey)\n return MetadataEntry(label, description, DagsterAssetMetadataValue(asset_key))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n @experimental\n def table(\n records: List[TableRecord],\n label: str,\n description: Optional[str] = None,\n schema: Optional[TableSchema] = None,\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing tabluar data as\n :py:class:`TableMetadataValue`. For example:\n\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not has_errors,\n label="is_valid",\n metadata_entries=[\n MetadataEntry.table(\n label="errors",\n records=[\n TableRecord(code="invalid-data-type", row=2, col="name"}]\n ],\n schema=TableSchema(\n columns=[\n TableColumn(name="code", type="string"),\n TableColumn(name="row", type="int"),\n TableColumn(name="col", type="string"),\n ]\n )\n ),\n ],\n )\n\n Args:\n records (List[TableRecord]): The data as a list of records (i.e. rows).\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n schema (Optional[TableSchema]): A schema for the table. If none is provided, one will be\n automatically generated by examining the first record. The schema will include as columns all\n field names present in the first record, with a type of `"string"`, `"int"`,\n `"bool"` or `"float"` inferred from the first record's values. If a value does\n not directly match one of the above types, it will be treated as a string.\n """\n return MetadataEntry(label, description, TableMetadataValue(records, schema))
\n\n
[docs] @staticmethod\n @deprecated_metadata_entry_constructor\n def table_schema(\n schema: TableSchema, label: str, description: Optional[str] = None\n ) -> "MetadataEntry":\n """Static constructor for a metadata entry containing a table schema as\n :py:class:`TableSchemaMetadataValue`. For example:\n\n .. code-block:: python\n\n schema = TableSchema(\n columns = [\n TableColumn(name="id", type="int"),\n TableColumn(name="status", type="bool"),\n ]\n )\n\n DagsterType(\n type_check_fn=some_validation_fn,\n name='MyTable',\n metadata_entries=[\n MetadataEntry.table_schema(\n schema,\n label='schema',\n )\n ]\n )\n\n Args:\n schema (TableSchema): The table schema for a metadata entry.\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n """\n return MetadataEntry(\n label,\n description,\n TableSchemaMetadataValue(schema),\n )
\n\n\nclass PartitionMetadataEntry(\n NamedTuple(\n "_PartitionMetadataEntry",\n [\n ("partition", str),\n ("entry", "MetadataEntry"),\n ],\n )\n):\n """Event containing an :py:class:`MetadataEntry` and the name of a partition that the entry\n applies to.\n\n This can be yielded or returned in place of MetadataEntries for cases where you are trying\n to associate metadata more precisely.\n """\n\n def __new__(cls, partition: str, entry: MetadataEntry):\n experimental_class_warning("PartitionMetadataEntry")\n return super(PartitionMetadataEntry, cls).__new__(\n cls,\n check.str_param(partition, "partition"),\n check.inst_param(entry, "entry", MetadataEntry),\n )\n
", "current_page_name": "_modules/dagster/core/definitions/metadata", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "table": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.metadata.table

\nfrom typing import Any, Dict, List, NamedTuple, Optional, Type, Union, cast\n\nimport dagster._check as check\nfrom dagster.serdes.serdes import DefaultNamedTupleSerializer, whitelist_for_serdes\nfrom dagster.utils.backcompat import experimental\n\n# ########################\n# ##### TABLE RECORD\n# ########################\n\n\nclass _TableRecordSerializer(DefaultNamedTupleSerializer):\n    @classmethod\n    def value_from_unpacked(\n        cls,\n        unpacked_dict: Dict[str, Any],\n        klass: Type,\n    ):\n        return klass(**unpacked_dict["data"])\n\n\n
[docs]@experimental\n@whitelist_for_serdes(serializer=_TableRecordSerializer)\nclass TableRecord(NamedTuple("TableRecord", [("data", Dict[str, Union[str, int, float, bool]])])):\n """Represents one record in a table. All passed keyword arguments are treated as field key/value\n pairs in the record. Field keys are arbitrary strings-- field values must be strings, integers,\n floats, or bools.\n """\n\n def __new__(cls, **data):\n check.dict_param(\n data,\n "data",\n value_type=(str, float, int, bool, type(None)),\n additional_message="Record fields must be one of types: (str, float, int, bool)",\n )\n return super(TableRecord, cls).__new__(cls, data=data)
\n\n\n# ########################\n# ##### TABLE SCHEMA\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableSchema(\n NamedTuple(\n "TableSchema",\n [\n ("columns", List["TableColumn"]),\n ("constraints", "TableConstraints"),\n ],\n )\n):\n """Representation of a schema for tabular data. Schema is composed of two parts:\n\n - A required list of columns (`TableColumn`). Each column specifies a\n `name`, `type`, set of `constraints`, and (optional) `description`. `type`\n defaults to `string` if unspecified. Column constraints\n (`TableColumnConstraints`) consist of boolean properties `unique` and\n `nullable`, as well as a list of strings `other` containing string\n descriptions of all additional constraints (e.g. `"<= 5"`).\n - An optional list of table-level constraints (`TableConstraints`). A\n table-level constraint cannot be expressed in terms of a single column,\n e.g. col a > col b. Presently, all table-level constraints must be\n expressed as strings under the `other` attribute of a `TableConstraints`\n object.\n\n .. code-block:: python\n\n # example schema\n TableSchema(\n constraints = TableConstraints(\n other = [\n "foo > bar",\n ],\n ),\n columns = [\n TableColumn(\n name = "foo",\n type = "string",\n description = "Foo description",\n constraints = TableColumnConstraints(\n required = True,\n other = [\n "starts with the letter 'a'",\n ],\n ),\n ),\n TableColumn(\n name = "bar",\n type = "string",\n ),\n TableColumn(\n name = "baz",\n type = "custom_type",\n constraints = TableColumnConstraints(\n unique = True,\n )\n ),\n ],\n )\n\n Args:\n columns (List[TableColumn]): The columns of the table.\n constraints (Optional[TableConstraints]): The constraints of the table.\n """\n\n def __new__(\n cls,\n columns: List["TableColumn"],\n constraints: Optional["TableConstraints"] = None,\n ):\n return super(TableSchema, cls).__new__(\n cls,\n columns=check.list_param(columns, "columns", of_type=TableColumn),\n constraints=check.opt_inst_param(\n constraints, "constraints", TableConstraints, default=_DEFAULT_TABLE_CONSTRAINTS\n ),\n )
\n\n\n# ########################\n# ##### TABLE CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableConstraints(\n NamedTuple(\n "TableConstraints",\n [\n ("other", List[str]),\n ],\n )\n):\n """Descriptor for "table-level" constraints. Presently only one property,\n `other` is supported. This contains strings describing arbitrary\n table-level constraints. A table-level constraint is a constraint defined\n in terms of multiple columns (e.g. col_A > col_B) or in terms of rows.\n\n Args:\n other (List[str]): Descriptions of arbitrary table-level constraints.\n """\n\n def __new__(\n cls,\n other: List[str],\n ):\n return super(TableConstraints, cls).__new__(\n cls,\n other=check.list_param(other, "other", of_type=str),\n )
\n\n\n_DEFAULT_TABLE_CONSTRAINTS = TableConstraints(other=[])\n\n# ########################\n# ##### TABLE COLUMN\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumn(\n NamedTuple(\n "TableColumn",\n [\n ("name", str),\n ("type", str),\n ("description", Optional[str]),\n ("constraints", "TableColumnConstraints"),\n ],\n )\n):\n """Descriptor for a table column. The only property that must be specified\n by the user is `name`. If no `type` is specified, `string` is assumed. If\n no `constraints` are specified, the column is assumed to be nullable\n (i.e. `required = False`) and have no other constraints beyond the data type.\n\n Args:\n name (List[str]): Descriptions of arbitrary table-level constraints.\n type (Optional[str]): The type of the column. Can be an arbitrary\n string. Defaults to `"string"`.\n description (Optional[str]): Description of this column. Defaults to `None`.\n constraints (Optional[TableColumnConstraints]): Column-level constraints.\n If unspecified, column is nullable with no constraints.\n """\n\n def __new__(\n cls,\n name: str,\n type: str = "string", # pylint: disable=redefined-builtin\n description: Optional[str] = None,\n constraints: Optional["TableColumnConstraints"] = None,\n ):\n return super(TableColumn, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n type=check.str_param(type, "type"),\n description=check.opt_str_param(description, "description"),\n constraints=cast(\n "TableColumnConstraints",\n check.opt_inst_param(\n constraints,\n "constraints",\n TableColumnConstraints,\n default=_DEFAULT_TABLE_COLUMN_CONSTRAINTS,\n ),\n ),\n )
\n\n\n# ########################\n# ##### TABLE COLUMN CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumnConstraints(\n NamedTuple(\n "TableColumnConstraints",\n [\n ("nullable", bool),\n ("unique", bool),\n ("other", Optional[List[str]]),\n ],\n )\n):\n """Descriptor for a table column's constraints. Nullability and uniqueness are specified with\n boolean properties. All other constraints are described using arbitrary strings under the\n `other` property.\n\n Args:\n nullable (Optional[bool]): If true, this column can hold null values.\n unique (Optional[bool]): If true, all values in this column must be unique.\n other (List[str]): Descriptions of arbitrary column-level constraints\n not expressible by the predefined properties.\n """\n\n def __new__(\n cls,\n nullable: bool = True,\n unique: bool = False,\n other: Optional[List[str]] = None,\n ):\n return super(TableColumnConstraints, cls).__new__(\n cls,\n nullable=check.bool_param(nullable, "nullable"),\n unique=check.bool_param(unique, "unique"),\n other=check.opt_list_param(other, "other"),\n )
\n\n\n_DEFAULT_TABLE_COLUMN_CONSTRAINTS = TableColumnConstraints()\n
", "current_page_name": "_modules/dagster/core/definitions/metadata/table", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}, {"link": "../", "title": "dagster.core.definitions.metadata"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.metadata.table"}, "title": "dagster.core.definitions.metadata"}, "mode": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.mode

\nfrom typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster.core.definitions.executor_definition import ExecutorDefinition, default_executors\nfrom dagster.loggers import default_loggers\nfrom dagster.utils.merger import merge_dicts\n\nfrom .config import ConfigMapping\nfrom .logger_definition import LoggerDefinition\nfrom .resource_definition import ResourceDefinition\nfrom .utils import check_valid_name\n\nDEFAULT_MODE_NAME = "default"\n\nif TYPE_CHECKING:\n    from .partition import PartitionedConfig\n\n\n
[docs]class ModeDefinition(\n NamedTuple(\n "_ModeDefinition",\n [\n ("name", str),\n ("resource_defs", Dict[str, ResourceDefinition]),\n ("loggers", Dict[str, LoggerDefinition]),\n ("executor_defs", List[ExecutorDefinition]),\n ("description", Optional[str]),\n ("config_mapping", Optional[ConfigMapping]),\n ("partitioned_config", Optional["PartitionedConfig"]),\n ],\n )\n):\n """Define a mode in which a pipeline can operate.\n\n A mode provides pipelines with a set of resource implementations, loggers, system storages,\n and executors.\n\n Args:\n name (Optional[str]): The name of the mode. Must be unique within the\n :py:class:`PipelineDefinition` to which the mode is attached. (default: "default").\n resource_defs (Optional[Dict[str, ResourceDefinition]]): A dictionary of string resource\n keys to their implementations. Individual solids may require resources to be present by\n these keys.\n logger_defs (Optional[Dict[str, LoggerDefinition]]): A dictionary of string logger\n identifiers to their implementations.\n executor_defs (Optional[List[ExecutorDefinition]]): The set of executors available when\n executing in this mode. By default, this will be the 'in_process' and 'multiprocess'\n executors (:py:data:`~dagster.default_executors`).\n description (Optional[str]): A human-readable description of the mode.\n _config_mapping (Optional[ConfigMapping]): Only for internal use.\n _partitions (Optional[PartitionedConfig]): Only for internal use.\n """\n\n def __new__(\n cls,\n name: Optional[str] = None,\n resource_defs: Optional[Dict[str, ResourceDefinition]] = None,\n logger_defs: Optional[Dict[str, LoggerDefinition]] = None,\n executor_defs: Optional[List[ExecutorDefinition]] = None,\n description: Optional[str] = None,\n _config_mapping: Optional[ConfigMapping] = None,\n _partitioned_config: Optional["PartitionedConfig"] = None,\n ):\n\n from .partition import PartitionedConfig\n\n resource_defs = check.opt_dict_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n\n for key in resource_defs:\n if not key.isidentifier():\n check.failed(f"Resource key '{key}' must be a valid Python identifier.")\n\n if resource_defs and "io_manager" in resource_defs:\n resource_defs_with_defaults = resource_defs\n else:\n from dagster.core.storage.mem_io_manager import mem_io_manager\n\n resource_defs_with_defaults = merge_dicts(\n {"io_manager": mem_io_manager}, resource_defs or {}\n )\n\n return super(ModeDefinition, cls).__new__(\n cls,\n name=check_valid_name(name) if name else DEFAULT_MODE_NAME,\n resource_defs=resource_defs_with_defaults,\n loggers=(\n check.opt_dict_param(\n logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition\n )\n or default_loggers()\n ),\n executor_defs=check.list_param(\n executor_defs if executor_defs else default_executors,\n "executor_defs",\n of_type=ExecutorDefinition,\n ),\n description=check.opt_str_param(description, "description"),\n config_mapping=check.opt_inst_param(_config_mapping, "_config_mapping", ConfigMapping),\n partitioned_config=check.opt_inst_param(\n _partitioned_config, "_partitioned_config", PartitionedConfig\n ),\n )\n\n @property\n def resource_key_set(self):\n return frozenset(self.resource_defs.keys())\n\n @staticmethod\n def from_resources(resources, name=None):\n check.dict_param(resources, "resources", key_type=str)\n\n return ModeDefinition(\n name=name,\n resource_defs={\n resource_name: ResourceDefinition.hardcoded_resource(resource)\n for resource_name, resource in resources.items()\n },\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/mode", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.mode"}, "op_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.op_definition

\nfrom typing import Dict\n\nfrom .input import In\nfrom .output import Out\nfrom .solid_definition import SolidDefinition\n\n\n
[docs]class OpDefinition(SolidDefinition):\n """\n Defines an op, the functional unit of user-defined computation.\n\n For more details on what a op is, refer to the\n `Ops Overview <../../concepts/ops-jobs-graphs/ops>`_ .\n\n End users should prefer the :func:`@op <op>` decorator. OpDefinition is generally intended to be\n used by framework authors or for programatically generated ops.\n\n Args:\n name (str): Name of the op. Must be unique within any :py:class:`GraphDefinition` or\n :py:class:`JobDefinition` that contains the op.\n input_defs (List[InputDefinition]): Inputs of the op.\n compute_fn (Callable): The core of the op, the function that performs the actual\n computation. The signature of this function is determined by ``input_defs``, and\n optionally, an injected first argument, ``context``, a collection of information\n provided by the system.\n\n This function will be coerced into a generator or an async generator, which must yield\n one :py:class:`Output` for each of the op's ``output_defs``, and additionally may\n yield other types of Dagster events, including :py:class:`AssetMaterialization` and\n :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the op.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that the config provided for the op matches this schema and will fail if it does not. If\n not set, Dagster will accept any config provided for the op.\n description (Optional[str]): Human-readable description of the op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this op.\n version (Optional[str]): (Experimental) The version of the op's compute_fn. Two ops should\n have the same version if and only if they deterministically produce the same outputs\n when provided the same inputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n OpDefinition(\n name="add_one",\n input_defs=[InputDefinition("num", Int)],\n output_defs=[OutputDefinition(Int)], # default name ("result")\n compute_fn=_add_one,\n )\n """\n\n @property\n def node_type_str(self) -> str:\n return "op"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @property\n def ins(self) -> Dict[str, In]:\n return {input_def.name: In.from_definition(input_def) for input_def in self.input_defs}\n\n @property\n def outs(self) -> Dict[str, Out]:\n return {output_def.name: Out.from_definition(output_def) for output_def in self.output_defs}
\n
", "current_page_name": "_modules/dagster/core/definitions/op_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.op_definition"}, "output": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.output

\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    List,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster.core.definitions.events import AssetKey, DynamicAssetKey\nfrom dagster.core.definitions.metadata import MetadataEntry, MetadataUserInput, normalize_metadata\nfrom dagster.core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster.core.types.dagster_type import (\n    DagsterType,\n    is_dynamic_output_annotation,\n    resolve_dagster_type,\n)\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom .inference import InferredOutputProps\nfrom .input import NoValueSentinel\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.partition import PartitionsDefinition\n    from dagster.core.execution.context.output import OutputContext\n\nTOut = TypeVar("TOut", bound="OutputDefinition")\n\n\n
[docs]class OutputDefinition:\n """Defines an output from a solid's compute function.\n\n Solids can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many solids have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Output definitions may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]): The type of this output.\n Users should provide the Python type of the objects that they expect the solid to yield\n for this output, or a :py:class:`DagsterType` that defines a runtime check that they\n want to be run on this output. Defaults to :py:class:`Any`.\n name (Optional[str]): Name of the output. (default: "result")\n description (Optional[str]): Human-readable description of the output.\n is_required (Optional[bool]): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the IOManager used for storing this\n output and loading it in downstream steps (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n asset_key (Optional[AssetKey]]): (Experimental) An AssetKey which should be associated\n with this OutputDefinition. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], OutputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the OutputContext) which should be associated with this OutputDefinition.\n """\n\n def __init__(\n self,\n dagster_type=None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n asset_key: Optional[Union[AssetKey, DynamicAssetKey]] = None,\n asset_partitions: Optional[\n Union[AbstractSet[str], Callable[["OutputContext"], AbstractSet[str]]]\n ] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None\n # make sure new parameters are updated in combine_with_inferred below\n ):\n from dagster.core.definitions.partition import PartitionsDefinition\n\n self._name = check_valid_name(check.opt_str_param(name, "name", DEFAULT_OUTPUT))\n self._type_not_set = dagster_type is None\n self._dagster_type = resolve_dagster_type(dagster_type)\n self._description = check.opt_str_param(description, "description")\n self._is_required = check.bool_param(is_required, "is_required")\n self._io_manager_key = check.opt_str_param(\n io_manager_key,\n "io_manager_key",\n default="io_manager",\n )\n self._metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n self._metadata_entries = check.is_list(\n normalize_metadata(self._metadata, [], allow_invalid=True), MetadataEntry\n )\n\n if asset_key:\n experimental_arg_warning("asset_key", "OutputDefinition.__init__")\n\n if callable(asset_key):\n warnings.warn(\n "Passing a function as the `asset_key` argument to `Out` or `OutputDefinition` is "\n "deprecated behavior and will be removed in version 0.15.0."\n )\n else:\n check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n self._asset_key = asset_key\n\n if asset_partitions:\n experimental_arg_warning("asset_partitions", "OutputDefinition.__init__")\n check.param_invariant(\n asset_key is not None,\n "asset_partitions",\n 'Cannot specify "asset_partitions" argument without also specifying "asset_key"',\n )\n\n self._asset_partitions_fn: Optional[Callable[["OutputContext"], AbstractSet[str]]]\n if callable(asset_partitions):\n self._asset_partitions_fn = asset_partitions\n elif asset_partitions is not None:\n asset_partitions = check.opt_set_param(asset_partitions, "asset_partitions", str)\n\n def _fn(_context: "OutputContext") -> AbstractSet[str]:\n return cast(AbstractSet[str], asset_partitions) # mypy bug?\n\n self._asset_partitions_fn = _fn\n else:\n self._asset_partitions_fn = None\n\n if asset_partitions_def:\n experimental_arg_warning("asset_partitions_def", "OutputDefinition.__init__")\n self._asset_partitions_def = check.opt_inst_param(\n asset_partitions_def, "asset_partition_def", PartitionsDefinition\n )\n\n @property\n def name(self):\n return self._name\n\n @property\n def dagster_type(self) -> DagsterType:\n return self._dagster_type\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def is_required(self) -> bool:\n return self._is_required\n\n @property\n def io_manager_key(self) -> str:\n return self._io_manager_key\n\n @property\n def optional(self) -> bool:\n return not self.is_required\n\n @property\n def metadata(self) -> MetadataUserInput:\n return self._metadata\n\n @property\n def metadata_entries(self) -> List[MetadataEntry]:\n return self._metadata_entries\n\n @property\n def is_dynamic(self) -> bool:\n return False\n\n @property\n def is_asset(self) -> bool:\n return self._asset_key is not None\n\n @property\n def asset_partitions_def(self) -> Optional["PartitionsDefinition"]:\n return self._asset_partitions_def\n\n @property\n def hardcoded_asset_key(self) -> Optional[AssetKey]:\n if not callable(self._asset_key):\n return self._asset_key\n else:\n return None\n\n def get_asset_key(self, context: "OutputContext") -> Optional[AssetKey]:\n """Get the AssetKey associated with this OutputDefinition for the given\n :py:class:`OutputContext` (if any).\n\n Args:\n context (OutputContext): The OutputContext that this OutputDefinition is being evaluated\n in\n """\n if callable(self._asset_key):\n return self._asset_key(context)\n else:\n return self.hardcoded_asset_key\n\n def get_asset_partitions(self, context: "OutputContext") -> Optional[AbstractSet[str]]:\n """Get the set of partitions associated with this OutputDefinition for the given\n :py:class:`OutputContext` (if any).\n\n Args:\n context (OutputContext): The OutputContext that this OutputDefinition is being evaluated\n in\n """\n if self._asset_partitions_fn is None:\n return None\n\n return self._asset_partitions_fn(context)\n\n def mapping_from(self, solid_name: str, output_name: Optional[str] = None) -> "OutputMapping":\n """Create an output mapping from an output of a child solid.\n\n In a CompositeSolidDefinition, you can use this helper function to construct\n an :py:class:`OutputMapping` from the output of a child solid.\n\n Args:\n solid_name (str): The name of the child solid from which to map this output.\n output_name (str): The name of the child solid's output from which to map this output.\n\n Examples:\n\n .. code-block:: python\n\n output_mapping = OutputDefinition(Int).mapping_from('child_solid')\n """\n return OutputMapping(self, OutputPointer(solid_name, output_name))\n\n @staticmethod\n def create_from_inferred(inferred: InferredOutputProps) -> "OutputDefinition":\n if is_dynamic_output_annotation(inferred.annotation):\n return DynamicOutputDefinition(\n dagster_type=_checked_inferred_type(inferred.annotation),\n description=inferred.description,\n )\n else:\n return OutputDefinition(\n dagster_type=_checked_inferred_type(inferred.annotation),\n description=inferred.description,\n )\n\n def combine_with_inferred(self: TOut, inferred: InferredOutputProps) -> TOut:\n dagster_type = self.dagster_type\n if self._type_not_set:\n dagster_type = _checked_inferred_type(inferred.annotation)\n if self.description is None:\n description = inferred.description\n else:\n description = self.description\n\n return self.__class__(\n name=self.name,\n dagster_type=dagster_type,\n description=description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self._metadata,\n asset_key=self._asset_key,\n asset_partitions=self._asset_partitions_fn,\n asset_partitions_def=self.asset_partitions_def,\n )
\n\n\ndef _checked_inferred_type(inferred: Any) -> DagsterType:\n try:\n return resolve_dagster_type(inferred)\n except DagsterError as e:\n raise DagsterInvalidDefinitionError(\n f"Problem using type '{inferred}' from return type annotation, correct the issue "\n "or explicitly set the dagster_type on your OutputDefinition."\n ) from e\n\n\nclass DynamicOutputDefinition(OutputDefinition):\n """\n Variant of :py:class:`OutputDefinition <dagster.OutputDefinition>` for an\n output that will dynamically alter the graph at runtime.\n\n When using in a composition function such as :py:func:`@pipeline <dagster.pipeline>`,\n dynamic outputs must be used with either\n\n * ``map`` - clone downstream solids for each separate :py:class:`DynamicOutput`\n * ``collect`` - gather across all :py:class:`DynamicOutput` in to a list\n\n Uses the same constructor as :py:class:`OutputDefinition <dagster.OutputDefinition>`\n\n .. code-block:: python\n\n @solid(\n config_schema={\n "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n },\n output_defs=[DynamicOutputDefinition(str)],\n )\n def files_in_directory(context):\n path = context.solid_config["path"]\n dirname, _, filenames = next(os.walk(path))\n for file in filenames:\n yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n @pipeline\n def process_directory():\n files = files_in_directory()\n\n # use map to invoke a solid on each dynamic output\n file_results = files.map(process_file)\n\n # use collect to gather the results in to a list\n summarize_directory(file_results.collect())\n """\n\n @property\n def is_dynamic(self) -> bool:\n return True\n\n\nclass OutputPointer(NamedTuple("_OutputPointer", [("solid_name", str), ("output_name", str)])):\n def __new__(cls, solid_name: str, output_name: Optional[str] = None):\n return super(OutputPointer, cls).__new__(\n cls,\n check.str_param(solid_name, "solid_name"),\n check.opt_str_param(output_name, "output_name", DEFAULT_OUTPUT),\n )\n\n @property\n def node_name(self):\n return self.solid_name\n\n\n
[docs]class OutputMapping(\n NamedTuple("_OutputMapping", [("definition", OutputDefinition), ("maps_from", OutputPointer)])\n):\n """Defines an output mapping for a composite solid.\n\n Args:\n definition (OutputDefinition): Defines the output of the composite solid.\n solid_name (str): The name of the child solid from which to map the output.\n output_name (str): The name of the child solid's output from which to map the output.\n """\n\n def __new__(cls, definition: OutputDefinition, maps_from: OutputPointer):\n return super(OutputMapping, cls).__new__(\n cls,\n check.inst_param(definition, "definition", OutputDefinition),\n check.inst_param(maps_from, "maps_from", OutputPointer),\n )
\n\n\n
[docs]class Out(\n NamedTuple(\n "_Out",\n [\n ("dagster_type", Union[DagsterType, Type[NoValueSentinel]]),\n ("description", Optional[str]),\n ("is_required", bool),\n ("io_manager_key", str),\n ("metadata", Optional[MetadataUserInput]),\n ("asset_key", Optional[Union[AssetKey, DynamicAssetKey]]),\n (\n "asset_partitions",\n Optional[Union[AbstractSet[str], Callable[["OutputContext"], AbstractSet[str]]]],\n ),\n ("asset_partitions_def", Optional["PartitionsDefinition"]),\n ],\n )\n):\n """\n Defines an output from an op's compute function.\n\n Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many ops have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Outs may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the output manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n asset_key (Optional[AssetKey]): (Experimental) An AssetKey which should be associated\n with this Out. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], OutputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the OutputContext) which should be associated with this Out.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n asset_key: Optional[AssetKey] = None,\n asset_partitions: Optional[\n Union[AbstractSet[str], Callable[["OutputContext"], AbstractSet[str]]]\n ] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n # make sure new parameters are updated in combine_with_inferred below\n ):\n if asset_partitions_def:\n experimental_arg_warning("asset_partitions_definition", "Out.__new__")\n return super(Out, cls).__new__(\n cls,\n dagster_type=NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type),\n description=description,\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default="io_manager"\n ),\n metadata=metadata,\n asset_key=asset_key,\n asset_partitions=asset_partitions,\n asset_partitions_def=asset_partitions_def,\n )\n\n @staticmethod\n def from_definition(output_def: "OutputDefinition"):\n return Out(\n dagster_type=output_def.dagster_type,\n description=output_def.description,\n is_required=output_def.is_required,\n io_manager_key=output_def.io_manager_key,\n metadata=output_def.metadata,\n asset_key=output_def._asset_key, # type: ignore # pylint: disable=protected-access\n asset_partitions=output_def._asset_partitions_fn, # pylint: disable=protected-access\n asset_partitions_def=output_def.asset_partitions_def, # pylint: disable=protected-access\n )\n\n def to_definition(self, annotation_type: type, name: Optional[str]) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type if self.dagster_type is not NoValueSentinel else annotation_type\n )\n\n return OutputDefinition(\n dagster_type=dagster_type,\n name=name,\n description=self.description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n asset_partitions_def=self.asset_partitions_def,\n )
\n\n\n
[docs]class DynamicOut(Out):\n """\n Variant of :py:class:`Out <dagster.Out>` for an output that will dynamically alter the graph at\n runtime.\n\n When using in a composition function such as :py:func:`@graph <dagster.graph>`,\n dynamic outputs must be used with either\n\n * ``map`` - clone downstream ops for each separate :py:class:`DynamicOut`\n * ``collect`` - gather across all :py:class:`DynamicOut` in to a list\n\n Uses the same constructor as :py:class:`Out <dagster.Out>`\n\n .. code-block:: python\n\n @op(\n config_schema={\n "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n },\n out=DynamicOut(str),\n )\n def files_in_directory(context):\n path = context.op_config["path"]\n dirname, _, filenames = next(os.walk(path))\n for file in filenames:\n yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n @job\n def process_directory():\n files = files_in_directory()\n\n # use map to invoke an op on each dynamic output\n file_results = files.map(process_file)\n\n # use collect to gather the results in to a list\n summarize_directory(file_results.collect())\n """\n\n def to_definition(self, annotation_type: type, name: Optional[str]) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type if self.dagster_type is not NoValueSentinel else annotation_type\n )\n\n return DynamicOutputDefinition(\n dagster_type=dagster_type,\n name=name,\n description=self.description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n )
\n\n\n
[docs]class GraphOut(NamedTuple("_GraphOut", [("description", Optional[str])])):\n """\n Represents information about the outputs that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the output.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphOut, cls).__new__(cls, description=description)\n\n def to_definition(self, name: Optional[str]) -> "OutputDefinition":\n return OutputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/core/definitions/output", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.output"}, "partition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.partition

\nimport copy\nimport inspect\nfrom abc import ABC, abstractmethod\nfrom datetime import datetime, time, timedelta\nfrom enum import Enum\nfrom typing import Any, Callable, Dict, Generic, List, NamedTuple, Optional, TypeVar, Union, cast\n\nimport pendulum\nfrom dateutil.relativedelta import relativedelta\n\nimport dagster._check as check\nfrom dagster.serdes import whitelist_for_serdes\n\nfrom ...core.definitions.utils import validate_tags\nfrom ...seven.compat.pendulum import PendulumDateTime, to_timezone\nfrom ...utils import frozenlist, merge_dicts\nfrom ...utils.schedules import schedule_execution_time_iterator\nfrom ..decorator_utils import get_function_params\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    DagsterUnknownPartitionError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom ..storage.pipeline_run import PipelineRun\nfrom .mode import DEFAULT_MODE_NAME\nfrom .run_request import RunRequest, SkipReason\nfrom .schedule_definition import (\n    DefaultScheduleStatus,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n)\nfrom .utils import check_valid_name, validate_tags\n\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\n\nT = TypeVar("T")\n\n\n
[docs]class Partition(Generic[T]):\n """\n A Partition represents a single slice of the entire set of a job's possible work. It consists\n of a value, which is an object that represents that partition, and an optional name, which is\n used to label the partition in a human-readable way.\n\n Args:\n value (Any): The object for this partition\n name (str): Name for this partition\n """\n\n def __init__(self, value: T, name: Optional[str] = None):\n self._value = value\n self._name = cast(str, check.opt_str_param(name, "name", str(value)))\n\n @property\n def value(self) -> T:\n return self._value\n\n @property\n def name(self) -> str:\n return self._name\n\n def __eq__(self, other) -> bool:\n return (\n isinstance(other, Partition) and self.value == other.value and self.name == other.name\n )
\n\n\ndef schedule_partition_range(\n start: datetime,\n end: Optional[datetime],\n cron_schedule: str,\n fmt: str,\n timezone: Optional[str],\n execution_time_to_partition_fn: Callable,\n current_time: Optional[datetime],\n) -> List[Partition[datetime]]:\n if end and start > end:\n raise DagsterInvariantViolationError(\n 'Selected date range start "{start}" is after date range end "{end}'.format(\n start=start.strftime(fmt),\n end=end.strftime(fmt),\n )\n )\n\n tz = timezone if timezone else "UTC"\n\n _current_time = current_time if current_time else pendulum.now(tz)\n\n # Coerce to the definition timezone\n _start = (\n to_timezone(start, tz)\n if isinstance(start, PendulumDateTime)\n else pendulum.instance(start, tz=tz)\n )\n _current_time = (\n to_timezone(_current_time, tz)\n if isinstance(_current_time, PendulumDateTime)\n else pendulum.instance(_current_time, tz=tz)\n )\n\n # The end partition time should be before the last partition that\n # executes before the current time\n end_partition_time = execution_time_to_partition_fn(_current_time)\n\n # The partition set has an explicit end time that represents the end of the partition range\n if end:\n _end = (\n to_timezone(end, tz)\n if isinstance(end, PendulumDateTime)\n else pendulum.instance(end, tz=tz)\n )\n\n # If the explicit end time is before the last partition time,\n # update the end partition time\n end_partition_time = min(_end, end_partition_time)\n\n end_timestamp = end_partition_time.timestamp()\n\n partitions: List[Partition[datetime]] = []\n for next_time in schedule_execution_time_iterator(_start.timestamp(), cron_schedule, tz):\n\n partition_time = execution_time_to_partition_fn(next_time)\n\n if partition_time.timestamp() > end_timestamp:\n break\n\n if partition_time.timestamp() < _start.timestamp():\n continue\n\n partitions.append(Partition(value=partition_time, name=partition_time.strftime(fmt)))\n\n return partitions\n\n\n@whitelist_for_serdes\nclass ScheduleType(Enum):\n HOURLY = "HOURLY"\n DAILY = "DAILY"\n WEEKLY = "WEEKLY"\n MONTHLY = "MONTHLY"\n\n @property\n def ordinal(self):\n return {"HOURLY": 1, "DAILY": 2, "WEEKLY": 3, "MONTHLY": 4}[self.value]\n\n @property\n def delta(self):\n if self == ScheduleType.HOURLY:\n return timedelta(hours=1)\n elif self == ScheduleType.DAILY:\n return timedelta(days=1)\n elif self == ScheduleType.WEEKLY:\n return timedelta(weeks=1)\n elif self == ScheduleType.MONTHLY:\n return relativedelta(months=1)\n else:\n check.failed(f"Unexpected ScheduleType {self}")\n\n def __gt__(self, other):\n return self.ordinal > other.ordinal\n\n def __lt__(self, other):\n return self.ordinal < other.ordinal\n\n\n
[docs]class PartitionsDefinition(ABC, Generic[T]):\n @abstractmethod\n def get_partitions(self, current_time: Optional[datetime] = None) -> List[Partition[T]]:\n ...\n\n def __str__(self) -> str:\n joined_keys = ", ".join([f"'{key}'" for key in self.get_partition_keys()])\n return joined_keys\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> List[str]:\n return [partition.name for partition in self.get_partitions(current_time)]\n\n def get_default_partition_mapping(self):\n from dagster.core.asset_defs.partition_mapping import IdentityPartitionMapping\n\n return IdentityPartitionMapping()
\n\n\n
[docs]class StaticPartitionsDefinition(\n PartitionsDefinition[str],\n): # pylint: disable=unsubscriptable-object\n def __init__(self, partition_keys: List[str]):\n check.list_param(partition_keys, "partition_keys", of_type=str)\n\n # Dagit selects partition ranges following the format '2022-01-13...2022-01-14'\n # "..." is an invalid substring in partition keys\n if any(["..." in partition_key for partition_key in partition_keys]):\n raise DagsterInvalidDefinitionError("'...' is an invalid substring in a partition key")\n\n self._partitions = [Partition(key) for key in partition_keys]\n\n def get_partitions(\n self, current_time: Optional[datetime] = None # pylint: disable=unused-argument\n ) -> List[Partition[str]]:\n return self._partitions\n\n def __hash__(self):\n return hash(self.__repr__())\n\n def __eq__(self, other) -> bool:\n return (\n isinstance(other, StaticPartitionsDefinition)\n and self._partitions == other.get_partitions()\n )\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(partition_keys={[p.name for p in self._partitions]})"
\n\n\nclass ScheduleTimeBasedPartitionsDefinition(\n PartitionsDefinition[datetime], # pylint: disable=unsubscriptable-object\n NamedTuple(\n "_ScheduleTimeBasedPartitionsDefinition",\n [\n ("schedule_type", ScheduleType),\n ("start", datetime),\n ("execution_time", time),\n ("execution_day", Optional[int]),\n ("end", Optional[datetime]),\n ("fmt", str),\n ("timezone", Optional[str]),\n ("offset", Optional[int]),\n ],\n ),\n):\n """Computes the partitions backwards from the scheduled execution times"""\n\n def __new__( # pylint: disable=arguments-differ\n cls,\n schedule_type: ScheduleType,\n start: datetime,\n execution_time: Optional[time] = None,\n execution_day: Optional[int] = None,\n end: Optional[datetime] = None,\n fmt: Optional[str] = None,\n timezone: Optional[str] = None,\n offset: Optional[int] = None,\n ):\n if end is not None:\n check.invariant(\n start <= end,\n f'Selected date range start "{start}" '\n f'is after date range end "{end}"'.format(\n start=start.strftime(fmt) if fmt is not None else start,\n end=cast(datetime, end).strftime(fmt) if fmt is not None else end,\n ),\n )\n if schedule_type in [ScheduleType.HOURLY, ScheduleType.DAILY]:\n check.invariant(\n not execution_day,\n f'Execution day should not be provided for schedule type "{schedule_type}"',\n )\n elif schedule_type is ScheduleType.WEEKLY:\n execution_day = execution_day if execution_day is not None else 0\n check.invariant(\n execution_day is not None and 0 <= execution_day <= 6,\n f'Execution day "{execution_day}" must be between 0 and 6 for '\n f'schedule type "{schedule_type}"',\n )\n elif schedule_type is ScheduleType.MONTHLY:\n execution_day = execution_day if execution_day is not None else 1\n check.invariant(\n execution_day is not None and 1 <= execution_day <= 31,\n f'Execution day "{execution_day}" must be between 1 and 31 for '\n f'schedule type "{schedule_type}"',\n )\n\n return super(ScheduleTimeBasedPartitionsDefinition, cls).__new__(\n cls,\n check.inst_param(schedule_type, "schedule_type", ScheduleType),\n check.inst_param(start, "start", datetime),\n check.opt_inst_param(execution_time, "execution_time", time, time(0, 0)),\n check.opt_int_param(\n execution_day,\n "execution_day",\n ),\n check.opt_inst_param(end, "end", datetime),\n cast(str, check.opt_str_param(fmt, "fmt", default=DEFAULT_DATE_FORMAT)),\n check.opt_str_param(timezone, "timezone", default="UTC"),\n check.opt_int_param(offset, "offset", default=1),\n )\n\n def get_partitions(self, current_time: Optional[datetime] = None) -> List[Partition[datetime]]:\n check.opt_inst_param(current_time, "current_time", datetime)\n\n return schedule_partition_range(\n start=self.start,\n end=self.end,\n cron_schedule=self.get_cron_schedule(),\n fmt=self.fmt,\n timezone=self.timezone,\n execution_time_to_partition_fn=self.get_execution_time_to_partition_fn(),\n current_time=current_time,\n )\n\n def get_cron_schedule(self) -> str:\n return get_cron_schedule(self.schedule_type, self.execution_time, self.execution_day)\n\n def get_execution_time_to_partition_fn(self) -> Callable[[datetime], datetime]:\n if self.schedule_type is ScheduleType.HOURLY:\n # Using subtract(minutes=d.minute) here instead of .replace(minute=0) because on\n # pendulum 1, replace(minute=0) sometimes changes the timezone:\n # >>> a = create_pendulum_time(2021, 11, 7, 0, 0, tz="US/Central")\n #\n # >>> a.add(hours=1)\n # <Pendulum [2021-11-07T01:00:00-05:00]>\n # >>> a.add(hours=1).replace(minute=0)\n # <Pendulum [2021-11-07T01:00:00-06:00]>\n return lambda d: pendulum.instance(d).subtract(hours=self.offset, minutes=d.minute)\n elif self.schedule_type is ScheduleType.DAILY:\n return (\n lambda d: pendulum.instance(d).replace(hour=0, minute=0).subtract(days=self.offset)\n )\n elif self.schedule_type is ScheduleType.WEEKLY:\n execution_day = cast(int, self.execution_day)\n day_difference = (execution_day - (self.start.weekday() + 1)) % 7\n return (\n lambda d: pendulum.instance(d)\n .replace(hour=0, minute=0)\n .subtract(\n weeks=self.offset,\n days=day_difference,\n )\n )\n elif self.schedule_type is ScheduleType.MONTHLY:\n execution_day = cast(int, self.execution_day)\n return (\n lambda d: pendulum.instance(d)\n .replace(hour=0, minute=0)\n .subtract(months=self.offset, days=execution_day - 1)\n )\n else:\n check.assert_never(self.schedule_type)\n\n\nclass DynamicPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_DynamicPartitionsDefinition",\n [("partition_fn", Callable[[Optional[datetime]], Union[List[Partition], List[str]]])],\n ),\n):\n def __new__( # pylint: disable=arguments-differ\n cls, partition_fn: Callable[[Optional[datetime]], Union[List[Partition], List[str]]]\n ):\n return super(DynamicPartitionsDefinition, cls).__new__(\n cls, check.callable_param(partition_fn, "partition_fn")\n )\n\n def get_partitions(self, current_time: Optional[datetime] = None) -> List[Partition]:\n partitions = self.partition_fn(current_time)\n if all(isinstance(partition, Partition) for partition in partitions):\n return cast(List[Partition], partitions)\n else:\n return [Partition(p) for p in partitions]\n\n\n
[docs]class PartitionSetDefinition(Generic[T]):\n """\n Defines a partition set, representing the set of slices making up an axis of a pipeline\n\n Args:\n name (str): Name for this partition set\n pipeline_name (str): The name of the pipeline definition\n partition_fn (Optional[Callable[void, List[Partition]]]): User-provided function to define\n the set of valid partition objects.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with this partition. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this partition. (default: 'default')\n run_config_fn_for_partition (Callable[[Partition], Any]): A\n function that takes a :py:class:`~dagster.Partition` and returns the run\n configuration that parameterizes the execution for this partition.\n tags_fn_for_partition (Callable[[Partition], Optional[dict[str, str]]]): A function that\n takes a :py:class:`~dagster.Partition` and returns a list of key value pairs that will\n be added to the generated run for this partition.\n partitions_def (Optional[PartitionsDefinition]): A set of parameters used to construct the set\n of valid partition objects.\n """\n\n def __init__(\n self,\n name: str,\n pipeline_name: Optional[str] = None,\n partition_fn: Optional[Callable[..., Union[List[Partition[T]], List[str]]]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n run_config_fn_for_partition: Callable[[Partition[T]], Any] = lambda _partition: {},\n tags_fn_for_partition: Callable[\n [Partition[T]], Optional[Dict[str, str]]\n ] = lambda _partition: {},\n partitions_def: Optional[\n PartitionsDefinition[T] # pylint: disable=unsubscriptable-object\n ] = None,\n job_name: Optional[str] = None,\n ):\n check.invariant(\n partition_fn is not None or partitions_def is not None,\n "One of `partition_fn` or `partitions_def` must be supplied.",\n )\n check.invariant(\n not (partition_fn and partitions_def),\n "Only one of `partition_fn` or `partitions_def` must be supplied.",\n )\n check.invariant(\n (pipeline_name or job_name) and not (pipeline_name and job_name),\n "Exactly one one of `job_name` and `pipeline_name` must be supplied.",\n )\n\n _wrap_partition_fn = None\n\n if partition_fn is not None:\n partition_fn_param_count = len(inspect.signature(partition_fn).parameters)\n\n def _wrap_partition(x: Union[str, Partition]) -> Partition:\n if isinstance(x, Partition):\n return x\n if isinstance(x, str):\n return Partition(x)\n raise DagsterInvalidDefinitionError(\n "Expected <Partition> | <str>, received {type}".format(type=type(x))\n )\n\n def _wrap_partition_fn(current_time=None) -> List[Partition]:\n if not current_time:\n current_time = pendulum.now("UTC")\n\n check.callable_param(partition_fn, "partition_fn") # type: ignore\n\n if partition_fn_param_count == 1:\n obj_list = cast(\n Callable[..., List[Union[Partition[T], str]]],\n partition_fn,\n )(current_time)\n else:\n obj_list = partition_fn() # type: ignore\n\n return [_wrap_partition(obj) for obj in obj_list]\n\n self._name = check_valid_name(name)\n self._pipeline_name = check.opt_str_param(pipeline_name, "pipeline_name")\n self._job_name = check.opt_str_param(job_name, "job_name")\n self._partition_fn = _wrap_partition_fn\n self._solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n self._mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n self._user_defined_run_config_fn_for_partition = check.callable_param(\n run_config_fn_for_partition, "run_config_fn_for_partition"\n )\n self._user_defined_tags_fn_for_partition = check.callable_param(\n tags_fn_for_partition, "tags_fn_for_partition"\n )\n check.opt_inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n if partitions_def is not None:\n self._partitions_def = partitions_def\n else:\n if partition_fn is None:\n check.failed("One of `partition_fn` or `partitions_def` must be supplied.")\n self._partitions_def = DynamicPartitionsDefinition(partition_fn=_wrap_partition_fn)\n\n @property\n def name(self):\n return self._name\n\n @property\n def pipeline_name(self):\n return self._pipeline_name\n\n @property\n def job_name(self):\n return self._job_name\n\n @property\n def pipeline_or_job_name(self) -> str:\n # one is guaranteed to be set\n return cast(str, self._pipeline_name or self._job_name)\n\n @property\n def solid_selection(self):\n return self._solid_selection\n\n @property\n def mode(self):\n return self._mode\n\n def run_config_for_partition(self, partition: Partition[T]) -> Dict[str, Any]:\n return copy.deepcopy(self._user_defined_run_config_fn_for_partition(partition))\n\n def tags_for_partition(self, partition: Partition[T]) -> Dict[str, str]:\n user_tags = validate_tags(\n self._user_defined_tags_fn_for_partition(partition), allow_reserved_tags=False\n )\n tags = merge_dicts(user_tags, PipelineRun.tags_for_partition_set(self, partition))\n\n return tags\n\n
[docs] def get_partitions(self, current_time: Optional[datetime] = None) -> List[Partition[T]]:\n """Return the set of known partitions.\n\n Arguments:\n current_time (Optional[datetime]): The evaluation time for the partition function, which\n is passed through to the ``partition_fn`` (if it accepts a parameter). Defaults to\n the current time in UTC.\n """\n return self._partitions_def.get_partitions(current_time)
\n\n def get_partition(self, name: str) -> Partition[T]:\n for partition in self.get_partitions():\n if partition.name == name:\n return partition\n\n raise DagsterUnknownPartitionError(f"Could not find a partition with key `{name}`")\n\n def get_partition_names(self, current_time: Optional[datetime] = None) -> List[str]:\n return [part.name for part in self.get_partitions(current_time)]\n\n
[docs] def create_schedule_definition(\n self,\n schedule_name,\n cron_schedule,\n partition_selector,\n should_execute=None,\n environment_vars=None,\n execution_timezone=None,\n description=None,\n decorated_fn=None,\n job=None,\n default_status=DefaultScheduleStatus.STOPPED,\n ):\n """Create a ScheduleDefinition from a PartitionSetDefinition.\n\n Arguments:\n schedule_name (str): The name of the schedule.\n cron_schedule (str): A valid cron string for the schedule\n partition_selector (Callable[ScheduleEvaluationContext, PartitionSetDefinition], Union[Partition, List[Partition]]):\n Function that determines the partition to use at a given execution time. Can return\n either a single Partition or a list of Partitions. For time-based partition sets,\n will likely be either `identity_partition_selector` or a selector returned by\n `create_offset_partition_selector`.\n should_execute (Optional[function]): Function that runs at schedule execution time that\n determines whether a schedule should execute. Defaults to a function that always returns\n ``True``.\n environment_vars (Optional[dict]): The environment variables to set for the schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n\n Returns:\n PartitionScheduleDefinition: The generated PartitionScheduleDefinition for the partition\n selector\n """\n\n check.str_param(schedule_name, "schedule_name")\n check.str_param(cron_schedule, "cron_schedule")\n check.opt_callable_param(should_execute, "should_execute")\n check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)\n check.callable_param(partition_selector, "partition_selector")\n check.opt_str_param(execution_timezone, "execution_timezone")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultScheduleStatus)\n\n def _execution_fn(context):\n check.inst_param(context, "context", ScheduleEvaluationContext)\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of partition_selector for schedule {schedule_name}",\n ):\n selector_result = partition_selector(context, self)\n\n if isinstance(selector_result, SkipReason):\n yield selector_result\n return\n\n selected_partitions = (\n selector_result\n if isinstance(selector_result, (frozenlist, list))\n else [selector_result]\n )\n\n check.is_list(selected_partitions, of_type=Partition)\n\n if not selected_partitions:\n yield SkipReason("Partition selector returned an empty list of partitions.")\n return\n\n partition_names = self.get_partition_names(context.scheduled_execution_time)\n\n missing_partition_names = [\n partition.name\n for partition in selected_partitions\n if partition.name not in partition_names\n ]\n\n if missing_partition_names:\n yield SkipReason(\n "Partition selector returned partition"\n + ("s" if len(missing_partition_names) > 1 else "")\n + f" not in the partition set: {', '.join(missing_partition_names)}."\n )\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of should_execute for schedule {schedule_name}",\n ):\n if should_execute and not should_execute(context):\n yield SkipReason(\n "should_execute function for {schedule_name} returned false.".format(\n schedule_name=schedule_name\n )\n )\n return\n\n for selected_partition in selected_partitions:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of run_config_fn for schedule {schedule_name}",\n ):\n run_config = self.run_config_for_partition(selected_partition)\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {schedule_name}",\n ):\n tags = self.tags_for_partition(selected_partition)\n yield RunRequest(\n run_key=selected_partition.name if len(selected_partitions) > 0 else None,\n run_config=run_config,\n tags=tags,\n )\n\n return PartitionScheduleDefinition(\n name=schedule_name,\n cron_schedule=cron_schedule,\n pipeline_name=self._pipeline_name,\n tags_fn=None,\n solid_selection=self._solid_selection,\n mode=self._mode,\n should_execute=None,\n environment_vars=environment_vars,\n partition_set=self,\n execution_timezone=execution_timezone,\n execution_fn=_execution_fn,\n description=description,\n decorated_fn=decorated_fn,\n job=job,\n default_status=default_status,\n )
\n\n\n
[docs]class PartitionScheduleDefinition(ScheduleDefinition):\n __slots__ = ["_partition_set"]\n\n def __init__(\n self,\n name,\n cron_schedule,\n pipeline_name,\n tags_fn,\n solid_selection,\n mode,\n should_execute,\n environment_vars,\n partition_set,\n run_config_fn=None,\n execution_timezone=None,\n execution_fn=None,\n description=None,\n decorated_fn=None,\n job=None,\n default_status=DefaultScheduleStatus.STOPPED,\n ):\n super(PartitionScheduleDefinition, self).__init__(\n name=check_valid_name(name),\n cron_schedule=cron_schedule,\n pipeline_name=pipeline_name,\n run_config_fn=run_config_fn,\n tags_fn=tags_fn,\n solid_selection=solid_selection,\n mode=mode,\n should_execute=should_execute,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n execution_fn=execution_fn,\n description=description,\n job=job,\n default_status=default_status,\n )\n self._partition_set = check.inst_param(\n partition_set, "partition_set", PartitionSetDefinition\n )\n self._decorated_fn = check.opt_callable_param(decorated_fn, "decorated_fn")\n\n def __call__(self, *args, **kwargs):\n if not self._decorated_fn:\n raise DagsterInvalidInvocationError(\n "Only partition schedules created using one of the partition schedule decorators "\n "can be directly invoked."\n )\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Schedule decorated function has date argument, but no date argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Schedule invocation received multiple arguments. Only a first "\n "positional date parameter should be provided when invoking."\n )\n\n date_param_name = get_function_params(self._decorated_fn)[0].name\n\n if args:\n date = check.opt_inst_param(args[0], date_param_name, datetime)\n else:\n if date_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Schedule invocation expected argument '{date_param_name}'."\n )\n date = check.opt_inst_param(kwargs[date_param_name], date_param_name, datetime)\n\n return self._decorated_fn(date)\n\n def get_partition_set(self):\n return self._partition_set
\n\n\n
[docs]class PartitionedConfig(Generic[T]):\n """Defines a way of configuring a job where the job can be run on one of a discrete set of\n partitions, and each partition corresponds to run configuration for the job.\n\n Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\n and view the run history across partitions.\n """\n\n def __init__(\n self,\n partitions_def: PartitionsDefinition[T], # pylint: disable=unsubscriptable-object\n run_config_for_partition_fn: Callable[[Partition[T]], Dict[str, Any]],\n decorated_fn: Optional[Callable[..., Dict[str, Any]]] = None,\n tags_for_partition_fn: Optional[Callable[[Partition[T]], Dict[str, str]]] = None,\n ):\n self._partitions = check.inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n self._run_config_for_partition_fn = check.callable_param(\n run_config_for_partition_fn, "run_config_for_partition_fn"\n )\n self._decorated_fn = decorated_fn\n self._tags_for_partition_fn = check.opt_callable_param(\n tags_for_partition_fn, "tags_for_partition_fn"\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition[T]: # pylint: disable=unsubscriptable-object\n return self._partitions\n\n @property\n def run_config_for_partition_fn(self) -> Callable[[Partition[T]], Dict[str, Any]]:\n return self._run_config_for_partition_fn\n\n @property\n def tags_for_partition_fn(self) -> Optional[Callable[[Partition[T]], Dict[str, str]]]:\n return self._tags_for_partition_fn\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> List[str]:\n return [partition.name for partition in self.partitions_def.get_partitions(current_time)]\n\n
[docs] def get_run_config_for_partition_key(self, partition_key: str) -> Dict[str, Any]:\n """Generates the run config corresponding to a partition key.\n\n Args:\n partition_key (str): the key for a partition that should be used to generate a run config.\n """\n partitions = self.partitions_def.get_partitions()\n partition = [p for p in partitions if p.name == partition_key]\n if len(partition) == 0:\n raise DagsterInvalidInvocationError(f"No partition for partition key {partition_key}.")\n return self.run_config_for_partition_fn(partition[0])
\n\n def __call__(self, *args, **kwargs):\n if self._decorated_fn is None:\n raise DagsterInvalidInvocationError(\n "Only PartitionedConfig objects created using one of the partitioned config "\n "decorators can be directly invoked."\n )\n else:\n return self._decorated_fn(*args, **kwargs)
\n\n\n
[docs]def static_partitioned_config(\n partition_keys: List[str],\n tags_for_partition_fn: Optional[Callable[[str], Dict[str, str]]] = None,\n) -> Callable[[Callable[[str], Dict[str, Any]]], PartitionedConfig]:\n """Creates a static partitioned config for a job.\n\n The provided partition_keys returns a static list of strings identifying the set of partitions,\n given an optional datetime argument (representing the current time). The list of partitions\n is static, so while the run config returned by the decorated function may change over time, the\n list of valid partition keys does not.\n\n This has performance advantages over `dynamic_partitioned_config` in terms of loading different\n partition views in Dagit.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_keys (List[str]): A list of valid partition keys, which serve as the range of\n values that can be provided to the decorated run config function.\n\n Returns:\n PartitionedConfig\n """\n check.list_param(partition_keys, "partition_keys", str)\n\n def inner(fn: Callable[[str], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n def _run_config_wrapper(partition: Partition[T]) -> Dict[str, Any]:\n return fn(partition.name)\n\n def _tag_wrapper(partition: Partition[T]) -> Dict[str, str]:\n return tags_for_partition_fn(partition.name) if tags_for_partition_fn else {}\n\n return PartitionedConfig(\n partitions_def=StaticPartitionsDefinition(partition_keys),\n run_config_for_partition_fn=_run_config_wrapper,\n decorated_fn=fn,\n tags_for_partition_fn=_tag_wrapper,\n )\n\n return inner
\n\n\n
[docs]def dynamic_partitioned_config(\n partition_fn: Callable[[Optional[datetime]], List[str]],\n tags_for_partition_fn: Optional[Callable[[str], Dict[str, str]]] = None,\n) -> Callable[[Callable[[str], Dict[str, Any]]], PartitionedConfig]:\n """Creates a dynamic partitioned config for a job.\n\n The provided partition_fn returns a list of strings identifying the set of partitions, given\n an optional datetime argument (representing the current time). The list of partitions returned\n may change over time.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_fn (Callable[[datetime.datetime], Sequence[str]]): A function that generates a\n list of valid partition keys, which serve as the range of values that can be provided\n to the decorated run config function.\n\n Returns:\n PartitionedConfig\n """\n check.callable_param(partition_fn, "partition_fn")\n\n def inner(fn: Callable[[str], Dict[str, Any]]) -> PartitionedConfig:\n def _run_config_wrapper(partition: Partition[T]) -> Dict[str, Any]:\n return fn(partition.name)\n\n def _tag_wrapper(partition: Partition[T]) -> Dict[str, str]:\n return tags_for_partition_fn(partition.name) if tags_for_partition_fn else {}\n\n return PartitionedConfig(\n partitions_def=DynamicPartitionsDefinition(partition_fn),\n run_config_for_partition_fn=_run_config_wrapper,\n decorated_fn=fn,\n tags_for_partition_fn=_tag_wrapper,\n )\n\n return inner
\n\n\ndef get_cron_schedule(\n schedule_type: ScheduleType,\n time_of_day: time = time(0, 0),\n execution_day: Optional[int] = None,\n) -> str:\n minute = time_of_day.minute\n hour = time_of_day.hour\n\n if schedule_type is ScheduleType.HOURLY:\n return f"{minute} * * * *"\n elif schedule_type is ScheduleType.DAILY:\n return f"{minute} {hour} * * *"\n elif schedule_type is ScheduleType.WEEKLY:\n return f"{minute} {hour} * * {execution_day if execution_day != None else 0}"\n elif schedule_type is ScheduleType.MONTHLY:\n return f"{minute} {hour} {execution_day if execution_day != None else 1} * *"\n else:\n check.assert_never(schedule_type)\n
", "current_page_name": "_modules/dagster/core/definitions/partition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.partition"}, "partitioned_schedule": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.partitioned_schedule

\nfrom datetime import time\nfrom typing import Optional, Union, cast\n\nimport dagster._check as check\n\nfrom .job_definition import JobDefinition\nfrom .partition import (\n    Partition,\n    PartitionSetDefinition,\n    PartitionedConfig,\n    ScheduleType,\n    get_cron_schedule,\n)\nfrom .run_request import SkipReason\nfrom .schedule_definition import (\n    DefaultScheduleStatus,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n)\nfrom .time_window_partitions import TimeWindow, TimeWindowPartitionsDefinition\n\n\n
[docs]def build_schedule_from_partitioned_job(\n job: JobDefinition,\n description: Optional[str] = None,\n name: Optional[str] = None,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n) -> ScheduleDefinition:\n """\n Creates a schedule from a time window-partitioned job.\n\n The schedule executes at the cadence specified by the partitioning of the given job.\n """\n check.invariant(len(job.mode_definitions) == 1, "job must only have one mode")\n check.invariant(\n job.mode_definitions[0].partitioned_config is not None, "job must be a partitioned job"\n )\n check.invariant(\n not (day_of_week and day_of_month),\n "Cannot provide both day_of_month and day_of_week parameter to build_schedule_from_partitioned_job.",\n )\n\n partitioned_config = cast(PartitionedConfig, job.mode_definitions[0].partitioned_config)\n partition_set = cast(PartitionSetDefinition, job.get_partition_set_def())\n\n check.inst(partitioned_config.partitions_def, TimeWindowPartitionsDefinition)\n partitions_def = cast(TimeWindowPartitionsDefinition, partitioned_config.partitions_def)\n\n minute_of_hour = cast(\n int,\n check.opt_int_param(minute_of_hour, "minute_of_hour", default=partitions_def.minute_offset),\n )\n\n if partitions_def.schedule_type == ScheduleType.HOURLY:\n check.invariant(hour_of_day is None, "Cannot set hour parameter with hourly partitions.")\n\n hour_of_day = cast(\n int, check.opt_int_param(hour_of_day, "hour_of_day", default=partitions_def.hour_offset)\n )\n execution_time = time(minute=minute_of_hour, hour=hour_of_day)\n\n if partitions_def.schedule_type == ScheduleType.DAILY:\n check.invariant(\n day_of_week is None, "Cannot set day of week parameter with daily partitions."\n )\n check.invariant(\n day_of_month is None, "Cannot set day of month parameter with daily partitions."\n )\n\n if partitions_def.schedule_type == ScheduleType.MONTHLY:\n default = partitions_def.day_offset or 1\n execution_day = check.opt_int_param(day_of_month, "day_of_month", default=default)\n elif partitions_def.schedule_type == ScheduleType.WEEKLY:\n default = partitions_def.day_offset or 0\n execution_day = check.opt_int_param(day_of_week, "day_of_week", default=default)\n else:\n execution_day = 0\n\n cron_schedule = get_cron_schedule(partitions_def.schedule_type, execution_time, execution_day)\n\n schedule_def = partition_set.create_schedule_definition(\n schedule_name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n cron_schedule=cron_schedule,\n partition_selector=latest_window_partition_selector,\n execution_timezone=partitions_def.timezone,\n description=description,\n job=job,\n default_status=default_status,\n )\n\n return schedule_def
\n\n\nschedule_from_partitions = build_schedule_from_partitioned_job\n\n\ndef latest_window_partition_selector(\n context: ScheduleEvaluationContext, partition_set_def: PartitionSetDefinition[TimeWindow]\n) -> Union[SkipReason, Partition[TimeWindow]]:\n """Creates a selector for partitions that are time windows. Selects the latest partition that\n exists as of the schedule tick time.\n """\n partitions = partition_set_def.get_partitions(context.scheduled_execution_time)\n if len(partitions) == 0:\n return SkipReason()\n else:\n return partitions[-1]\n
", "current_page_name": "_modules/dagster/core/definitions/partitioned_schedule", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.partitioned_schedule"}, "pipeline_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.pipeline_definition

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    FrozenSet,\n    Iterator,\n    List,\n    Optional,\n    Set,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.definitions.solid_definition import NodeDefinition\nfrom dagster.core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster.core.storage.io_manager import IOManagerDefinition\nfrom dagster.core.storage.output_manager import IOutputManagerDefinition\nfrom dagster.core.storage.root_input_manager import (\n    IInputManagerDefinition,\n    RootInputManagerDefinition,\n)\nfrom dagster.core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster.core.types.dagster_type import DagsterType, DagsterTypeKind\nfrom dagster.core.utils import str_format_set\nfrom dagster.utils import frozentags, merge_dicts\nfrom dagster.utils.backcompat import experimental_class_warning\n\nfrom .asset_layer import AssetLayer\nfrom .dependency import (\n    DependencyDefinition,\n    DependencyStructure,\n    DynamicCollectDependencyDefinition,\n    IDependencyDefinition,\n    MultiDependencyDefinition,\n    Node,\n    NodeHandle,\n    NodeInvocation,\n    SolidInputHandle,\n)\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .hook_definition import HookDefinition\nfrom .mode import ModeDefinition\nfrom .node_definition import NodeDefinition\nfrom .preset import PresetDefinition\nfrom .utils import validate_tags\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.partition import PartitionSetDefinition\n    from dagster.core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster.core.host_representation import PipelineIndex\n    from dagster.core.instance import DagsterInstance\n    from dagster.core.snap import ConfigSchemaSnapshot, PipelineSnapshot\n\n    from .run_config_schema import RunConfigSchema\n\n\n
[docs]class PipelineDefinition:\n """Defines a Dagster pipeline.\n\n A pipeline is made up of\n\n - Solids, each of which is a single functional unit of data computation.\n - Dependencies, which determine how the values produced by solids as their outputs flow from\n one solid to another. This tells Dagster how to arrange solids, and potentially multiple\n aliased instances of solids, into a directed, acyclic graph (DAG) of compute.\n - Modes, which can be used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline, and to switch between them.\n - Presets, which can be used to ship common combinations of pipeline config options in Python\n code, and to switch between them.\n\n Args:\n solid_defs (List[SolidDefinition]): The set of solids used in this pipeline.\n name (str): The name of the pipeline. Must be unique within any\n :py:class:`RepositoryDefinition` containing the pipeline.\n description (Optional[str]): A human-readable description of the pipeline.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each solid's inputs on the outputs of\n other solids in the pipeline. Keys of the top level dict are either the string names of\n solids in the pipeline or, in the case of aliased solids,\n :py:class:`NodeInvocations <NodeInvocation>`. Values of the top level dict are\n themselves dicts, which map input names belonging to the solid or aliased solid to\n :py:class:`DependencyDefinitions <DependencyDefinition>`.\n mode_defs (Optional[List[ModeDefinition]]): The set of modes in which this pipeline can\n operate. Modes are used to attach resources, custom loggers, custom system storage\n options, and custom executors to a pipeline. Modes can be used, e.g., to vary available\n resource and logging implementations between local test and production runs.\n preset_defs (Optional[List[PresetDefinition]]): A set of preset collections of configuration\n options that may be used to execute a pipeline. A preset consists of an environment\n dict, an optional subset of solids to execute, and a mode selection. Presets can be used\n to ship common combinations of options to pipeline end users in Python code, and can\n be selected by tools like Dagit.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the pipeline.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n hook_defs (Optional[AbstractSet[HookDefinition]]): A set of hook definitions applied to the\n pipeline. When a hook is applied to a pipeline, it will be attached to all solid\n instances within the pipeline.\n solid_retry_policy (Optional[RetryPolicy]): The default retry policy for all solids in\n this pipeline. Only used if retry policy is not defined on the solid definition or\n solid invocation.\n asset_layer (Optional[AssetLayer]): Structured object containing all definition-time asset\n information for this pipeline.\n\n\n _parent_pipeline_def (INTERNAL ONLY): Used for tracking pipelines created using solid subsets.\n\n Examples:\n\n .. code-block:: python\n\n @solid\n def return_one(_):\n return 1\n\n\n @solid(input_defs=[InputDefinition('num')], required_resource_keys={'op'})\n def apply_op(context, num):\n return context.resources.op(num)\n\n @resource(config_schema=Int)\n def adder_resource(init_context):\n return lambda x: x + init_context.resource_config\n\n\n add_mode = ModeDefinition(\n name='add_mode',\n resource_defs={'op': adder_resource},\n description='Mode that adds things',\n )\n\n\n add_three_preset = PresetDefinition(\n name='add_three_preset',\n run_config={'resources': {'op': {'config': 3}}},\n mode='add_mode',\n )\n\n\n pipeline_def = PipelineDefinition(\n name='basic',\n solid_defs=[return_one, apply_op],\n dependencies={'apply_op': {'num': DependencyDefinition('return_one')}},\n mode_defs=[add_mode],\n preset_defs=[add_three_preset],\n )\n """\n\n def __init__(\n self,\n solid_defs: Optional[List[NodeDefinition]] = None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n dependencies: Optional[\n Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]\n ] = None,\n mode_defs: Optional[List[ModeDefinition]] = None,\n preset_defs: Optional[List[PresetDefinition]] = None,\n tags: Optional[Dict[str, Any]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n solid_retry_policy: Optional[RetryPolicy] = None,\n graph_def=None,\n _parent_pipeline_def=None, # https://github.com/dagster-io/dagster/issues/2115\n version_strategy: Optional[VersionStrategy] = None,\n asset_layer: Optional[AssetLayer] = None,\n ):\n # If a graph is specificed directly use it\n if check.opt_inst_param(graph_def, "graph_def", GraphDefinition):\n self._graph_def = graph_def\n self._name = name or graph_def.name\n\n # Otherwise fallback to legacy construction\n else:\n if name is None:\n check.failed("name must be set provided")\n self._name = name\n\n if solid_defs is None:\n check.failed("solid_defs must be provided")\n\n self._graph_def = GraphDefinition(\n name=name,\n dependencies=dependencies,\n node_defs=solid_defs,\n input_mappings=None,\n output_mappings=None,\n config=None,\n description=None,\n )\n\n # tags and description can exist on graph as well, but since\n # same graph may be in multiple pipelines/jobs, keep separate layer\n self._description = check.opt_str_param(description, "description")\n self._tags = validate_tags(tags)\n\n self._current_level_node_defs = self._graph_def.node_defs\n\n mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition)\n\n if not mode_definitions:\n mode_definitions = [ModeDefinition()]\n\n self._mode_definitions = mode_definitions\n\n seen_modes = set()\n for mode_def in mode_definitions:\n if mode_def.name in seen_modes:\n raise DagsterInvalidDefinitionError(\n (\n 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '\n "Modes must have unique names."\n ).format(mode_name=mode_def.name, pipeline_name=self.name)\n )\n seen_modes.add(mode_def.name)\n\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n self._solid_retry_policy = check.opt_inst_param(\n solid_retry_policy, "solid_retry_policy", RetryPolicy\n )\n\n self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition)\n self._preset_dict: Dict[str, PresetDefinition] = {}\n for preset in self._preset_defs:\n if preset.name in self._preset_dict:\n raise DagsterInvalidDefinitionError(\n (\n 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '\n "PresetDefinitions must have unique names."\n ).format(name=preset.name, pipeline_name=self.name)\n )\n if preset.mode not in seen_modes:\n raise DagsterInvalidDefinitionError(\n (\n 'PresetDefinition "{name}" in "{pipeline_name}" '\n 'references mode "{mode}" which is not defined.'\n ).format(name=preset.name, pipeline_name=self.name, mode=preset.mode)\n )\n self._preset_dict[preset.name] = preset\n\n self._asset_layer = check.opt_inst_param(\n asset_layer, "asset_layer", AssetLayer, default=AssetLayer.from_graph(self.graph)\n )\n\n self._resource_requirements = {\n mode_def.name: _checked_resource_reqs_for_mode(\n self._graph_def,\n mode_def,\n self._current_level_node_defs,\n self._graph_def._dagster_type_dict,\n self._graph_def.node_dict,\n self._hook_defs,\n self._graph_def._dependency_structure,\n self._asset_layer,\n )\n for mode_def in self._mode_definitions\n }\n\n # Recursively explore all nodes in the this pipeline\n self._all_node_defs = _build_all_node_defs(self._current_level_node_defs)\n self._parent_pipeline_def = check.opt_inst_param(\n _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition\n )\n self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {}\n self._cached_external_pipeline = None\n\n self.version_strategy = check.opt_inst_param(\n version_strategy, "version_strategy", VersionStrategy\n )\n\n if self.version_strategy is not None:\n experimental_class_warning("VersionStrategy")\n\n @property\n def name(self):\n return self._name\n\n @property\n def target_type(self):\n return "pipeline"\n\n @property\n def is_job(self) -> bool:\n return False\n\n def describe_target(self):\n return f"{self.target_type} '{self.name}'"\n\n @property\n def tags(self):\n return frozentags(**merge_dicts(self._graph_def.tags, self._tags))\n\n @property\n def description(self):\n return self._description\n\n @property\n def graph(self):\n return self._graph_def\n\n @property\n def dependency_structure(self):\n return self._graph_def.dependency_structure\n\n @property\n def dependencies(self):\n return self._graph_def.dependencies\n\n def get_run_config_schema(self, mode: Optional[str] = None) -> "RunConfigSchema":\n check.str_param(mode, "mode")\n\n mode_def = self.get_mode_definition(mode)\n\n if mode_def.name in self._cached_run_config_schemas:\n return self._cached_run_config_schemas[mode_def.name]\n\n self._cached_run_config_schemas[mode_def.name] = _create_run_config_schema(\n self,\n mode_def,\n self._resource_requirements[mode_def.name],\n )\n return self._cached_run_config_schemas[mode_def.name]\n\n @property\n def mode_definitions(self) -> List[ModeDefinition]:\n return self._mode_definitions\n\n @property\n def preset_defs(self) -> List[PresetDefinition]:\n return self._preset_defs\n\n def _get_mode_definition(self, mode: str) -> Optional[ModeDefinition]:\n check.str_param(mode, "mode")\n for mode_definition in self._mode_definitions:\n if mode_definition.name == mode:\n return mode_definition\n\n return None\n\n def get_default_mode(self) -> ModeDefinition:\n return self._mode_definitions[0]\n\n @property\n def is_single_mode(self) -> bool:\n return len(self._mode_definitions) == 1\n\n @property\n def is_multi_mode(self) -> bool:\n return len(self._mode_definitions) > 1\n\n def is_using_memoization(self, run_tags: Dict[str, str]) -> bool:\n tags = merge_dicts(self.tags, run_tags)\n # If someone provides a false value for memoized run tag, then they are intentionally\n # switching off memoization.\n if tags.get(MEMOIZED_RUN_TAG) == "false":\n return False\n return (\n MEMOIZED_RUN_TAG in tags and tags.get(MEMOIZED_RUN_TAG) == "true"\n ) or self.version_strategy is not None\n\n def has_mode_definition(self, mode: str) -> bool:\n check.str_param(mode, "mode")\n return bool(self._get_mode_definition(mode))\n\n def get_default_mode_name(self) -> str:\n return self._mode_definitions[0].name\n\n def get_mode_definition(self, mode: Optional[str] = None) -> ModeDefinition:\n check.opt_str_param(mode, "mode")\n if mode is None:\n check.invariant(self.is_single_mode)\n return self.get_default_mode()\n\n mode_def = self._get_mode_definition(mode)\n\n if mode_def is None:\n check.failed(\n "Could not find mode {mode} in pipeline {name}".format(mode=mode, name=self.name),\n )\n\n return mode_def\n\n @property\n def available_modes(self) -> List[str]:\n return [mode_def.name for mode_def in self._mode_definitions]\n\n def get_required_resource_defs_for_mode(self, mode: str) -> Dict[str, ResourceDefinition]:\n return {\n resource_key: resource\n for resource_key, resource in self.get_mode_definition(mode).resource_defs.items()\n if resource_key in self._resource_requirements[mode]\n }\n\n @property\n def all_node_defs(self) -> List[NodeDefinition]:\n return list(self._all_node_defs.values())\n\n @property\n def top_level_solid_defs(self) -> List[NodeDefinition]:\n return self._current_level_node_defs\n\n def solid_def_named(self, name: str) -> NodeDefinition:\n check.str_param(name, "name")\n\n check.invariant(name in self._all_node_defs, "{} not found".format(name))\n return self._all_node_defs[name]\n\n def has_solid_def(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._all_node_defs\n\n def get_solid(self, handle):\n return self._graph_def.get_solid(handle)\n\n def has_solid_named(self, name):\n return self._graph_def.has_solid_named(name)\n\n def solid_named(self, name):\n return self._graph_def.solid_named(name)\n\n @property\n def solids(self):\n return self._graph_def.solids\n\n @property\n def solids_in_topological_order(self):\n return self._graph_def.solids_in_topological_order\n\n def all_dagster_types(self):\n return self._graph_def.all_dagster_types()\n\n def has_dagster_type(self, name):\n return self._graph_def.has_dagster_type(name)\n\n def dagster_type_named(self, name):\n return self._graph_def.dagster_type_named(name)\n\n def get_pipeline_subset_def(\n self, solids_to_execute: Optional[AbstractSet[str]]\n ) -> "PipelineDefinition":\n return (\n self if solids_to_execute is None else _get_pipeline_subset_def(self, solids_to_execute)\n )\n\n def has_preset(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._preset_dict\n\n def get_preset(self, name: str) -> PresetDefinition:\n check.str_param(name, "name")\n if name not in self._preset_dict:\n raise DagsterInvariantViolationError(\n (\n 'Could not find preset for "{name}". Available presets '\n 'for pipeline "{pipeline_name}" are {preset_names}.'\n ).format(\n name=name,\n preset_names=list(self._preset_dict.keys()),\n pipeline_name=self.name,\n )\n )\n\n return self._preset_dict[name]\n\n def get_pipeline_snapshot(self) -> "PipelineSnapshot":\n return self.get_pipeline_index().pipeline_snapshot\n\n def get_pipeline_snapshot_id(self) -> str:\n return self.get_pipeline_index().pipeline_snapshot_id\n\n def get_pipeline_index(self) -> "PipelineIndex":\n from dagster.core.host_representation import PipelineIndex\n from dagster.core.snap import PipelineSnapshot\n\n return PipelineIndex(\n PipelineSnapshot.from_pipeline_def(self), self.get_parent_pipeline_snapshot()\n )\n\n def get_config_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n return self.get_pipeline_snapshot().config_schema_snapshot\n\n @property\n def is_subset_pipeline(self) -> bool:\n return False\n\n @property\n def parent_pipeline_def(self) -> Optional["PipelineDefinition"]:\n return None\n\n def get_parent_pipeline_snapshot(self) -> Optional["PipelineSnapshot"]:\n return None\n\n @property\n def solids_to_execute(self) -> Optional[FrozenSet[str]]:\n return None\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def asset_layer(self) -> AssetLayer:\n return self._asset_layer\n\n def get_all_hooks_for_handle(self, handle: NodeHandle) -> FrozenSet[HookDefinition]:\n """Gather all the hooks for the given solid from all places possibly attached with a hook.\n\n A hook can be attached to any of the following objects\n * Solid (solid invocation)\n * PipelineDefinition\n\n Args:\n handle (NodeHandle): The solid's handle\n\n Returns:\n FrozenSet[HookDefinition]\n """\n check.inst_param(handle, "handle", NodeHandle)\n hook_defs: AbstractSet[HookDefinition] = set()\n\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n # hooks on top-level solid\n name = lineage.pop()\n solid = self._graph_def.solid_named(name)\n hook_defs = hook_defs.union(solid.hook_defs)\n\n # hooks on non-top-level solids\n while lineage:\n name = lineage.pop()\n solid = solid.definition.solid_named(name)\n hook_defs = hook_defs.union(solid.hook_defs)\n\n # hooks applied to a pipeline definition will run on every solid\n hook_defs = hook_defs.union(self.hook_defs)\n\n return frozenset(hook_defs)\n\n def get_retry_policy_for_handle(self, handle: NodeHandle) -> Optional[RetryPolicy]:\n solid = self.get_solid(handle)\n\n if solid.retry_policy:\n return solid.retry_policy\n elif solid.definition.retry_policy:\n return solid.definition.retry_policy\n\n # could be expanded to look in composite_solid / graph containers\n else:\n return self._solid_retry_policy\n\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PipelineDefinition":\n """Apply a set of hooks to all solid instances within the pipeline."""\n\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n\n pipeline_def = PipelineDefinition(\n name=self.name,\n graph_def=self._graph_def,\n mode_defs=self.mode_definitions,\n preset_defs=self.preset_defs,\n tags=self.tags,\n hook_defs=hook_defs | self.hook_defs,\n description=self._description,\n solid_retry_policy=self._solid_retry_policy,\n _parent_pipeline_def=self._parent_pipeline_def,\n )\n\n update_wrapper(pipeline_def, self, updated=())\n\n return pipeline_def\n\n # make Callable for decorator reference updates\n def __call__(self, *args, **kwargs):\n if self.is_job:\n msg = (\n f"Attempted to call job '{self.name}' directly. Jobs should be invoked by "\n "using an execution API function (e.g. `job.execute_in_process`)."\n )\n else:\n msg = (\n f"Attempted to call pipeline '{self.name}' directly. Pipelines should be invoked by "\n "using an execution API function (e.g. `execute_pipeline`)."\n )\n raise DagsterInvariantViolationError(msg)
\n\n\nclass PipelineSubsetDefinition(PipelineDefinition):\n @property\n def solids_to_execute(self):\n return frozenset(self._graph_def.node_names())\n\n @property\n def solid_selection(self) -> List[str]:\n # we currently don't pass the real solid_selection (the solid query list) down here.\n # so in the short-term, to make the call sites cleaner, we will convert the solids to execute\n # to a list\n return self._graph_def.node_names()\n\n @property\n def parent_pipeline_def(self) -> PipelineDefinition:\n return self._parent_pipeline_def\n\n def get_parent_pipeline_snapshot(self) -> Optional["PipelineSnapshot"]:\n return self._parent_pipeline_def.get_pipeline_snapshot()\n\n @property\n def is_subset_pipeline(self) -> bool:\n return True\n\n def get_pipeline_subset_def(\n self, solids_to_execute: Optional[AbstractSet[str]]\n ) -> "PipelineSubsetDefinition":\n raise DagsterInvariantViolationError("Pipeline subsets may not be subset again.")\n\n\ndef _dep_key_of(solid: Node) -> NodeInvocation:\n return NodeInvocation(\n name=solid.definition.name,\n alias=solid.name,\n tags=solid.tags,\n hook_defs=solid.hook_defs,\n retry_policy=solid.retry_policy,\n )\n\n\ndef _get_pipeline_subset_def(\n pipeline_def: PipelineDefinition,\n solids_to_execute: AbstractSet[str],\n) -> "PipelineSubsetDefinition":\n """\n Build a pipeline which is a subset of another pipeline.\n Only includes the solids which are in solids_to_execute.\n """\n\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.set_param(solids_to_execute, "solids_to_execute", of_type=str)\n graph = pipeline_def.graph\n for solid_name in solids_to_execute:\n if not graph.has_solid_named(solid_name):\n raise DagsterInvalidSubsetError(\n "{target_type} {pipeline_name} has no {node_type} named {name}.".format(\n target_type=pipeline_def.target_type,\n pipeline_name=pipeline_def.name,\n name=solid_name,\n node_type="ops" if pipeline_def.is_job else "solids",\n ),\n )\n\n # go in topo order to ensure deps dict is ordered\n solids = list(\n filter(lambda solid: solid.name in solids_to_execute, graph.solids_in_topological_order)\n )\n\n deps: Dict[\n Union[str, NodeInvocation],\n Dict[str, IDependencyDefinition],\n ] = {_dep_key_of(solid): {} for solid in solids}\n\n for solid in solids:\n for input_handle in solid.input_handles():\n if graph.dependency_structure.has_direct_dep(input_handle):\n output_handle = pipeline_def.dependency_structure.get_direct_dep(input_handle)\n if output_handle.solid.name in solids_to_execute:\n deps[_dep_key_of(solid)][input_handle.input_def.name] = DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n elif graph.dependency_structure.has_dynamic_fan_in_dep(input_handle):\n output_handle = graph.dependency_structure.get_dynamic_fan_in_dep(input_handle)\n if output_handle.solid.name in solids_to_execute:\n deps[_dep_key_of(solid)][\n input_handle.input_def.name\n ] = DynamicCollectDependencyDefinition(\n solid_name=output_handle.solid.name,\n output_name=output_handle.output_def.name,\n )\n elif graph.dependency_structure.has_fan_in_deps(input_handle):\n output_handles = graph.dependency_structure.get_fan_in_deps(input_handle)\n deps[_dep_key_of(solid)][input_handle.input_def.name] = MultiDependencyDefinition(\n [\n DependencyDefinition(\n solid=output_handle.solid.name, output=output_handle.output_def.name\n )\n for output_handle in output_handles\n if output_handle.solid.name in solids_to_execute\n ]\n )\n # else input is unconnected\n\n try:\n sub_pipeline_def = PipelineSubsetDefinition(\n name=pipeline_def.name, # should we change the name for subsetted pipeline?\n solid_defs=list({solid.definition for solid in solids}),\n mode_defs=pipeline_def.mode_definitions,\n dependencies=deps,\n _parent_pipeline_def=pipeline_def,\n tags=pipeline_def.tags,\n hook_defs=pipeline_def.hook_defs,\n )\n\n return sub_pipeline_def\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(solids_to_execute)} for {pipeline_def.target_type} "\n f"{pipeline_def.name} results in an invalid {pipeline_def.target_type}"\n ) from exc\n\n\ndef _iterate_all_nodes(root_node_dict: Dict[str, Node]) -> Iterator[Node]:\n for node in root_node_dict.values():\n yield node\n if node.is_graph:\n yield from _iterate_all_nodes(node.definition.ensure_graph_def().node_dict)\n\n\ndef _checked_resource_reqs_for_mode(\n top_level_graph_def: GraphDefinition,\n mode_def: ModeDefinition,\n node_defs: List[NodeDefinition],\n dagster_type_dict: Dict[str, DagsterType],\n root_node_dict: Dict[str, Node],\n pipeline_hook_defs: AbstractSet[HookDefinition],\n dependency_structure: DependencyStructure,\n asset_layer: AssetLayer,\n) -> Set[str]:\n """\n Calculate the resource requirements for the pipeline in this mode and ensure they are\n provided by the mode.\n\n We combine these operations in to one traversal to allow for raising excpetions that provide\n as much context as possible about where the unsatisfied resource requirement came from.\n """\n resource_reqs: Set[str] = set()\n mode_output_managers = set(\n key\n for key, resource_def in mode_def.resource_defs.items()\n if isinstance(resource_def, IOutputManagerDefinition)\n )\n mode_resources = set(mode_def.resource_defs.keys())\n for node_def in node_defs:\n for solid_def in node_def.iterate_solid_defs():\n for required_resource in solid_def.required_resource_keys:\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=solid_def.describe_node(),\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n for output_def in solid_def.output_defs:\n resource_reqs.add(output_def.io_manager_key)\n if output_def.io_manager_key not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="IO manager",\n resource_key=output_def.io_manager_key,\n descriptor=f"output '{output_def.name}' of {solid_def.describe_node()}",\n mode_def=mode_def,\n resource_defs_of_type=mode_output_managers,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n resource_reqs.update(\n _checked_type_resource_reqs_for_mode(\n mode_def,\n dagster_type_dict,\n )\n )\n\n # Validate unsatisfied inputs can be materialized from config\n resource_reqs.update(\n _checked_input_resource_reqs_for_mode(\n top_level_graph_def, dependency_structure, root_node_dict, mode_def, asset_layer\n )\n )\n\n for node in _iterate_all_nodes(root_node_dict):\n for hook_def in node.hook_defs:\n for required_resource in hook_def.required_resource_keys:\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"hook '{hook_def.name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n for hook_def in pipeline_hook_defs:\n for required_resource in hook_def.required_resource_keys:\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"hook '{hook_def.name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n for resource_key, resource in mode_def.resource_defs.items():\n for required_resource in resource.required_resource_keys:\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"resource at key '{resource_key}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n # Finally, recursively add any resources that the set of required resources require\n while True:\n new_resources: Set[str] = set()\n for resource_key in resource_reqs:\n resource = mode_def.resource_defs[resource_key]\n new_resources.update(resource.required_resource_keys - resource_reqs)\n\n if not len(new_resources):\n break\n\n resource_reqs.update(new_resources)\n\n return resource_reqs\n\n\ndef _checked_type_resource_reqs_for_mode(\n mode_def: ModeDefinition,\n dagster_type_dict: Dict[str, DagsterType],\n) -> Set[str]:\n """\n Calculate all the resource requirements related to DagsterTypes for this mode and ensure the\n mode provides those resources.\n """\n\n resource_reqs = set()\n mode_resources = set(mode_def.resource_defs.keys())\n for dagster_type in dagster_type_dict.values():\n for required_resource in dagster_type.required_resource_keys:\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"type '{dagster_type.display_name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n if dagster_type.loader:\n for required_resource in dagster_type.loader.required_resource_keys():\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"the loader on type '{dagster_type.display_name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n if dagster_type.materializer:\n for required_resource in dagster_type.materializer.required_resource_keys():\n resource_reqs.add(required_resource)\n if required_resource not in mode_resources:\n error_msg = _get_missing_resource_error_msg(\n resource_type="resource",\n resource_key=required_resource,\n descriptor=f"the materializer on type '{dagster_type.display_name}'",\n mode_def=mode_def,\n resource_defs_of_type=mode_resources,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n return resource_reqs\n\n\ndef _checked_input_resource_reqs_for_mode(\n top_level_graph_def: GraphDefinition,\n dependency_structure: DependencyStructure,\n node_dict: Dict[str, Node],\n mode_def: ModeDefinition,\n asset_layer: AssetLayer,\n outer_dependency_structures: Optional[List[DependencyStructure]] = None,\n outer_solids: Optional[List[Node]] = None,\n parent_node_handle: Optional[NodeHandle] = None,\n) -> Set[str]:\n outer_dependency_structures = check.opt_list_param(\n outer_dependency_structures, "outer_dependency_structures", DependencyStructure\n )\n outer_solids = check.opt_list_param(outer_solids, "outer_solids", Node)\n\n resource_reqs = set()\n mode_root_input_managers = set(\n key\n for key, resource_def in mode_def.resource_defs.items()\n if isinstance(resource_def, RootInputManagerDefinition)\n )\n mode_io_managers = set(\n key\n for key, resource_def in mode_def.resource_defs.items()\n if isinstance(resource_def, IOManagerDefinition)\n )\n\n for node in node_dict.values():\n node_handle = NodeHandle(name=node.name, parent=parent_node_handle)\n if node.is_graph:\n graph_def = node.definition.ensure_graph_def()\n # check inner solids\n resource_reqs.update(\n _checked_input_resource_reqs_for_mode(\n top_level_graph_def=top_level_graph_def,\n dependency_structure=graph_def.dependency_structure,\n node_dict=graph_def.node_dict,\n mode_def=mode_def,\n asset_layer=asset_layer,\n outer_dependency_structures=outer_dependency_structures\n + [dependency_structure],\n outer_solids=outer_solids + [node],\n parent_node_handle=node_handle,\n )\n )\n for handle in node.input_handles():\n source_output_handles = None\n if dependency_structure.has_deps(handle):\n # input is connected to outputs from the same dependency structure\n source_output_handles = dependency_structure.get_deps_list(handle)\n else:\n # input is connected to outputs from outer dependency structure, e.g. first solids\n # in a composite\n curr_node = node\n curr_handle = handle\n curr_index = len(outer_solids) - 1\n\n # Checks to see if input is mapped to an outer dependency structure\n while curr_index >= 0 and curr_node.container_maps_input(curr_handle.input_name):\n curr_handle = SolidInputHandle(\n solid=outer_solids[curr_index],\n input_def=curr_node.container_mapped_input(\n curr_handle.input_name\n ).definition,\n )\n\n if outer_dependency_structures[curr_index].has_deps(curr_handle):\n source_output_handles = outer_dependency_structures[\n curr_index\n ].get_deps_list(curr_handle)\n break\n\n curr_node = outer_solids[curr_index]\n curr_index -= 1\n\n if source_output_handles:\n # input is connected to source output handles within the graph\n for source_output_handle in source_output_handles:\n output_manager_key = source_output_handle.output_def.io_manager_key\n output_manager_def = mode_def.resource_defs[output_manager_key]\n if not isinstance(output_manager_def, IInputManagerDefinition):\n raise DagsterInvalidDefinitionError(\n f'Input "{handle.input_def.name}" of {node.describe_node()} is '\n f'connected to output "{source_output_handle.output_def.name}" '\n f"of {source_output_handle.solid.describe_node()}. That output does not "\n "have an output "\n f"manager that knows how to load inputs, so we don't know how "\n f"to load the input. To address this, assign an IOManager to "\n f"the upstream output."\n )\n else:\n # input is not connected to upstream output\n input_def = handle.input_def\n input_asset_key = asset_layer.asset_key_for_input(node_handle, input_def.name)\n\n # Input is not nothing, not resolvable by config, and isn't\n # mapped from a top-level output.\n if not _is_input_resolvable(\n top_level_graph_def, input_def, node, outer_solids, input_asset_key\n ):\n raise DagsterInvalidDefinitionError(\n f"Input '{input_def.name}' of {node.describe_node()} "\n "has no upstream output, no default value, and no "\n "dagster type loader. Must provide a value to this "\n "input via either a direct input value mapped from the "\n "top-level graph, or a root input manager key. To "\n "learn more, see the docs for unconnected inputs: "\n "https://docs.dagster.io/concepts/io-management/unconnected-inputs#unconnected-inputs."\n )\n\n # If a root manager is provided, it's always used. I.e. it has priority over\n # the other ways of loading unsatisfied inputs - dagster type loaders and\n # default values.\n if input_def.root_manager_key:\n resource_reqs.add(input_def.root_manager_key)\n if input_def.root_manager_key not in mode_def.resource_defs:\n error_msg = _get_missing_resource_error_msg(\n resource_type="root input manager",\n resource_key=input_def.root_manager_key,\n descriptor=f"unsatisfied input '{input_def.name}' of {node.describe_node()}",\n mode_def=mode_def,\n resource_defs_of_type=mode_root_input_managers,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n # If a root manager is not provided, but the input is associated with an asset, we\n # can load the input using the io_manager for that asset\n elif input_asset_key:\n io_manager_key = asset_layer.io_manager_key_for_asset(input_asset_key)\n resource_reqs.add(io_manager_key)\n if io_manager_key not in mode_def.resource_defs:\n error_msg = _get_missing_resource_error_msg(\n resource_type="io_manager",\n resource_key=io_manager_key,\n descriptor=f"unsatisfied input '{input_def.name}' of {node.describe_node()} with asset key {input_asset_key}",\n mode_def=mode_def,\n resource_defs_of_type=mode_io_managers,\n )\n raise DagsterInvalidDefinitionError(error_msg)\n\n return resource_reqs\n\n\ndef _is_input_resolvable(graph_def, input_def, node, upstream_nodes, input_asset_key):\n # If input is not loadable via config, check if loadable via top-level input (meaning it is mapped all the way up the graph composition).\n if (\n not input_def.dagster_type.loader\n and not input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n and not input_def.root_manager_key\n and not input_def.has_default_value\n and not input_asset_key\n ):\n return _is_input_resolved_from_top_level(graph_def, input_def, node, upstream_nodes)\n else:\n return True\n\n\ndef _is_input_resolved_from_top_level(graph_def, input_def, node, upstream_nodes):\n from dagster.core.definitions.input import InputPointer\n\n input_name = input_def.name\n node_name = node.name\n\n # Upstream nodes are in order of composition, with the top-level graph\n # being first.\n upstream_nodes = upstream_nodes[::-1]\n for upstream_node in upstream_nodes:\n input_mapping = upstream_node.definition.input_mapping_for_pointer(\n InputPointer(solid_name=node_name, input_name=input_name)\n )\n if not input_mapping:\n return False\n else:\n input_name = input_mapping.definition.name\n node_name = upstream_node.name\n\n top_level_mapping = graph_def.input_mapping_for_pointer(\n InputPointer(solid_name=node_name, input_name=input_name)\n )\n return bool(top_level_mapping)\n\n\ndef _get_missing_resource_error_msg(\n resource_type, resource_key, descriptor, mode_def, resource_defs_of_type\n):\n if mode_def.name == "default":\n return (\n f"{resource_type} key '{resource_key}' is required by "\n f"{descriptor}, but is not provided. Provide a {resource_type} for key '{resource_key}', "\n f"or change '{resource_key}' to one of the provided {resource_type} keys: "\n f"{sorted(resource_defs_of_type)}."\n )\n else:\n return (\n f"{resource_type} key '{resource_key}' is required by "\n f"{descriptor}, but is not provided by mode '{mode_def.name}'. "\n f"In mode '{mode_def.name}', provide a {resource_type} for key '{resource_key}', "\n f"or change '{resource_key}' to one of the provided root input managers keys: {sorted(resource_defs_of_type)}."\n )\n\n\ndef _build_all_node_defs(node_defs: List[NodeDefinition]) -> Dict[str, NodeDefinition]:\n all_defs: Dict[str, NodeDefinition] = {}\n for current_level_node_def in node_defs:\n for node_def in current_level_node_def.iterate_node_defs():\n if node_def.name in all_defs:\n if all_defs[node_def.name] != node_def:\n raise DagsterInvalidDefinitionError(\n 'Detected conflicting node definitions with the same name "{name}"'.format(\n name=node_def.name\n )\n )\n else:\n all_defs[node_def.name] = node_def\n\n return all_defs\n\n\ndef _create_run_config_schema(\n pipeline_def: PipelineDefinition,\n mode_definition: ModeDefinition,\n required_resources: Set[str],\n) -> "RunConfigSchema":\n from .job_definition import get_direct_input_values_from_job\n from .run_config import (\n RunConfigSchemaCreationData,\n construct_config_type_dictionary,\n define_run_config_schema_type,\n )\n from .run_config_schema import RunConfigSchema\n\n # When executing with a subset pipeline, include the missing solids\n # from the original pipeline as ignored to allow execution with\n # run config that is valid for the original\n if isinstance(pipeline_def.graph, SubselectedGraphDefinition):\n ignored_solids = pipeline_def.graph.get_top_level_omitted_nodes()\n elif pipeline_def.is_subset_pipeline:\n if pipeline_def.parent_pipeline_def is None:\n check.failed("Unexpected subset pipeline state")\n\n ignored_solids = [\n solid\n for solid in pipeline_def.parent_pipeline_def.graph.solids\n if not pipeline_def.has_solid_named(solid.name)\n ]\n else:\n ignored_solids = []\n\n run_config_schema_type = define_run_config_schema_type(\n RunConfigSchemaCreationData(\n pipeline_name=pipeline_def.name,\n solids=pipeline_def.graph.solids,\n graph_def=pipeline_def.graph,\n dependency_structure=pipeline_def.graph.dependency_structure,\n mode_definition=mode_definition,\n logger_defs=mode_definition.loggers,\n ignored_solids=ignored_solids,\n required_resources=required_resources,\n is_using_graph_job_op_apis=pipeline_def.is_job,\n direct_inputs=get_direct_input_values_from_job(pipeline_def),\n asset_layer=pipeline_def.asset_layer,\n )\n )\n\n if mode_definition.config_mapping:\n outer_config_type = mode_definition.config_mapping.config_schema.config_type\n else:\n outer_config_type = run_config_schema_type\n\n if outer_config_type is None:\n check.failed("Unexpected outer_config_type value of None")\n\n config_type_dict_by_name, config_type_dict_by_key = construct_config_type_dictionary(\n pipeline_def.all_node_defs,\n outer_config_type,\n )\n\n return RunConfigSchema(\n run_config_schema_type=run_config_schema_type,\n config_type_dict_by_name=config_type_dict_by_name,\n config_type_dict_by_key=config_type_dict_by_key,\n config_mapping=mode_definition.config_mapping,\n )\n
", "current_page_name": "_modules/dagster/core/definitions/pipeline_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.pipeline_definition"}, "policy": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.policy

\nfrom enum import Enum\nfrom random import random\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster.core.errors import DagsterInvalidDefinitionError\n\n\n
[docs]class Backoff(Enum):\n """\n A modifier for delay as a function of attempt number.\n\n LINEAR: `attempt_num * delay`\n EXPONENTIAL: `((2 ^ attempt_num) - 1) * delay`\n """\n\n LINEAR = "LINEAR"\n EXPONENTIAL = "EXPONENTIAL"
\n\n\n
[docs]class Jitter(Enum):\n """A randomizing modifier for delay, applied after backoff calculation.\n\n FULL: between 0 and the calculated delay based on backoff: `random() * backoff_delay`\n PLUS_MINUS: +/- the delay: `backoff_delay + ((2 * (random() * delay)) - delay)`\n """\n\n FULL = "FULL"\n PLUS_MINUS = "PLUS_MINUS"
\n\n\n
[docs]class RetryPolicy(\n NamedTuple(\n "_RetryPolicy",\n [\n ("max_retries", int),\n ("delay", Optional[check.Numeric]),\n # declarative time modulation to allow calc witout running user function\n ("backoff", Optional[Backoff]),\n ("jitter", Optional[Jitter]),\n ],\n ),\n):\n """\n A declarative policy for when to request retries when an exception occurs during op execution.\n\n Args:\n max_retries (int):\n The maximum number of retries to attempt. Defaults to 1.\n delay (Optional[Union[int,float]]):\n The time in seconds to wait between the retry being requested and the next attempt\n being started. This unit of time can be modulated as a function of attempt number\n with backoff and randomly with jitter.\n backoff (Optional[Backoff]):\n A modifier for delay as a function of retry attempt number.\n jitter (Optional[Jitter]):\n A randomizing modifier for delay, applied after backoff calculation.\n """\n\n def __new__(\n cls,\n max_retries: int = 1,\n delay: Optional[check.Numeric] = None,\n backoff: Optional[Backoff] = None,\n jitter: Optional[Jitter] = None,\n ):\n if backoff is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set jitter on RetryPolicy without also setting delay"\n )\n\n if jitter is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set backoff on RetryPolicy without also setting delay"\n )\n\n return super().__new__(\n cls,\n max_retries=check.int_param(max_retries, "max_retries"),\n delay=check.opt_numeric_param(delay, "delay"),\n backoff=check.opt_inst_param(backoff, "backoff", Backoff),\n jitter=check.opt_inst_param(jitter, "jitter", Jitter),\n )\n\n def calculate_delay(self, attempt_num: int) -> check.Numeric:\n return calculate_delay(\n attempt_num=attempt_num,\n backoff=self.backoff,\n jitter=self.jitter,\n base_delay=self.delay or 0,\n )
\n\n\ndef calculate_delay(attempt_num, backoff, jitter, base_delay):\n if backoff is Backoff.EXPONENTIAL:\n calc_delay = ((2**attempt_num) - 1) * base_delay\n elif backoff is Backoff.LINEAR:\n calc_delay = base_delay * attempt_num\n elif backoff is None:\n calc_delay = base_delay\n else:\n check.assert_never(backoff)\n\n if jitter is Jitter.FULL:\n calc_delay = random() * calc_delay\n elif jitter is Jitter.PLUS_MINUS:\n calc_delay = calc_delay + ((2 * (random() * base_delay)) - base_delay)\n elif jitter is None:\n pass\n else:\n check.assert_never(jitter)\n\n return calc_delay\n
", "current_page_name": "_modules/dagster/core/definitions/policy", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.policy"}, "preset": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.preset

\nfrom typing import Dict, List, NamedTuple, Optional\n\nimport pkg_resources\nimport yaml\n\nimport dagster._check as check\nfrom dagster.core.definitions.utils import config_from_files, config_from_yaml_strings\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.utils.merger import deep_merge_dicts\n\nfrom .mode import DEFAULT_MODE_NAME\nfrom .utils import check_valid_name\n\n\n
[docs]class PresetDefinition(\n NamedTuple(\n "_PresetDefinition",\n [\n ("name", str),\n ("run_config", Optional[Dict[str, object]]),\n ("solid_selection", Optional[List[str]]),\n ("mode", str),\n ("tags", Dict[str, str]),\n ],\n )\n):\n """Defines a preset configuration in which a pipeline can execute.\n\n Presets can be used in Dagit to load predefined configurations into the tool.\n\n Presets may also be used from the Python API (in a script, or in test) as follows:\n\n .. code-block:: python\n\n execute_pipeline(pipeline_def, preset='example_preset')\n\n Presets may also be used with the command line tools:\n\n .. code-block:: shell\n\n $ dagster pipeline execute example_pipeline --preset example_preset\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n run_config (Optional[dict]): A dict representing the config to set with the preset.\n This is equivalent to the ``run_config`` argument to :py:func:`execute_pipeline`.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default: 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n """\n\n def __new__(\n cls,\n name: str,\n run_config: Optional[Dict[str, object]] = None,\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n tags: Optional[Dict[str, object]] = None,\n ):\n\n return super(PresetDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n run_config=run_config,\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n mode=check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME),\n tags=check.opt_dict_param(tags, "tags", key_type=str),\n )\n\n
[docs] @staticmethod\n def from_files(name, config_files=None, solid_selection=None, mode=None, tags=None):\n """Static constructor for presets from YAML files.\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n config_files (Optional[List[str]]): List of paths or glob patterns for yaml files\n to load and parse as the run config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML files.\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n check.str_param(name, "name")\n config_files = check.opt_list_param(config_files, "config_files")\n solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n\n merged = config_from_files(config_files)\n\n return PresetDefinition(name, merged, solid_selection, mode, tags)
\n\n
[docs] @staticmethod\n def from_yaml_strings(name, yaml_strings=None, solid_selection=None, mode=None, tags=None):\n """Static constructor for presets from YAML strings.\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n yaml_strings (Optional[List[str]]): List of yaml strings to parse as the environment\n config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n check.str_param(name, "name")\n yaml_strings = check.opt_list_param(yaml_strings, "yaml_strings", of_type=str)\n solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)\n\n merged = config_from_yaml_strings(yaml_strings)\n\n return PresetDefinition(name, merged, solid_selection, mode, tags)
\n\n
[docs] @staticmethod\n def from_pkg_resources(\n name, pkg_resource_defs=None, solid_selection=None, mode=None, tags=None\n ):\n """Load a preset from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n\n .. code-block:: python\n\n PresetDefinition.from_pkg_resources(\n name='local',\n mode='local',\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n name (str): The name of this preset. Must be unique in the presets defined on a given\n pipeline.\n pkg_resource_defs (Optional[List[(str, str)]]): List of pkg_resource modules/files to\n load as the run config for this preset.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute with this partition. e.g.\n ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): The mode to apply when executing this preset. (default:\n 'default')\n tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.\n\n Returns:\n PresetDefinition: A PresetDefinition constructed from the provided YAML strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n pkg_resource_defs = check.opt_list_param(\n pkg_resource_defs, "pkg_resource_defs", of_type=tuple\n )\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs} "\n f'on preset "{name}".'\n ) from err\n\n return PresetDefinition.from_yaml_strings(name, yaml_strings, solid_selection, mode, tags)
\n\n
[docs] def get_environment_yaml(self):\n """Get the environment dict set on a preset as YAML.\n\n Returns:\n str: The environment dict as YAML.\n """\n return yaml.dump(self.run_config or {}, default_flow_style=False)
\n\n
[docs] def with_additional_config(self, run_config):\n """Return a new PresetDefinition with additional config merged in to the existing config."""\n\n check.opt_nullable_dict_param(run_config, "run_config")\n if run_config is None:\n return self\n else:\n initial_config = self.run_config or {}\n return PresetDefinition(\n name=self.name,\n solid_selection=self.solid_selection,\n mode=self.mode,\n tags=self.tags,\n run_config=deep_merge_dicts(initial_config, run_config),\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/preset", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.preset"}, "reconstruct": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.reconstruct

\nimport inspect\nimport os\nimport sys\nfrom functools import lru_cache\nfrom typing import TYPE_CHECKING, Any, Dict, FrozenSet, List, NamedTuple, Optional, Union, overload\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.core.code_pointer import (\n    CodePointer,\n    CustomPointer,\n    FileCodePointer,\n    ModuleCodePointer,\n    get_python_file_from_target,\n)\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.origin import (\n    DEFAULT_DAGSTER_ENTRY_POINT,\n    PipelinePythonOrigin,\n    RepositoryPythonOrigin,\n)\nfrom dagster.core.selector import parse_solid_selection\nfrom dagster.serdes import pack_value, unpack_value, whitelist_for_serdes\nfrom dagster.utils import frozenlist, make_readonly_value\nfrom dagster.utils.backcompat import experimental\n\nfrom .pipeline_base import IPipeline\n\nif TYPE_CHECKING:\n    from dagster.core.asset_defs.asset_group import AssetGroup\n\n    from .graph_definition import GraphDefinition\n    from .pipeline_definition import PipelineDefinition\n    from .repository_definition import RepositoryDefinition\n\n\ndef get_ephemeral_repository_name(pipeline_name: str) -> str:\n    check.str_param(pipeline_name, "pipeline_name")\n    return "__repository__{pipeline_name}".format(pipeline_name=pipeline_name)\n\n\n@whitelist_for_serdes\nclass ReconstructableRepository(\n    NamedTuple(\n        "_ReconstructableRepository",\n        [\n            ("pointer", CodePointer),\n            ("container_image", Optional[str]),\n            ("executable_path", Optional[str]),\n            ("entry_point", List[str]),\n            ("container_context", Optional[Dict[str, Any]]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        pointer,\n        container_image=None,\n        executable_path=None,\n        entry_point=None,\n        container_context=None,\n    ):\n        return super(ReconstructableRepository, cls).__new__(\n            cls,\n            pointer=check.inst_param(pointer, "pointer", CodePointer),\n            container_image=check.opt_str_param(container_image, "container_image"),\n            executable_path=check.opt_str_param(executable_path, "executable_path"),\n            entry_point=(\n                frozenlist(check.list_param(entry_point, "entry_point", of_type=str))\n                if entry_point != None\n                else DEFAULT_DAGSTER_ENTRY_POINT\n            ),\n            container_context=(\n                make_readonly_value(check.opt_dict_param(container_context, "container_context"))\n                if container_context != None\n                else None\n            ),\n        )\n\n    @lru_cache(maxsize=1)\n    def get_definition(self):\n        return repository_def_from_pointer(self.pointer)\n\n    def get_reconstructable_pipeline(self, name):\n        return ReconstructablePipeline(self, name)\n\n    @classmethod\n    def for_file(\n        cls, file, fn_name, working_directory=None, container_image=None, container_context=None\n    ):\n        if not working_directory:\n            working_directory = os.getcwd()\n        return cls(\n            FileCodePointer(file, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    @classmethod\n    def for_module(\n        cls, module, fn_name, working_directory=None, container_image=None, container_context=None\n    ):\n        return cls(\n            ModuleCodePointer(module, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    def get_python_origin(self):\n        return RepositoryPythonOrigin(\n            executable_path=self.executable_path if self.executable_path else sys.executable,\n            code_pointer=self.pointer,\n            container_image=self.container_image,\n            entry_point=self.entry_point,\n            container_context=self.container_context,\n        )\n\n    def get_python_origin_id(self):\n        return self.get_python_origin().get_id()\n\n\n
[docs]@whitelist_for_serdes\nclass ReconstructablePipeline(\n NamedTuple(\n "_ReconstructablePipeline",\n [\n ("repository", ReconstructableRepository),\n ("pipeline_name", str),\n ("solid_selection_str", Optional[str]),\n ("solids_to_execute", Optional[FrozenSet[str]]),\n ("asset_selection", Optional[FrozenSet[AssetKey]]),\n ],\n ),\n IPipeline,\n):\n """Defines a reconstructable pipeline. When your pipeline/job must cross process boundaries,\n Dagster must know how to reconstruct the pipeline/job on the other side of the process boundary.\n\n Args:\n repository (ReconstructableRepository): The reconstructable representation of the repository\n the pipeline/job belongs to.\n pipeline_name (str): The name of the pipeline/job.\n solid_selection_str (Optional[str]): The string value of a comma separated list of user-input\n solid/op selection. None if no selection is specified, i.e. the entire pipeline/job will\n be run.\n solids_to_execute (Optional[FrozenSet[str]]): A set of solid/op names to execute. None if no selection\n is specified, i.e. the entire pipeline/job will be run.\n asset_selection (Optional[FrozenSet[AssetKey]]) A set of assets to execute. None if no selection\n is specified, i.e. the entire job will be run.\n """\n\n def __new__(\n cls,\n repository,\n pipeline_name,\n solid_selection_str=None,\n solids_to_execute=None,\n asset_selection=None,\n ):\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", AssetKey)\n return super(ReconstructablePipeline, cls).__new__(\n cls,\n repository=check.inst_param(repository, "repository", ReconstructableRepository),\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n solid_selection_str=check.opt_str_param(solid_selection_str, "solid_selection_str"),\n solids_to_execute=solids_to_execute,\n asset_selection=asset_selection,\n )\n\n @property\n def solid_selection(self) -> Optional[List[str]]:\n return seven.json.loads(self.solid_selection_str) if self.solid_selection_str else None\n\n @lru_cache(maxsize=1)\n def get_definition(self):\n from dagster.core.definitions.job_definition import JobDefinition\n\n defn = self.repository.get_definition().get_pipeline(self.pipeline_name)\n\n if isinstance(defn, JobDefinition):\n return (\n self.repository.get_definition()\n .get_pipeline(self.pipeline_name)\n .get_job_def_for_subset_selection(self.solid_selection, self.asset_selection)\n )\n\n check.invariant(\n self.asset_selection == None, "Asset selection cannot be provided with a pipeline"\n )\n return (\n self.repository.get_definition().get_pipeline(self.pipeline_name)\n # pipelines use post-resolved selection\n .get_pipeline_subset_def(self.solids_to_execute)\n )\n\n def get_reconstructable_repository(self):\n return self.repository\n\n def _subset_for_execution(\n self,\n solids_to_execute: Optional[Optional[FrozenSet[str]]],\n solid_selection: Optional[List[str]],\n asset_selection: Optional[FrozenSet[AssetKey]],\n ) -> "ReconstructablePipeline":\n # no selection\n if solid_selection is None and solids_to_execute is None and asset_selection is None:\n return ReconstructablePipeline(\n repository=self.repository,\n pipeline_name=self.pipeline_name,\n )\n\n from dagster.core.definitions import JobDefinition, PipelineDefinition\n\n pipeline_def = self.get_definition()\n if isinstance(pipeline_def, JobDefinition):\n # jobs use pre-resolved selection\n # when subselecting a job\n # * job subselection depend on solid_selection rather than solids_to_execute\n # * we'll resolve the op selection later in the stack\n if solid_selection is None:\n # when the pre-resolution info is unavailable (e.g. subset from existing run),\n # we need to fill the solid_selection in order to pass the value down to deeper stack.\n solid_selection = list(solids_to_execute) if solids_to_execute else None\n return ReconstructablePipeline(\n repository=self.repository,\n pipeline_name=self.pipeline_name,\n solid_selection_str=seven.json.dumps(solid_selection) if solid_selection else None,\n solids_to_execute=None,\n asset_selection=asset_selection,\n )\n elif isinstance(pipeline_def, PipelineDefinition):\n # when subselecting a pipeline\n # * pipeline subselection depend on solids_to_excute rather than solid_selection\n # * we resolve a list of solid selection queries to a frozenset of qualified solid names\n # e.g. ['foo_solid+'] to {'foo_solid', 'bar_solid'}\n if solid_selection and solids_to_execute is None:\n # when post-resolution query is unavailable, resolve the query\n solids_to_execute = parse_solid_selection(pipeline_def, solid_selection)\n return ReconstructablePipeline(\n repository=self.repository,\n pipeline_name=self.pipeline_name,\n solid_selection_str=seven.json.dumps(solid_selection) if solid_selection else None,\n solids_to_execute=frozenset(solids_to_execute) if solids_to_execute else None,\n )\n else:\n raise Exception(f"Unexpected pipeline/job type {pipeline_def.__class__.__name__}")\n\n def subset_for_execution(\n self,\n solid_selection: Optional[List[str]] = None,\n asset_selection: Optional[FrozenSet[AssetKey]] = None,\n ) -> "ReconstructablePipeline":\n # take a list of unresolved selection queries\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n check.invariant(\n not (solid_selection and asset_selection),\n "solid_selection and asset_selection cannot both be provided as arguments",\n )\n\n return self._subset_for_execution(\n solids_to_execute=None, solid_selection=solid_selection, asset_selection=asset_selection\n )\n\n def subset_for_execution_from_existing_pipeline(\n self,\n solids_to_execute: Optional[FrozenSet[str]] = None,\n asset_selection: Optional[FrozenSet[AssetKey]] = None,\n ) -> "ReconstructablePipeline":\n # take a frozenset of resolved solid names from an existing pipeline\n # so there's no need to parse the selection\n\n check.invariant(\n not (solids_to_execute and asset_selection),\n "solids_to_execute and asset_selection cannot both be provided as arguments",\n )\n\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n return self._subset_for_execution(\n solids_to_execute=solids_to_execute,\n solid_selection=None,\n asset_selection=asset_selection,\n )\n\n def describe(self):\n return '"{name}" in repository ({repo})'.format(\n repo=self.repository.pointer.describe, name=self.pipeline_name\n )\n\n @staticmethod\n def for_file(python_file, fn_name):\n return bootstrap_standalone_recon_pipeline(\n FileCodePointer(python_file, fn_name, os.getcwd())\n )\n\n @staticmethod\n def for_module(module, fn_name):\n return bootstrap_standalone_recon_pipeline(ModuleCodePointer(module, fn_name, os.getcwd()))\n\n def to_dict(self):\n return pack_value(self)\n\n @staticmethod\n def from_dict(val):\n check.dict_param(val, "val")\n\n inst = unpack_value(val)\n check.invariant(\n isinstance(inst, ReconstructablePipeline),\n "Deserialized object is not instance of ReconstructablePipeline, got {type}".format(\n type=type(inst)\n ),\n )\n return inst\n\n def get_python_origin(self):\n return PipelinePythonOrigin(self.pipeline_name, self.repository.get_python_origin())\n\n def get_python_origin_id(self):\n return self.get_python_origin().get_id()\n\n
[docs] def get_module(self) -> Optional[str]:\n """Return the module the pipeline is found in, the origin is a module code pointer"""\n pointer = self.get_python_origin().get_repo_pointer()\n if isinstance(pointer, ModuleCodePointer):\n return pointer.module\n\n return None
\n\n\n
[docs]def reconstructable(target):\n """\n Create a :py:class:`~dagster.core.definitions.reconstructable.ReconstructablePipeline` from a\n function that returns a :py:class:`~dagster.PipelineDefinition`/:py:class:`~dagster.JobDefinition`,\n or a function decorated with :py:func:`@pipeline <dagster.pipeline>`/:py:func:`@job <dagster.job>`.\n\n When your pipeline/job must cross process boundaries, e.g., for execution on multiple nodes or\n in different systems (like ``dagstermill``), Dagster must know how to reconstruct the pipeline/job\n on the other side of the process boundary.\n\n Passing a job created with ``~dagster.GraphDefinition.to_job`` to ``reconstructable()``,\n requires you to wrap that job's definition in a module-scoped function, and pass that function\n instead:\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def my_graph():\n ...\n\n def define_my_job():\n return my_graph.to_job()\n\n reconstructable(define_my_job)\n\n This function implements a very conservative strategy for reconstruction, so that its behavior\n is easy to predict, but as a consequence it is not able to reconstruct certain kinds of pipelines\n or jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\n call), or in interactive environments such as the Python REPL or Jupyter notebooks.\n\n If you need to reconstruct objects constructed in these ways, you should use\n :py:func:`~dagster.reconstructable.build_reconstructable_job` instead, which allows you to\n specify your own reconstruction strategy.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job, reconstructable\n\n @job\n def foo_job():\n ...\n\n reconstructable_foo_job = reconstructable(foo_job)\n\n\n @graph\n def foo():\n ...\n\n def make_bar_job():\n return foo.to_job()\n\n reconstructable_bar_job = reconstructable(make_bar_job)\n """\n from dagster.core.definitions import JobDefinition, PipelineDefinition\n\n if not seven.is_function_or_decorator_instance_of(target, PipelineDefinition):\n if isinstance(target, JobDefinition):\n raise DagsterInvariantViolationError(\n "Reconstructable target was not a function returning a job definition, or a job "\n "definition produced by a decorated function. If your job was constructed using "\n "``GraphDefinition.to_job``, you must wrap the ``to_job`` call in a function at "\n "module scope, ie not within any other functions. "\n "To learn more, check out the docs on ``reconstructable``: "\n "https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n )\n raise DagsterInvariantViolationError(\n "Reconstructable target should be a function or definition produced "\n "by a decorated function, got {type}.".format(type=type(target)),\n )\n\n if seven.is_lambda(target):\n raise DagsterInvariantViolationError(\n "Reconstructable target can not be a lambda. Use a function or "\n "decorated function defined at module scope instead, or use "\n "build_reconstructable_job."\n )\n\n if seven.qualname_differs(target):\n raise DagsterInvariantViolationError(\n 'Reconstructable target "{target.__name__}" has a different '\n '__qualname__ "{target.__qualname__}" indicating it is not '\n "defined at module scope. Use a function or decorated function "\n "defined at module scope instead, or use build_reconstructable_job.".format(\n target=target\n )\n )\n\n try:\n if (\n hasattr(target, "__module__")\n and hasattr(target, "__name__")\n and inspect.getmodule(target).__name__ != "__main__"\n ):\n return ReconstructablePipeline.for_module(target.__module__, target.__name__)\n except:\n pass\n\n python_file = get_python_file_from_target(target)\n if not python_file:\n raise DagsterInvariantViolationError(\n "reconstructable() can not reconstruct jobs or pipelines defined in interactive "\n "environments like <stdin>, IPython, or Jupyter notebooks. "\n "Use a pipeline defined in a module or file instead, or use build_reconstructable_job."\n )\n\n pointer = FileCodePointer(\n python_file=python_file, fn_name=target.__name__, working_directory=os.getcwd()\n )\n\n return bootstrap_standalone_recon_pipeline(pointer)
\n\n\n
[docs]@experimental\ndef build_reconstructable_job(\n reconstructor_module_name,\n reconstructor_function_name,\n reconstructable_args=None,\n reconstructable_kwargs=None,\n reconstructor_working_directory=None,\n):\n """\n Create a :py:class:`dagster.core.definitions.reconstructable.ReconstructablePipeline`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or in\n different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n This function allows you to use the strategy of your choice for reconstructing jobs, so\n that you can reconstruct certain kinds of jobs that are not supported by\n :py:func:`~dagster.reconstructable`, such as those defined by lambdas, in nested scopes (e.g.,\n dynamically within a method call), or in interactive environments such as the Python REPL or\n Jupyter notebooks.\n\n If you need to reconstruct jobs constructed in these ways, use this function instead of\n :py:func:`~dagster.reconstructable`.\n\n Args:\n reconstructor_module_name (str): The name of the module containing the function to use to\n reconstruct the job.\n reconstructor_function_name (str): The name of the function to use to reconstruct the\n job.\n reconstructable_args (Tuple): Args to the function to use to reconstruct the job.\n Values of the tuple must be JSON serializable.\n reconstructable_kwargs (Dict[str, Any]): Kwargs to the function to use to reconstruct the\n job. Values of the dict must be JSON serializable.\n\n Examples:\n\n .. code-block:: python\n\n # module: mymodule\n\n from dagster import JobDefinition, job, build_reconstructable_job\n\n class JobFactory:\n def make_job(*args, **kwargs):\n\n @job\n def _job(...):\n ...\n\n return _job\n\n def reconstruct_job(*args):\n factory = JobFactory()\n return factory.make_job(*args)\n\n factory = JobFactory()\n\n foo_job_args = (...,...)\n\n foo_job_kwargs = {...:...}\n\n foo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\n reconstructable_foo_job = build_reconstructable_job(\n 'mymodule',\n 'reconstruct_job',\n foo_job_args,\n foo_job_kwargs,\n )\n """\n check.str_param(reconstructor_module_name, "reconstructor_module_name")\n check.str_param(reconstructor_function_name, "reconstructor_function_name")\n check.opt_str_param(\n reconstructor_working_directory, "reconstructor_working_directory", os.getcwd()\n )\n\n reconstructable_args = list(check.opt_tuple_param(reconstructable_args, "reconstructable_args"))\n reconstructable_kwargs = list(\n (\n [key, value]\n for key, value in check.opt_dict_param(\n reconstructable_kwargs, "reconstructable_kwargs", key_type=str\n ).items()\n )\n )\n\n reconstructor_pointer = ModuleCodePointer(\n reconstructor_module_name,\n reconstructor_function_name,\n working_directory=reconstructor_working_directory,\n )\n\n pointer = CustomPointer(reconstructor_pointer, reconstructable_args, reconstructable_kwargs)\n\n pipeline_def = pipeline_def_from_pointer(pointer)\n\n return ReconstructablePipeline(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n pipeline_name=pipeline_def.name,\n )
\n\n\n# back compat, in case users have imported these directly\nbuild_reconstructable_pipeline = build_reconstructable_job\nbuild_reconstructable_target = build_reconstructable_job\n\n\ndef bootstrap_standalone_recon_pipeline(pointer):\n # So this actually straps the the pipeline for the sole\n # purpose of getting the pipeline name. If we changed ReconstructablePipeline\n # to get the pipeline on demand in order to get name, we could avoid this.\n pipeline_def = pipeline_def_from_pointer(pointer)\n return ReconstructablePipeline(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n pipeline_name=pipeline_def.name,\n )\n\n\ndef _check_is_loadable(definition):\n from dagster.core.asset_defs import AssetGroup\n\n from .graph_definition import GraphDefinition\n from .pipeline_definition import PipelineDefinition\n from .repository_definition import RepositoryDefinition\n\n if not isinstance(\n definition, (PipelineDefinition, RepositoryDefinition, GraphDefinition, AssetGroup)\n ):\n raise DagsterInvariantViolationError(\n (\n "Loadable attributes must be either a JobDefinition, GraphDefinition, "\n f"PipelineDefinition, AssetGroup, or RepositoryDefinition. Got {repr(definition)}."\n )\n )\n return definition\n\n\ndef load_def_in_module(module_name, attribute, working_directory):\n return def_from_pointer(CodePointer.from_module(module_name, attribute, working_directory))\n\n\ndef load_def_in_package(package_name, attribute, working_directory):\n return def_from_pointer(\n CodePointer.from_python_package(package_name, attribute, working_directory)\n )\n\n\ndef load_def_in_python_file(python_file, attribute, working_directory):\n return def_from_pointer(CodePointer.from_python_file(python_file, attribute, working_directory))\n\n\ndef def_from_pointer(\n pointer: CodePointer,\n) -> Union["PipelineDefinition", "RepositoryDefinition", "GraphDefinition"]:\n target = pointer.load_target()\n\n from dagster.core.asset_defs.asset_group import AssetGroup\n\n from .graph_definition import GraphDefinition\n from .pipeline_definition import PipelineDefinition\n from .repository_definition import RepositoryDefinition\n\n if isinstance(\n target, (PipelineDefinition, RepositoryDefinition, GraphDefinition, AssetGroup)\n ) or not callable(target):\n return _check_is_loadable(target)\n\n # if its a function invoke it - otherwise we are pointing to a\n # artifact in module scope, likely decorator output\n\n if seven.get_arg_names(target):\n raise DagsterInvariantViolationError(\n "Error invoking function at {target} with no arguments. "\n "Reconstructable target must be callable with no arguments".format(\n target=pointer.describe()\n )\n )\n\n return _check_is_loadable(target())\n\n\ndef pipeline_def_from_pointer(pointer: CodePointer) -> "PipelineDefinition":\n from .pipeline_definition import PipelineDefinition\n\n target = def_from_pointer(pointer)\n\n if isinstance(target, PipelineDefinition):\n return target\n\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a JobDefinition (or PipelineDefinition for legacy code). "\n "Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n\n\n@overload\n# NOTE: mypy can't handle these overloads but pyright can\ndef repository_def_from_target_def( # type: ignore\n target: Union["RepositoryDefinition", "PipelineDefinition", "GraphDefinition", "AssetGroup"]\n) -> "RepositoryDefinition":\n ...\n\n\n@overload\ndef repository_def_from_target_def(target: object) -> None:\n ...\n\n\ndef repository_def_from_target_def(target: object) -> Optional["RepositoryDefinition"]:\n from dagster.core.asset_defs.asset_group import AssetGroup\n\n from .graph_definition import GraphDefinition\n from .pipeline_definition import PipelineDefinition\n from .repository_definition import CachingRepositoryData, RepositoryDefinition\n\n # special case - we can wrap a single pipeline in a repository\n if isinstance(target, (PipelineDefinition, GraphDefinition)):\n # consider including pipeline name in generated repo name\n return RepositoryDefinition(\n name=get_ephemeral_repository_name(target.name),\n repository_data=CachingRepositoryData.from_list([target]),\n )\n elif isinstance(target, AssetGroup):\n return RepositoryDefinition(\n name="__repository__", repository_data=CachingRepositoryData.from_list([target])\n )\n elif isinstance(target, RepositoryDefinition):\n return target\n else:\n return None\n\n\ndef repository_def_from_pointer(pointer: CodePointer) -> "RepositoryDefinition":\n target = def_from_pointer(pointer)\n repo_def = repository_def_from_target_def(target)\n if not repo_def:\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a "\n "RepositoryDefinition, JobDefinition, or PipelineDefinition. "\n "Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n return repo_def\n
", "current_page_name": "_modules/dagster/core/definitions/reconstruct", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.reconstruct"}, "repository_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.repository_definition

\nimport warnings\nfrom abc import ABC, abstractmethod\nfrom inspect import isfunction\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    List,\n    Mapping,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster.core.asset_defs.source_asset import SourceAsset\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.utils import merge_dicts\n\nfrom .events import AssetKey\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .job_definition import JobDefinition\nfrom .partition import PartitionScheduleDefinition, PartitionSetDefinition\nfrom .pipeline_definition import PipelineDefinition\nfrom .schedule_definition import ScheduleDefinition\nfrom .sensor_definition import SensorDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.asset_defs.asset_group import AssetGroup\n\nVALID_REPOSITORY_DATA_DICT_KEYS = {\n    "pipelines",\n    "partition_sets",\n    "schedules",\n    "sensors",\n    "jobs",\n}\n\nRepositoryLevelDefinition = TypeVar(\n    "RepositoryLevelDefinition",\n    PipelineDefinition,\n    JobDefinition,\n    PartitionSetDefinition,\n    ScheduleDefinition,\n    SensorDefinition,\n)\n\n\nclass _CacheingDefinitionIndex(Generic[RepositoryLevelDefinition]):\n    def __init__(\n        self,\n        definition_class: Type[RepositoryLevelDefinition],\n        definition_class_name: str,\n        definition_kind: str,\n        definitions: Mapping[\n            str, Union[RepositoryLevelDefinition, Callable[[], RepositoryLevelDefinition]]\n        ],\n        validation_fn: Callable[[RepositoryLevelDefinition], RepositoryLevelDefinition],\n        lazy_definitions_fn: Optional[Callable[[], List[RepositoryLevelDefinition]]] = None,\n    ):\n        """\n        Args:\n            definitions: A dictionary of definition names to definitions or functions that load\n                definitions.\n            lazy_definitions_fn: A function for loading a list of definitions whose names are not\n                even known until loaded.\n\n        """\n\n        for key, definition in definitions.items():\n            check.invariant(\n                isinstance(definition, definition_class) or callable(definition),\n                "Bad definition for {definition_kind} {key}: must be {definition_class_name} or "\n                "callable, got {type_}".format(\n                    definition_kind=definition_kind,\n                    key=key,\n                    definition_class_name=definition_class_name,\n                    type_=type(definition),\n                ),\n            )\n\n        self._definition_class: Type[RepositoryLevelDefinition] = definition_class\n        self._definition_class_name = definition_class_name\n        self._definition_kind = definition_kind\n        self._validation_fn: Callable[\n            [RepositoryLevelDefinition], RepositoryLevelDefinition\n        ] = validation_fn\n\n        self._definitions: Mapping[\n            str, Union[RepositoryLevelDefinition, Callable[[], RepositoryLevelDefinition]]\n        ] = definitions\n        self._definition_cache: Dict[str, RepositoryLevelDefinition] = {}\n        self._definition_names: Optional[List[str]] = None\n\n        self._lazy_definitions_fn: Callable[\n            [], List[RepositoryLevelDefinition]\n        ] = lazy_definitions_fn or (lambda: [])\n        self._lazy_definitions: Optional[List[RepositoryLevelDefinition]] = None\n\n        self._all_definitions: Optional[List[RepositoryLevelDefinition]] = None\n\n    def _get_lazy_definitions(self) -> List[RepositoryLevelDefinition]:\n        if self._lazy_definitions is None:\n            self._lazy_definitions = self._lazy_definitions_fn()\n            for definition in self._lazy_definitions:\n                self._validate_and_cache_definition(definition, definition.name)\n\n        return self._lazy_definitions\n\n    def get_definition_names(self) -> List[str]:\n        if self._definition_names:\n            return self._definition_names\n\n        lazy_names = []\n        for definition in self._get_lazy_definitions():\n            strict_definition = self._definitions.get(definition.name)\n            if strict_definition:\n                check.invariant(\n                    strict_definition == definition,\n                    f"Duplicate definition found for {definition.name}",\n                )\n            else:\n                lazy_names.append(definition.name)\n\n        self._definition_names = list(self._definitions.keys()) + lazy_names\n        return self._definition_names\n\n    def has_definition(self, definition_name: str) -> bool:\n        check.str_param(definition_name, "definition_name")\n\n        return definition_name in self.get_definition_names()\n\n    def get_all_definitions(self) -> List[RepositoryLevelDefinition]:\n        if self._all_definitions is not None:\n            return self._all_definitions\n\n        self._all_definitions = list(\n            sorted(\n                map(self.get_definition, self.get_definition_names()),\n                key=lambda definition: definition.name,\n            )\n        )\n        return self._all_definitions\n\n    def get_definition(self, definition_name: str) -> RepositoryLevelDefinition:\n        check.str_param(definition_name, "definition_name")\n\n        if not self.has_definition(definition_name):\n            raise DagsterInvariantViolationError(\n                "Could not find {definition_kind} '{definition_name}'. Found: "\n                "{found_names}.".format(\n                    definition_kind=self._definition_kind,\n                    definition_name=definition_name,\n                    found_names=", ".join(\n                        [\n                            "'{found_name}'".format(found_name=found_name)\n                            for found_name in self.get_definition_names()\n                        ]\n                    ),\n                )\n            )\n\n        if definition_name in self._definition_cache:\n            return self._definition_cache[definition_name]\n\n        definition_source = self._definitions[definition_name]\n\n        if isinstance(definition_source, self._definition_class):\n            self._definition_cache[definition_name] = self._validation_fn(definition_source)\n            return definition_source\n        else:\n            definition = cast(Callable, definition_source)()\n            self._validate_and_cache_definition(definition, definition_name)\n            return definition\n\n    def _validate_and_cache_definition(\n        self, definition: RepositoryLevelDefinition, definition_dict_key: str\n    ):\n        check.invariant(\n            isinstance(definition, self._definition_class),\n            "Bad constructor for {definition_kind} {definition_name}: must return "\n            "{definition_class_name}, got value of type {type_}".format(\n                definition_kind=self._definition_kind,\n                definition_name=definition_dict_key,\n                definition_class_name=self._definition_class_name,\n                type_=type(definition),\n            ),\n        )\n        check.invariant(\n            definition.name == definition_dict_key,\n            "Bad constructor for {definition_kind} '{definition_name}': name in "\n            "{definition_class_name} does not match: got '{definition_def_name}'".format(\n                definition_kind=self._definition_kind,\n                definition_name=definition_dict_key,\n                definition_class_name=self._definition_class_name,\n                definition_def_name=definition.name,\n            ),\n        )\n        self._definition_cache[definition_dict_key] = self._validation_fn(definition)\n\n\n
[docs]class RepositoryData(ABC):\n """\n Users should usually rely on the :py:func:`@repository <repository>` decorator to create new\n repositories, which will in turn call the static constructors on this class. However, users may\n subclass :py:class:`RepositoryData` for fine-grained control over access to and lazy creation\n of repository members.\n """\n\n @abstractmethod\n def get_all_pipelines(self) -> List[PipelineDefinition]:\n """Return all pipelines/jobs in the repository as a list.\n\n Returns:\n List[PipelineDefinition]: All pipelines/jobs in the repository.\n """\n\n def get_all_jobs(self) -> List[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n return [job for job in self.get_all_pipelines() if isinstance(job, JobDefinition)]\n\n def get_pipeline_names(self) -> List[str]:\n """Get the names of all pipelines/jobs in the repository.\n\n Returns:\n List[str]\n """\n return [pipeline_def.name for pipeline_def in self.get_all_pipelines()]\n\n def get_job_names(self) -> List[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return [job_def.name for job_def in self.get_all_jobs()]\n\n def has_pipeline(self, pipeline_name: str) -> bool:\n """Check if a pipeline/job with a given name is present in the repository.\n\n Args:\n pipeline_name (str): The name of the pipeline/job.\n\n Returns:\n bool\n """\n return pipeline_name in self.get_pipeline_names()\n\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n return job_name in self.get_job_names()\n\n def get_pipeline(self, pipeline_name) -> PipelineDefinition:\n """Get a pipeline/job by name.\n\n Args:\n pipeline_name (str): Name of the pipeline/job to retrieve.\n\n Returns:\n PipelineDefinition: The pipeline/job definition corresponding to the given name.\n """\n pipelines_with_name = [\n pipeline for pipeline in self.get_all_pipelines() if pipeline.name == pipeline_name\n ]\n if not pipelines_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find pipeline/job {pipeline_name} in repository"\n )\n return pipelines_with_name[0]\n\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n match = next(job for job in self.get_all_jobs() if job.name == job_name)\n if match is None:\n raise DagsterInvariantViolationError(f"Could not find job {job_name} in repository")\n return match\n\n def get_partition_set_names(self):\n """Get the names of all partition sets in the repository.\n\n Returns:\n List[str]\n """\n return [partition_set.name for partition_set in self.get_all_partition_sets()]\n\n def has_partition_set(self, partition_set_name: str) -> bool:\n """Check if a partition set with a given name is present in the repository.\n\n Args:\n partition_set_name (str): The name of the partition set.\n\n Returns:\n bool\n """\n return partition_set_name in self.get_partition_set_names()\n\n def get_all_partition_sets(self) -> List[PartitionSetDefinition]:\n """Return all partition sets in the repository as a list.\n\n Returns:\n List[PartitionSetDefinition]: All partition sets in the repository.\n """\n return []\n\n def get_partition_set(self, partition_set_name: str) -> PartitionSetDefinition:\n """Get a partition set by name.\n\n Args:\n partition_set_name (str): Name of the partition set to retrieve.\n\n Returns:\n PartitionSetDefinition: The partition set definition corresponding to the given name.\n """\n partition_sets_with_name = [\n partition_set\n for partition_set in self.get_all_partition_sets()\n if partition_set.name == partition_set_name\n ]\n if not partition_sets_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find partition set {partition_set_name} in repository"\n )\n return partition_sets_with_name[0]\n\n def get_schedule_names(self) -> List[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return [schedule.name for schedule in self.get_all_schedules()]\n\n def get_all_schedules(self) -> List[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Returns:\n List[ScheduleDefinition]: All pipelines in the repository.\n """\n return []\n\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n schedules_with_name = [\n schedule for schedule in self.get_all_schedules() if schedule.name == schedule_name\n ]\n if not schedules_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find schedule {schedule_name} in repository"\n )\n return schedules_with_name[0]\n\n def has_schedule(self, schedule_name: str) -> bool:\n return schedule_name in self.get_schedule_names()\n\n def get_all_sensors(self) -> List[SensorDefinition]:\n return []\n\n def get_sensor_names(self) -> List[str]:\n return [sensor.name for sensor in self.get_all_sensors()]\n\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n sensors_with_name = [\n sensor for sensor in self.get_all_sensors() if sensor.name == sensor_name\n ]\n if not sensors_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find sensor {sensor_name} in repository"\n )\n return sensors_with_name[0]\n\n def has_sensor(self, sensor_name: str) -> bool:\n return sensor_name in self.get_sensor_names()\n\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return {}\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self.get_all_pipelines()\n self.get_all_jobs()\n self.get_all_partition_sets()\n self.get_all_schedules()\n self.get_all_sensors()\n self.get_source_assets_by_key()
\n\n\nT = TypeVar("T")\nResolvable = Callable[[], T]\n\n\nclass CachingRepositoryData(RepositoryData):\n """Default implementation of RepositoryData used by the :py:func:`@repository <repository>` decorator."""\n\n _all_jobs: Optional[List[JobDefinition]]\n _all_pipelines: Optional[List[PipelineDefinition]]\n\n def __init__(\n self,\n pipelines: Mapping[str, Union[PipelineDefinition, Resolvable[PipelineDefinition]]],\n jobs: Mapping[str, Union[JobDefinition, Resolvable[JobDefinition]]],\n partition_sets: Mapping[\n str, Union[PartitionSetDefinition, Resolvable[PartitionSetDefinition]]\n ],\n schedules: Mapping[str, Union[ScheduleDefinition, Resolvable[ScheduleDefinition]]],\n sensors: Mapping[str, Union[SensorDefinition, Resolvable[SensorDefinition]]],\n source_assets: Mapping[AssetKey, SourceAsset],\n ):\n """Constructs a new CachingRepositoryData object.\n\n You may pass pipeline, job, partition_set, and schedule definitions directly, or you may pass\n callables with no arguments that will be invoked to lazily construct definitions when\n accessed by name. This can be helpful for performance when there are many definitions in a\n repository, or when constructing the definitions is costly.\n\n Note that when lazily constructing a definition, the name of the definition must match its\n key in its dictionary index, or a :py:class:`DagsterInvariantViolationError` will be thrown\n at retrieval time.\n\n Args:\n pipelines (Mapping[str, Union[PipelineDefinition, Callable[[], PipelineDefinition]]]):\n The pipeline definitions belonging to the repository.\n jobs (Mapping[str, Union[JobDefinition, Callable[[], JobDefinition]]]):\n The job definitions belonging to the repository.\n partition_sets (Mapping[str, Union[PartitionSetDefinition, Callable[[], PartitionSetDefinition]]]):\n The partition sets belonging to the repository.\n schedules (Mapping[str, Union[ScheduleDefinition, Callable[[], ScheduleDefinition]]]):\n The schedules belonging to the repository.\n sensors (Mapping[str, Union[SensorDefinition, Callable[[], SensorDefinition]]]):\n The sensors belonging to a repository.\n source_assets (Mapping[AssetKey, SourceAsset]): The source assets belonging to a repository.\n """\n check.mapping_param(\n pipelines, "pipelines", key_type=str, value_type=(PipelineDefinition, FunctionType)\n )\n check.mapping_param(jobs, "jobs", key_type=str, value_type=(JobDefinition, FunctionType))\n check.mapping_param(\n partition_sets,\n "partition_sets",\n key_type=str,\n value_type=(PartitionSetDefinition, FunctionType),\n )\n check.mapping_param(\n schedules, "schedules", key_type=str, value_type=(ScheduleDefinition, FunctionType)\n )\n check.mapping_param(\n sensors, "sensors", key_type=str, value_type=(SensorDefinition, FunctionType)\n )\n check.mapping_param(\n source_assets, "source_assets", key_type=AssetKey, value_type=SourceAsset\n )\n\n self._pipelines = _CacheingDefinitionIndex(\n PipelineDefinition,\n "PipelineDefinition",\n "pipeline",\n pipelines,\n self._validate_pipeline,\n )\n\n self._jobs = _CacheingDefinitionIndex(\n JobDefinition,\n "JobDefinition",\n "job",\n jobs,\n self._validate_job,\n )\n\n self._schedules = _CacheingDefinitionIndex(\n ScheduleDefinition,\n "ScheduleDefinition",\n "schedule",\n schedules,\n self._validate_schedule,\n )\n schedule_partition_sets = [\n schedule.get_partition_set()\n for schedule in self._schedules.get_all_definitions()\n if isinstance(schedule, PartitionScheduleDefinition)\n ]\n self._source_assets = source_assets\n\n def load_partition_sets_from_pipelines() -> List[PartitionSetDefinition]:\n job_partition_sets = []\n for pipeline in self.get_all_pipelines():\n if isinstance(pipeline, JobDefinition):\n job_partition_set = pipeline.get_partition_set_def()\n\n if job_partition_set:\n # should only return a partition set if this was constructed using the job\n # API, with a partitioned config\n job_partition_sets.append(job_partition_set)\n\n return job_partition_sets\n\n self._partition_sets = _CacheingDefinitionIndex(\n PartitionSetDefinition,\n "PartitionSetDefinition",\n "partition set",\n merge_dicts(\n {partition_set.name: partition_set for partition_set in schedule_partition_sets},\n partition_sets,\n ),\n self._validate_partition_set,\n load_partition_sets_from_pipelines,\n )\n self._sensors = _CacheingDefinitionIndex(\n SensorDefinition,\n "SensorDefinition",\n "sensor",\n sensors,\n self._validate_sensor,\n )\n # load all sensors to force validation\n self._sensors.get_all_definitions()\n\n self._all_pipelines = None\n self._all_jobs = None\n\n @staticmethod\n def from_dict(repository_definitions: Dict[str, Dict[str, Any]]) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definition (Dict[str, Dict[str, ...]]): A dict of the form:\n\n {\n 'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n """\n check.dict_param(repository_definitions, "repository_definitions", key_type=str)\n check.invariant(\n set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS),\n "Bad dict: must not contain keys other than {{{valid_keys}}}: found {bad_keys}.".format(\n valid_keys=", ".join(\n ["'{key}'".format(key=key) for key in VALID_REPOSITORY_DATA_DICT_KEYS]\n ),\n bad_keys=", ".join(\n [\n "'{key}'"\n for key in repository_definitions.keys()\n if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n ]\n ),\n ),\n )\n\n for key in VALID_REPOSITORY_DATA_DICT_KEYS:\n if key not in repository_definitions:\n repository_definitions[key] = {}\n\n duplicate_keys = set(repository_definitions["schedules"].keys()).intersection(\n set(repository_definitions["sensors"].keys())\n )\n if duplicate_keys:\n raise DagsterInvalidDefinitionError(\n f"Duplicate definitions between schedules and sensors found for keys: {', '.join(duplicate_keys)}"\n )\n\n # merge jobs in to pipelines while they are just implemented as pipelines\n for key, job in repository_definitions["jobs"].items():\n if key in repository_definitions["pipelines"]:\n raise DagsterInvalidDefinitionError(\n f'Conflicting entries for name {key} in "jobs" and "pipelines".'\n )\n\n if isinstance(job, GraphDefinition):\n repository_definitions["jobs"][key] = job.coerce_to_job()\n elif not isinstance(job, JobDefinition) and not isfunction(job):\n raise DagsterInvalidDefinitionError(\n f"Object mapped to {key} is not an instance of JobDefinition or GraphDefinition."\n )\n\n return CachingRepositoryData(**repository_definitions, source_assets={})\n\n @classmethod\n def from_list(\n cls,\n repository_definitions: List[\n Union[\n PipelineDefinition,\n PartitionSetDefinition,\n ScheduleDefinition,\n SensorDefinition,\n "AssetGroup",\n GraphDefinition,\n ]\n ],\n ) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definitions (List[Union[PipelineDefinition, PartitionSetDefinition, ScheduleDefinition, SensorDefinition, AssetGroup, GraphDefinition]]):\n Use this constructor when you have no need to lazy load pipelines/jobs or other\n definitions.\n """\n from dagster.core.asset_defs import AssetGroup\n\n pipelines_or_jobs: Dict[str, Union[PipelineDefinition, JobDefinition]] = {}\n coerced_graphs: Dict[str, JobDefinition] = {}\n partition_sets: Dict[str, PartitionSetDefinition] = {}\n schedules: Dict[str, ScheduleDefinition] = {}\n sensors: Dict[str, SensorDefinition] = {}\n source_assets: Dict[AssetKey, SourceAsset] = {}\n combined_asset_group = None\n for definition in repository_definitions:\n if isinstance(definition, PipelineDefinition):\n if (\n definition.name in pipelines_or_jobs\n and pipelines_or_jobs[definition.name] != definition\n ):\n raise DagsterInvalidDefinitionError(\n "Duplicate {target_type} definition found for {target}".format(\n target_type=definition.target_type, target=definition.describe_target()\n )\n )\n if AssetGroup.is_base_job_name(definition.name):\n raise DagsterInvalidDefinitionError(\n f"Attempted to provide job called {definition.name} to repository, which "\n "is a reserved name. Please rename the job."\n )\n pipelines_or_jobs[definition.name] = definition\n elif isinstance(definition, PartitionSetDefinition):\n if definition.name in partition_sets:\n raise DagsterInvalidDefinitionError(\n "Duplicate partition set definition found for partition set "\n "{partition_set_name}".format(partition_set_name=definition.name)\n )\n partition_sets[definition.name] = definition\n elif isinstance(definition, SensorDefinition):\n if definition.name in sensors or definition.name in schedules:\n raise DagsterInvalidDefinitionError(\n f"Duplicate definition found for {definition.name}"\n )\n sensors[definition.name] = definition\n elif isinstance(definition, ScheduleDefinition):\n if definition.name in sensors or definition.name in schedules:\n raise DagsterInvalidDefinitionError(\n f"Duplicate definition found for {definition.name}"\n )\n schedules[definition.name] = definition\n if isinstance(definition, PartitionScheduleDefinition):\n partition_set_def = definition.get_partition_set()\n if (\n partition_set_def.name in partition_sets\n and partition_set_def != partition_sets[partition_set_def.name]\n ):\n raise DagsterInvalidDefinitionError(\n "Duplicate partition set definition found for partition set "\n "{partition_set_name}".format(partition_set_name=partition_set_def.name)\n )\n partition_sets[partition_set_def.name] = partition_set_def\n elif isinstance(definition, GraphDefinition):\n coerced = definition.coerce_to_job()\n if coerced.name in pipelines_or_jobs:\n raise DagsterInvalidDefinitionError(\n "Duplicate {target_type} definition found for graph '{name}'".format(\n target_type=coerced.target_type, name=coerced.name\n )\n )\n pipelines_or_jobs[coerced.name] = coerced\n coerced_graphs[coerced.name] = coerced\n\n elif isinstance(definition, AssetGroup):\n if combined_asset_group:\n combined_asset_group += definition\n else:\n combined_asset_group = definition\n else:\n check.failed(f"Unexpected repository entry {definition}")\n\n if combined_asset_group:\n for job_def in combined_asset_group.get_base_jobs():\n pipelines_or_jobs[job_def.name] = job_def\n\n source_assets = {\n source_asset.key: source_asset\n for source_asset in combined_asset_group.source_assets\n }\n\n for name, sensor_def in sensors.items():\n if sensor_def.has_loadable_targets():\n targets = sensor_def.load_targets()\n for target in targets:\n _process_and_validate_target(\n sensor_def, coerced_graphs, pipelines_or_jobs, target\n )\n\n for name, schedule_def in schedules.items():\n if schedule_def.has_loadable_target():\n target = schedule_def.load_target()\n _process_and_validate_target(\n schedule_def, coerced_graphs, pipelines_or_jobs, target\n )\n\n pipelines: Dict[str, PipelineDefinition] = {}\n jobs: Dict[str, JobDefinition] = {}\n for name, pipeline_or_job in pipelines_or_jobs.items():\n if isinstance(pipeline_or_job, JobDefinition):\n jobs[name] = pipeline_or_job\n else:\n pipelines[name] = pipeline_or_job\n\n return CachingRepositoryData(\n pipelines=pipelines,\n jobs=jobs,\n partition_sets=partition_sets,\n schedules=schedules,\n sensors=sensors,\n source_assets=source_assets,\n )\n\n def get_pipeline_names(self) -> List[str]:\n """Get the names of all pipelines/jobs in the repository.\n\n Returns:\n List[str]\n """\n return self._pipelines.get_definition_names() + self.get_job_names()\n\n def get_job_names(self) -> List[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return self._jobs.get_definition_names()\n\n def has_pipeline(self, pipeline_name: str) -> bool:\n """Check if a pipeline/job with a given name is present in the repository.\n\n Args:\n pipeline_name (str): The name of the pipeline/job.\n\n Returns:\n bool\n """\n check.str_param(pipeline_name, "pipeline_name")\n\n return self._pipelines.has_definition(pipeline_name) or self._jobs.has_definition(\n pipeline_name\n )\n\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n check.str_param(job_name, "job_name")\n return self._jobs.has_definition(job_name)\n\n def get_all_pipelines(self) -> List[PipelineDefinition]:\n """Return all pipelines/jobs in the repository as a list.\n\n Note that this will construct any pipeline/job that has not yet been constructed.\n\n Returns:\n List[PipelineDefinition]: All pipelines/jobs in the repository.\n """\n if self._all_pipelines is not None:\n return self._all_pipelines\n\n self._all_jobs = self._jobs.get_all_definitions()\n pipelines: List[PipelineDefinition] = [\n *self._pipelines.get_all_definitions(),\n *self._all_jobs,\n ]\n self._check_solid_defs(pipelines)\n self._all_pipelines = pipelines\n return self._all_pipelines\n\n def get_all_jobs(self) -> List[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job that has not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n if self._all_jobs is not None:\n return self._all_jobs\n\n # _check_solid_defs enforces that pipeline and graph definition names are\n # unique within a repository. Loads pipelines in the line below to enforce\n # pipeline/job/graph uniqueness.\n self.get_all_pipelines()\n\n # The `get_all_pipelines` call ensures _all_jobs is set.\n return cast(List[JobDefinition], self._all_jobs)\n\n def get_pipeline(self, pipeline_name: str) -> PipelineDefinition:\n """Get a pipeline/job by name.\n\n If this pipeline/job has not yet been constructed, only this pipeline/job is constructed, and will\n be cached for future calls.\n\n Args:\n pipeline_name (str): Name of the pipeline/job to retrieve.\n\n Returns:\n PipelineDefinition: The pipeline/job definition corresponding to the given name.\n """\n\n check.str_param(pipeline_name, "pipeline_name")\n\n if self._jobs.has_definition(pipeline_name):\n return self._jobs.get_definition(pipeline_name)\n else:\n return self._pipelines.get_definition(pipeline_name)\n\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job has not yet been constructed, only this job is constructed, and will\n be cached for future calls.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n\n check.str_param(job_name, "job_name")\n return self._jobs.get_definition(job_name)\n\n def get_partition_set_names(self) -> List[str]:\n """Get the names of all partition sets in the repository.\n\n Returns:\n List[str]\n """\n return self._partition_sets.get_definition_names()\n\n def has_partition_set(self, partition_set_name: str) -> bool:\n """Check if a partition set with a given name is present in the repository.\n\n Args:\n partition_set_name (str): The name of the partition set.\n\n Returns:\n bool\n """\n check.str_param(partition_set_name, "partition_set_name")\n return self._partition_sets.has_definition(partition_set_name)\n\n def get_all_partition_sets(self) -> List[PartitionSetDefinition]:\n """Return all partition sets in the repository as a list.\n\n Note that this will construct any partition set that has not yet been constructed.\n\n Returns:\n List[PartitionSetDefinition]: All partition sets in the repository.\n """\n return self._partition_sets.get_all_definitions()\n\n def get_partition_set(self, partition_set_name: str) -> PartitionSetDefinition:\n """Get a partition set by name.\n\n If this partition set has not yet been constructed, only this partition set is constructed,\n and will be cached for future calls.\n\n Args:\n partition_set_name (str): Name of the partition set to retrieve.\n\n Returns:\n PartitionSetDefinition: The partition set definition corresponding to the given name.\n """\n\n check.str_param(partition_set_name, "partition_set_name")\n\n return self._partition_sets.get_definition(partition_set_name)\n\n def get_schedule_names(self) -> List[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return self._schedules.get_definition_names()\n\n def get_all_schedules(self) -> List[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Note that this will construct any schedule that has not yet been constructed.\n\n Returns:\n List[ScheduleDefinition]: All schedules in the repository.\n """\n return self._schedules.get_all_definitions()\n\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n if this schedule has not yet been constructed, only this schedule is constructed, and will\n be cached for future calls.\n\n args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.get_definition(schedule_name)\n\n def has_schedule(self, schedule_name: str) -> bool:\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.has_definition(schedule_name)\n\n def get_all_sensors(self) -> List[SensorDefinition]:\n return self._sensors.get_all_definitions()\n\n def get_sensor_names(self) -> List[str]:\n return self._sensors.get_definition_names()\n\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n return self._sensors.get_definition(sensor_name)\n\n def has_sensor(self, sensor_name: str) -> bool:\n return self._sensors.has_definition(sensor_name)\n\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._source_assets\n\n def _check_solid_defs(self, pipelines: List[PipelineDefinition]) -> None:\n solid_defs = {}\n solid_to_pipeline = {}\n for pipeline in pipelines:\n for solid_def in [*pipeline.all_node_defs, pipeline.graph]:\n # skip checks for subselected graphs because they don't have their own names\n if isinstance(solid_def, SubselectedGraphDefinition):\n break\n\n if solid_def.name not in solid_defs:\n solid_defs[solid_def.name] = solid_def\n solid_to_pipeline[solid_def.name] = pipeline.name\n\n if not solid_defs[solid_def.name] is solid_def:\n first_name, second_name = sorted(\n [solid_to_pipeline[solid_def.name], pipeline.name]\n )\n raise DagsterInvalidDefinitionError(\n (\n f"Conflicting definitions found in repository with name '{solid_def.name}'. "\n "Op/Graph/Solid definition names must be unique within a "\n f"repository. {solid_def.__class__.__name__} is defined in {pipeline.target_type} "\n f"'{first_name}' and in {pipeline.target_type} '{second_name}'."\n )\n )\n\n def _validate_pipeline(self, pipeline: PipelineDefinition) -> PipelineDefinition:\n return pipeline\n\n def _validate_job(self, job: JobDefinition) -> JobDefinition:\n return job\n\n def _validate_schedule(self, schedule: ScheduleDefinition) -> ScheduleDefinition:\n pipelines = self.get_pipeline_names()\n\n if schedule.pipeline_name not in pipelines:\n raise DagsterInvalidDefinitionError(\n f'ScheduleDefinition "{schedule.name}" targets job/pipeline "{schedule.pipeline_name}" '\n "which was not found in this repository."\n )\n\n return schedule\n\n def _validate_sensor(self, sensor: SensorDefinition) -> SensorDefinition:\n pipelines = self.get_pipeline_names()\n if len(sensor.targets) == 0:\n # skip validation when the sensor does not target a pipeline\n return sensor\n\n for target in sensor.targets:\n if target.pipeline_name not in pipelines:\n raise DagsterInvalidDefinitionError(\n f'SensorDefinition "{sensor.name}" targets job/pipeline "{sensor.pipeline_name}" '\n "which was not found in this repository."\n )\n\n return sensor\n\n def _validate_partition_set(\n self, partition_set: PartitionSetDefinition\n ) -> PartitionSetDefinition:\n return partition_set\n\n\n
[docs]class RepositoryDefinition:\n """Define a repository that contains a group of definitions.\n\n Users should typically not create objects of this class directly. Instead, use the\n :py:func:`@repository` decorator.\n\n Args:\n name (str): The name of the repository.\n repository_data (RepositoryData): Contains the definitions making up the repository.\n description (Optional[str]): A string description of the repository.\n """\n\n def __init__(\n self,\n name,\n repository_data,\n description=None,\n ):\n self._name = check_valid_name(name)\n self._description = check.opt_str_param(description, "description")\n self._repository_data = check.inst_param(repository_data, "repository_data", RepositoryData)\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self._repository_data.load_all_definitions()\n\n @property\n def pipeline_names(self) -> List[str]:\n """List[str]: Names of all pipelines/jobs in the repository"""\n return self._repository_data.get_pipeline_names()\n\n @property\n def job_names(self) -> List[str]:\n """List[str]: Names of all jobs in the repository"""\n return self._repository_data.get_job_names()\n\n def has_pipeline(self, name: str) -> bool:\n """Check if a pipeline/job with a given name is present in the repository.\n\n Args:\n name (str): The name of the pipeline/job.\n\n Returns:\n bool\n """\n return self._repository_data.has_pipeline(name)\n\n def get_pipeline(self, name: str) -> PipelineDefinition:\n """Get a pipeline/job by name.\n\n If this pipeline/job is present in the lazily evaluated dictionary passed to the\n constructor, but has not yet been constructed, only this pipeline/job is constructed, and will\n be cached for future calls.\n\n Args:\n name (str): Name of the pipeline/job to retrieve.\n\n Returns:\n PipelineDefinition: The pipeline/job definition corresponding to the given name.\n """\n return self._repository_data.get_pipeline(name)\n\n def get_all_pipelines(self) -> List[PipelineDefinition]:\n """Return all pipelines/jobs in the repository as a list.\n\n Note that this will construct any pipeline/job in the lazily evaluated dictionary that\n has not yet been constructed.\n\n Returns:\n List[PipelineDefinition]: All pipelines/jobs in the repository.\n """\n return self._repository_data.get_all_pipelines()\n\n
[docs] def has_job(self, name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n name (str): The name of the job.\n\n Returns:\n bool\n """\n return self._repository_data.has_job(name)
\n\n
[docs] def get_job(self, name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job is present in the lazily evaluated dictionary passed to the\n constructor, but has not yet been constructed, only this job is constructed, and\n will be cached for future calls.\n\n Args:\n name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to\n the given name.\n """\n return self._repository_data.get_job(name)
\n\n
[docs] def get_all_jobs(self) -> List[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job in the lazily evaluated dictionary that has\n not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n return self._repository_data.get_all_jobs()
\n\n @property\n def partition_set_defs(self) -> List[PartitionSetDefinition]:\n return self._repository_data.get_all_partition_sets()\n\n def get_partition_set_def(self, name: str) -> PartitionSetDefinition:\n return self._repository_data.get_partition_set(name)\n\n @property\n def schedule_defs(self) -> List[ScheduleDefinition]:\n return self._repository_data.get_all_schedules()\n\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n return self._repository_data.get_schedule(name)\n\n def has_schedule_def(self, name: str) -> bool:\n return self._repository_data.has_schedule(name)\n\n @property\n def sensor_defs(self) -> List[SensorDefinition]:\n return self._repository_data.get_all_sensors()\n\n def get_sensor_def(self, name: str) -> SensorDefinition:\n return self._repository_data.get_sensor(name)\n\n def has_sensor_def(self, name: str) -> bool:\n return self._repository_data.has_sensor(name)\n\n @property\n def source_assets_by_key(self) -> Dict[AssetKey, SourceAsset]:\n return self._repository_data.get_source_assets_by_key()\n\n # If definition comes from the @repository decorator, then the __call__ method will be\n # overwritten. Therefore, we want to maintain the call-ability of repository definitions.\n def __call__(self, *args, **kwargs):\n return self
\n\n\ndef _process_and_validate_target(\n schedule_or_sensor_def: Union[SensorDefinition, ScheduleDefinition],\n coerced_graphs: Dict[str, JobDefinition],\n pipelines_or_jobs: Dict[str, PipelineDefinition],\n target: Union[GraphDefinition, PipelineDefinition],\n):\n # This function modifies the state of coerced_graphs.\n targeter = (\n f"schedule '{schedule_or_sensor_def.name}'"\n if isinstance(schedule_or_sensor_def, ScheduleDefinition)\n else f"sensor '{schedule_or_sensor_def.name}'"\n )\n if isinstance(target, GraphDefinition):\n if target.name not in coerced_graphs:\n # Since this is a graph we have to coerce, is not possible to be\n # the same definition by reference equality\n if target.name in pipelines_or_jobs:\n dupe_target_type = pipelines_or_jobs[target.name].target_type\n warnings.warn(\n _get_error_msg_for_target_conflict(\n targeter, "graph", target.name, dupe_target_type\n )\n )\n elif coerced_graphs[target.name].graph != target:\n warnings.warn(\n _get_error_msg_for_target_conflict(targeter, "graph", target.name, "graph")\n )\n coerced_job = target.coerce_to_job()\n coerced_graphs[target.name] = coerced_job\n pipelines_or_jobs[target.name] = coerced_job\n else:\n if target.name in pipelines_or_jobs and pipelines_or_jobs[target.name] != target:\n dupe_target_type = (\n pipelines_or_jobs[target.name].target_type\n if target.name not in coerced_graphs\n else "graph"\n )\n warnings.warn(\n _get_error_msg_for_target_conflict(\n targeter, target.target_type, target.name, dupe_target_type\n )\n )\n pipelines_or_jobs[target.name] = target\n\n\ndef _get_error_msg_for_target_conflict(targeter, target_type, target_name, dupe_target_type):\n return f"{targeter} targets {target_type} '{target_name}', but a different {dupe_target_type} with the same name was provided. The {target_type} provided to {targeter} will override the existing {dupe_target_type}, but in Dagster 0.15.0, this will result in an error. Disambiguate between these by providing a separate name to one of them."\n
", "current_page_name": "_modules/dagster/core/definitions/repository_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.repository_definition"}, "resource_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.resource_definition

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Optional,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.core.decorator_utils import format_docstring_for_description\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.configurable import AnonymousConfigurableDefinition\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster.seven import funcsigs\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom ..decorator_utils import (\n    get_function_params,\n    is_required_param,\n    positional_arg_name_list,\n    validate_expected_params,\n)\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .resource_invocation import resource_invocation_result\n\n# pylint: disable=unused-import\nfrom .scoped_resources_builder import (  # type: ignore\n    IContainsGenerator,\n    Resources,\n    ScopedResourcesBuilder,\n)\n\nif TYPE_CHECKING:\n    from dagster.core.execution.resources_init import InitResourceContext\n\n\ndef is_context_provided(params: List[funcsigs.Parameter]) -> bool:\n    return len(params) >= 1\n\n\n
[docs]class ResourceDefinition(AnonymousConfigurableDefinition):\n """Core class for defining resources.\n\n Resources are scoped ways to make external resources (like database connections) available to\n during job execution and to clean up after execution resolves.\n\n If resource_fn yields once rather than returning (in the manner of functions decorable with\n :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then the body of the\n function after the yield will be run after execution resolves, allowing users to write their\n own teardown/cleanup logic.\n\n Depending on your executor, resources may be instantiated and cleaned up more than once in a\n job execution.\n\n Args:\n resource_fn (Callable[[InitResourceContext], Any]): User-provided function to instantiate\n the resource, which will be made available to executions keyed on the\n ``context.resources`` object.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the resource matches this schema and fail if it does not. If\n not set, Dagster will accept any config provided for the resource.\n description (Optional[str]): A human-readable description of the resource.\n required_resource_keys: (Optional[Set[str]]) Keys for the resources required by this\n resource. A DagsterInvariantViolationError will be raised during initialization if\n dependencies are cyclic.\n version (Optional[str]): (Experimental) The version of the resource's definition fn. Two\n wrapped resource functions should only have the same version if they produce the same\n resource definition when provided with the same inputs.\n """\n\n def __init__(\n self,\n resource_fn: Callable[["InitResourceContext"], Any],\n config_schema: Optional[Union[Any, ConfigSchemaType]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._resource_fn = check.callable_param(resource_fn, "resource_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self._version = check.opt_str_param(version, "version")\n if version:\n experimental_arg_warning("version", "ResourceDefinition.__init__")\n\n @property\n def resource_fn(self) -> Callable[..., Any]:\n return self._resource_fn\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def version(self) -> Optional[str]:\n return self._version\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n
[docs] @staticmethod\n def none_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that returns a none resource.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that does nothing.\n """\n return ResourceDefinition.hardcoded_resource(value=None, description=description)
\n\n
[docs] @staticmethod\n def hardcoded_resource(value: Any, description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` with a hardcoded object.\n\n Args:\n value (Any): The value that will be accessible via context.resources.resource_name.\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A hardcoded resource.\n """\n return ResourceDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n
[docs] @staticmethod\n def mock_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` which wraps a ``mock.MagicMock``.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that creates the magic methods automatically and helps\n you mock existing resources.\n """\n from unittest import mock\n\n return ResourceDefinition(\n resource_fn=lambda _init_context: mock.MagicMock(), description=description\n )
\n\n @staticmethod\n def string_resource(description: Optional[str] = None) -> "ResourceDefinition":\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=str,\n description=description,\n )\n\n def copy_for_configured(\n self, description: Optional[str], config_schema: IDefinitionConfigSchema, _\n ) -> "ResourceDefinition":\n return ResourceDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n )\n\n def __call__(self, *args, **kwargs):\n from dagster.core.execution.resources_init import InitResourceContext\n\n context_provided = is_context_provided(get_function_params(self.resource_fn))\n\n if context_provided:\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Resource initialization function has context argument, but no context was provided "\n "when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of resource received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.resource_fn)[0].name\n\n if args:\n check.opt_inst_param(args[0], context_param_name, InitResourceContext)\n return resource_invocation_result(self, args[0])\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Resource initialization expected argument '{context_param_name}'."\n )\n check.opt_inst_param(\n kwargs[context_param_name], context_param_name, InitResourceContext\n )\n\n return resource_invocation_result(self, kwargs[context_param_name])\n else:\n return resource_invocation_result(self, None)
\n\n\nclass _ResourceDecoratorCallable:\n def __init__(\n self,\n config_schema: Optional[Dict[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self.config_schema = config_schema # checked by underlying definition\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n def __call__(self, resource_fn: Callable[["InitResourceContext"], Any]):\n check.callable_param(resource_fn, "resource_fn")\n\n any_name = ["*"] if is_context_provided(get_function_params(resource_fn)) else []\n\n params = get_function_params(resource_fn)\n\n missing_positional = validate_expected_params(params, any_name)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects a single "\n "positional argument."\n )\n\n extras = params[len(any_name) :]\n\n required_extras = list(filter(is_required_param, extras))\n if required_extras:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects only a single positional required argument. "\n f"Got required extra params {', '.join(positional_arg_name_list(required_extras))}"\n )\n\n resource_def = ResourceDefinition(\n resource_fn=resource_fn,\n config_schema=self.config_schema,\n description=self.description or format_docstring_for_description(resource_fn),\n version=self.version,\n required_resource_keys=self.required_resource_keys,\n )\n\n update_wrapper(resource_def, wrapped=resource_fn)\n\n return resource_def\n\n\n@overload\ndef resource(config_schema=Callable[["InitResourceContext"], Any]) -> ResourceDefinition:\n ...\n\n\n@overload\ndef resource(\n config_schema: Optional[ConfigSchemaType] = ...,\n description: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n version: Optional[str] = ...,\n) -> Callable[[Callable[["InitResourceContext"], Any]], "ResourceDefinition"]:\n ...\n\n\n
[docs]def resource(\n config_schema: Union[Callable[["InitResourceContext"], Any], Optional[ConfigSchemaType]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[\n Callable[[Callable[["InitResourceContext"], Any]], "ResourceDefinition"], "ResourceDefinition"\n]:\n """Define a resource.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an instance of\n the resource. This function will become the ``resource_fn`` of an underlying\n :py:class:`ResourceDefinition`.\n\n If the decorated function yields once rather than returning (in the manner of functions\n decorable with :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then\n the body of the function after the yield will be run after execution resolves, allowing users\n to write their own teardown/cleanup logic.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.resource_config`. If not set, Dagster will accept any config provided.\n description(Optional[str]): A human-readable description of the resource.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by this resource.\n """\n\n # This case is for when decorator is used bare, without arguments.\n # E.g. @resource versus @resource()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _ResourceDecoratorCallable()(config_schema) # type: ignore\n\n def _wrap(resource_fn: Callable[["InitResourceContext"], Any]) -> "ResourceDefinition":\n return _ResourceDecoratorCallable(\n config_schema=cast(Optional[Dict[str, Any]], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )(resource_fn)\n\n return _wrap
\n\n\n
[docs]def make_values_resource(**kwargs: Any) -> ResourceDefinition:\n """A helper function that creates a ``ResourceDefinition`` to take in user-defined values.\n\n This is useful for sharing values between ops.\n\n Args:\n **kwargs: Arbitrary keyword arguments that will be passed to the config schema of the\n returned resource definition. If not set, Dagster will accept any config provided for\n the resource.\n\n For example:\n\n .. code-block:: python\n\n @op(required_resource_keys={"globals"})\n def my_op(context):\n print(context.resources.globals["my_str_var"])\n\n @job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\n def my_job():\n my_op()\n\n Returns:\n ResourceDefinition: A resource that passes in user-defined values.\n """\n\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=kwargs or Any,\n )
\n
", "current_page_name": "_modules/dagster/core/definitions/resource_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.resource_definition"}, "run_request": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.run_request

\nfrom enum import Enum\nfrom typing import Any, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes.serdes import register_serdes_enum_fallbacks, whitelist_for_serdes\nfrom dagster.utils.error import SerializableErrorInfo\n\n\n@whitelist_for_serdes\nclass InstigatorType(Enum):\n    SCHEDULE = "SCHEDULE"\n    SENSOR = "SENSOR"\n\n\nregister_serdes_enum_fallbacks({"JobType": InstigatorType})\n# for internal backcompat\nJobType = InstigatorType\n\n\n
[docs]@whitelist_for_serdes\nclass SkipReason(NamedTuple("_SkipReason", [("skip_message", Optional[str])])):\n """\n Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\n why no runs were requested.\n\n Attributes:\n skip_message (Optional[str]): A message displayed in dagit for why this evaluation resulted\n in no requested runs.\n """\n\n def __new__(cls, skip_message: Optional[str] = None):\n return super(SkipReason, cls).__new__(\n cls,\n skip_message=check.opt_str_param(skip_message, "skip_message"),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RunRequest(\n NamedTuple(\n "_RunRequest",\n [\n ("run_key", Optional[str]),\n ("run_config", Mapping[str, Any]),\n ("tags", Mapping[str, str]),\n ("job_name", Optional[str]),\n ],\n )\n):\n """\n Represents all the information required to launch a single run. Must be returned by a\n SensorDefinition or ScheduleDefinition's evaluation function for a run to be launched.\n\n Attributes:\n run_key (str | None): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n run_config (Optional[Dict]): The config that parameterizes the run execution to\n be launched, as a dict.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n job_name (Optional[str]): (Experimental) The name of the job this run request will launch.\n Required for sensors that target multiple jobs.\n """\n\n def __new__(\n cls,\n run_key: Optional[str],\n run_config: Optional[Mapping[str, Any]] = None,\n tags: Optional[Mapping[str, str]] = None,\n job_name: Optional[str] = None,\n ):\n return super(RunRequest, cls).__new__(\n cls,\n run_key=check.opt_str_param(run_key, "run_key"),\n run_config=check.opt_dict_param(run_config, "run_config", key_type=str),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n job_name=check.opt_str_param(job_name, "job_name"),\n )
\n\n\n@whitelist_for_serdes\nclass PipelineRunReaction(\n NamedTuple(\n "_PipelineRunReaction",\n [\n ("pipeline_run", Optional[PipelineRun]),\n ("error", Optional[SerializableErrorInfo]),\n ("run_status", Optional[PipelineRunStatus]),\n ],\n )\n):\n """\n Represents a request that reacts to an existing pipeline run. If success, it will report logs\n back to the run.\n\n Attributes:\n pipeline_run (Optional[PipelineRun]): The pipeline run that originates this reaction.\n error (Optional[SerializableErrorInfo]): user code execution error.\n run_status: (Optional[PipelineRunStatus]): The run status that triggered the reaction.\n """\n\n def __new__(\n cls,\n pipeline_run: Optional[PipelineRun],\n error: Optional[SerializableErrorInfo] = None,\n run_status: Optional[PipelineRunStatus] = None,\n ):\n return super(PipelineRunReaction, cls).__new__(\n cls,\n pipeline_run=check.opt_inst_param(pipeline_run, "pipeline_run", PipelineRun),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n run_status=check.opt_inst_param(run_status, "run_status", PipelineRunStatus),\n )\n
", "current_page_name": "_modules/dagster/core/definitions/run_request", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.run_request"}, "run_status_sensor_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.run_status_sensor_definition

\nimport warnings\nfrom datetime import datetime\nfrom typing import Any, Callable, List, NamedTuple, Optional, Union, cast\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster.core.definitions import GraphDefinition, PipelineDefinition\nfrom dagster.core.definitions.sensor_definition import (\n    DefaultSensorStatus,\n    PipelineRunReaction,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SkipReason,\n    is_context_provided,\n)\nfrom dagster.core.errors import (\n    DagsterInvalidInvocationError,\n    RunStatusSensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster.core.events import PIPELINE_RUN_STATUS_TO_EVENT_TYPE, DagsterEvent\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.storage.pipeline_run import DagsterRun, PipelineRun, PipelineRunStatus, RunsFilter\nfrom dagster.serdes import (\n    deserialize_json_to_dagster_namedtuple,\n    serialize_dagster_namedtuple,\n    whitelist_for_serdes,\n)\nfrom dagster.serdes.errors import DeserializationError\nfrom dagster.serdes.serdes import register_serdes_tuple_fallbacks\nfrom dagster.seven import JSONDecodeError\nfrom dagster.utils import utc_datetime_from_timestamp\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom ..decorator_utils import get_function_params\n\n\n@whitelist_for_serdes\nclass RunStatusSensorCursor(\n    NamedTuple(\n        "_RunStatusSensorCursor",\n        [("record_id", int), ("update_timestamp", str)],\n    )\n):\n    def __new__(cls, record_id, update_timestamp):\n\n        return super(RunStatusSensorCursor, cls).__new__(\n            cls,\n            record_id=check.int_param(record_id, "record_id"),\n            update_timestamp=check.str_param(update_timestamp, "update_timestamp"),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            obj = deserialize_json_to_dagster_namedtuple(json_str)\n            return isinstance(obj, RunStatusSensorCursor)\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    def to_json(self) -> str:\n        return serialize_dagster_namedtuple(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> tuple:\n        return deserialize_json_to_dagster_namedtuple(json_str)\n\n\n# handle backcompat\nregister_serdes_tuple_fallbacks({"PipelineSensorCursor": RunStatusSensorCursor})\n\n\n
[docs]class RunStatusSensorContext(\n NamedTuple(\n "_RunStatusSensorContext",\n [\n ("sensor_name", str),\n ("dagster_run", DagsterRun),\n ("dagster_event", DagsterEvent),\n ("instance", DagsterInstance),\n ],\n )\n):\n """The ``context`` object available to a decorated function of ``run_status_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n dagster_run (DagsterRun): the run of the job or pipeline.\n dagster_event (DagsterEvent): the event associated with the job or pipeline run status.\n instance (DagsterInstance): the current instance.\n """\n\n def __new__(cls, sensor_name, dagster_run, dagster_event, instance):\n\n return super(RunStatusSensorContext, cls).__new__(\n cls,\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n dagster_run=check.inst_param(dagster_run, "dagster_run", DagsterRun),\n dagster_event=check.inst_param(dagster_event, "dagster_event", DagsterEvent),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )\n\n def for_pipeline_failure(self):\n return PipelineFailureSensorContext(\n sensor_name=self.sensor_name,\n dagster_run=self.dagster_run,\n dagster_event=self.dagster_event,\n instance=self.instance,\n )\n\n
[docs] def for_run_failure(self):\n """Converts RunStatusSensorContext to RunFailureSensorContext."""\n return RunFailureSensorContext(\n sensor_name=self.sensor_name,\n dagster_run=self.dagster_run,\n dagster_event=self.dagster_event,\n instance=self.instance,\n )
\n\n @property\n def pipeline_run(self) -> PipelineRun:\n warnings.warn(\n "`RunStatusSensorContext.pipeline_run` is deprecated as of 0.13.0; use "\n "`RunStatusSensorContext.dagster_run` instead."\n )\n return self.dagster_run
\n\n\n
[docs]class PipelineFailureSensorContext(RunStatusSensorContext):\n """The ``context`` object available to a decorated function of ``pipeline_failure_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n pipeline_run (PipelineRun): the failed pipeline run.\n failure_event (DagsterEvent): the pipeline failure event.\n """\n\n @property\n def failure_event(self):\n return self.dagster_event
\n\n\n
[docs]class RunFailureSensorContext(RunStatusSensorContext):\n """The ``context`` object available to a decorated function of ``run_failure_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n pipeline_run (PipelineRun): the failed pipeline run.\n failure_event (DagsterEvent): the pipeline failure event.\n """\n\n @property\n def failure_event(self):\n return self.dagster_event
\n\n\n
[docs]def build_run_status_sensor_context(\n sensor_name: str,\n dagster_event: DagsterEvent,\n dagster_instance: DagsterInstance,\n dagster_run: DagsterRun,\n) -> RunStatusSensorContext:\n """\n Builds run status sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@run_status_sensor` or `@run_failure_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n dagster_event (DagsterEvent): A DagsterEvent with the same event type as the one that\n triggers the run_status_sensor\n dagster_instance (DagsterInstance): The dagster instance configured for the context.\n dagster_run (DagsterRun): DagsterRun object from running a job\n\n Examples:\n .. code-block:: python\n\n instance = DagsterInstance.ephemeral()\n result = my_job.execute_in_process(instance=instance)\n\n dagster_run = result.dagster_run\n dagster_event = result.get_job_success_event() # or get_job_failure_event()\n\n context = build_run_status_sensor_context(\n sensor_name="run_status_sensor_to_invoke",\n dagster_instance=instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n )\n run_status_sensor_to_invoke(context)\n """\n\n return RunStatusSensorContext(\n sensor_name=sensor_name,\n instance=dagster_instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n )
\n\n\n
[docs]def pipeline_failure_sensor(\n name: Optional[Union[Callable[..., Any], str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n pipeline_selection: Optional[List[str]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[\n [Callable[[PipelineFailureSensorContext], Union[SkipReason, PipelineRunReaction]]],\n SensorDefinition,\n]:\n """\n Creates a sensor that reacts to pipeline failure events, where the decorated function will be\n run when a pipeline run fails.\n\n Takes a :py:class:`~dagster.PipelineFailureSensorContext`.\n\n Args:\n name (Optional[str]): The name of the pipeline failure sensor. Defaults to the name of the\n decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n pipeline_selection (Optional[List[str]]): Names of the pipelines that will be monitored by\n this failure sensor. Defaults to None, which means the alert will be sent when any\n pipeline in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def inner(\n fn: Callable[[PipelineFailureSensorContext], Union[SkipReason, PipelineRunReaction]]\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n if name is None or callable(name):\n sensor_name = fn.__name__\n else:\n sensor_name = name\n\n @run_status_sensor(\n pipeline_run_status=PipelineRunStatus.FAILURE,\n pipeline_selection=pipeline_selection,\n name=sensor_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n )\n def _pipeline_failure_sensor(context: RunStatusSensorContext):\n fn(context.for_pipeline_failure())\n\n return _pipeline_failure_sensor\n\n # This case is for when decorator is used bare, without arguments, i.e. @pipeline_failure_sensor\n if callable(name):\n return inner(name)\n\n return inner
\n\n\n
[docs]def run_failure_sensor(\n name: Optional[Union[Callable[..., Any], str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None,\n pipeline_selection: Optional[List[str]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[\n [Callable[[RunFailureSensorContext], Union[SkipReason, PipelineRunReaction]]],\n SensorDefinition,\n]:\n """\n Creates a sensor that reacts to job failure events, where the decorated function will be\n run when a run fails.\n\n Takes a :py:class:`~dagster.RunFailureSensorContext`.\n\n Args:\n name (Optional[str]): The name of the job failure sensor. Defaults to the name of the\n decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition]]]): The jobs that\n will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails.\n pipeline_selection (Optional[List[str]]): (legacy) Names of the pipelines that will be monitored by\n this sensor. Defaults to None, which means the alert will be sent when any pipeline in\n the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def inner(\n fn: Callable[[RunFailureSensorContext], Union[SkipReason, PipelineRunReaction]]\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n if name is None or callable(name):\n sensor_name = fn.__name__\n else:\n sensor_name = name\n\n @run_status_sensor(\n pipeline_run_status=PipelineRunStatus.FAILURE,\n name=sensor_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job_selection=job_selection,\n pipeline_selection=pipeline_selection,\n default_status=default_status,\n )\n def _run_failure_sensor(context: RunStatusSensorContext):\n fn(context.for_run_failure())\n\n return _run_failure_sensor\n\n # This case is for when decorator is used bare, without arguments, i.e. @pipeline_failure_sensor\n if callable(name):\n return inner(name)\n\n return inner
\n\n\n
[docs]class RunStatusSensorDefinition(SensorDefinition):\n """\n Define a sensor that reacts to a given status of pipeline execution, where the decorated\n function will be evaluated when a run is at the given status.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n pipeline_run_status (PipelineRunStatus): The status of a run which will be\n monitored by the sensor.\n run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.RunStatusSensorContext`.\n pipeline_selection (Optional[List[str]]): (legacy) Names of the pipelines that will be monitored by\n this sensor. Defaults to None, which means the alert will be sent when any pipeline in\n the repository fails.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition]]]): The jobs that\n will be monitored by this sensor. Defaults to None, which means the alert will be sent\n when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n pipeline_run_status: PipelineRunStatus,\n run_status_sensor_fn: Callable[\n [RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]\n ],\n pipeline_selection: Optional[List[str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n ):\n\n from dagster.core.storage.event_log.base import EventRecordsFilter, RunShardedEventsCursor\n\n check.str_param(name, "name")\n check.inst_param(pipeline_run_status, "pipeline_run_status", PipelineRunStatus)\n check.callable_param(run_status_sensor_fn, "run_status_sensor_fn")\n check.opt_list_param(pipeline_selection, "pipeline_selection", str)\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.opt_list_param(job_selection, "job_selection", (PipelineDefinition, GraphDefinition))\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n self._run_status_sensor_fn = check.callable_param(\n run_status_sensor_fn, "run_status_sensor_fn"\n )\n event_type = PIPELINE_RUN_STATUS_TO_EVENT_TYPE[pipeline_run_status]\n\n def _wrapped_fn(context: SensorEvaluationContext):\n # initiate the cursor to (most recent event id, current timestamp) when:\n # * it's the first time starting the sensor\n # * or, the cursor isn't in valid format (backcompt)\n if context.cursor is None or not RunStatusSensorCursor.is_valid(context.cursor):\n most_recent_event_records = list(\n context.instance.get_event_records(\n EventRecordsFilter(event_type=event_type), ascending=False, limit=1\n )\n )\n most_recent_event_id = (\n most_recent_event_records[0].storage_id\n if len(most_recent_event_records) == 1\n else -1\n )\n\n new_cursor = RunStatusSensorCursor(\n update_timestamp=pendulum.now("UTC").isoformat(),\n record_id=most_recent_event_id,\n )\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initiating {name}. Set cursor to {new_cursor}")\n return\n\n record_id, update_timestamp = RunStatusSensorCursor.from_json(context.cursor)\n\n # Fetch events after the cursor id\n # * we move the cursor forward to the latest visited event's id to avoid revisits\n # * when the daemon is down, bc we persist the cursor info, we can go back to where we\n # left and backfill alerts for the qualified events (up to 5 at a time) during the downtime\n # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n after_cursor=RunShardedEventsCursor(\n id=record_id,\n run_updated_after=cast(datetime, pendulum.parse(update_timestamp)),\n ),\n event_type=event_type,\n ),\n ascending=True,\n limit=5,\n )\n\n for event_record in event_records:\n event_log_entry = event_record.event_log_entry\n storage_id = event_record.storage_id\n\n # get run info\n run_records = context.instance.get_run_records(\n filters=RunsFilter(run_ids=[event_log_entry.run_id])\n )\n\n # skip if we couldn't find the right run\n if len(run_records) != 1:\n # bc we couldn't find the run, we use the event timestamp as the approximate\n # run update timestamp\n approximate_update_timestamp = utc_datetime_from_timestamp(\n event_log_entry.timestamp\n )\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=approximate_update_timestamp.isoformat(),\n ).to_json()\n )\n continue\n\n pipeline_run = run_records[0].pipeline_run\n update_timestamp = run_records[0].update_timestamp\n\n # skip if any of of the followings happens:\n if (\n # the pipeline does not have a repository (manually executed)\n not pipeline_run.external_pipeline_origin\n or\n # the pipeline does not belong to the current repository\n pipeline_run.external_pipeline_origin.external_repository_origin.repository_name\n != context.repository_name\n or\n # if pipeline is not selected\n (pipeline_selection and pipeline_run.pipeline_name not in pipeline_selection)\n or\n # if job not selected\n (\n job_selection\n and pipeline_run.pipeline_name not in map(lambda x: x.name, job_selection)\n )\n ):\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n continue\n\n serializable_error = None\n\n try:\n with user_code_error_boundary(\n RunStatusSensorExecutionError,\n lambda: f'Error occurred during the execution sensor "{name}".',\n ):\n # one user code invocation maps to one failure event\n run_status_sensor_fn(\n RunStatusSensorContext(\n sensor_name=name,\n dagster_run=pipeline_run,\n dagster_event=event_log_entry.dagster_event,\n instance=context.instance,\n )\n )\n except RunStatusSensorExecutionError as run_status_sensor_execution_error:\n # When the user code errors, we report error to the sensor tick not the original run.\n serializable_error = serializable_error_info_from_exc_info(\n run_status_sensor_execution_error.original_exc_info\n )\n\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n\n # Yield PipelineRunReaction to indicate the execution success/failure.\n # The sensor machinery would\n # * report back to the original run if success\n # * update cursor and job state\n yield PipelineRunReaction(\n pipeline_run=pipeline_run,\n run_status=pipeline_run_status,\n error=serializable_error,\n )\n\n super(RunStatusSensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n )\n\n def __call__(self, *args, **kwargs):\n if is_context_provided(self._run_status_sensor_fn):\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Run status sensor function expected context argument, but no context argument "\n "was provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Run status sensor invocation received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self._run_status_sensor_fn)[0].name\n\n if args:\n context = check.opt_inst_param(args[0], context_param_name, RunStatusSensorContext)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Run status sensor invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name], context_param_name, RunStatusSensorContext\n )\n\n if not context:\n raise DagsterInvalidInvocationError(\n "Context must be provided for direct invocation of run status sensor."\n )\n\n return self._run_status_sensor_fn(context)\n\n else:\n if len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Run status sensor decorated function has no arguments, but arguments were "\n "provided to invocation."\n )\n\n return self._run_status_sensor_fn()
\n\n\n
[docs]def run_status_sensor(\n pipeline_run_status: PipelineRunStatus,\n pipeline_selection: Optional[List[str]] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[\n [Callable[[RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]]],\n RunStatusSensorDefinition,\n]:\n """\n Creates a sensor that reacts to a given status of pipeline execution, where the decorated\n function will be run when a pipeline is at the given status.\n\n Takes a :py:class:`~dagster.RunStatusSensorContext`.\n\n Args:\n pipeline_run_status (PipelineRunStatus): The status of pipeline execution which will be\n monitored by the sensor.\n pipeline_selection (Optional[List[str]]): Names of the pipelines that will be monitored by\n this sensor. Defaults to None, which means the alert will be sent when any pipeline in\n the repository fails.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job_selection (Optional[List[Union[PipelineDefinition, GraphDefinition]]]): Jobs that will\n be monitored by this sensor. Defaults to None, which means the alert will be sent when\n any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def inner(\n fn: Callable[["RunStatusSensorContext"], Union[SkipReason, PipelineRunReaction]]\n ) -> RunStatusSensorDefinition:\n\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(context: RunStatusSensorContext):\n fn(context)\n\n return RunStatusSensorDefinition(\n name=sensor_name,\n pipeline_run_status=pipeline_run_status,\n run_status_sensor_fn=_wrapped_fn,\n pipeline_selection=pipeline_selection,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job_selection=job_selection,\n default_status=default_status,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/core/definitions/run_status_sensor_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.run_status_sensor_definition"}, "schedule_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.schedule_definition

\nimport copy\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import Any, Callable, Dict, Iterator, List, NamedTuple, Optional, TypeVar, Union, cast\n\nimport pendulum\nfrom typing_extensions import TypeGuard\n\nimport dagster._check as check\n\nfrom ...serdes import whitelist_for_serdes\nfrom ...utils import ensure_gen, merge_dicts\nfrom ...utils.schedules import is_valid_cron_string\nfrom ..decorator_utils import get_function_params\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom ..instance import DagsterInstance\nfrom ..instance.ref import InstanceRef\nfrom ..storage.pipeline_run import PipelineRun\nfrom .graph_definition import GraphDefinition\nfrom .mode import DEFAULT_MODE_NAME\nfrom .pipeline_definition import PipelineDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .target import DirectTarget, RepoRelativeTarget\nfrom .utils import check_valid_name, validate_tags\n\nT = TypeVar("T")\n\n\n@whitelist_for_serdes\nclass DefaultScheduleStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\n
[docs]class ScheduleEvaluationContext:\n """Schedule-specific execution context.\n\n An instance of this class is made available as the first argument to various ScheduleDefinition\n functions. It is passed as the first argument to ``run_config_fn``, ``tags_fn``,\n and ``should_execute``.\n\n Attributes:\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n scheduled_execution_time (datetime):\n The time in which the execution was scheduled to happen. May differ slightly\n from both the actual execution time and the time at which the run config is computed.\n Not available in all schedulers - currently only set in deployments using\n DagsterDaemonScheduler.\n """\n\n __slots__ = ["_instance_ref", "_scheduled_execution_time", "_exit_stack", "_instance"]\n\n def __init__(\n self, instance_ref: Optional[InstanceRef], scheduled_execution_time: Optional[datetime]\n ):\n self._exit_stack = ExitStack()\n self._instance = None\n\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._scheduled_execution_time = check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n )\n\n def __enter__(self):\n return self\n\n def __exit__(self, _exception_type, _exception_value, _traceback):\n self._exit_stack.close()\n\n @property\n def instance(self) -> "DagsterInstance":\n # self._instance_ref should only ever be None when this ScheduleEvaluationContext was\n # constructed under test.\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was provided."\n )\n if not self._instance:\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def scheduled_execution_time(self) -> Optional[datetime]:\n return self._scheduled_execution_time
\n\n\n# Preserve ScheduleExecutionContext for backcompat so type annotations don't break.\nScheduleExecutionContext = ScheduleEvaluationContext\n\nRunConfig = Dict[str, Any]\nRunRequestIterator = Iterator[Union[RunRequest, SkipReason]]\n\nScheduleEvaluationFunctionReturn = Union[\n RunRequest, SkipReason, RunConfig, RunRequestIterator, List[RunRequest]\n]\nRawScheduleEvaluationFunction = Union[\n Callable[[ScheduleEvaluationContext], ScheduleEvaluationFunctionReturn],\n Callable[[], ScheduleEvaluationFunctionReturn],\n]\n\nRunConfigEvaluationFunction = Union[\n Callable[[ScheduleEvaluationContext], RunConfig],\n Callable[[], RunConfig],\n]\n\n\nclass DecoratedScheduleFunction(NamedTuple):\n """Wrapper around the decorated schedule function. Keeps track of both to better support the\n optimal return value for direct invocation of the evaluation function"""\n\n decorated_fn: RawScheduleEvaluationFunction\n wrapped_fn: Callable[[ScheduleEvaluationContext], RunRequestIterator]\n has_context_arg: bool\n\n\ndef is_context_provided(\n fn: Union[Callable[[ScheduleEvaluationContext], T], Callable[[], T]]\n) -> TypeGuard[Callable[[ScheduleEvaluationContext], T]]:\n return len(get_function_params(fn)) == 1\n\n\n
[docs]def build_schedule_context(\n instance: Optional[DagsterInstance] = None, scheduled_execution_time: Optional[datetime] = None\n) -> ScheduleEvaluationContext:\n """Builds schedule execution context using the provided parameters.\n\n The instance provided to ``build_schedule_context`` must be persistent;\n DagsterInstance.ephemeral() will result in an error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the schedule.\n scheduled_execution_time (datetime): The time in which the execution was scheduled to\n happen. May differ slightly from both the actual execution time and the time at which\n the run config is computed.\n\n Examples:\n\n .. code-block:: python\n\n context = build_schedule_context(instance)\n daily_schedule.evaluate_tick(context)\n\n """\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n return ScheduleEvaluationContext(\n instance_ref=instance.get_ref() if instance and instance.is_persistent else None,\n scheduled_execution_time=check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n ),\n )
\n\n\n@whitelist_for_serdes\nclass ScheduleExecutionData(NamedTuple):\n run_requests: Optional[List[RunRequest]]\n skip_message: Optional[str]\n\n\n
[docs]class ScheduleDefinition:\n """Define a schedule that targets a job\n\n Args:\n name (Optional[str]): The name of the schedule to create. Defaults to the job name plus\n "_schedule".\n cron_schedule (str): A valid cron string specifying when the schedule will run, e.g.,\n '45 23 * * 6' for a schedule that runs at 11:45 PM every Saturday.\n pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the schedule runs.\n execution_fn (Callable[ScheduleEvaluationContext]): The core evaluation function for the\n schedule, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.ScheduleEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n run_config (Optional[Dict]): The config that parameterizes this execution,\n as a dict.\n run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Dict]]]): A function that\n takes a ScheduleEvaluationContext object and returns the run configuration that\n parameterizes this execution, as a dict. You may set only one of ``run_config``,\n ``run_config_fn``, and ``execution_fn``.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags``, ``tags_fn``, and ``execution_fn``.\n solid_selection (Optional[List[str]]): A list of solid subselection (including single\n solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``\n mode (Optional[str]): (legacy) The mode to apply when executing this schedule. (default: 'default')\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs\n at schedule execution time to determine whether a schedule should execute or skip. Takes\n a :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n environment_vars (Optional[dict[str, str]]): The environment variables to set for the\n schedule\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this\n schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: Optional[str] = None,\n cron_schedule: Optional[str] = None,\n pipeline_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n run_config_fn: Optional[RunConfigEvaluationFunction] = None,\n tags: Optional[Dict[str, str]] = None,\n tags_fn: Optional[Callable[..., Optional[Dict[str, str]]]] = None,\n solid_selection: Optional[List[Any]] = None,\n mode: Optional[str] = "default",\n should_execute: Optional[Callable[..., bool]] = None,\n environment_vars: Optional[Dict[str, str]] = None,\n execution_timezone: Optional[str] = None,\n execution_fn: Optional[\n Union[Callable[[ScheduleEvaluationContext], Any], DecoratedScheduleFunction]\n ] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, PipelineDefinition]] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n ):\n\n self._cron_schedule = check.str_param(cron_schedule, "cron_schedule")\n\n if not is_valid_cron_string(self._cron_schedule):\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{self._cron_schedule}' for schedule '{name}''. "\n "Dagster recognizes standard cron expressions consisting of 5 fields."\n )\n\n if job is not None:\n self._target: Union[DirectTarget, RepoRelativeTarget] = DirectTarget(job)\n else:\n self._target = RepoRelativeTarget(\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n mode=check.opt_str_param(mode, "mode") or DEFAULT_MODE_NAME,\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n )\n\n if name:\n self._name = check_valid_name(name)\n elif pipeline_name:\n self._name = pipeline_name + "_schedule"\n elif job:\n self._name = job.name + "_schedule"\n\n self._description = check.opt_str_param(description, "description")\n\n self._environment_vars = check.opt_dict_param(\n environment_vars, "environment_vars", key_type=str, value_type=str\n )\n self._execution_timezone = check.opt_str_param(execution_timezone, "execution_timezone")\n\n if execution_fn and (run_config_fn or tags_fn or should_execute or tags or run_config):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both execution_fn and individual run_config/tags arguments "\n "to ScheduleDefinition. Must provide only one of the two."\n )\n elif execution_fn:\n self._execution_fn: Optional[\n Union[Callable[..., Any], DecoratedScheduleFunction]\n ] = None\n if isinstance(execution_fn, DecoratedScheduleFunction):\n self._execution_fn = execution_fn\n else:\n self._execution_fn = check.opt_callable_param(execution_fn, "execution_fn")\n self._run_config_fn = None\n else:\n\n if run_config_fn and run_config:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both run_config_fn and run_config as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n\n # pylint: disable=unused-argument\n def _default_run_config_fn(context: ScheduleEvaluationContext) -> RunConfig:\n return check.opt_dict_param(run_config, "run_config")\n\n self._run_config_fn = check.opt_callable_param(\n run_config_fn, "run_config_fn", default=_default_run_config_fn\n )\n\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n tags = validate_tags(tags, allow_reserved_tags=False)\n tags_fn = lambda _context: tags\n else:\n tags_fn = check.opt_callable_param(\n tags_fn, "tags_fn", default=lambda _context: cast(Dict[str, str], {})\n )\n\n should_execute = check.opt_callable_param(\n should_execute, "should_execute", default=lambda _context: True\n )\n\n def _execution_fn(context):\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of should_execute for schedule {name}",\n ):\n if not should_execute(context):\n yield SkipReason(\n "should_execute function for {schedule_name} returned false.".format(\n schedule_name=name\n )\n )\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of run_config_fn for schedule {name}",\n ):\n run_config_fn = check.not_none(self._run_config_fn)\n evaluated_run_config = copy.deepcopy(\n run_config_fn(context)\n if is_context_provided(run_config_fn)\n else run_config_fn() # type: ignore\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {name}",\n ):\n evaluated_tags = validate_tags(tags_fn(context), allow_reserved_tags=False)\n\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n\n self._execution_fn = _execution_fn\n\n if self._execution_timezone:\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(self._execution_timezone)\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n f"Invalid execution timezone {self._execution_timezone} for {name}"\n ) from e\n\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultScheduleStatus\n )\n\n def __call__(self, *args, **kwargs):\n from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n if not isinstance(self._execution_fn, DecoratedScheduleFunction):\n raise DagsterInvalidInvocationError(\n "Schedule invocation is only supported for schedules created via the schedule "\n "decorators."\n )\n result = None\n if self._execution_fn.has_context_arg:\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Schedule decorated function has context argument, but no context argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Schedule invocation received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self._execution_fn.decorated_fn)[0].name\n\n if args:\n context = check.opt_inst_param(\n args[0], context_param_name, ScheduleEvaluationContext\n )\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Schedule invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name], context_param_name, ScheduleEvaluationContext\n )\n\n context = context if context else build_schedule_context()\n\n result = self._execution_fn.decorated_fn(context) # type: ignore\n else:\n if len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Decorated schedule function takes no arguments, but arguments were provided."\n )\n result = self._execution_fn.decorated_fn() # type: ignore\n\n if isinstance(result, dict):\n return copy.deepcopy(result)\n else:\n return result\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def pipeline_name(self) -> str:\n return self._target.pipeline_name\n\n @property\n def solid_selection(self) -> Optional[List[Any]]:\n return self._target.solid_selection\n\n @property\n def mode(self) -> str:\n return self._target.mode\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def cron_schedule(self) -> str:\n return self._cron_schedule\n\n @property\n def environment_vars(self) -> Dict[str, str]:\n return self._environment_vars\n\n @property\n def execution_timezone(self) -> Optional[str]:\n return self._execution_timezone\n\n @property\n def job(self) -> Union[GraphDefinition, PipelineDefinition]:\n if isinstance(self._target, DirectTarget):\n return self._target.target\n raise DagsterInvalidDefinitionError("No job was provided to ScheduleDefinition.")\n\n def evaluate_tick(self, context: "ScheduleEvaluationContext") -> ScheduleExecutionData:\n """Evaluate schedule using the provided context.\n\n Args:\n context (ScheduleEvaluationContext): The context with which to evaluate this schedule.\n Returns:\n ScheduleExecutionData: Contains list of run requests, or skip message if present.\n\n """\n\n check.inst_param(context, "context", ScheduleEvaluationContext)\n execution_fn: Callable[[ScheduleEvaluationContext], "ScheduleEvaluationFunctionReturn"]\n if isinstance(self._execution_fn, DecoratedScheduleFunction):\n execution_fn = self._execution_fn.wrapped_fn\n else:\n execution_fn = cast(\n Callable[[ScheduleExecutionContext], "ScheduleEvaluationFunctionReturn"],\n self._execution_fn,\n )\n\n result = list(ensure_gen(execution_fn(context)))\n\n skip_message: Optional[str] = None\n\n run_requests: List[RunRequest] = []\n if not result or result == [None]:\n run_requests = []\n skip_message = "Schedule function returned an empty result"\n elif len(result) == 1:\n item = check.inst(result[0], (SkipReason, RunRequest))\n if isinstance(item, RunRequest):\n run_requests = [item]\n skip_message = None\n elif isinstance(item, SkipReason):\n run_requests = []\n skip_message = item.skip_message\n else:\n # NOTE: mypy is not correctly reading this cast-- not sure why\n # (pyright reads it fine). Hence the type-ignores below.\n result = cast(List[RunRequest], check.is_list(result, of_type=RunRequest)) # type: ignore\n check.invariant(\n not any(not request.run_key for request in result), # type: ignore\n "Schedules that return multiple RunRequests must specify a run_key in each RunRequest",\n )\n run_requests = result # type: ignore\n skip_message = None\n\n # clone all the run requests with the required schedule tags\n run_requests_with_schedule_tags = [\n RunRequest(\n run_key=request.run_key,\n run_config=request.run_config,\n tags=merge_dicts(request.tags, PipelineRun.tags_for_schedule(self)),\n )\n for request in run_requests\n ]\n\n return ScheduleExecutionData(\n run_requests=run_requests_with_schedule_tags, skip_message=skip_message\n )\n\n def has_loadable_target(self):\n return isinstance(self._target, DirectTarget)\n\n def load_target(self) -> Union[GraphDefinition, PipelineDefinition]:\n if isinstance(self._target, DirectTarget):\n return self._target.load()\n\n check.failed("Target is not loadable")\n\n @property\n def default_status(self) -> DefaultScheduleStatus:\n return self._default_status
\n
", "current_page_name": "_modules/dagster/core/definitions/schedule_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.schedule_definition"}, "sensor_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.sensor_definition

\nimport inspect\nfrom contextlib import ExitStack\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Iterator,\n    List,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeGuard\n\nimport dagster._check as check\nfrom dagster.core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.instance.ref import InstanceRef\nfrom dagster.serdes import whitelist_for_serdes\n\nfrom ..decorator_utils import get_function_params\nfrom .events import AssetKey\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .mode import DEFAULT_MODE_NAME\nfrom .pipeline_definition import PipelineDefinition\nfrom .run_request import PipelineRunReaction, RunRequest, SkipReason\nfrom .target import DirectTarget, RepoRelativeTarget\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster.core.events.log import EventLogEntry\n\n\n@whitelist_for_serdes\nclass DefaultSensorStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\nDEFAULT_SENSOR_DAEMON_INTERVAL = 30\n\n\n
[docs]class SensorEvaluationContext:\n """Sensor execution context.\n\n An instance of this class is made available as the first argument to the evaluation function\n on SensorDefinition.\n\n Attributes:\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest\n last_completion_time (float): DEPRECATED The last time that the sensor was evaluated (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n instance: Optional[DagsterInstance] = None,\n ):\n self._exit_stack = ExitStack()\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._last_completion_time = check.opt_float_param(\n last_completion_time, "last_completion_time"\n )\n self._last_run_key = check.opt_str_param(last_run_key, "last_run_key")\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._repository_name = check.opt_str_param(repository_name, "repository_name")\n self._instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n\n def __enter__(self):\n return self\n\n def __exit__(self, _exception_type, _exception_value, _traceback):\n self._exit_stack.close()\n\n @property\n def instance(self) -> DagsterInstance:\n # self._instance_ref should only ever be None when this SensorEvaluationContext was\n # constructed under test.\n if not self._instance:\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was provided."\n )\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def last_completion_time(self) -> Optional[float]:\n return self._last_completion_time\n\n @property\n def last_run_key(self) -> Optional[str]:\n return self._last_run_key\n\n @property\n def cursor(self) -> Optional[str]:\n """The cursor value for this sensor, which was set in an earlier sensor evaluation."""\n return self._cursor\n\n def update_cursor(self, cursor: Optional[str]) -> None:\n """Updates the cursor value for this sensor, which will be provided on the context for the\n next sensor evaluation.\n\n This can be used to keep track of progress and avoid duplicate work across sensor\n evaluations.\n\n Args:\n cursor (Optional[str]):\n """\n self._cursor = check.opt_str_param(cursor, "cursor")\n\n @property\n def repository_name(self) -> Optional[str]:\n return self._repository_name
\n\n\n# Preserve SensorExecutionContext for backcompat so type annotations don't break.\nSensorExecutionContext = SensorEvaluationContext\n\nRawSensorEvaluationFunctionReturn = Union[\n Iterator[Union[SkipReason, RunRequest]], List[RunRequest], SkipReason, RunRequest\n]\nRawSensorEvaluationFunction = Union[\n Callable[[], RawSensorEvaluationFunctionReturn],\n Callable[[SensorEvaluationContext], RawSensorEvaluationFunctionReturn],\n]\nSensorEvaluationFunction = Callable[\n [SensorEvaluationContext], Iterator[Union[SkipReason, RunRequest]]\n]\n\n\ndef is_context_provided(\n fn: "RawSensorEvaluationFunction",\n) -> TypeGuard[Callable[[SensorEvaluationContext], "RawSensorEvaluationFunctionReturn"]]:\n return len(get_function_params(fn)) == 1\n\n\n
[docs]class SensorDefinition:\n """Define a sensor that initiates a set of runs based on some external state\n\n Args:\n evaluation_fn (Callable[[SensorEvaluationContext]]): The core evaluation function for the\n sensor, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n name (Optional[str]): The name of the sensor to create. Defaults to name of evaluation_fn\n pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the sensor\n fires. Cannot be used in conjunction with `job` or `jobs` parameters.\n solid_selection (Optional[List[str]]): (legacy) A list of solid subselection (including single\n solid names) to execute when the sensor runs. e.g. ``['*some_solid+', 'other_solid']``.\n Cannot be used in conjunction with `job` or `jobs` parameters.\n mode (Optional[str]): (legacy) The mode to apply when executing runs triggered by this\n sensor. Cannot be used in conjunction with `job` or `jobs` parameters. (default:\n 'default')\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[GraphDefinition, JobDefinition]): The job to execute when this sensor fires.\n jobs (Optional[Sequence[GraphDefinition, JobDefinition]]): (experimental) A list of jobs to execute when this sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: Optional[str] = None,\n evaluation_fn: Optional[RawSensorEvaluationFunction] = None,\n pipeline_name: Optional[str] = None,\n solid_selection: Optional[List[Any]] = None,\n mode: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, JobDefinition]] = None,\n jobs: Optional[Sequence[Union[GraphDefinition, JobDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n ):\n if evaluation_fn is None:\n raise DagsterInvalidDefinitionError("Must provide evaluation_fn to SensorDefinition.")\n\n if job and jobs:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both job and jobs to SensorDefinition. Must provide only one "\n "of the two."\n )\n\n job_param_name = "job" if job else "jobs"\n jobs = jobs if jobs else [job] if job else None\n\n if pipeline_name and jobs:\n raise DagsterInvalidDefinitionError(\n f"Attempted to provide both pipeline_name and {job_param_name} to "\n "SensorDefinition. Must provide only one of the two."\n )\n if solid_selection and jobs:\n raise DagsterInvalidDefinitionError(\n f"Attempted to provide solid_selection and {job_param_name} to SensorDefinition. "\n "The solid_selection argument is incompatible with jobs."\n )\n if mode and jobs:\n raise DagsterInvalidDefinitionError(\n f"Attempted to provide mode and {job_param_name} to SensorDefinition. "\n "The mode argument is incompatible with jobs."\n )\n\n targets: Optional[List[Union[RepoRelativeTarget, DirectTarget]]] = None\n if pipeline_name:\n targets = [\n RepoRelativeTarget(\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n mode=check.opt_str_param(mode, "mode") or DEFAULT_MODE_NAME,\n solid_selection=check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n ),\n )\n ]\n elif job:\n targets = [DirectTarget(job)]\n elif jobs:\n targets = [DirectTarget(job) for job in jobs]\n\n if name:\n self._name = check_valid_name(name)\n else:\n self._name = evaluation_fn.__name__\n\n self._raw_fn: RawSensorEvaluationFunction = check.callable_param(\n evaluation_fn, "evaluation_fn"\n )\n self._evaluation_fn: SensorEvaluationFunction = wrap_sensor_evaluation(\n self._name, evaluation_fn\n )\n self._min_interval = check.opt_int_param(\n minimum_interval_seconds, "minimum_interval_seconds", DEFAULT_SENSOR_DAEMON_INTERVAL\n )\n self._description = check.opt_str_param(description, "description")\n self._targets = check.opt_list_param(targets, "targets", (DirectTarget, RepoRelativeTarget))\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultSensorStatus\n )\n\n def __call__(self, *args, **kwargs):\n\n if is_context_provided(self._raw_fn):\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Sensor evaluation function expected context argument, but no context argument "\n "was provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Sensor invocation received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self._raw_fn)[0].name\n\n if args:\n context = check.opt_inst_param(args[0], context_param_name, SensorEvaluationContext)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Sensor invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name], context_param_name, SensorEvaluationContext\n )\n\n context = context if context else build_sensor_context()\n\n return self._raw_fn(context)\n\n else:\n if len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Sensor decorated function has no arguments, but arguments were provided to "\n "invocation."\n )\n\n return self._raw_fn() # type: ignore [TypeGuard limitation]\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def minimum_interval_seconds(self) -> Optional[int]:\n return self._min_interval\n\n @property\n def targets(self) -> List[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets\n\n @property\n def job(self) -> Union[PipelineDefinition, GraphDefinition]:\n if self._targets:\n if len(self._targets) == 1 and isinstance(self._targets[0], DirectTarget):\n return self._targets[0].target\n elif len(self._targets) > 1:\n raise DagsterInvalidDefinitionError(\n "Job property not available when SensorDefinition has multiple jobs."\n )\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n def evaluate_tick(self, context: "SensorEvaluationContext") -> "SensorExecutionData":\n """Evaluate sensor using the provided context.\n\n Args:\n context (SensorEvaluationContext): The context with which to evaluate this sensor.\n Returns:\n SensorExecutionData: Contains list of run requests, or skip message if present.\n\n """\n\n context = check.inst_param(context, "context", SensorEvaluationContext)\n result = list(self._evaluation_fn(context))\n\n skip_message: Optional[str] = None\n\n run_requests: List[RunRequest]\n pipeline_run_reactions: List[PipelineRunReaction]\n if not result or result == [None]:\n run_requests = []\n pipeline_run_reactions = []\n skip_message = "Sensor function returned an empty result"\n elif len(result) == 1:\n item = result[0]\n check.inst(item, (SkipReason, RunRequest, PipelineRunReaction))\n run_requests = [item] if isinstance(item, RunRequest) else []\n pipeline_run_reactions = (\n [cast(PipelineRunReaction, item)] if isinstance(item, PipelineRunReaction) else []\n )\n skip_message = item.skip_message if isinstance(item, SkipReason) else None\n else:\n check.is_list(result, (SkipReason, RunRequest, PipelineRunReaction))\n has_skip = any(map(lambda x: isinstance(x, SkipReason), result))\n has_run_request = any(map(lambda x: isinstance(x, RunRequest), result))\n has_run_reaction = any(map(lambda x: isinstance(x, PipelineRunReaction), result))\n\n if has_skip:\n if has_run_request:\n check.failed(\n "Expected a single SkipReason or one or more RunRequests: received both "\n "RunRequest and SkipReason"\n )\n elif has_run_reaction:\n check.failed(\n "Expected a single SkipReason or one or more PipelineRunReaction: "\n "received both PipelineRunReaction and SkipReason"\n )\n else:\n check.failed("Expected a single SkipReason: received multiple SkipReasons")\n\n if has_run_request:\n run_requests = cast(List[RunRequest], result)\n pipeline_run_reactions = []\n\n else:\n # only run reactions\n run_requests = []\n pipeline_run_reactions = cast(List[PipelineRunReaction], result)\n\n self.check_valid_run_requests(run_requests)\n\n return SensorExecutionData(\n run_requests,\n skip_message,\n context.cursor,\n pipeline_run_reactions,\n )\n\n def has_loadable_targets(self) -> bool:\n for target in self._targets:\n if isinstance(target, DirectTarget):\n return True\n return False\n\n def load_targets(self) -> List[Union[PipelineDefinition, GraphDefinition]]:\n targets = []\n for target in self._targets:\n if isinstance(target, DirectTarget):\n targets.append(target.load())\n return targets\n\n def check_valid_run_requests(self, run_requests: List[RunRequest]):\n has_multiple_targets = len(self._targets) > 1\n target_names = [target.pipeline_name for target in self._targets]\n\n if run_requests and not self._targets:\n raise Exception(\n f"Error in sensor {self._name}: Sensor evaluation function returned a RunRequest "\n "for a sensor lacking a specified target (pipeline_name, job, or jobs). Targets "\n "can be specified by providing job, jobs, or pipeline_name to the @sensor "\n "decorator."\n )\n\n for run_request in run_requests:\n if run_request.job_name is None and has_multiple_targets:\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest that did not "\n f"specify job_name for the requested run. Expected one of: {target_names}"\n )\n elif run_request.job_name and run_request.job_name not in target_names:\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest with job_name "\n f"{run_request.job_name}. Expected one of: {target_names}"\n )\n\n @property\n def _target(self) -> Optional[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets[0] if self._targets else None\n\n @property\n def pipeline_name(self) -> Optional[str]:\n return self._target.pipeline_name if self._target else None\n\n @property\n def solid_selection(self) -> Optional[List[Any]]:\n return self._target.solid_selection if self._target else None\n\n @property\n def mode(self) -> Optional[str]:\n return self._target.mode if self._target else None\n\n @property\n def default_status(self) -> DefaultSensorStatus:\n return self._default_status
\n\n\n@whitelist_for_serdes\nclass SensorExecutionData(\n NamedTuple(\n "_SensorExecutionData",\n [\n ("run_requests", Optional[List[RunRequest]]),\n ("skip_message", Optional[str]),\n ("cursor", Optional[str]),\n ("pipeline_run_reactions", Optional[List[PipelineRunReaction]]),\n ],\n )\n):\n def __new__(\n cls,\n run_requests: Optional[List[RunRequest]] = None,\n skip_message: Optional[str] = None,\n cursor: Optional[str] = None,\n pipeline_run_reactions: Optional[List[PipelineRunReaction]] = None,\n ):\n check.opt_list_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_str_param(cursor, "cursor")\n check.opt_list_param(pipeline_run_reactions, "pipeline_run_reactions", PipelineRunReaction)\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(SensorExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n cursor=cursor,\n pipeline_run_reactions=pipeline_run_reactions,\n )\n\n\ndef wrap_sensor_evaluation(\n sensor_name: str,\n fn: RawSensorEvaluationFunction,\n) -> SensorEvaluationFunction:\n def _wrapped_fn(context: SensorEvaluationContext):\n if is_context_provided(fn):\n result = fn(context)\n else:\n result = fn() # type: ignore\n\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n\n elif result is not None:\n raise Exception(\n (\n "Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n "{result} of type {type_}. Should only return SkipReason or "\n "RunRequest objects."\n ).format(sensor_name=sensor_name, result=result, type_=type(result))\n )\n\n return _wrapped_fn\n\n\n
[docs]def build_sensor_context(\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n) -> SensorEvaluationContext:\n """Builds sensor execution context using the provided parameters.\n\n This function can be used to provide a context to the invocation of a sensor definition.If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A cursor value to provide to the evaluation of the sensor.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n\n Examples:\n\n .. code-block:: python\n\n context = build_sensor_context()\n my_sensor(context)\n\n """\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n return SensorEvaluationContext(\n instance_ref=None,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n )
\n\n\nAssetMaterializationFunctionReturn = Union[\n Iterator[Union[RunRequest, SkipReason]], List[RunRequest], RunRequest, SkipReason\n]\nAssetMaterializationFunction = Callable[\n ["SensorExecutionContext", "EventLogEntry"],\n AssetMaterializationFunctionReturn,\n]\n\n\n
[docs]class AssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a given\n asset.\n\n Args:\n name (str): The name of the sensor to create.\n asset_key (AssetKey): The asset_key this sensor monitors.\n pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the sensor\n fires. Cannot be used in conjunction with `job` or `jobs` parameters.\n asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.SensorEvaluationContext` and\n an EventLogEntry corresponding to an AssetMaterialization event.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n solid_selection (Optional[List[str]]): (legacy) A list of solid subselection (including single\n solid names) to execute when the sensor runs. e.g. ``['*some_solid+', 'other_solid']``.\n Cannot be used in conjunction with `job` or `jobs` parameters.\n mode (Optional[str]): (legacy) The mode to apply when executing runs triggered by this sensor.\n (default: 'default').\n Cannot be used in conjunction with `job` or `jobs` parameters.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_key: AssetKey,\n pipeline_name: Optional[str],\n asset_materialization_fn: Callable[\n ["SensorExecutionContext", "EventLogEntry"],\n RawSensorEvaluationFunctionReturn,\n ],\n solid_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[Union[GraphDefinition, JobDefinition]] = None,\n jobs: Optional[Sequence[Union[GraphDefinition, JobDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n ):\n self._asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n from dagster.core.events import DagsterEventType\n from dagster.core.storage.event_log.base import EventRecordsFilter\n\n def _wrap_asset_fn(materialization_fn):\n def _fn(context):\n after_cursor = None\n if context.cursor:\n try:\n after_cursor = int(context.cursor)\n except ValueError:\n after_cursor = None\n\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=self._asset_key,\n after_cursor=after_cursor,\n ),\n ascending=False,\n limit=1,\n )\n\n if not event_records:\n return\n\n event_record = event_records[0]\n result = materialization_fn(context, event_record.event_log_entry)\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n context.update_cursor(str(event_record.storage_id))\n\n return _fn\n\n super(AssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n pipeline_name=pipeline_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn"),\n ),\n solid_selection=solid_selection,\n mode=mode,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n )\n\n @property\n def asset_key(self):\n return self._asset_key
\n
", "current_page_name": "_modules/dagster/core/definitions/sensor_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.sensor_definition"}, "solid_definition": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.solid_definition

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    Iterator,\n    List,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster.config.config_schema import ConfigSchemaType\nfrom dagster.core.definitions.dependency import NodeHandle\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster.core.types.dagster_type import DagsterType\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nfrom ..decorator_utils import get_function_params\nfrom .config import ConfigMapping\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .dependency import IDependencyDefinition, NodeHandle, NodeInvocation\nfrom .graph_definition import GraphDefinition\nfrom .input import InputDefinition, InputMapping\nfrom .node_definition import NodeDefinition\nfrom .output import OutputDefinition, OutputMapping\nfrom .solid_invocation import solid_invocation_result\n\nif TYPE_CHECKING:\n    from .decorators.solid_decorator import DecoratedSolidFunction\n\n\n
[docs]class SolidDefinition(NodeDefinition):\n """\n The definition of a Solid that performs a user-defined computation.\n\n For more details on what a solid is, refer to the\n `Solid Overview <../../overview/solids-pipelines/solids>`_ .\n\n End users should prefer the :func:`@solid <solid>` and :func:`@lambda_solid <lambda_solid>`\n decorators. SolidDefinition is generally intended to be used by framework authors.\n\n Args:\n name (str): Name of the solid. Must be unique within any :py:class:`PipelineDefinition`\n using the solid.\n input_defs (List[InputDefinition]): Inputs of the solid.\n compute_fn (Callable): The core of the solid, the function that does the actual\n computation. The signature of this function is determined by ``input_defs``, and\n optionally, an injected first argument, ``context``, a collection of information provided\n by the system.\n\n This function will be coerced into a generator or an async generator, which must yield\n one :py:class:`Output` for each of the solid's ``output_defs``, and additionally may\n yield other types of Dagster events, including :py:class:`Materialization` and\n :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the solid.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the solid matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the solid.\n description (Optional[str]): Human-readable description of the solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this\n solid.\n version (Optional[str]): (Experimental) The version of the solid's compute_fn. Two solids should have\n the same version if and only if they deterministically produce the same outputs when\n provided the same inputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this solid.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n SolidDefinition(\n name="add_one",\n input_defs=[InputDefinition("num", Int)],\n output_defs=[OutputDefinition(Int)], # default name ("result")\n compute_fn=_add_one,\n )\n """\n\n def __init__(\n self,\n name: str,\n input_defs: Sequence[InputDefinition],\n compute_fn: Union[Callable[..., Any], "DecoratedSolidFunction"],\n output_defs: Sequence[OutputDefinition],\n config_schema: Optional[Union[ConfigSchemaType, IDefinitionConfigSchema]] = None,\n description: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .decorators.solid_decorator import DecoratedSolidFunction\n\n if isinstance(compute_fn, DecoratedSolidFunction):\n self._compute_fn: Union[Callable[..., Any], DecoratedSolidFunction] = compute_fn\n else:\n compute_fn = cast(Callable[..., Any], compute_fn)\n self._compute_fn = check.callable_param(compute_fn, "compute_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._required_resource_keys = frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n self._version = check.opt_str_param(version, "version")\n if version:\n experimental_arg_warning("version", "SolidDefinition.__init__")\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n positional_inputs = (\n self._compute_fn.positional_inputs()\n if isinstance(self._compute_fn, DecoratedSolidFunction)\n else None\n )\n\n super(SolidDefinition, self).__init__(\n name=name,\n input_defs=check.list_param(input_defs, "input_defs", InputDefinition),\n output_defs=check.list_param(output_defs, "output_defs", OutputDefinition),\n description=description,\n tags=check.opt_dict_param(tags, "tags", key_type=str),\n positional_inputs=positional_inputs,\n )\n\n def __call__(self, *args, **kwargs) -> Any:\n from ..execution.context.invocation import UnboundSolidExecutionContext\n from .composition import is_in_composition\n from .decorators.solid_decorator import DecoratedSolidFunction\n\n if is_in_composition():\n return super(SolidDefinition, self).__call__(*args, **kwargs)\n else:\n node_label = self.node_type_str # string "solid" for solids, "op" for ops\n\n if not isinstance(self.compute_fn, DecoratedSolidFunction):\n raise DagsterInvalidInvocationError(\n f"Attemped to invoke {node_label} that was not constructed using the `@{node_label}` "\n f"decorator. Only {node_label}s constructed using the `@{node_label}` decorator can be "\n "directly invoked."\n )\n if self.compute_fn.has_context_arg():\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n f"Compute function of {node_label} '{self.name}' has context argument, but no context "\n "was provided when invoking."\n )\n if len(args) > 0:\n if args[0] is not None and not isinstance(\n args[0], UnboundSolidExecutionContext\n ):\n raise DagsterInvalidInvocationError(\n f"Compute function of {node_label} '{self.name}' has context argument, "\n "but no context was provided when invoking."\n )\n context = args[0]\n return solid_invocation_result(self, context, *args[1:], **kwargs)\n # Context argument is provided under kwargs\n else:\n context_param_name = get_function_params(self.compute_fn.decorated_fn)[0].name\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Compute function of {node_label} '{self.name}' has context argument "\n f"'{context_param_name}', but no value for '{context_param_name}' was "\n f"found when invoking. Provided kwargs: {kwargs}"\n )\n context = kwargs[context_param_name]\n kwargs_sans_context = {\n kwarg: val\n for kwarg, val in kwargs.items()\n if not kwarg == context_param_name\n }\n return solid_invocation_result(self, context, *args, **kwargs_sans_context)\n\n else:\n if len(args) > 0 and isinstance(args[0], UnboundSolidExecutionContext):\n raise DagsterInvalidInvocationError(\n f"Compute function of {node_label} '{self.name}' has no context argument, but "\n "context was provided when invoking."\n )\n return solid_invocation_result(self, None, *args, **kwargs)\n\n @property\n def node_type_str(self) -> str:\n return "solid"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return False\n\n @property\n def compute_fn(self) -> Union[Callable[..., Any], "DecoratedSolidFunction"]:\n return self._compute_fn\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n @property\n def required_resource_keys(self) -> Optional[FrozenSet[str]]:\n return frozenset(self._required_resource_keys)\n\n @property\n def version(self) -> Optional[str]:\n return self._version\n\n def all_dagster_types(self) -> Iterator[DagsterType]:\n yield from self.all_input_output_types()\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n\n def iterate_solid_defs(self) -> Iterator["SolidDefinition"]:\n yield self\n\n def resolve_output_to_origin(\n self, output_name: str, handle: NodeHandle\n ) -> Tuple[OutputDefinition, NodeHandle]:\n return self.output_def_named(output_name), handle\n\n def input_has_default(self, input_name: str) -> InputDefinition:\n return self.input_def_named(input_name).has_default_value\n\n def default_value_for_input(self, input_name: str) -> InputDefinition:\n return self.input_def_named(input_name).default_value\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n return True\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n config_or_config_fn: Any,\n ) -> "SolidDefinition":\n return SolidDefinition(\n name=name,\n input_defs=self.input_defs,\n compute_fn=self.compute_fn,\n output_defs=self.output_defs,\n config_schema=config_schema,\n description=description or self.description,\n tags=self.tags,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n retry_policy=self.retry_policy,\n )\n\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n return self._retry_policy
\n\n\n
[docs]class CompositeSolidDefinition(GraphDefinition):\n """The core unit of composition and abstraction, composite solids allow you to\n define a solid from a graph of solids.\n\n In the same way you would refactor a block of code in to a function to deduplicate, organize,\n or manage complexity - you can refactor solids in a pipeline in to a composite solid.\n\n Args:\n name (str): The name of this composite solid. Must be unique within any\n :py:class:`PipelineDefinition` using the solid.\n solid_defs (List[Union[SolidDefinition, CompositeSolidDefinition]]): The set of solid\n definitions used in this composite solid. Composites may be arbitrarily nested.\n input_mappings (Optional[List[InputMapping]]): Define the inputs to the composite solid,\n and how they map to the inputs of its constituent solids.\n output_mappings (Optional[List[OutputMapping]]): Define the outputs of the composite solid,\n and how they map from the outputs of its constituent solids.\n config_mapping (Optional[ConfigMapping]): By specifying a config mapping, you can override\n the configuration for the child solids contained within this composite solid. Config\n mappings require both a configuration field to be specified, which is exposed as the\n configuration for the composite solid, and a configuration mapping function, which\n is called to map the configuration of the composite solid into the configuration that\n is applied to any child solids.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares where each solid gets its inputs. The keys at the top\n level dict are either string names of solids or NodeInvocations. The values\n are dicts that map input names to DependencyDefinitions.\n description (Optional[str]): Human readable description of this composite solid.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n may expect and require certain metadata to be attached to a solid.\n positional_inputs (Optional[List[str]]): The positional order of the inputs if it\n differs from the order of the input mappings\n\n Examples:\n\n .. code-block:: python\n\n @lambda_solid\n def add_one(num: int) -> int:\n return num + 1\n\n add_two = CompositeSolidDefinition(\n 'add_two',\n solid_defs=[add_one],\n dependencies={\n NodeInvocation('add_one', 'adder_1'): {},\n NodeInvocation('add_one', 'adder_2'): {'num': DependencyDefinition('adder_1')},\n },\n input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')],\n output_mappings=[OutputDefinition(Int).mapping_from('adder_2')],\n )\n """\n\n def __init__(\n self,\n name: str,\n solid_defs: List[NodeDefinition],\n input_mappings: Optional[List[InputMapping]] = None,\n output_mappings: Optional[List[OutputMapping]] = None,\n config_mapping: Optional[ConfigMapping] = None,\n dependencies: Optional[\n Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]\n ] = None,\n description: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n positional_inputs: Optional[List[str]] = None,\n ):\n _check_io_managers_on_composite_solid(name, input_mappings, output_mappings)\n\n super(CompositeSolidDefinition, self).__init__(\n name=name,\n description=description,\n node_defs=solid_defs,\n dependencies=dependencies,\n tags=tags,\n positional_inputs=positional_inputs,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config=config_mapping,\n )\n\n def all_dagster_types(self) -> Iterator[DagsterType]:\n yield from self.all_input_output_types()\n\n for node_def in self._node_defs:\n yield from node_def.all_dagster_types()\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n config_or_config_fn: Any,\n ) -> "CompositeSolidDefinition":\n config_mapping = self._config_mapping\n if config_mapping is None:\n raise DagsterInvalidDefinitionError(\n "Only composite solids utilizing config mapping can be pre-configured. The "\n 'composite solid "{graph_name}" does not have a config mapping, and thus has '\n "nothing to be configured.".format(graph_name=self.name)\n )\n\n return CompositeSolidDefinition(\n name=name,\n solid_defs=self._node_defs,\n input_mappings=self.input_mappings,\n output_mappings=self.output_mappings,\n config_mapping=ConfigMapping(\n config_mapping.config_fn,\n config_schema=config_schema,\n receive_processed_config_values=config_mapping.receive_processed_config_values,\n ),\n dependencies=self.dependencies,\n description=description or self.description,\n tags=self.tags,\n positional_inputs=self.positional_inputs,\n )\n\n @property\n def node_type_str(self):\n return "composite solid"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return False
\n\n\ndef _check_io_managers_on_composite_solid(\n name: str,\n input_mappings: Optional[List[InputMapping]],\n output_mappings: Optional[List[OutputMapping]],\n):\n # Ban root_manager_key on composite solids\n if input_mappings:\n for input_mapping in input_mappings:\n input_def = input_mapping.definition\n if input_def.root_manager_key:\n raise DagsterInvalidDefinitionError(\n "Root input manager cannot be set on a composite solid: "\n f'root_manager_key "{input_def.root_manager_key}" '\n f'is set on InputDefinition "{input_def.name}" of composite solid "{name}". '\n )\n # Ban io_manager_key on composite solids\n if output_mappings:\n for output_mapping in output_mappings:\n output_def = output_mapping.definition\n if output_def.io_manager_key != "io_manager":\n raise DagsterInvalidDefinitionError(\n "IO manager cannot be set on a composite solid: "\n f'io_manager_key "{output_def.io_manager_key}" '\n f'is set on OutputtDefinition "{output_def.name}" of composite solid "{name}". '\n )\n
", "current_page_name": "_modules/dagster/core/definitions/solid_definition", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.solid_definition"}, "time_window_partitions": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.time_window_partitions

\nfrom datetime import datetime, time\nfrom typing import Any, Callable, Dict, List, NamedTuple, Optional, Union, cast\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster.utils.partitions import DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\nfrom dagster.utils.schedules import schedule_execution_time_iterator\n\nfrom .partition import (\n    DEFAULT_DATE_FORMAT,\n    Partition,\n    PartitionedConfig,\n    PartitionsDefinition,\n    ScheduleType,\n    get_cron_schedule,\n)\n\n\nclass TimeWindow(NamedTuple):\n    """An interval that is closed at the start and open at the end"""\n\n    start: datetime\n    end: datetime\n\n\n
[docs]class TimeWindowPartitionsDefinition(\n PartitionsDefinition[TimeWindow], # pylint: disable=unsubscriptable-object\n NamedTuple(\n "_TimeWindowPartitions",\n [\n ("schedule_type", ScheduleType),\n ("start", datetime),\n ("timezone", str),\n ("fmt", str),\n ("end_offset", int),\n ("minute_offset", int),\n ("hour_offset", int),\n ("day_offset", Optional[int]),\n ],\n ),\n):\n def __new__( # pylint: disable=arguments-differ\n cls,\n schedule_type: ScheduleType,\n start: Union[datetime, str],\n timezone: Optional[str],\n fmt: str,\n end_offset: int,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: Optional[int] = None,\n ):\n if isinstance(start, str):\n start_dt = datetime.strptime(start, fmt)\n else:\n start_dt = start\n\n return super(TimeWindowPartitionsDefinition, cls).__new__(\n cls,\n schedule_type,\n start_dt,\n timezone or "UTC",\n fmt,\n end_offset,\n minute_offset,\n hour_offset,\n day_offset,\n )\n\n def get_partitions(\n self, current_time: Optional[datetime] = None\n ) -> List[Partition[TimeWindow]]:\n current_timestamp = (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ).timestamp()\n\n time_of_day = time(self.hour_offset, self.minute_offset)\n\n start_timestamp = pendulum.instance(self.start, tz=self.timezone).timestamp()\n iterator = schedule_execution_time_iterator(\n start_timestamp=start_timestamp,\n cron_schedule=get_cron_schedule(\n schedule_type=self.schedule_type,\n time_of_day=time_of_day,\n execution_day=self.day_offset,\n ),\n execution_timezone=self.timezone,\n )\n\n partitions: List[Partition[TimeWindow]] = []\n prev_time = next(iterator)\n while prev_time.timestamp() < start_timestamp:\n prev_time = next(iterator)\n\n end_offset = self.end_offset\n partitions_past_current_time = 0\n while True:\n next_time = next(iterator)\n if (\n next_time.timestamp() <= current_timestamp\n or partitions_past_current_time < end_offset\n ):\n partitions.append(\n Partition(\n value=TimeWindow(prev_time, next_time),\n name=prev_time.strftime(self.fmt),\n )\n )\n\n if next_time.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n prev_time = next_time\n\n if end_offset < 0:\n partitions = partitions[:end_offset]\n\n return partitions\n\n def __str__(self) -> str:\n partition_def_str = f"{self.schedule_type.value.capitalize()}, starting {self.start.strftime(self.fmt)} {self.timezone}."\n if self.end_offset != 0:\n partition_def_str += f" End offsetted by {self.end_offset} partition{'' if self.end_offset == 1 else 's'}."\n return partition_def_str\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n start = self.start_time_for_partition_key(partition_key)\n time_of_day = time(self.hour_offset, self.minute_offset)\n iterator = schedule_execution_time_iterator(\n start_timestamp=start.timestamp(),\n cron_schedule=get_cron_schedule(\n schedule_type=self.schedule_type,\n time_of_day=time_of_day,\n execution_day=self.day_offset,\n ),\n execution_timezone=self.timezone,\n )\n\n return TimeWindow(next(iterator), next(iterator))\n\n def start_time_for_partition_key(self, partition_key: str) -> datetime:\n return pendulum.instance(datetime.strptime(partition_key, self.fmt), tz=self.timezone)\n\n def get_default_partition_mapping(self):\n from dagster.core.asset_defs.time_window_partition_mapping import TimeWindowPartitionMapping\n\n return TimeWindowPartitionMapping()
\n\n\n
[docs]class DailyPartitionsDefinition(TimeWindowPartitionsDefinition):\n def __new__(\n cls,\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n """A set of daily partitions.\n\n The first partition in the set will start at the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset and/or hour_offset are used, the start and end times of\n each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n DailyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n DailyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(DailyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.DAILY,\n start=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\ndef wrap_time_window_tags_fn(\n tags_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]]\n) -> Callable[[Partition], Dict[str, str]]:\n def _tag_wrapper(partition: Partition) -> Dict[str, str]:\n if not tags_fn:\n return {}\n return tags_fn(cast(datetime, partition.value[0]), cast(datetime, partition.value[1]))\n\n return _tag_wrapper\n\n\n
[docs]def daily_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]] = None,\n) -> Callable[[Callable[[datetime, datetime], Dict[str, Any]]], PartitionedConfig]:\n """Defines run config over a set of daily partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the bounds\n of the date partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset and/or hour_offset are used, the start and end times of each partition\n will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n @daily_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n @daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def inner(fn: Callable[[datetime, datetime], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n return PartitionedConfig(\n run_config_for_partition_fn=lambda partition: fn(\n partition.value[0], partition.value[1]\n ),\n partitions_def=DailyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n ),\n decorated_fn=fn,\n tags_for_partition_fn=wrap_time_window_tags_fn(tags_for_partition_fn),\n )\n\n return inner
\n\n\n
[docs]class HourlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n def __new__(\n cls,\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n """A set of hourly partitions.\n\n The first partition in the set will start on the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset is provided, the start and end times of each partition\n will be minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n _fmt = fmt or DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n\n return super(HourlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.HOURLY,\n start=start_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def hourly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]] = None,\n) -> Callable[[Callable[[datetime, datetime], Dict[str, Any]]], PartitionedConfig]:\n """Defines run config over a set of hourly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset is provided, the start and end times of each partition will be\n minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def inner(fn: Callable[[datetime, datetime], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n return PartitionedConfig(\n run_config_for_partition_fn=lambda partition: fn(\n partition.value[0], partition.value[1]\n ),\n partitions_def=HourlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n ),\n decorated_fn=fn,\n tags_for_partition_fn=wrap_time_window_tags_fn(tags_for_partition_fn),\n )\n\n return inner
\n\n\n
[docs]class MonthlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n def __new__(\n cls,\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n """A set of monthly partitions.\n\n The first partition in the set will start at the soonest first of the month after start_date\n at midnight. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and\n end date of each partition will be day_offset. If minute_offset and/or hour_offset are used,\n the start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n MonthlyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n MonthlyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(MonthlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.MONTHLY,\n start=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def monthly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]] = None,\n) -> Callable[[Callable[[datetime, datetime], Dict[str, Any]]], PartitionedConfig]:\n """Defines run config over a set of monthly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at midnight on the soonest first of the month after\n start_date. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and end\n date of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\n start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n @monthly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n @monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def inner(fn: Callable[[datetime, datetime], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n return PartitionedConfig(\n run_config_for_partition_fn=lambda partition: fn(\n partition.value[0], partition.value[1]\n ),\n partitions_def=MonthlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n ),\n decorated_fn=fn,\n tags_for_partition_fn=wrap_time_window_tags_fn(tags_for_partition_fn),\n )\n\n return inner
\n\n\n
[docs]class WeeklyPartitionsDefinition(TimeWindowPartitionsDefinition):\n def __new__(\n cls,\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n """Defines a set of weekly partitions.\n\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n WeeklyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n WeeklyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(WeeklyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.WEEKLY,\n start=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def weekly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Dict[str, str]]] = None,\n) -> Callable[[Callable[[datetime, datetime], Dict[str, Any]]], PartitionedConfig]:\n """Defines run config over a set of weekly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n @weekly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n @weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def inner(fn: Callable[[datetime, datetime], Dict[str, Any]]) -> PartitionedConfig:\n check.callable_param(fn, "fn")\n\n return PartitionedConfig(\n run_config_for_partition_fn=lambda partition: fn(\n partition.value[0], partition.value[1]\n ),\n partitions_def=WeeklyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n ),\n decorated_fn=fn,\n tags_for_partition_fn=wrap_time_window_tags_fn(tags_for_partition_fn),\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/core/definitions/time_window_partitions", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.time_window_partitions"}, "utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.utils

\nimport keyword\nimport os\nimport re\nfrom glob import glob\nfrom typing import Any, Dict, List, Optional, Tuple\n\nimport pkg_resources\nimport yaml\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.core.storage.tags import check_reserved_tags\nfrom dagster.utils import frozentags\nfrom dagster.utils.yaml_utils import merge_yaml_strings, merge_yamls\n\nDEFAULT_OUTPUT = "result"\n\nDISALLOWED_NAMES = set(\n    [\n        "context",\n        "conf",\n        "config",\n        "meta",\n        "arg_dict",\n        "dict",\n        "input_arg_dict",\n        "output_arg_dict",\n        "int",\n        "str",\n        "float",\n        "bool",\n        "input",\n        "output",\n        "type",\n    ]\n    + list(keyword.kwlist)  # just disallow all python keywords\n)\n\nVALID_NAME_REGEX_STR = r"^[A-Za-z0-9_]+$"\nVALID_NAME_REGEX = re.compile(VALID_NAME_REGEX_STR)\n\n\nclass NoValueSentinel:\n    """Sentinel value to distinguish unset from None"""\n\n\ndef has_valid_name_chars(name):\n    return bool(VALID_NAME_REGEX.match(name))\n\n\ndef check_valid_name(name: str):\n    check.str_param(name, "name")\n    if name in DISALLOWED_NAMES:\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. It conflicts with a Dagster or python reserved keyword.'\n        )\n\n    if not has_valid_name_chars(name):\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. Names must be in regex {VALID_NAME_REGEX_STR}.'\n        )\n\n    check.invariant(is_valid_name(name))\n    return name\n\n\ndef is_valid_name(name):\n    check.str_param(name, "name")\n\n    return name not in DISALLOWED_NAMES and has_valid_name_chars(name)\n\n\ndef _kv_str(key, value):\n    return '{key}="{value}"'.format(key=key, value=repr(value))\n\n\ndef struct_to_string(name, **kwargs):\n    # Sort the kwargs to ensure consistent representations across Python versions\n    props_str = ", ".join([_kv_str(key, value) for key, value in sorted(kwargs.items())])\n    return "{name}({props_str})".format(name=name, props_str=props_str)\n\n\ndef validate_tags(tags: Optional[Dict[str, Any]], allow_reserved_tags=True) -> Dict[str, str]:\n    valid_tags = {}\n    for key, value in check.opt_dict_param(tags, "tags", key_type=str).items():\n        if not isinstance(value, str):\n            valid = False\n            err_reason = 'Could not JSON encode value "{}"'.format(value)\n            try:\n                str_val = seven.json.dumps(value)\n                err_reason = 'JSON encoding "{json}" of value "{val}" is not equivalent to original value'.format(\n                    json=str_val, val=value\n                )\n\n                valid = seven.json.loads(str_val) == value\n            except Exception:\n                pass\n\n            if not valid:\n                raise DagsterInvalidDefinitionError(\n                    'Invalid value for tag "{key}", {err_reason}. Tag values must be strings '\n                    "or meet the constraint that json.loads(json.dumps(value)) == value.".format(\n                        key=key, err_reason=err_reason\n                    )\n                )\n\n            valid_tags[key] = str_val\n        else:\n            valid_tags[key] = value\n\n    if not allow_reserved_tags:\n        check_reserved_tags(valid_tags)\n\n    return frozentags(valid_tags)\n\n\n
[docs]def config_from_files(config_files: List[str]) -> Dict[str, Any]:\n """Constructs run config from YAML files.\n\n Args:\n config_files (List[str]): List of paths or glob patterns for yaml files\n to load and parse as the run config.\n\n Returns:\n Dict[str, Any]: A run config dictionary constructed from provided YAML files.\n\n Raises:\n FileNotFoundError: When a config file produces no results\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n config_files = check.opt_list_param(config_files, "config_files")\n\n filenames = []\n for file_glob in config_files or []:\n globbed_files = glob(file_glob)\n if not globbed_files:\n raise DagsterInvariantViolationError(\n 'File or glob pattern "{file_glob}" for "config_files" '\n "produced no results.".format(file_glob=file_glob)\n )\n\n filenames += [os.path.realpath(globbed_file) for globbed_file in globbed_files]\n\n try:\n run_config = merge_yamls(filenames)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing files {filenames} "\n f"loaded by file/patterns {config_files}."\n ) from err\n\n return run_config
\n\n\n
[docs]def config_from_yaml_strings(yaml_strings: List[str]) -> Dict[str, Any]:\n """Static constructor for run configs from YAML strings.\n\n Args:\n yaml_strings (List[str]): List of yaml strings to parse as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n yaml_strings = check.list_param(yaml_strings, "yaml_strings", of_type=str)\n\n try:\n run_config = merge_yaml_strings(yaml_strings)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing YAMLs {yaml_strings} "\n ) from err\n\n return run_config
\n\n\n
[docs]def config_from_pkg_resources(pkg_resource_defs: List[Tuple[str, str]]) -> Dict[str, Any]:\n """Load a run config from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n\n .. code-block:: python\n\n config_from_pkg_resources(\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n pkg_resource_defs (List[(str, str)]): List of pkg_resource modules/files to\n load as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n pkg_resource_defs = check.list_param(pkg_resource_defs, "pkg_resource_defs", of_type=tuple)\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs}."\n ) from err\n\n return config_from_yaml_strings(yaml_strings=yaml_strings)
\n
", "current_page_name": "_modules/dagster/core/definitions/utils", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.utils"}, "version_strategy": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.definitions.version_strategy

\nimport hashlib\nimport inspect\nfrom typing import TYPE_CHECKING, Any, NamedTuple, Optional\n\nif TYPE_CHECKING:\n    from .op_definition import OpDefinition\n    from .resource_definition import ResourceDefinition\n    from .solid_definition import SolidDefinition\n\n\nclass OpVersionContext(NamedTuple):\n    """Provides execution-time information for computing the version for an op.\n    Attributes:\n        op_def (OpDefinition): The definition of the op to compute a version for.\n        op_config (Any): The parsed config to be passed to the op during execution.\n    """\n\n    op_def: "OpDefinition"\n    op_config: Any\n\n    @property\n    def solid_def(self) -> "SolidDefinition":\n        return self.op_def\n\n    @property\n    def solid_config(self) -> Any:\n        return self.op_config\n\n\nSolidVersionContext = OpVersionContext\n\n\nclass ResourceVersionContext(NamedTuple):\n    """Version-specific resource context.\n\n    Attributes:\n        resource_def (ResourceDefinition): The definition of the resource whose version will be computed.\n        resource_config (Any): The parsed config to be passed to the resource during execution.\n    """\n\n    resource_def: "ResourceDefinition"\n    resource_config: Any\n\n\n
[docs]class VersionStrategy:\n """Abstract class for defining a strategy to version solids and resources.\n\n When subclassing, `get_solid_version` must be implemented, and `get_resource_version` can be\n optionally implemented.\n\n `get_solid_version` should ingest a SolidVersionContext, and `get_resource_version` should ingest a\n ResourceVersionContext. From that, each synthesize a unique string called a `version`, which will\n be tagged to outputs of that solid in the pipeline. Providing a `VersionStrategy` instance to a\n job will enable memoization on that job, such that only steps whose outputs do not have an\n up-to-date version will run.\n """\n\n def get_solid_version(self, context: SolidVersionContext) -> str:\n pass\n\n def get_op_version(self, context: OpVersionContext) -> str:\n return self.get_solid_version(context)\n\n def get_resource_version(\n self, context: ResourceVersionContext # pylint: disable=unused-argument\n ) -> Optional[str]:\n return None
\n\n\n
[docs]class SourceHashVersionStrategy(VersionStrategy):\n def _get_source_hash(self, fn):\n code_as_str = inspect.getsource(fn)\n return hashlib.sha1(code_as_str.encode("utf-8")).hexdigest()\n\n def get_op_version(self, context: OpVersionContext) -> str:\n compute_fn = context.op_def.compute_fn\n if callable(compute_fn):\n return self._get_source_hash(compute_fn)\n else:\n return self._get_source_hash(compute_fn.decorated_fn)\n\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n return self._get_source_hash(context.resource_def.resource_fn)
\n
", "current_page_name": "_modules/dagster/core/definitions/version_strategy", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.definitions.version_strategy"}}, "errors": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.errors

\n"""Core Dagster error classes.\n\nAll errors thrown by the Dagster framework inherit from :py:class:`~dagster.DagsterError`. Users\nshould not subclass this base class for their own exceptions.\n\nThere is another exception base class, :py:class:`~dagster.DagsterUserCodeExecutionError`, which is\nused by the framework in concert with the :py:func:`~dagster.core.errors.user_code_error_boundary`.\n\nDagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\n:py:class:`~dagster.DagsterUserCodeExecutionError`.\n\nThe wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.\n"""\n\nimport sys\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster.utils.interrupts import raise_interrupts_as\n\n\nclass DagsterExecutionInterruptedError(BaseException):\n    """\n    Pipeline execution was interrupted during the execution process.\n\n    Just like KeyboardInterrupt this inherits from BaseException\n    as to not be accidentally caught by code that catches Exception\n    and thus prevent the interpreter from exiting.\n    """\n\n\n
[docs]class DagsterError(Exception):\n """Base class for all errors thrown by the Dagster framework.\n\n Users should not subclass this base class for their own exceptions."""\n\n @property\n def is_user_code_error(self):\n """Returns true if this error is attributable to user code."""\n return False
\n\n\n
[docs]class DagsterInvalidDefinitionError(DagsterError):\n """Indicates that the rules for a definition have been violated by the user."""
\n\n\nclass DagsterInvalidSubsetError(DagsterError):\n """Indicates that a subset of a pipeline is invalid because either:\n - One or more ops in the specified subset do not exist on the job.'\n - The subset produces an invalid job.\n """\n\n\nCONFIG_ERROR_VERBIAGE = """\nThis value can be a:\n - Field\n - Python primitive types that resolve to dagster config types\n - int, float, bool, str, list.\n - A dagster config type: Int, Float, Bool, Array, Optional, Selector, Shape, Permissive, Map\n - A bare python dictionary, which is wrapped in Field(Shape(...)). Any values\n in the dictionary get resolved by the same rules, recursively.\n - A python list with a single entry that can resolve to a type, e.g. [int]\n"""\n\n\n
[docs]class DagsterInvalidConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a config with an invalid value\n\n Acceptable values for config types are any of:\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type: :py:data:`~dagster.Int`, :py:data:`~dagster.Float`,\n :py:data:`~dagster.Bool`, :py:data:`~dagster.String`,\n :py:data:`~dagster.StringSource`, :py:data:`~dagster.Any`,\n :py:class:`~dagster.Array`, :py:data:`~dagster.Noneable`, :py:data:`~dagster.Enum`,\n :py:class:`~dagster.Selector`, :py:class:`~dagster.Shape`, or\n :py:class:`~dagster.Permissive`.\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n 5. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self, original_root, current_value, stack, reason=None, **kwargs):\n self.original_root = original_root\n self.current_value = current_value\n self.stack = stack\n super(DagsterInvalidConfigDefinitionError, self).__init__(\n (\n "Error defining config. Original value passed: {original_root}. "\n "{stack_str}{current_value} "\n "cannot be resolved.{reason_str}" + CONFIG_ERROR_VERBIAGE\n ).format(\n original_root=repr(original_root),\n stack_str="Error at stack path :" + ":".join(stack) + ". " if stack else "",\n current_value=repr(current_value),\n reason_str=" Reason: {reason}.".format(reason=reason) if reason else "",\n ),\n **kwargs,\n )
\n\n\n
[docs]class DagsterInvariantViolationError(DagsterError):\n """Indicates the user has violated a well-defined invariant that can only be enforced\n at runtime."""
\n\n\n
[docs]class DagsterExecutionStepNotFoundError(DagsterError):\n """Thrown when the user specifies execution step keys that do not exist."""\n\n def __init__(self, *args, **kwargs):\n self.step_keys = check.list_param(kwargs.pop("step_keys"), "step_keys", str)\n super(DagsterExecutionStepNotFoundError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterExecutionPlanSnapshotNotFoundError(DagsterError):\n """Thrown when an expected execution plan snapshot could not be found on a PipelineRun."""\n\n\n
[docs]class DagsterRunNotFoundError(DagsterError):\n """Thrown when a run cannot be found in run storage."""\n\n def __init__(self, *args, **kwargs):\n self.invalid_run_id = check.str_param(kwargs.pop("invalid_run_id"), "invalid_run_id")\n super(DagsterRunNotFoundError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterStepOutputNotFoundError(DagsterError):\n """Indicates that previous step outputs required for an execution step to proceed are not\n available."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterStepOutputNotFoundError, self).__init__(*args, **kwargs)
\n\n\n@contextmanager\ndef raise_execution_interrupts():\n with raise_interrupts_as(DagsterExecutionInterruptedError):\n yield\n\n\n
[docs]@contextmanager\ndef user_code_error_boundary(error_cls, msg_fn, log_manager=None, **kwargs):\n """\n Wraps the execution of user-space code in an error boundary. This places a uniform\n policy around any user code invoked by the framework. This ensures that all user\n errors are wrapped in an exception derived from DagsterUserCodeExecutionError,\n and that the original stack trace of the user error is preserved, so that it\n can be reported without confusing framework code in the stack trace, if a\n tool author wishes to do so.\n\n Examples:\n\n .. code-block:: python\n\n with user_code_error_boundary(\n # Pass a class that inherits from DagsterUserCodeExecutionError\n DagsterExecutionStepExecutionError,\n # Pass a function that produces a message\n "Error occurred during step execution"\n ):\n call_user_provided_function()\n\n """\n check.callable_param(msg_fn, "msg_fn")\n check.class_param(error_cls, "error_cls", superclass=DagsterUserCodeExecutionError)\n\n with raise_execution_interrupts():\n if log_manager:\n log_manager.begin_python_log_capture()\n try:\n yield\n except DagsterError as de:\n # The system has thrown an error that is part of the user-framework contract\n raise de\n except Exception as e: # pylint: disable=W0703\n # An exception has been thrown by user code and computation should cease\n # with the error reported further up the stack\n raise error_cls(\n msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs\n ) from e\n finally:\n if log_manager:\n log_manager.end_python_log_capture()
\n\n\n
[docs]class DagsterUserCodeExecutionError(DagsterError):\n """\n This is the base class for any exception that is meant to wrap an\n :py:class:`~python:Exception` thrown by user code. It wraps that existing user code.\n The ``original_exc_info`` argument to the constructor is meant to be a tuple of the type\n returned by :py:func:`sys.exc_info <python:sys.exc_info>` at the call site of the constructor.\n\n Users should not subclass this base class for their own exceptions and should instead throw\n freely from user code. User exceptions will be automatically wrapped and rethrown.\n """\n\n def __init__(self, *args, **kwargs):\n # original_exc_info should be gotten from a sys.exc_info() call at the\n # callsite inside of the exception handler. this will allow consuming\n # code to *re-raise* the user error in it's original format\n # for cleaner error reporting that does not have framework code in it\n user_exception = check.inst_param(kwargs.pop("user_exception"), "user_exception", Exception)\n original_exc_info = check.tuple_param(kwargs.pop("original_exc_info"), "original_exc_info")\n\n check.invariant(original_exc_info[0] is not None)\n\n super(DagsterUserCodeExecutionError, self).__init__(args[0], *args[1:], **kwargs)\n\n self.user_exception = check.opt_inst_param(user_exception, "user_exception", Exception)\n self.original_exc_info = original_exc_info\n\n @property\n def is_user_code_error(self):\n return True
\n\n\n
[docs]class DagsterTypeCheckError(DagsterUserCodeExecutionError):\n """Indicates an error in the op type system at runtime. E.g. a op receives an\n unexpected input, or produces an output that does not match the type of the output definition.\n """
\n\n\nclass DagsterExecutionLoadInputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while loading an input for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.input_name = check.str_param(kwargs.pop("input_name"), "input_name")\n super(DagsterExecutionLoadInputError, self).__init__(*args, **kwargs)\n\n\nclass DagsterExecutionHandleOutputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while handling an output for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterExecutionHandleOutputError, self).__init__(*args, **kwargs)\n\n\n
[docs]class DagsterExecutionStepExecutionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of an execution step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.op_name = check.str_param(kwargs.pop("op_name"), "op_name")\n self.op_def_name = check.str_param(kwargs.pop("op_def_name"), "op_def_name")\n super(DagsterExecutionStepExecutionError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterResourceFunctionError(DagsterUserCodeExecutionError):\n """\n Indicates an error occurred while executing the body of the ``resource_fn`` in a\n :py:class:`~dagster.ResourceDefinition` during resource initialization.\n """
\n\n\n
[docs]class DagsterConfigMappingFunctionError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of a config mapping\n function defined in a :py:class:`~dagster.JobDefinition` or `~dagster.GraphDefinition` during\n config parsing.\n """
\n\n\nclass DagsterTypeLoadingError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of an type load\n function defined in a :py:class:`~dagster.DagsterTypeLoader` during loading of a custom type.\n """\n\n\nclass DagsterTypeMaterializationError(DagsterUserCodeExecutionError):\n """\n Indicates that an unexpected error occurred while executing the body of an output\n materialization function defined in a :py:class:`~dagster.DagsterTypeMaterializer` during\n materialization of a custom type.\n """\n\n\n
[docs]class DagsterUnknownResourceError(DagsterError, AttributeError):\n # inherits from AttributeError as it is raised within a __getattr__ call... used to support\n # object hasattr method\n """Indicates that an unknown resource was accessed in the body of an execution step. May often\n happen by accessing a resource in the compute function of an op without first supplying the\n op with the correct `required_resource_keys` argument.\n """\n\n def __init__(self, resource_name, *args, **kwargs):\n self.resource_name = check.str_param(resource_name, "resource_name")\n msg = (\n "Unknown resource `{resource_name}`. Specify `{resource_name}` as a required resource "\n "on the compute / config function that accessed it."\n ).format(resource_name=resource_name)\n super(DagsterUnknownResourceError, self).__init__(msg, *args, **kwargs)
\n\n\nclass DagsterInvalidInvocationError(DagsterError):\n """\n Indicates that an error has occurred when an op has been invoked, but before the actual\n core compute has been reached.\n """\n\n\n
[docs]class DagsterInvalidConfigError(DagsterError):\n """Thrown when provided config is invalid (does not type check against the relevant config\n schema)."""\n\n def __init__(self, preamble, errors, config_value, *args, **kwargs):\n from dagster.config.errors import EvaluationError\n\n check.str_param(preamble, "preamble")\n self.errors = check.list_param(errors, "errors", of_type=EvaluationError)\n self.config_value = config_value\n\n error_msg = preamble\n error_messages = []\n\n for i_error, error in enumerate(self.errors):\n error_messages.append(error.message)\n error_msg += "\\n Error {i_error}: {error_message}".format(\n i_error=i_error + 1, error_message=error.message\n )\n\n self.message = error_msg\n self.error_messages = error_messages\n\n super(DagsterInvalidConfigError, self).__init__(error_msg, *args, **kwargs)
\n\n\n
[docs]class DagsterUnmetExecutorRequirementsError(DagsterError):\n """Indicates the resolved executor is incompatible with the state of other systems\n such as the :py:class:`~dagster.core.instance.DagsterInstance` or system storage configuration.\n """
\n\n\n
[docs]class DagsterSubprocessError(DagsterError):\n """An exception has occurred in one or more of the child processes dagster manages.\n This error forwards the message and stack trace for all of the collected errors.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.subprocess_error_infos = check.list_param(\n kwargs.pop("subprocess_error_infos"), "subprocess_error_infos", SerializableErrorInfo\n )\n super(DagsterSubprocessError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterUserCodeUnreachableError(DagsterError):\n """Dagster was unable to reach a user code server to fetch information about user code."""\n\n\nclass DagsterUserCodeProcessError(DagsterError):\n """An exception has occurred in a user code process that the host process raising this error\n was communicating with."""\n\n @staticmethod\n def from_error_info(error_info):\n from dagster.utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterUserCodeProcessError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterUserCodeProcessError, self).__init__(*args, **kwargs)\n\n\nclass DagsterRepositoryLocationNotFoundError(DagsterError):\n pass\n\n\nclass DagsterRepositoryLocationLoadError(DagsterError):\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.load_error_infos = check.list_param(\n kwargs.pop("load_error_infos"),\n "load_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterRepositoryLocationLoadError, self).__init__(*args, **kwargs)\n\n\nclass DagsterLaunchFailedError(DagsterError):\n """Indicates an error while attempting to launch a pipeline run."""\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterLaunchFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterBackfillFailedError(DagsterError):\n """Indicates an error while attempting to launch a backfill."""\n\n def __init__(self, *args, **kwargs):\n from dagster.utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterBackfillFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterRunAlreadyExists(DagsterError):\n """Indicates that a pipeline run already exists in a run storage."""\n\n\nclass DagsterSnapshotDoesNotExist(DagsterError):\n """Indicates you attempted to create a pipeline run with a nonexistent snapshot id"""\n\n\nclass DagsterRunConflict(DagsterError):\n """Indicates that a conflicting pipeline run exists in a run storage."""\n\n\n
[docs]class DagsterTypeCheckDidNotPass(DagsterError):\n """Indicates that a type check failed.\n\n This is raised when ``raise_on_error`` is ``True`` in calls to the synchronous job and\n graph execution APIs (e.g. `graph.execute_in_process()`, `job.execute_in_process()` -- typically\n within a test), and a :py:class:`~dagster.DagsterType`'s type check fails by returning either\n ``False`` or an instance of :py:class:`~dagster.TypeCheck` whose ``success`` member is ``False``.\n """\n\n def __init__(self, description=None, metadata_entries=None, dagster_type=None):\n from dagster import DagsterType, MetadataEntry\n\n super(DagsterTypeCheckDidNotPass, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n self.dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)
\n\n\n
[docs]class DagsterEventLogInvalidForRun(DagsterError):\n """Raised when the event logs for a historical run are malformed or invalid."""\n\n def __init__(self, run_id):\n self.run_id = check.str_param(run_id, "run_id")\n super(DagsterEventLogInvalidForRun, self).__init__(\n "Event logs invalid for run id {}".format(run_id)\n )
\n\n\nclass ScheduleExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of schedule."""\n\n\nclass SensorExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of a sensor (or its job)."""\n\n\nclass PartitionExecutionError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions of a partition set schedule."""\n\n\nclass DagsterInvalidAssetKey(DagsterError):\n """Error raised by invalid asset key"""\n\n\nclass DagsterInvalidMetadata(DagsterError):\n """Error raised by invalid metadata parameters"""\n\n\nclass HookExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined hook."""\n\n\nclass RunStatusSensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined run status sensor."""\n\n\nclass DagsterImportError(DagsterError):\n """Import error raised while importing user-code."""\n\n\nclass JobError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions for a defined Job."""\n\n\nclass DagsterUnknownStepStateError(DagsterError):\n """When job execution completes with steps in an unknown state"""\n\n\nclass DagsterObjectStoreError(DagsterError):\n """Errors during an object store operation."""\n\n\nclass DagsterInvalidPropertyError(DagsterError):\n """Indicates that an invalid property was accessed. May often happen by accessing a property\n that no longer exists after breaking changes."""\n\n\nclass DagsterHomeNotSetError(DagsterError):\n """\n The user has tried to use a command that requires an instance or invoke DagsterInstance.get()\n without setting DAGSTER_HOME env var.\n """\n\n\nclass DagsterUnknownPartitionError(DagsterError):\n """\n The user has tried to access run config for a partition name that does not exist.\n """\n
", "current_page_name": "_modules/dagster/core/errors", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.errors"}, "events": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.events

\n"""Structured representations of system events."""\nimport logging\nimport os\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, List, NamedTuple, Optional, Union, cast\n\nimport dagster._check as check\nfrom dagster.core.definitions import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    HookDefinition,\n    Materialization,\n    MetadataEntry,\n    NodeHandle,\n)\nfrom dagster.core.definitions.events import AssetLineageInfo, ObjectStoreOperationType\nfrom dagster.core.definitions.metadata import MetadataValue\nfrom dagster.core.errors import DagsterError, HookExecutionError\nfrom dagster.core.execution.context.hook import HookContext\nfrom dagster.core.execution.context.system import (\n    IPlanContext,\n    IStepContext,\n    PlanExecutionContext,\n    PlanOrchestrationContext,\n    StepExecutionContext,\n)\nfrom dagster.core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster.core.execution.plan.inputs import StepInputData\nfrom dagster.core.execution.plan.objects import StepFailureData, StepRetryData, StepSuccessData\nfrom dagster.core.execution.plan.outputs import StepOutputData\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus\nfrom dagster.serdes import register_serdes_tuple_fallbacks, whitelist_for_serdes\nfrom dagster.utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info\nfrom dagster.utils.timing import format_duration\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.events import ObjectStoreOperation\n    from dagster.core.execution.plan.plan import ExecutionPlan\n    from dagster.core.execution.plan.step import ExecutionStep, StepKind\n\nEventSpecificData = Union[\n    StepOutputData,\n    StepFailureData,\n    StepSuccessData,\n    "StepMaterializationData",\n    "StepExpectationResultData",\n    StepInputData,\n    "EngineEventData",\n    "HookErroredData",\n    StepRetryData,\n    "PipelineFailureData",\n    "PipelineCanceledData",\n    "ObjectStoreOperationResultData",\n    "HandledOutputData",\n    "LoadedInputData",\n    "ComputeLogsCaptureData",\n    "AssetObservationData",\n    "AssetMaterializationPlannedData",\n]\n\n\n
[docs]class DagsterEventType(Enum):\n """The types of events that may be yielded by solid and pipeline execution."""\n\n STEP_OUTPUT = "STEP_OUTPUT"\n STEP_INPUT = "STEP_INPUT"\n STEP_FAILURE = "STEP_FAILURE"\n STEP_START = "STEP_START"\n STEP_SUCCESS = "STEP_SUCCESS"\n STEP_SKIPPED = "STEP_SKIPPED"\n\n STEP_UP_FOR_RETRY = "STEP_UP_FOR_RETRY" # "failed" but want to retry\n STEP_RESTARTED = "STEP_RESTARTED"\n\n ASSET_MATERIALIZATION = "ASSET_MATERIALIZATION"\n ASSET_MATERIALIZATION_PLANNED = "ASSET_MATERIALIZATION_PLANNED"\n ASSET_OBSERVATION = "ASSET_OBSERVATION"\n STEP_EXPECTATION_RESULT = "STEP_EXPECTATION_RESULT"\n\n # We want to display RUN_* events in dagit and in our LogManager output, but in order to\n # support backcompat for our storage layer, we need to keep the persisted value to be strings\n # of the form "PIPELINE_*". We may have user code that pass in the DagsterEventType\n # enum values into storage APIs (like get_event_records, which takes in an EventRecordsFilter).\n RUN_ENQUEUED = "PIPELINE_ENQUEUED"\n RUN_DEQUEUED = "PIPELINE_DEQUEUED"\n RUN_STARTING = "PIPELINE_STARTING" # Launch is happening, execution hasn't started yet\n RUN_START = "PIPELINE_START" # Execution has started\n RUN_SUCCESS = "PIPELINE_SUCCESS"\n RUN_FAILURE = "PIPELINE_FAILURE"\n RUN_CANCELING = "PIPELINE_CANCELING"\n RUN_CANCELED = "PIPELINE_CANCELED"\n\n # Keep these legacy enum values around, to keep back-compatability for user code that might be\n # using these constants to filter event records\n PIPELINE_ENQUEUED = RUN_ENQUEUED\n PIPELINE_DEQUEUED = RUN_DEQUEUED\n PIPELINE_STARTING = RUN_STARTING\n PIPELINE_START = RUN_START\n PIPELINE_SUCCESS = RUN_SUCCESS\n PIPELINE_FAILURE = RUN_FAILURE\n PIPELINE_CANCELING = RUN_CANCELING\n PIPELINE_CANCELED = RUN_CANCELED\n\n OBJECT_STORE_OPERATION = "OBJECT_STORE_OPERATION"\n ASSET_STORE_OPERATION = "ASSET_STORE_OPERATION"\n LOADED_INPUT = "LOADED_INPUT"\n HANDLED_OUTPUT = "HANDLED_OUTPUT"\n\n ENGINE_EVENT = "ENGINE_EVENT"\n\n HOOK_COMPLETED = "HOOK_COMPLETED"\n HOOK_ERRORED = "HOOK_ERRORED"\n HOOK_SKIPPED = "HOOK_SKIPPED"\n\n ALERT_START = "ALERT_START"\n ALERT_SUCCESS = "ALERT_SUCCESS"\n ALERT_FAILURE = "ALERT_FAILURE"\n\n LOGS_CAPTURED = "LOGS_CAPTURED"
\n\n\nEVENT_TYPE_VALUE_TO_DISPLAY_STRING = {\n "PIPELINE_ENQUEUED": "RUN_ENQUEUED",\n "PIPELINE_DEQUEUED": "RUN_DEQUEUED",\n "PIPELINE_STARTING": "RUN_STARTING",\n "PIPELINE_START": "RUN_START",\n "PIPELINE_SUCCESS": "RUN_SUCCESS",\n "PIPELINE_FAILURE": "RUN_FAILURE",\n "PIPELINE_CANCELING": "RUN_CANCELING",\n "PIPELINE_CANCELED": "RUN_CANCELED",\n}\n\nSTEP_EVENTS = {\n DagsterEventType.STEP_INPUT,\n DagsterEventType.STEP_START,\n DagsterEventType.STEP_OUTPUT,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.STEP_SUCCESS,\n DagsterEventType.STEP_SKIPPED,\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.STEP_EXPECTATION_RESULT,\n DagsterEventType.OBJECT_STORE_OPERATION,\n DagsterEventType.HANDLED_OUTPUT,\n DagsterEventType.LOADED_INPUT,\n DagsterEventType.STEP_RESTARTED,\n DagsterEventType.STEP_UP_FOR_RETRY,\n}\n\nFAILURE_EVENTS = {\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.RUN_CANCELED,\n}\n\nPIPELINE_EVENTS = {\n DagsterEventType.RUN_ENQUEUED,\n DagsterEventType.RUN_DEQUEUED,\n DagsterEventType.RUN_STARTING,\n DagsterEventType.RUN_START,\n DagsterEventType.RUN_SUCCESS,\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.RUN_CANCELING,\n DagsterEventType.RUN_CANCELED,\n}\n\nHOOK_EVENTS = {\n DagsterEventType.HOOK_COMPLETED,\n DagsterEventType.HOOK_ERRORED,\n DagsterEventType.HOOK_SKIPPED,\n}\n\nALERT_EVENTS = {\n DagsterEventType.ALERT_START,\n DagsterEventType.ALERT_SUCCESS,\n DagsterEventType.ALERT_FAILURE,\n}\n\n\nEVENT_TYPE_TO_PIPELINE_RUN_STATUS = {\n DagsterEventType.RUN_START: PipelineRunStatus.STARTED,\n DagsterEventType.RUN_SUCCESS: PipelineRunStatus.SUCCESS,\n DagsterEventType.RUN_FAILURE: PipelineRunStatus.FAILURE,\n DagsterEventType.RUN_ENQUEUED: PipelineRunStatus.QUEUED,\n DagsterEventType.RUN_STARTING: PipelineRunStatus.STARTING,\n DagsterEventType.RUN_CANCELING: PipelineRunStatus.CANCELING,\n DagsterEventType.RUN_CANCELED: PipelineRunStatus.CANCELED,\n}\n\nPIPELINE_RUN_STATUS_TO_EVENT_TYPE = {v: k for k, v in EVENT_TYPE_TO_PIPELINE_RUN_STATUS.items()}\n\nASSET_EVENTS = {\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n}\n\n\ndef _assert_type(\n method: str, expected_type: DagsterEventType, actual_type: DagsterEventType\n) -> None:\n check.invariant(\n expected_type == actual_type,\n (\n "{method} only callable when event_type is {expected_type}, called on {actual_type}"\n ).format(method=method, expected_type=expected_type, actual_type=actual_type),\n )\n\n\ndef _validate_event_specific_data(\n event_type: DagsterEventType, event_specific_data: Optional["EventSpecificData"]\n) -> Optional["EventSpecificData"]:\n\n if event_type == DagsterEventType.STEP_OUTPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepOutputData)\n elif event_type == DagsterEventType.STEP_FAILURE:\n check.inst_param(event_specific_data, "event_specific_data", StepFailureData)\n elif event_type == DagsterEventType.STEP_SUCCESS:\n check.inst_param(event_specific_data, "event_specific_data", StepSuccessData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION:\n check.inst_param(event_specific_data, "event_specific_data", StepMaterializationData)\n elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT:\n check.inst_param(event_specific_data, "event_specific_data", StepExpectationResultData)\n elif event_type == DagsterEventType.STEP_INPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepInputData)\n elif event_type == DagsterEventType.ENGINE_EVENT:\n check.inst_param(event_specific_data, "event_specific_data", EngineEventData)\n elif event_type == DagsterEventType.HOOK_ERRORED:\n check.inst_param(event_specific_data, "event_specific_data", HookErroredData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n check.inst_param(\n event_specific_data, "event_specific_data", AssetMaterializationPlannedData\n )\n\n return event_specific_data\n\n\ndef log_step_event(step_context: IStepContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n step_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for step {step_context.step.key}",\n dagster_event=event,\n )\n\n\ndef log_pipeline_event(pipeline_context: IPlanContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n pipeline_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for pipeline {pipeline_context.pipeline_name}",\n dagster_event=event,\n )\n\n\ndef log_resource_event(log_manager: DagsterLogManager, event: "DagsterEvent") -> None:\n event_specific_data = cast(EngineEventData, event.event_specific_data)\n\n log_level = logging.ERROR if event_specific_data.error else logging.DEBUG\n log_manager.log_dagster_event(level=log_level, msg=event.message or "", dagster_event=event)\n\n\n
[docs]@whitelist_for_serdes\nclass DagsterEvent(\n NamedTuple(\n "_DagsterEvent",\n [\n ("event_type_value", str),\n ("pipeline_name", str),\n ("step_handle", Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]]),\n ("solid_handle", Optional[NodeHandle]),\n ("step_kind_value", Optional[str]),\n ("logging_tags", Optional[Dict[str, str]]),\n ("event_specific_data", Optional["EventSpecificData"]),\n ("message", Optional[str]),\n ("pid", Optional[int]),\n ("step_key", Optional[str]),\n ],\n )\n):\n """Events yielded by solid and pipeline execution.\n\n Users should not instantiate this class.\n\n Attributes:\n event_type_value (str): Value for a DagsterEventType.\n pipeline_name (str)\n solid_handle (NodeHandle)\n step_kind_value (str): Value for a StepKind.\n logging_tags (Dict[str, str])\n event_specific_data (Any): Type must correspond to event_type_value.\n message (str)\n pid (int)\n step_key (Optional[str]): DEPRECATED\n """\n\n @staticmethod\n def from_step(\n event_type: "DagsterEventType",\n step_context: IStepContext,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n message=check.opt_str_param(message, "message"),\n pid=os.getpid(),\n )\n\n log_step_event(step_context, event)\n\n return event\n\n @staticmethod\n def from_pipeline(\n event_type: DagsterEventType,\n pipeline_context: IPlanContext,\n message: Optional[str] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n ) -> "DagsterEvent":\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n )\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n pipeline_name=pipeline_context.pipeline_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n step_handle=step_handle,\n pid=os.getpid(),\n )\n\n log_pipeline_event(pipeline_context, event)\n\n return event\n\n @staticmethod\n def from_resource(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n message: Optional[str] = None,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n\n event = DagsterEvent(\n DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=pipeline_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(\n DagsterEventType.ENGINE_EVENT, event_specific_data\n ),\n step_handle=execution_plan.step_handle_for_single_step_plans(),\n pid=os.getpid(),\n )\n log_resource_event(log_manager, event)\n return event\n\n @staticmethod\n def asset_materialization_planned(\n pipeline_name: str,\n asset_key: AssetKey,\n log_manager: DagsterLogManager,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value,\n pipeline_name=pipeline_name,\n message=f"{pipeline_name} intends to materialize asset {asset_key.to_string()}",\n event_specific_data=AssetMaterializationPlannedData(asset_key),\n )\n log_level = logging.DEBUG\n log_manager.log_dagster_event(level=log_level, msg=event.message or "", dagster_event=event)\n return event\n\n def __new__(\n cls,\n event_type_value: str,\n pipeline_name: str,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n solid_handle: Optional[NodeHandle] = None,\n step_kind_value: Optional[str] = None,\n logging_tags: Optional[Dict[str, str]] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n pid: Optional[int] = None,\n # legacy\n step_key: Optional[str] = None,\n ):\n event_type_value, event_specific_data = _handle_back_compat(\n event_type_value, event_specific_data\n )\n\n # old events may contain solid_handle but not step_handle\n if solid_handle is not None and step_handle is None:\n step_handle = StepHandle(solid_handle)\n\n # Legacy events may have step_key set directly, preserve those to stay in sync\n # with legacy execution plan snapshots.\n if step_handle is not None and step_key is None:\n step_key = step_handle.to_key()\n\n return super(DagsterEvent, cls).__new__(\n cls,\n check.str_param(event_type_value, "event_type_value"),\n check.str_param(pipeline_name, "pipeline_name"),\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n ),\n check.opt_inst_param(solid_handle, "solid_handle", NodeHandle),\n check.opt_str_param(step_kind_value, "step_kind_value"),\n check.opt_dict_param(logging_tags, "logging_tags"),\n _validate_event_specific_data(DagsterEventType(event_type_value), event_specific_data),\n check.opt_str_param(message, "message"),\n check.opt_int_param(pid, "pid"),\n check.opt_str_param(step_key, "step_key"),\n )\n\n @property\n def solid_name(self) -> str:\n check.invariant(self.solid_handle is not None)\n solid_handle = cast(NodeHandle, self.solid_handle)\n return solid_handle.name\n\n @property\n def event_type(self) -> DagsterEventType:\n """DagsterEventType: The type of this event."""\n return DagsterEventType(self.event_type_value)\n\n @property\n def is_step_event(self) -> bool:\n return self.event_type in STEP_EVENTS\n\n @property\n def is_hook_event(self) -> bool:\n return self.event_type in HOOK_EVENTS\n\n @property\n def is_alert_event(self) -> bool:\n return self.event_type in ALERT_EVENTS\n\n @property\n def step_kind(self) -> "StepKind":\n from dagster.core.execution.plan.step import StepKind\n\n return StepKind(self.step_kind_value)\n\n @property\n def is_step_success(self) -> bool:\n return self.event_type == DagsterEventType.STEP_SUCCESS\n\n @property\n def is_successful_output(self) -> bool:\n return self.event_type == DagsterEventType.STEP_OUTPUT\n\n @property\n def is_step_start(self) -> bool:\n return self.event_type == DagsterEventType.STEP_START\n\n @property\n def is_step_failure(self) -> bool:\n return self.event_type == DagsterEventType.STEP_FAILURE\n\n @property\n def is_step_skipped(self) -> bool:\n return self.event_type == DagsterEventType.STEP_SKIPPED\n\n @property\n def is_step_up_for_retry(self) -> bool:\n return self.event_type == DagsterEventType.STEP_UP_FOR_RETRY\n\n @property\n def is_step_restarted(self) -> bool:\n return self.event_type == DagsterEventType.STEP_RESTARTED\n\n @property\n def is_pipeline_success(self) -> bool:\n return self.event_type == DagsterEventType.RUN_SUCCESS\n\n @property\n def is_pipeline_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @property\n def is_failure(self) -> bool:\n return self.event_type in FAILURE_EVENTS\n\n @property\n def is_pipeline_event(self) -> bool:\n return self.event_type in PIPELINE_EVENTS\n\n @property\n def is_engine_event(self) -> bool:\n return self.event_type == DagsterEventType.ENGINE_EVENT\n\n @property\n def is_handled_output(self) -> bool:\n return self.event_type == DagsterEventType.HANDLED_OUTPUT\n\n @property\n def is_loaded_input(self) -> bool:\n return self.event_type == DagsterEventType.LOADED_INPUT\n\n @property\n def is_step_materialization(self) -> bool:\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION\n\n @property\n def is_expectation_result(self) -> bool:\n return self.event_type == DagsterEventType.STEP_EXPECTATION_RESULT\n\n @property\n def is_asset_observation(self) -> bool:\n return self.event_type == DagsterEventType.ASSET_OBSERVATION\n\n @property\n def is_asset_materialization_planned(self) -> bool:\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED\n\n @property\n def asset_key(self) -> Optional[AssetKey]:\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.asset_key\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.asset_key\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.asset_key\n else:\n return None\n\n @property\n def partition(self) -> Optional[str]:\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.partition\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.partition\n else:\n return None\n\n @property\n def step_input_data(self) -> "StepInputData":\n _assert_type("step_input_data", DagsterEventType.STEP_INPUT, self.event_type)\n return cast(StepInputData, self.event_specific_data)\n\n @property\n def step_output_data(self) -> StepOutputData:\n _assert_type("step_output_data", DagsterEventType.STEP_OUTPUT, self.event_type)\n return cast(StepOutputData, self.event_specific_data)\n\n @property\n def step_success_data(self) -> "StepSuccessData":\n _assert_type("step_success_data", DagsterEventType.STEP_SUCCESS, self.event_type)\n return cast(StepSuccessData, self.event_specific_data)\n\n @property\n def step_failure_data(self) -> "StepFailureData":\n _assert_type("step_failure_data", DagsterEventType.STEP_FAILURE, self.event_type)\n return cast(StepFailureData, self.event_specific_data)\n\n @property\n def step_retry_data(self) -> "StepRetryData":\n _assert_type("step_retry_data", DagsterEventType.STEP_UP_FOR_RETRY, self.event_type)\n return cast(StepRetryData, self.event_specific_data)\n\n @property\n def step_materialization_data(self) -> "StepMaterializationData":\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data)\n\n @property\n def asset_observation_data(self) -> "AssetObservationData":\n _assert_type("asset_observation_data", DagsterEventType.ASSET_OBSERVATION, self.event_type)\n return cast(AssetObservationData, self.event_specific_data)\n\n @property\n def asset_materialization_planned_data(self) -> "AssetMaterializationPlannedData":\n _assert_type(\n "asset_materialization_planned",\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n self.event_type,\n )\n return cast(AssetMaterializationPlannedData, self.event_specific_data)\n\n @property\n def step_expectation_result_data(self) -> "StepExpectationResultData":\n _assert_type(\n "step_expectation_result_data",\n DagsterEventType.STEP_EXPECTATION_RESULT,\n self.event_type,\n )\n return cast(StepExpectationResultData, self.event_specific_data)\n\n @property\n def pipeline_failure_data(self) -> "PipelineFailureData":\n _assert_type("pipeline_failure_data", DagsterEventType.RUN_FAILURE, self.event_type)\n return cast(PipelineFailureData, self.event_specific_data)\n\n @property\n def engine_event_data(self) -> "EngineEventData":\n _assert_type("engine_event_data", DagsterEventType.ENGINE_EVENT, self.event_type)\n return cast(EngineEventData, self.event_specific_data)\n\n @property\n def hook_completed_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_completed_data", DagsterEventType.HOOK_COMPLETED, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_errored_data(self) -> "HookErroredData":\n _assert_type("hook_errored_data", DagsterEventType.HOOK_ERRORED, self.event_type)\n return cast(HookErroredData, self.event_specific_data)\n\n @property\n def hook_skipped_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_skipped_data", DagsterEventType.HOOK_SKIPPED, self.event_type)\n return self.event_specific_data\n\n @property\n def logs_captured_data(self):\n _assert_type("logs_captured_data", DagsterEventType.LOGS_CAPTURED, self.event_type)\n return self.event_specific_data\n\n @staticmethod\n def step_output_event(\n step_context: StepExecutionContext, step_output_data: StepOutputData\n ) -> "DagsterEvent":\n\n output_def = step_context.solid.output_def_named(\n step_output_data.step_output_handle.output_name\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_OUTPUT,\n step_context=step_context,\n event_specific_data=step_output_data,\n message='Yielded output "{output_name}"{mapping_clause} of type "{output_type}".{type_check_clause}'.format(\n output_name=step_output_data.step_output_handle.output_name,\n output_type=output_def.dagster_type.display_name,\n type_check_clause=(\n " Warning! Type check failed."\n if not step_output_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_output_data.type_check_data\n else " (No type check).",\n mapping_clause=f' mapping key "{step_output_data.step_output_handle.mapping_key}"'\n if step_output_data.step_output_handle.mapping_key\n else "",\n ),\n )\n\n @staticmethod\n def step_failure_event(\n step_context: IStepContext, step_failure_data: "StepFailureData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_FAILURE,\n step_context=step_context,\n event_specific_data=step_failure_data,\n message='Execution of step "{step_key}" failed.'.format(step_key=step_context.step.key),\n )\n\n @staticmethod\n def step_retry_event(\n step_context: IStepContext, step_retry_data: "StepRetryData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_UP_FOR_RETRY,\n step_context=step_context,\n event_specific_data=step_retry_data,\n message='Execution of step "{step_key}" failed and has requested a retry{wait_str}.'.format(\n step_key=step_context.step.key,\n wait_str=" in {n} seconds".format(n=step_retry_data.seconds_to_wait)\n if step_retry_data.seconds_to_wait\n else "",\n ),\n )\n\n @staticmethod\n def step_input_event(\n step_context: StepExecutionContext, step_input_data: "StepInputData"\n ) -> "DagsterEvent":\n input_def = step_context.solid_def.input_def_named(step_input_data.input_name)\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_INPUT,\n step_context=step_context,\n event_specific_data=step_input_data,\n message='Got input "{input_name}" of type "{input_type}".{type_check_clause}'.format(\n input_name=step_input_data.input_name,\n input_type=input_def.dagster_type.display_name,\n type_check_clause=(\n " Warning! Type check failed."\n if not step_input_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_input_data.type_check_data\n else " (No type check).",\n ),\n )\n\n @staticmethod\n def step_start_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_START,\n step_context=step_context,\n message='Started execution of step "{step_key}".'.format(\n step_key=step_context.step.key\n ),\n )\n\n @staticmethod\n def step_restarted_event(step_context: IStepContext, previous_attempts: int) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_RESTARTED,\n step_context=step_context,\n message='Started re-execution (attempt # {n}) of step "{step_key}".'.format(\n step_key=step_context.step.key, n=previous_attempts + 1\n ),\n )\n\n @staticmethod\n def step_success_event(\n step_context: IStepContext, success: "StepSuccessData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SUCCESS,\n step_context=step_context,\n event_specific_data=success,\n message='Finished execution of step "{step_key}" in {duration}.'.format(\n step_key=step_context.step.key,\n duration=format_duration(success.duration_ms),\n ),\n )\n\n @staticmethod\n def step_skipped_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SKIPPED,\n step_context=step_context,\n message='Skipped execution of step "{step_key}".'.format(\n step_key=step_context.step.key\n ),\n )\n\n @staticmethod\n def asset_materialization(\n step_context: IStepContext,\n materialization: Union[AssetMaterialization, Materialization],\n asset_lineage: Optional[List[AssetLineageInfo]] = None,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n step_context=step_context,\n event_specific_data=StepMaterializationData(materialization, asset_lineage),\n message=materialization.description\n if materialization.description\n else "Materialized value{label_clause}.".format(\n label_clause=" {label}".format(label=materialization.label)\n if materialization.label\n else ""\n ),\n )\n\n @staticmethod\n def asset_observation(\n step_context: IStepContext, observation: AssetObservation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n step_context=step_context,\n event_specific_data=AssetObservationData(observation),\n )\n\n @staticmethod\n def step_expectation_result(\n step_context: IStepContext, expectation_result: ExpectationResult\n ) -> "DagsterEvent":\n def _msg():\n if expectation_result.description:\n return expectation_result.description\n\n return "Expectation{label_clause} {result_verb}".format(\n label_clause=" " + expectation_result.label if expectation_result.label else "",\n result_verb="passed" if expectation_result.success else "failed",\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_EXPECTATION_RESULT,\n step_context=step_context,\n event_specific_data=StepExpectationResultData(expectation_result),\n message=_msg(),\n )\n\n @staticmethod\n def pipeline_start(pipeline_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_pipeline(\n DagsterEventType.RUN_START,\n pipeline_context,\n message='Started execution of run for "{pipeline_name}".'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n )\n\n @staticmethod\n def pipeline_success(pipeline_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_pipeline(\n DagsterEventType.RUN_SUCCESS,\n pipeline_context,\n message='Finished execution of run for "{pipeline_name}".'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n )\n\n @staticmethod\n def pipeline_failure(\n pipeline_context_or_name: Union[IPlanContext, str],\n context_msg: str,\n error_info: Optional[SerializableErrorInfo] = None,\n ) -> "DagsterEvent":\n check.str_param(context_msg, "context_msg")\n if isinstance(pipeline_context_or_name, IPlanContext):\n return DagsterEvent.from_pipeline(\n DagsterEventType.RUN_FAILURE,\n pipeline_context_or_name,\n message='Execution of run for "{pipeline_name}" failed. {context_msg}'.format(\n pipeline_name=pipeline_context_or_name.pipeline_name,\n context_msg=context_msg,\n ),\n event_specific_data=PipelineFailureData(error_info),\n )\n else:\n # when the failure happens trying to bring up context, the pipeline_context hasn't been\n # built and so can't use from_pipeline\n check.str_param(pipeline_context_or_name, "pipeline_name")\n event = DagsterEvent(\n event_type_value=DagsterEventType.RUN_FAILURE.value,\n pipeline_name=pipeline_context_or_name,\n event_specific_data=PipelineFailureData(error_info),\n message='Execution of run for "{pipeline_name}" failed. {context_msg}'.format(\n pipeline_name=pipeline_context_or_name,\n context_msg=context_msg,\n ),\n pid=os.getpid(),\n )\n return event\n\n @staticmethod\n def pipeline_canceled(\n pipeline_context: IPlanContext, error_info: Optional[SerializableErrorInfo] = None\n ) -> "DagsterEvent":\n return DagsterEvent.from_pipeline(\n DagsterEventType.RUN_CANCELED,\n pipeline_context,\n message='Execution of run for "{pipeline_name}" canceled.'.format(\n pipeline_name=pipeline_context.pipeline_name\n ),\n event_specific_data=PipelineCanceledData(\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo)\n ),\n )\n\n @staticmethod\n def resource_init_start(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n ) -> "DagsterEvent":\n\n return DagsterEvent.from_resource(\n pipeline_name=pipeline_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Starting initialization of resources [{}].".format(\n ", ".join(sorted(resource_keys))\n ),\n event_specific_data=EngineEventData(metadata_entries=[], marker_start="resources"),\n )\n\n @staticmethod\n def resource_init_success(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_instances: Dict[str, Any],\n resource_init_times: Dict[str, str],\n ) -> "DagsterEvent":\n\n metadata_entries = []\n for key in resource_instances.keys():\n metadata_entries.extend(\n [\n MetadataEntry(\n key,\n value=MetadataValue.python_artifact(resource_instances[key].__class__),\n ),\n MetadataEntry(f"{key}:init_time", value=resource_init_times[key]),\n ]\n )\n\n return DagsterEvent.from_resource(\n pipeline_name=pipeline_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Finished initialization of resources [{}].".format(\n ", ".join(sorted(resource_init_times.keys()))\n ),\n event_specific_data=EngineEventData(\n metadata_entries=metadata_entries,\n marker_end="resources",\n ),\n )\n\n @staticmethod\n def resource_init_failure(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n\n return DagsterEvent.from_resource(\n pipeline_name=pipeline_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Initialization of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata_entries=[],\n marker_end="resources",\n error=error,\n ),\n )\n\n @staticmethod\n def resource_teardown_failure(\n pipeline_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n\n return DagsterEvent.from_resource(\n pipeline_name=pipeline_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Teardown of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata_entries=[],\n marker_start=None,\n marker_end=None,\n error=error,\n ),\n )\n\n @staticmethod\n def engine_event(\n pipeline_context: IPlanContext,\n message: str,\n event_specific_data: Optional["EngineEventData"] = None,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n ) -> "DagsterEvent":\n return DagsterEvent.from_pipeline(\n DagsterEventType.ENGINE_EVENT,\n pipeline_context,\n message,\n event_specific_data=event_specific_data,\n step_handle=step_handle,\n )\n\n @staticmethod\n def object_store_operation(\n step_context: IStepContext, object_store_operation_result: "ObjectStoreOperation"\n ) -> "DagsterEvent":\n\n object_store_name = (\n "{object_store_name} ".format(\n object_store_name=object_store_operation_result.object_store_name\n )\n if object_store_operation_result.object_store_name\n else ""\n )\n\n serialization_strategy_modifier = (\n " using {serialization_strategy_name}".format(\n serialization_strategy_name=object_store_operation_result.serialization_strategy_name\n )\n if object_store_operation_result.serialization_strategy_name\n else ""\n )\n\n value_name = object_store_operation_result.value_name\n\n if (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.SET_OBJECT\n ):\n message = (\n "Stored intermediate object for output {value_name} in "\n "{object_store_name}object store{serialization_strategy_modifier}."\n ).format(\n value_name=value_name,\n object_store_name=object_store_name,\n serialization_strategy_modifier=serialization_strategy_modifier,\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.GET_OBJECT\n ):\n message = (\n "Retrieved intermediate object for input {value_name} in "\n "{object_store_name}object store{serialization_strategy_modifier}."\n ).format(\n value_name=value_name,\n object_store_name=object_store_name,\n serialization_strategy_modifier=serialization_strategy_modifier,\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.CP_OBJECT\n ):\n message = (\n "Copied intermediate object for input {value_name} from {key} to {dest_key}"\n ).format(\n value_name=value_name,\n key=object_store_operation_result.key,\n dest_key=object_store_operation_result.dest_key,\n )\n else:\n message = ""\n\n return DagsterEvent.from_step(\n DagsterEventType.OBJECT_STORE_OPERATION,\n step_context,\n event_specific_data=ObjectStoreOperationResultData(\n op=object_store_operation_result.op,\n value_name=value_name,\n address=object_store_operation_result.key,\n metadata_entries=[\n MetadataEntry(\n "key", value=MetadataValue.path(object_store_operation_result.key)\n ),\n ],\n version=object_store_operation_result.version,\n mapping_key=object_store_operation_result.mapping_key,\n ),\n message=message,\n )\n\n @staticmethod\n def handled_output(\n step_context: IStepContext,\n output_name: str,\n manager_key: str,\n message_override: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n ) -> "DagsterEvent":\n message = f'Handled output "{output_name}" using IO manager "{manager_key}"'\n return DagsterEvent.from_step(\n event_type=DagsterEventType.HANDLED_OUTPUT,\n step_context=step_context,\n event_specific_data=HandledOutputData(\n output_name=output_name,\n manager_key=manager_key,\n metadata_entries=metadata_entries if metadata_entries else [],\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def loaded_input(\n step_context: IStepContext,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n message_override: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n ) -> "DagsterEvent":\n\n message = f'Loaded input "{input_name}" using input manager "{manager_key}"'\n if upstream_output_name:\n message += f', from output "{upstream_output_name}" of step ' f'"{upstream_step_key}"'\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.LOADED_INPUT,\n step_context=step_context,\n event_specific_data=LoadedInputData(\n input_name=input_name,\n manager_key=manager_key,\n upstream_output_name=upstream_output_name,\n upstream_step_key=upstream_step_key,\n metadata_entries=metadata_entries if metadata_entries else [],\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def hook_completed(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_COMPLETED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n message=(\n 'Finished the execution of hook "{hook_name}" triggered for "{solid_name}".'\n ).format(hook_name=hook_def.name, solid_name=step_context.solid.name),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def hook_errored(\n step_context: StepExecutionContext, error: HookExecutionError\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_ERRORED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n event_specific_data=_validate_event_specific_data(\n event_type,\n HookErroredData(\n error=serializable_error_info_from_exc_info(error.original_exc_info)\n ),\n ),\n )\n\n step_context.log.log_dagster_event(level=logging.ERROR, msg=str(error), dagster_event=event)\n\n return event\n\n @staticmethod\n def hook_skipped(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_SKIPPED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n pipeline_name=step_context.pipeline_name,\n step_handle=step_context.step.handle,\n solid_handle=step_context.step.solid_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.logging_tags,\n message=(\n 'Skipped the execution of hook "{hook_name}". It did not meet its triggering '\n 'condition during the execution of "{solid_name}".'\n ).format(hook_name=hook_def.name, solid_name=step_context.solid.name),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def capture_logs(pipeline_context: IPlanContext, log_key: str, steps: List["ExecutionStep"]):\n step_keys = [step.key for step in steps]\n if len(step_keys) == 1:\n message = f"Started capturing logs for step: {step_keys[0]}."\n else:\n message = f"Started capturing logs in process (pid: {os.getpid()})."\n\n if isinstance(pipeline_context, StepExecutionContext):\n return DagsterEvent.from_step(\n DagsterEventType.LOGS_CAPTURED,\n pipeline_context,\n message=message,\n event_specific_data=ComputeLogsCaptureData(\n step_keys=step_keys,\n log_key=log_key,\n ),\n )\n\n return DagsterEvent.from_pipeline(\n DagsterEventType.LOGS_CAPTURED,\n pipeline_context,\n message=message,\n event_specific_data=ComputeLogsCaptureData(\n step_keys=step_keys,\n log_key=log_key,\n ),\n )
\n\n\ndef get_step_output_event(\n events: List[DagsterEvent], step_key: str, output_name: Optional[str] = "result"\n) -> Optional["DagsterEvent"]:\n check.list_param(events, "events", of_type=DagsterEvent)\n check.str_param(step_key, "step_key")\n check.str_param(output_name, "output_name")\n for event in events:\n if (\n event.event_type == DagsterEventType.STEP_OUTPUT\n and event.step_key == step_key\n and event.step_output_data.output_name == output_name\n ):\n return event\n return None\n\n\n@whitelist_for_serdes\nclass AssetObservationData(\n NamedTuple("_AssetObservation", [("asset_observation", AssetObservation)])\n):\n def __new__(cls, asset_observation: AssetObservation):\n return super(AssetObservationData, cls).__new__(\n cls,\n asset_observation=check.inst_param(\n asset_observation, "asset_observation", AssetObservation\n ),\n )\n\n\n@whitelist_for_serdes\nclass StepMaterializationData(\n NamedTuple(\n "_StepMaterializationData",\n [\n ("materialization", Union[Materialization, AssetMaterialization]),\n ("asset_lineage", List[AssetLineageInfo]),\n ],\n )\n):\n def __new__(\n cls,\n materialization: Union[Materialization, AssetMaterialization],\n asset_lineage: Optional[List[AssetLineageInfo]] = None,\n ):\n return super(StepMaterializationData, cls).__new__(\n cls,\n materialization=check.inst_param(\n materialization, "materialization", (Materialization, AssetMaterialization)\n ),\n asset_lineage=check.opt_list_param(\n asset_lineage, "asset_lineage", of_type=AssetLineageInfo\n ),\n )\n\n\n@whitelist_for_serdes\nclass AssetMaterializationPlannedData(\n NamedTuple("_AssetMaterializationPlannedData", [("asset_key", AssetKey)])\n):\n def __new__(cls, asset_key: AssetKey):\n return super(AssetMaterializationPlannedData, cls).__new__(\n cls, asset_key=check.inst_param(asset_key, "asset_key", AssetKey)\n )\n\n\n@whitelist_for_serdes\nclass StepExpectationResultData(\n NamedTuple(\n "_StepExpectationResultData",\n [\n ("expectation_result", ExpectationResult),\n ],\n )\n):\n def __new__(cls, expectation_result: ExpectationResult):\n return super(StepExpectationResultData, cls).__new__(\n cls,\n expectation_result=check.inst_param(\n expectation_result, "expectation_result", ExpectationResult\n ),\n )\n\n\n@whitelist_for_serdes\nclass ObjectStoreOperationResultData(\n NamedTuple(\n "_ObjectStoreOperationResultData",\n [\n ("op", ObjectStoreOperationType),\n ("value_name", Optional[str]),\n ("metadata_entries", List[MetadataEntry]),\n ("address", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n value_name: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n address: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperationResultData, cls).__new__(\n cls,\n op=cast(ObjectStoreOperationType, check.str_param(op, "op")),\n value_name=check.opt_str_param(value_name, "value_name"),\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n address=check.opt_str_param(address, "address"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n\n@whitelist_for_serdes\nclass EngineEventData(\n NamedTuple(\n "_EngineEventData",\n [\n ("metadata_entries", List[MetadataEntry]),\n ("error", Optional[SerializableErrorInfo]),\n ("marker_start", Optional[str]),\n ("marker_end", Optional[str]),\n ],\n )\n):\n # serdes log\n # * added optional error\n # * added marker_start / marker_end\n #\n def __new__(\n cls,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n error: Optional[SerializableErrorInfo] = None,\n marker_start: Optional[str] = None,\n marker_end: Optional[str] = None,\n ):\n return super(EngineEventData, cls).__new__(\n cls,\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n marker_start=check.opt_str_param(marker_start, "marker_start"),\n marker_end=check.opt_str_param(marker_end, "marker_end"),\n )\n\n @staticmethod\n def in_process(\n pid: int, step_keys_to_execute: Optional[List[str]] = None, marker_end: Optional[str] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata_entries=[MetadataEntry("pid", value=str(pid))]\n + (\n [MetadataEntry("step_keys", value=str(step_keys_to_execute))]\n if step_keys_to_execute\n else []\n ),\n marker_end=marker_end,\n )\n\n @staticmethod\n def multiprocess(\n pid: int, step_keys_to_execute: Optional[List[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata_entries=[MetadataEntry("pid", value=str(pid))]\n + (\n [MetadataEntry("step_keys", value=str(step_keys_to_execute))]\n if step_keys_to_execute\n else []\n )\n )\n\n @staticmethod\n def interrupted(steps_interrupted: List[str]) -> "EngineEventData":\n return EngineEventData(\n metadata_entries=[MetadataEntry("steps_interrupted", value=str(steps_interrupted))]\n )\n\n @staticmethod\n def engine_error(error: SerializableErrorInfo) -> "EngineEventData":\n return EngineEventData(metadata_entries=[], error=error)\n\n\n@whitelist_for_serdes\nclass PipelineFailureData(\n NamedTuple(\n "_PipelineFailureData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(PipelineFailureData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass PipelineCanceledData(\n NamedTuple(\n "_PipelineCanceledData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(PipelineCanceledData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HookErroredData(\n NamedTuple(\n "_HookErroredData",\n [\n ("error", SerializableErrorInfo),\n ],\n )\n):\n def __new__(cls, error: SerializableErrorInfo):\n return super(HookErroredData, cls).__new__(\n cls, error=check.inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HandledOutputData(\n NamedTuple(\n "_HandledOutputData",\n [\n ("output_name", str),\n ("manager_key", str),\n ("metadata_entries", List[MetadataEntry]),\n ],\n )\n):\n def __new__(\n cls,\n output_name: str,\n manager_key: str,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n ):\n return super(HandledOutputData, cls).__new__(\n cls,\n output_name=check.str_param(output_name, "output_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n )\n\n\n@whitelist_for_serdes\nclass LoadedInputData(\n NamedTuple(\n "_LoadedInputData",\n [\n ("input_name", str),\n ("manager_key", str),\n ("upstream_output_name", Optional[str]),\n ("upstream_step_key", Optional[str]),\n ("metadata_entries", Optional[List[MetadataEntry]]),\n ],\n )\n):\n def __new__(\n cls,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n metadata_entries: Optional[List[MetadataEntry]] = None,\n ):\n return super(LoadedInputData, cls).__new__(\n cls,\n input_name=check.str_param(input_name, "input_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n upstream_output_name=check.opt_str_param(upstream_output_name, "upstream_output_name"),\n upstream_step_key=check.opt_str_param(upstream_step_key, "upstream_step_key"),\n metadata_entries=check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n ),\n )\n\n\n@whitelist_for_serdes\nclass ComputeLogsCaptureData(\n NamedTuple(\n "_ComputeLogsCaptureData",\n [\n ("log_key", str),\n ("step_keys", List[str]),\n ],\n )\n):\n def __new__(cls, log_key, step_keys):\n return super(ComputeLogsCaptureData, cls).__new__(\n cls,\n log_key=check.str_param(log_key, "log_key"),\n step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n )\n\n\n###################################################################################################\n# THE GRAVEYARD\n#\n# -|- -|- -|-\n# | | |\n# _-'~~~~~`-_ . _-'~~~~~`-_ _-'~~~~~`-_\n# .' '. .' '. .' '.\n# | R I P | | R I P | | R I P |\n# | | | | | |\n# | Synthetic | | Asset | | Pipeline |\n# | Process | | Store | | Init |\n# | Events | | Operations | | Failures |\n# | | | | | |\n###################################################################################################\n\n# Keep these around to prevent issues like https://github.com/dagster-io/dagster/issues/3533\n@whitelist_for_serdes\nclass AssetStoreOperationData(NamedTuple):\n op: str\n step_key: str\n output_name: str\n asset_store_key: str\n\n\n@whitelist_for_serdes\nclass AssetStoreOperationType(Enum):\n SET_ASSET = "SET_ASSET"\n GET_ASSET = "GET_ASSET"\n\n\n@whitelist_for_serdes\nclass PipelineInitFailureData(NamedTuple):\n error: SerializableErrorInfo\n\n\ndef _handle_back_compat(event_type_value, event_specific_data):\n # transform old specific process events in to engine events\n if event_type_value == "PIPELINE_PROCESS_START":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n elif event_type_value == "PIPELINE_PROCESS_STARTED":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n elif event_type_value == "PIPELINE_PROCESS_EXITED":\n return DagsterEventType.ENGINE_EVENT.value, EngineEventData([])\n\n # changes asset store ops in to get/set asset\n elif event_type_value == "ASSET_STORE_OPERATION":\n if event_specific_data.op in ("GET_ASSET", AssetStoreOperationType.GET_ASSET):\n return (\n DagsterEventType.LOADED_INPUT.value,\n LoadedInputData(\n event_specific_data.output_name, event_specific_data.asset_store_key\n ),\n )\n if event_specific_data.op in ("SET_ASSET", AssetStoreOperationType.SET_ASSET):\n return (\n DagsterEventType.HANDLED_OUTPUT.value,\n HandledOutputData(\n event_specific_data.output_name, event_specific_data.asset_store_key, []\n ),\n )\n\n # previous name for ASSET_MATERIALIZATION was STEP_MATERIALIZATION\n if event_type_value == "STEP_MATERIALIZATION":\n return DagsterEventType.ASSET_MATERIALIZATION.value, event_specific_data\n\n # transform PIPELINE_INIT_FAILURE to PIPELINE_FAILURE\n if event_type_value == "PIPELINE_INIT_FAILURE":\n return DagsterEventType.PIPELINE_FAILURE.value, PipelineFailureData(\n event_specific_data.error\n )\n\n return event_type_value, event_specific_data\n\n\nregister_serdes_tuple_fallbacks(\n {\n "PipelineProcessStartedData": None,\n "PipelineProcessExitedData": None,\n "PipelineProcessStartData": None,\n }\n)\n
", "current_page_name": "_modules/dagster/core/events", "customsidebar": null, "log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.events.log

\nfrom typing import Any, Dict, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.utils import coerce_valid_log_level\nfrom dagster.serdes.serdes import (\n    DefaultNamedTupleSerializer,\n    WhitelistMap,\n    deserialize_json_to_dagster_namedtuple,\n    register_serdes_tuple_fallbacks,\n    serialize_dagster_namedtuple,\n    whitelist_for_serdes,\n)\nfrom dagster.utils.error import SerializableErrorInfo\nfrom dagster.utils.log import (\n    JsonEventLoggerHandler,\n    StructuredLoggerHandler,\n    StructuredLoggerMessage,\n    construct_single_handler_logger,\n)\n\n\nclass EventLogEntrySerializer(DefaultNamedTupleSerializer):\n    @classmethod\n    def value_to_storage_dict(\n        cls,\n        value: NamedTuple,\n        whitelist_map: WhitelistMap,\n        descent_path: str,\n    ) -> Dict[str, Any]:\n        storage_dict = super().value_to_storage_dict(value, whitelist_map, descent_path)\n        # include an empty string for the message field to allow older versions of dagster to load the events\n        storage_dict["message"] = ""\n        return storage_dict\n\n\n
[docs]@whitelist_for_serdes(serializer=EventLogEntrySerializer)\nclass EventLogEntry(\n NamedTuple(\n "_EventLogEntry",\n [\n ("error_info", Optional[SerializableErrorInfo]),\n ("level", Union[str, int]),\n ("user_message", str),\n ("run_id", str),\n ("timestamp", float),\n ("step_key", Optional[str]),\n ("pipeline_name", Optional[str]),\n ("dagster_event", Optional[DagsterEvent]),\n ],\n )\n):\n """Entries in the event log.\n\n These entries may originate from the logging machinery (DagsterLogManager/context.log), from\n framework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n (e.g. Output).\n\n Args:\n error_info (Optional[SerializableErrorInfo]): Error info for an associated exception, if\n any, as generated by serializable_error_info_from_exc_info and friends.\n level (Union[str, int]): The Python log level at which to log this event. Note that\n framework and user code events are also logged to Python logging. This value may be an\n integer or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.\n user_message (str): For log messages, this is the user-generated message.\n run_id (str): The id of the run which generated this event.\n timestamp (float): The Unix timestamp of this event.\n step_key (Optional[str]): The step key for the step which generated this event. Some events\n are generated outside of a step context.\n job_name (Optional[str]): The job which generated this event. Some events are\n generated outside of a job context.\n dagster_event (Optional[DagsterEvent]): For framework and user events, the associated\n structured event.\n pipeline_name (Optional[str]): (legacy) The pipeline which generated this event. Some events are\n generated outside of a pipeline context.\n """\n\n def __new__(\n cls,\n error_info,\n level,\n user_message,\n run_id,\n timestamp,\n step_key=None,\n pipeline_name=None,\n dagster_event=None,\n job_name=None,\n ):\n if pipeline_name and job_name:\n raise DagsterInvariantViolationError(\n "Provided both `pipeline_name` and `job_name` parameters to `EventLogEntry` "\n "initialization. Please provide only one or the other."\n )\n\n pipeline_name = pipeline_name or job_name\n return super(EventLogEntry, cls).__new__(\n cls,\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo),\n coerce_valid_log_level(level),\n check.str_param(user_message, "user_message"),\n check.str_param(run_id, "run_id"),\n check.float_param(timestamp, "timestamp"),\n check.opt_str_param(step_key, "step_key"),\n check.opt_str_param(pipeline_name, "pipeline_name"),\n check.opt_inst_param(dagster_event, "dagster_event", DagsterEvent),\n )\n\n @property\n def is_dagster_event(self) -> bool:\n return bool(self.dagster_event)\n\n @property\n def job_name(self) -> Optional[str]:\n return self.pipeline_name\n\n def get_dagster_event(self) -> DagsterEvent:\n if not isinstance(self.dagster_event, DagsterEvent):\n check.failed(\n "Not a dagster event, check is_dagster_event before calling get_dagster_event",\n )\n\n return self.dagster_event\n\n def to_json(self):\n return serialize_dagster_namedtuple(self)\n\n @staticmethod\n def from_json(json_str):\n return deserialize_json_to_dagster_namedtuple(json_str)\n\n @property\n def dagster_event_type(self):\n return self.dagster_event.event_type if self.dagster_event else None\n\n @property\n def message(self) -> str:\n """\n Return the message from the structured DagsterEvent if present, fallback to user_message\n """\n\n if self.is_dagster_event:\n msg = self.get_dagster_event().message\n if msg is not None:\n return msg\n\n return self.user_message
\n\n\ndef construct_event_record(logger_message: StructuredLoggerMessage) -> EventLogEntry:\n check.inst_param(logger_message, "logger_message", StructuredLoggerMessage)\n\n return EventLogEntry(\n level=logger_message.level,\n user_message=logger_message.meta["orig_message"],\n run_id=logger_message.meta["run_id"],\n timestamp=logger_message.record.created,\n step_key=logger_message.meta.get("step_key"),\n job_name=logger_message.meta.get("pipeline_name"),\n dagster_event=logger_message.meta.get("dagster_event"),\n error_info=None,\n )\n\n\ndef construct_event_logger(event_record_callback):\n """\n Callback receives a stream of event_records. Piggybacks on the logging machinery.\n """\n check.callable_param(event_record_callback, "event_record_callback")\n\n return construct_single_handler_logger(\n "event-logger",\n "debug",\n StructuredLoggerHandler(\n lambda logger_message: event_record_callback(construct_event_record(logger_message))\n ),\n )\n\n\ndef construct_json_event_logger(json_path):\n """Record a stream of event records to json"""\n check.str_param(json_path, "json_path")\n return construct_single_handler_logger(\n "json-event-record-logger",\n "debug",\n JsonEventLoggerHandler(\n json_path,\n lambda record: construct_event_record(\n StructuredLoggerMessage(\n name=record.name,\n message=record.msg,\n level=record.levelno,\n meta=record.dagster_meta,\n record=record,\n )\n ),\n ),\n )\n\n\nregister_serdes_tuple_fallbacks(\n {\n # These were originally distinguished from each other but ended up being empty subclasses\n # of EventLogEntry -- instead of using the subclasses we were relying on\n # EventLogEntry.is_dagster_event to distinguish events that originate in the logging\n # machinery from events that are yielded by user code\n "DagsterEventRecord": EventLogEntry,\n "LogMessageRecord": EventLogEntry,\n # renamed EventRecord -> EventLogEntry\n "EventRecord": EventLogEntry,\n }\n)\n
", "current_page_name": "_modules/dagster/core/events/log", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster.core.events"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.events.log"}, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.events"}, "execution": {"api": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.api

\nimport sys\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, FrozenSet, Iterator, List, Mapping, Optional, Tuple, Union\n\nimport dagster._check as check\nfrom dagster.core.definitions import IPipeline, JobDefinition, PipelineDefinition\nfrom dagster.core.definitions.pipeline_base import InMemoryPipeline\nfrom dagster.core.definitions.pipeline_definition import PipelineSubsetDefinition\nfrom dagster.core.definitions.reconstruct import ReconstructablePipeline\nfrom dagster.core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster.core.events import DagsterEvent, EngineEventData\nfrom dagster.core.execution.context.system import PlanOrchestrationContext\nfrom dagster.core.execution.plan.execute_plan import inner_plan_execution_iterator\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.execution.plan.plan import ExecutionPlan\nfrom dagster.core.execution.plan.state import KnownExecutionState\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.instance import DagsterInstance, InstanceRef\nfrom dagster.core.selector import parse_step_selection\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.system_config.objects import ResolvedRunConfig\nfrom dagster.core.telemetry import log_repo_stats, telemetry_wrapper\nfrom dagster.core.utils import str_format_set\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\nfrom dagster.utils.interrupts import capture_interrupts\n\nfrom .context_creation_pipeline import (\n    ExecutionContextManager,\n    PlanExecutionContextManager,\n    PlanOrchestrationContextManager,\n    orchestration_context_event_generator,\n    scoped_pipeline_context,\n)\nfrom .results import PipelineExecutionResult\n\n## Brief guide to the execution APIs\n# | function name               | operates over      | sync  | supports    | creates new PipelineRun |\n# |                             |                    |       | reexecution | in instance             |\n# | --------------------------- | ------------------ | ----- | ----------- | ----------------------- |\n# | execute_pipeline_iterator   | IPipeline          | async | no          | yes                     |\n# | execute_pipeline            | IPipeline          | sync  | no          | yes                     |\n# | execute_run_iterator        | PipelineRun        | async | (1)         | no                      |\n# | execute_run                 | PipelineRun        | sync  | (1)         | no                      |\n# | execute_plan_iterator       | ExecutionPlan      | async | (2)         | no                      |\n# | execute_plan                | ExecutionPlan      | sync  | (2)         | no                      |\n# | reexecute_pipeline          | IPipeline          | sync  | yes         | yes                     |\n# | reexecute_pipeline_iterator | IPipeline          | async | yes         | yes                     |\n#\n# Notes on reexecution support:\n# (1) The appropriate bits must be set on the PipelineRun passed to this function. Specifically,\n#     parent_run_id and root_run_id must be set and consistent, and if a solids_to_execute or\n#     step_keys_to_execute are set they must be consistent with the parent and root runs.\n# (2) As for (1), but the ExecutionPlan passed must also agree in all relevant bits.\n\n\ndef execute_run_iterator(\n    pipeline: IPipeline,\n    pipeline_run: PipelineRun,\n    instance: DagsterInstance,\n    resume_from_failure: bool = False,\n) -> Iterator[DagsterEvent]:\n    check.inst_param(pipeline, "pipeline", IPipeline)\n    check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if pipeline_run.status == PipelineRunStatus.CANCELED:\n        # This can happen if the run was force-terminated while it was starting\n        def gen_execute_on_cancel():\n            yield instance.report_engine_event(\n                "Not starting execution since the run was canceled before execution could start",\n                pipeline_run,\n            )\n\n        return gen_execute_on_cancel()\n\n    if not resume_from_failure:\n        if pipeline_run.status not in (PipelineRunStatus.NOT_STARTED, PipelineRunStatus.STARTING):\n            if instance.run_monitoring_enabled:\n                # This can happen if the pod was unexpectedly restarted by the cluster - ignore it since\n                # the run monitoring daemon will also spin up a new pod\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a duplicate run that was started from somewhere other than the run monitor daemon",\n                        pipeline_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            elif pipeline_run.is_finished:\n\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a run worker that started after the run had already finished.",\n                        pipeline_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            else:\n\n                def gen_fail_restarted_run_worker():\n                    yield instance.report_engine_event(\n                        f"{pipeline_run.pipeline_name} ({pipeline_run.run_id}) started "\n                        f"a new run worker while the run was already in state {pipeline_run.status}. "\n                        "This most frequently happens when the run worker unexpectedly stops and is "\n                        "restarted by the cluster. Marking the run as failed.",\n                        pipeline_run,\n                    )\n                    yield instance.report_run_failed(pipeline_run)\n\n                return gen_fail_restarted_run_worker()\n\n    else:\n        check.invariant(\n            pipeline_run.status == PipelineRunStatus.STARTED\n            or pipeline_run.status == PipelineRunStatus.STARTING,\n            desc="Run of {} ({}) in state {}, expected STARTED or STARTING because it's "\n            "resuming from a run worker failure".format(\n                pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status\n            ),\n        )\n\n    if pipeline_run.solids_to_execute or pipeline_run.asset_selection:\n        pipeline_def = pipeline.get_definition()\n        if isinstance(pipeline_def, PipelineSubsetDefinition):\n            check.invariant(\n                pipeline_run.solids_to_execute == pipeline.solids_to_execute,\n                "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that conflicts "\n                "with pipeline subset {pipeline_solids_to_execute}.".format(\n                    pipeline_solids_to_execute=str_format_set(pipeline.solids_to_execute),\n                    solids_to_execute=str_format_set(pipeline_run.solids_to_execute),\n                ),\n            )\n        else:\n            # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created\n            # note that when we receive the solids to execute via PipelineRun, it won't support\n            # solid selection query syntax\n            pipeline = pipeline.subset_for_execution_from_existing_pipeline(\n                frozenset(pipeline_run.solids_to_execute)\n                if pipeline_run.solids_to_execute\n                else None,\n                asset_selection=pipeline_run.asset_selection,\n            )\n\n    execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run, instance)\n\n    return iter(\n        ExecuteRunWithPlanIterable(\n            execution_plan=execution_plan,\n            iterator=pipeline_execution_iterator,\n            execution_context_manager=PlanOrchestrationContextManager(\n                context_event_generator=orchestration_context_event_generator,\n                pipeline=pipeline,\n                execution_plan=execution_plan,\n                pipeline_run=pipeline_run,\n                instance=instance,\n                run_config=pipeline_run.run_config,\n                raise_on_error=False,\n                executor_defs=None,\n                output_capture=None,\n                resume_from_failure=resume_from_failure,\n            ),\n        )\n    )\n\n\ndef execute_run(\n    pipeline: IPipeline,\n    pipeline_run: PipelineRun,\n    instance: DagsterInstance,\n    raise_on_error: bool = False,\n) -> PipelineExecutionResult:\n    """Executes an existing pipeline run synchronously.\n\n    Synchronous version of execute_run_iterator.\n\n    Args:\n        pipeline (IPipeline): The pipeline to execute.\n        pipeline_run (PipelineRun): The run to execute\n        instance (DagsterInstance): The instance in which the run has been created.\n        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n            Defaults to ``False``.\n\n    Returns:\n        PipelineExecutionResult: The result of the execution.\n    """\n    if isinstance(pipeline, PipelineDefinition):\n        if isinstance(pipeline, JobDefinition):\n            error = "execute_run requires a reconstructable job but received job definition directly instead."\n        else:\n            error = (\n                "execute_run requires a reconstructable pipeline but received pipeline definition "\n                "directly instead."\n            )\n        raise DagsterInvariantViolationError(\n            f"{error} To support hand-off to other processes please wrap your definition in "\n            "a call to reconstructable(). Learn more about reconstructable here: https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n        )\n\n    check.inst_param(pipeline, "pipeline", IPipeline)\n    check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if pipeline_run.status == PipelineRunStatus.CANCELED:\n        message = "Not starting execution since the run was canceled before execution could start"\n        instance.report_engine_event(\n            message,\n            pipeline_run,\n        )\n        raise DagsterInvariantViolationError(message)\n\n    check.invariant(\n        pipeline_run.status == PipelineRunStatus.NOT_STARTED\n        or pipeline_run.status == PipelineRunStatus.STARTING,\n        desc="Run {} ({}) in state {}, expected NOT_STARTED or STARTING".format(\n            pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status\n        ),\n    )\n    pipeline_def = pipeline.get_definition()\n    if pipeline_run.solids_to_execute or pipeline_run.asset_selection:\n        if isinstance(pipeline_def, PipelineSubsetDefinition):\n            check.invariant(\n                pipeline_run.solids_to_execute == pipeline.solids_to_execute,\n                "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that "\n                "conflicts with pipeline subset {pipeline_solids_to_execute}.".format(\n                    pipeline_solids_to_execute=str_format_set(pipeline.solids_to_execute),\n                    solids_to_execute=str_format_set(pipeline_run.solids_to_execute),\n                ),\n            )\n        else:\n            # when `execute_run` is directly called, the sub pipeline hasn't been created\n            # note that when we receive the solids to execute via PipelineRun, it won't support\n            # solid selection query syntax\n            pipeline = pipeline.subset_for_execution_from_existing_pipeline(\n                frozenset(pipeline_run.solids_to_execute)\n                if pipeline_run.solids_to_execute\n                else None,\n                pipeline_run.asset_selection,\n            )\n\n    execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run, instance)\n    output_capture: Optional[Dict[StepOutputHandle, Any]] = {}\n\n    _execute_run_iterable = ExecuteRunWithPlanIterable(\n        execution_plan=execution_plan,\n        iterator=pipeline_execution_iterator,\n        execution_context_manager=PlanOrchestrationContextManager(\n            context_event_generator=orchestration_context_event_generator,\n            pipeline=pipeline,\n            execution_plan=execution_plan,\n            pipeline_run=pipeline_run,\n            instance=instance,\n            run_config=pipeline_run.run_config,\n            raise_on_error=raise_on_error,\n            executor_defs=None,\n            output_capture=output_capture,\n        ),\n    )\n    event_list = list(_execute_run_iterable)\n\n    return PipelineExecutionResult(\n        pipeline.get_definition(),\n        pipeline_run.run_id,\n        event_list,\n        lambda: scoped_pipeline_context(\n            execution_plan,\n            pipeline,\n            pipeline_run.run_config,\n            pipeline_run,\n            instance,\n        ),\n        output_capture=output_capture,\n    )\n\n\n
[docs]def execute_pipeline_iterator(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> Iterator[DagsterEvent]:\n """Execute a pipeline iteratively.\n\n Rather than package up the result of running a pipeline into a single object, like\n :py:func:`execute_pipeline`, this function yields the stream of events resulting from pipeline\n execution.\n\n This is intended to allow the caller to handle these events on a streaming basis in whatever\n way is appropriate.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n run_config (Optional[dict]): The configuration that parametrizes this run,\n as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n\n - ``['some_solid']``: selects ``some_solid`` itself.\n - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).\n - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its\n ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Iterator[DagsterEvent]: The stream of events resulting from pipeline execution.\n """\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n (\n pipeline,\n run_config,\n mode,\n tags,\n solids_to_execute,\n solid_selection,\n ) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n )\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n tags=tags,\n )\n\n return execute_run_iterator(pipeline, pipeline_run, execute_instance)
\n\n\n@contextmanager\ndef ephemeral_instance_if_missing(\n instance: Optional[DagsterInstance],\n) -> Iterator[DagsterInstance]:\n if instance:\n yield instance\n else:\n with DagsterInstance.ephemeral() as ephemeral_instance:\n yield ephemeral_instance\n\n\n
[docs]def execute_pipeline(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n """Execute a pipeline synchronously.\n\n Users will typically call this API when testing pipeline execution, or running standalone\n scripts.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n run_config (Optional[dict]): The configuration that parametrizes this run,\n as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n\n - ``['some_solid']``: selects ``some_solid`` itself.\n - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).\n - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its\n ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.\n\n Returns:\n :py:class:`PipelineExecutionResult`: The result of pipeline execution.\n\n For the asynchronous version, see :py:func:`execute_pipeline_iterator`.\n """\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n return _logged_execute_pipeline(\n pipeline,\n instance=execute_instance,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n raise_on_error=raise_on_error,\n )
\n\n\n@telemetry_wrapper\ndef _logged_execute_pipeline(\n pipeline: Union[IPipeline, PipelineDefinition],\n instance: DagsterInstance,\n run_config: Optional[dict] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n check.inst_param(instance, "instance", DagsterInstance)\n (\n pipeline,\n run_config,\n mode,\n tags,\n solids_to_execute,\n solid_selection,\n ) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=solid_selection,\n )\n\n log_repo_stats(instance=instance, pipeline=pipeline, source="execute_pipeline")\n\n pipeline_run = instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n tags=tags,\n pipeline_code_origin=(\n pipeline.get_python_origin() if isinstance(pipeline, ReconstructablePipeline) else None\n ),\n )\n\n return execute_run(\n pipeline,\n pipeline_run,\n instance,\n raise_on_error=raise_on_error,\n )\n\n\n
[docs]def reexecute_pipeline(\n pipeline: Union[IPipeline, PipelineDefinition],\n parent_run_id: str,\n run_config: Optional[dict] = None,\n step_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> PipelineExecutionResult:\n """Reexecute an existing pipeline run.\n\n Users will typically call this API when testing pipeline reexecution, or running standalone\n scripts.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n parent_run_id (str): The id of the previous run to reexecute. The run must exist in the\n instance.\n run_config (Optional[dict]): The configuration that parametrizes this run,\n as a dict.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n\n - ``['some_solid']``: selects ``some_solid`` itself.\n - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).\n - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its\n ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.\n\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n\n Returns:\n :py:class:`PipelineExecutionResult`: The result of pipeline execution.\n\n For the asynchronous version, see :py:func:`reexecute_pipeline_iterator`.\n """\n\n check.opt_list_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n (pipeline, run_config, mode, tags, _, _) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n )\n\n parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)\n if parent_pipeline_run is None:\n check.failed(\n "No parent run with id {parent_run_id} found in instance.".format(\n parent_run_id=parent_run_id\n ),\n )\n\n execution_plan: Optional[ExecutionPlan] = None\n # resolve step selection DSL queries using parent execution information\n if step_selection:\n execution_plan = _resolve_reexecute_step_selection(\n execute_instance,\n pipeline,\n mode,\n run_config,\n parent_pipeline_run,\n step_selection,\n )\n\n if parent_pipeline_run.asset_selection:\n pipeline = pipeline.subset_for_execution(\n solid_selection=None, asset_selection=parent_pipeline_run.asset_selection\n )\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n execution_plan=execution_plan,\n run_config=run_config,\n mode=mode,\n tags=tags,\n solid_selection=parent_pipeline_run.solid_selection,\n asset_selection=parent_pipeline_run.asset_selection,\n solids_to_execute=parent_pipeline_run.solids_to_execute,\n root_run_id=parent_pipeline_run.root_run_id or parent_pipeline_run.run_id,\n parent_run_id=parent_pipeline_run.run_id,\n )\n\n return execute_run(\n pipeline,\n pipeline_run,\n execute_instance,\n raise_on_error=raise_on_error,\n )
\n\n\n
[docs]def reexecute_pipeline_iterator(\n pipeline: Union[IPipeline, PipelineDefinition],\n parent_run_id: str,\n run_config: Optional[dict] = None,\n step_selection: Optional[List[str]] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> Iterator[DagsterEvent]:\n """Reexecute a pipeline iteratively.\n\n Rather than package up the result of running a pipeline into a single object, like\n :py:func:`reexecute_pipeline`, this function yields the stream of events resulting from pipeline\n reexecution.\n\n This is intended to allow the caller to handle these events on a streaming basis in whatever\n way is appropriate.\n\n Parameters:\n pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.\n parent_run_id (str): The id of the previous run to reexecute. The run must exist in the\n instance.\n run_config (Optional[dict]): The configuration that parametrizes this run,\n as a dict.\n solid_selection (Optional[List[str]]): A list of solid selection queries (including single\n solid names) to execute. For example:\n\n - ``['some_solid']``: selects ``some_solid`` itself.\n - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).\n - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its\n ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.\n\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Iterator[DagsterEvent]: The stream of events resulting from pipeline reexecution.\n """\n\n check.opt_list_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n (pipeline, run_config, mode, tags, _, _) = _check_execute_pipeline_args(\n pipeline=pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n solid_selection=None,\n )\n parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)\n if parent_pipeline_run is None:\n check.failed(\n "No parent run with id {parent_run_id} found in instance.".format(\n parent_run_id=parent_run_id\n ),\n )\n\n execution_plan: Optional[ExecutionPlan] = None\n # resolve step selection DSL queries using parent execution information\n if step_selection:\n execution_plan = _resolve_reexecute_step_selection(\n execute_instance,\n pipeline,\n mode,\n run_config,\n parent_pipeline_run,\n step_selection,\n )\n\n pipeline_run = execute_instance.create_run_for_pipeline(\n pipeline_def=pipeline.get_definition(),\n run_config=run_config,\n execution_plan=execution_plan,\n mode=mode,\n tags=tags,\n solid_selection=parent_pipeline_run.solid_selection,\n solids_to_execute=parent_pipeline_run.solids_to_execute,\n root_run_id=parent_pipeline_run.root_run_id or parent_pipeline_run.run_id,\n parent_run_id=parent_pipeline_run.run_id,\n )\n\n return execute_run_iterator(pipeline, pipeline_run, execute_instance)
\n\n\ndef execute_plan_iterator(\n execution_plan: ExecutionPlan,\n pipeline: IPipeline,\n pipeline_run: PipelineRun,\n instance: DagsterInstance,\n retry_mode: Optional[RetryMode] = None,\n run_config: Optional[Mapping[str, object]] = None,\n) -> Iterator[DagsterEvent]:\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(pipeline, "pipeline", IPipeline)\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.inst_param(instance, "instance", DagsterInstance)\n retry_mode = check.opt_inst_param(retry_mode, "retry_mode", RetryMode, RetryMode.DISABLED)\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n return iter(\n ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=inner_plan_execution_iterator,\n execution_context_manager=PlanExecutionContextManager(\n pipeline=pipeline,\n retry_mode=retry_mode,\n execution_plan=execution_plan,\n run_config=run_config,\n pipeline_run=pipeline_run,\n instance=instance,\n ),\n )\n )\n\n\ndef execute_plan(\n execution_plan: ExecutionPlan,\n pipeline: IPipeline,\n instance: DagsterInstance,\n pipeline_run: PipelineRun,\n run_config: Optional[Dict] = None,\n retry_mode: Optional[RetryMode] = None,\n) -> List[DagsterEvent]:\n """This is the entry point of dagster-graphql executions. For the dagster CLI entry point, see\n execute_pipeline() above.\n """\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(pipeline, "pipeline", IPipeline)\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n run_config = check.opt_dict_param(run_config, "run_config")\n check.opt_inst_param(retry_mode, "retry_mode", RetryMode)\n\n return list(\n execute_plan_iterator(\n execution_plan=execution_plan,\n pipeline=pipeline,\n run_config=run_config,\n pipeline_run=pipeline_run,\n instance=instance,\n retry_mode=retry_mode,\n )\n )\n\n\ndef _check_pipeline(pipeline: Union[PipelineDefinition, IPipeline]) -> IPipeline:\n # backcompat\n if isinstance(pipeline, PipelineDefinition):\n pipeline = InMemoryPipeline(pipeline)\n\n check.inst_param(pipeline, "pipeline", IPipeline)\n return pipeline\n\n\ndef _get_execution_plan_from_run(\n pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance\n) -> ExecutionPlan:\n if (\n # need to rebuild execution plan so it matches the subsetted graph\n pipeline.solids_to_execute is None\n and pipeline_run.execution_plan_snapshot_id\n ):\n execution_plan_snapshot = instance.get_execution_plan_snapshot(\n pipeline_run.execution_plan_snapshot_id\n )\n if execution_plan_snapshot.can_reconstruct_plan:\n return ExecutionPlan.rebuild_from_snapshot(\n pipeline_run.pipeline_name,\n execution_plan_snapshot,\n )\n return create_execution_plan(\n pipeline,\n run_config=pipeline_run.run_config,\n mode=pipeline_run.mode,\n step_keys_to_execute=pipeline_run.step_keys_to_execute,\n instance_ref=instance.get_ref() if instance.is_persistent else None,\n )\n\n\ndef create_execution_plan(\n pipeline: Union[IPipeline, PipelineDefinition],\n run_config: Optional[Mapping[str, object]] = None,\n mode: Optional[str] = None,\n step_keys_to_execute: Optional[List[str]] = None,\n known_state: Optional[KnownExecutionState] = None,\n instance_ref: Optional[InstanceRef] = None,\n tags: Optional[Dict[str, str]] = None,\n) -> ExecutionPlan:\n pipeline = _check_pipeline(pipeline)\n pipeline_def = pipeline.get_definition()\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n mode = check.opt_str_param(mode, "mode", default=pipeline_def.get_default_mode_name())\n check.opt_nullable_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n tags = check.opt_dict_param(tags, "tags", key_type=str, value_type=str)\n\n resolved_run_config = ResolvedRunConfig.build(pipeline_def, run_config, mode=mode)\n\n return ExecutionPlan.build(\n pipeline,\n resolved_run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance_ref=instance_ref,\n tags=tags,\n )\n\n\ndef pipeline_execution_iterator(\n pipeline_context: PlanOrchestrationContext, execution_plan: ExecutionPlan\n) -> Iterator[DagsterEvent]:\n """A complete execution of a pipeline. Yields pipeline start, success,\n and failure events.\n\n Args:\n pipeline_context (PlanOrchestrationContext):\n execution_plan (ExecutionPlan):\n """\n\n # TODO: restart event?\n if not pipeline_context.resume_from_failure:\n yield DagsterEvent.pipeline_start(pipeline_context)\n\n pipeline_exception_info = None\n pipeline_canceled_info = None\n failed_steps = []\n generator_closed = False\n try:\n for event in pipeline_context.executor.execute(pipeline_context, execution_plan):\n if event.is_step_failure:\n failed_steps.append(event.step_key)\n\n yield event\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n pipeline_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if pipeline_context.raise_on_error:\n raise\n except (KeyboardInterrupt, DagsterExecutionInterruptedError):\n pipeline_canceled_info = serializable_error_info_from_exc_info(sys.exc_info())\n if pipeline_context.raise_on_error:\n raise\n except BaseException:\n pipeline_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if pipeline_context.raise_on_error:\n raise # finally block will run before this is re-raised\n finally:\n if pipeline_canceled_info:\n reloaded_run = pipeline_context.instance.get_run_by_id(pipeline_context.run_id)\n if reloaded_run and reloaded_run.status == PipelineRunStatus.CANCELING:\n event = DagsterEvent.pipeline_canceled(pipeline_context, pipeline_canceled_info)\n elif reloaded_run and reloaded_run.status == PipelineRunStatus.CANCELED:\n # This happens if the run was force-terminated but was still able to send\n # a cancellation request\n event = DagsterEvent.engine_event(\n pipeline_context,\n "Computational resources were cleaned up after the run was forcibly marked as canceled.",\n EngineEventData(),\n )\n elif pipeline_context.instance.run_will_resume(pipeline_context.run_id):\n event = DagsterEvent.engine_event(\n pipeline_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, not treating as failure because run will be resumed.",\n EngineEventData(),\n )\n else:\n event = DagsterEvent.pipeline_failure(\n pipeline_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, treating as failure.",\n pipeline_canceled_info,\n )\n elif pipeline_exception_info:\n event = DagsterEvent.pipeline_failure(\n pipeline_context,\n "An exception was thrown during execution.",\n pipeline_exception_info,\n )\n elif failed_steps:\n event = DagsterEvent.pipeline_failure(\n pipeline_context,\n "Steps failed: {}.".format(failed_steps),\n )\n else:\n event = DagsterEvent.pipeline_success(pipeline_context)\n if not generator_closed:\n yield event\n\n\nclass ExecuteRunWithPlanIterable:\n """Utility class to consolidate execution logic.\n\n This is a class and not a function because, e.g., in constructing a `scoped_pipeline_context`\n for `PipelineExecutionResult`, we need to pull out the `pipeline_context` after we're done\n yielding events. This broadly follows a pattern we make use of in other places,\n cf. `dagster.utils.EventGenerationManager`.\n """\n\n def __init__(self, execution_plan, iterator, execution_context_manager):\n self.execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n self.iterator = check.callable_param(iterator, "iterator")\n self.execution_context_manager = check.inst_param(\n execution_context_manager, "execution_context_manager", ExecutionContextManager\n )\n\n self.pipeline_context = None\n\n def __iter__(self):\n # Since interrupts can't be raised at arbitrary points safely, delay them until designated\n # checkpoints during the execution.\n # To be maximally certain that interrupts are always caught during an execution process,\n # you can safely add an additional `with capture_interrupts()` at the very beginning of the\n # process that performs the execution.\n with capture_interrupts():\n yield from self.execution_context_manager.prepare_context()\n self.pipeline_context = self.execution_context_manager.get_context()\n generator_closed = False\n try:\n if self.pipeline_context: # False if we had a pipeline init failure\n yield from self.iterator(\n execution_plan=self.execution_plan,\n pipeline_context=self.pipeline_context,\n )\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n raise\n finally:\n for event in self.execution_context_manager.shutdown_context():\n if not generator_closed:\n yield event\n\n\ndef _check_execute_pipeline_args(\n pipeline: Union[PipelineDefinition, IPipeline],\n run_config: Optional[dict],\n mode: Optional[str],\n preset: Optional[str],\n tags: Optional[Dict[str, Any]],\n solid_selection: Optional[List[str]] = None,\n) -> Tuple[\n IPipeline,\n Optional[dict],\n Optional[str],\n Dict[str, Any],\n Optional[FrozenSet[str]],\n Optional[List[str]],\n]:\n pipeline = _check_pipeline(pipeline)\n pipeline_def = pipeline.get_definition()\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n run_config = check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n check.invariant(\n not (mode is not None and preset is not None),\n "You may set only one of `mode` (got {mode}) or `preset` (got {preset}).".format(\n mode=mode, preset=preset\n ),\n )\n\n tags = check.opt_dict_param(tags, "tags", key_type=str)\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n\n if preset is not None:\n pipeline_preset = pipeline_def.get_preset(preset)\n\n if pipeline_preset.run_config is not None:\n check.invariant(\n (not run_config) or (pipeline_preset.run_config == run_config),\n "The environment set in preset '{preset}' does not agree with the environment "\n "passed in the `run_config` argument.".format(preset=preset),\n )\n\n run_config = pipeline_preset.run_config\n\n # load solid_selection from preset\n if pipeline_preset.solid_selection is not None:\n check.invariant(\n solid_selection is None or solid_selection == pipeline_preset.solid_selection,\n "The solid_selection set in preset '{preset}', {preset_subset}, does not agree with "\n "the `solid_selection` argument: {solid_selection}".format(\n preset=preset,\n preset_subset=pipeline_preset.solid_selection,\n solid_selection=solid_selection,\n ),\n )\n solid_selection = pipeline_preset.solid_selection\n\n check.invariant(\n mode is None or mode == pipeline_preset.mode,\n "Mode {mode} does not agree with the mode set in preset '{preset}': "\n "('{preset_mode}')".format(preset=preset, preset_mode=pipeline_preset.mode, mode=mode),\n )\n\n mode = pipeline_preset.mode\n\n tags = merge_dicts(pipeline_preset.tags, tags)\n\n if mode is not None:\n if not pipeline_def.has_mode_definition(mode):\n raise DagsterInvariantViolationError(\n (\n "You have attempted to execute pipeline {name} with mode {mode}. "\n "Available modes: {modes}"\n ).format(\n name=pipeline_def.name,\n mode=mode,\n modes=pipeline_def.available_modes,\n )\n )\n else:\n if pipeline_def.is_multi_mode:\n raise DagsterInvariantViolationError(\n (\n "Pipeline {name} has multiple modes (Available modes: {modes}) and you have "\n "attempted to execute it without specifying a mode. Set "\n "mode property on the PipelineRun object."\n ).format(name=pipeline_def.name, modes=pipeline_def.available_modes)\n )\n mode = pipeline_def.get_default_mode_name()\n\n tags = merge_dicts(pipeline_def.tags, tags)\n\n # generate pipeline subset from the given solid_selection\n if solid_selection:\n pipeline = pipeline.subset_for_execution(solid_selection)\n\n return (\n pipeline,\n run_config,\n mode,\n tags,\n pipeline.solids_to_execute,\n solid_selection,\n )\n\n\ndef _resolve_reexecute_step_selection(\n instance: DagsterInstance,\n pipeline: IPipeline,\n mode: Optional[str],\n run_config: Optional[dict],\n parent_pipeline_run: PipelineRun,\n step_selection: List[str],\n) -> ExecutionPlan:\n if parent_pipeline_run.solid_selection:\n pipeline = pipeline.subset_for_execution(parent_pipeline_run.solid_selection, None)\n\n parent_logs = instance.all_logs(parent_pipeline_run.run_id)\n parent_plan = create_execution_plan(\n pipeline,\n parent_pipeline_run.run_config,\n mode,\n known_state=KnownExecutionState.derive_from_logs(parent_logs),\n )\n step_keys_to_execute = parse_step_selection(parent_plan.get_all_step_deps(), step_selection)\n execution_plan = create_execution_plan(\n pipeline,\n run_config,\n mode,\n step_keys_to_execute=list(step_keys_to_execute),\n known_state=KnownExecutionState.for_reexecution(parent_logs, step_keys_to_execute),\n tags=parent_pipeline_run.tags,\n )\n return execution_plan\n
", "current_page_name": "_modules/dagster/core/execution/api", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.api"}, "build_resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.build_resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Generator, Optional, cast\n\nimport dagster._check as check\nfrom dagster.config.validate import process_config\nfrom dagster.core.definitions.resource_definition import (\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster.core.definitions.run_config import define_resource_dictionary_cls\nfrom dagster.core.errors import DagsterInvalidConfigError\nfrom dagster.core.execution.resources_init import resource_initialization_manager\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.system_config.objects import ResourceConfig, config_map_resources\n\nfrom .api import ephemeral_instance_if_missing\nfrom .context_creation_pipeline import initialize_console_manager\n\n\ndef _get_mapped_resource_config(\n    resource_defs: Dict[str, ResourceDefinition], resource_config: Dict[str, Any]\n) -> Dict[str, ResourceConfig]:\n    resource_config_schema = define_resource_dictionary_cls(\n        resource_defs, set(resource_defs.keys())\n    )\n    config_evr = process_config(resource_config_schema, resource_config)\n    if not config_evr.success:\n        raise DagsterInvalidConfigError(\n            "Error in config for resources ",\n            config_evr.errors,\n            resource_config,\n        )\n    config_value = cast(Dict[str, Any], config_evr.value)\n    return config_map_resources(resource_defs, config_value)\n\n\n
[docs]@contextmanager\ndef build_resources(\n resources: Dict[str, Any],\n instance: Optional[DagsterInstance] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n pipeline_run: Optional[PipelineRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n) -> Generator[Resources, None, None]:\n """Context manager that yields resources using provided resource definitions and run config.\n\n This API allows for using resources in an independent context. Resources will be initialized\n with the provided run config, and optionally, pipeline_run. The resulting resources will be\n yielded on a dictionary keyed identically to that provided for `resource_defs`. Upon exiting the\n context, resources will also be torn down safely.\n\n Args:\n resources (Dict[str, Any]): Resource instances or definitions to build. All\n required resource dependencies to a given resource must be contained within this\n dictionary, or the resource build will fail.\n instance (Optional[DagsterInstance]): The dagster instance configured to instantiate\n resources on.\n resource_config (Optional[Dict[str, Any]]): A dict representing the config to be\n provided to each resource during initialization and teardown.\n pipeline_run (Optional[PipelineRun]): The pipeline run to provide during resource\n initialization and teardown. If the provided resources require either the `pipeline_run`\n or `run_id` attributes of the provided context during resource initialization and/or\n teardown, this must be provided, or initialization will fail.\n log_manager (Optional[DagsterLogManager]): Log Manager to use during resource\n initialization. Defaults to system log manager.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import resource, build_resources\n\n @resource\n def the_resource():\n return "foo"\n\n with build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n assert resources.from_def == "foo"\n assert resources.from_val == "bar"\n\n """\n\n resources = check.dict_param(resources, "resource_defs", key_type=str)\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resource_config = check.opt_dict_param(resource_config, "resource_config", key_type=str)\n log_manager = check.opt_inst_param(log_manager, "log_manager", DagsterLogManager)\n resource_defs = wrap_resources_for_execution(resources)\n mapped_resource_config = _get_mapped_resource_config(resource_defs, resource_config)\n\n with ephemeral_instance_if_missing(instance) as dagster_instance:\n resources_manager = resource_initialization_manager(\n resource_defs=resource_defs,\n resource_configs=mapped_resource_config,\n log_manager=log_manager if log_manager else initialize_console_manager(pipeline_run),\n execution_plan=None,\n pipeline_run=pipeline_run,\n resource_keys_to_init=set(resource_defs.keys()),\n instance=dagster_instance,\n emit_persistent_events=False,\n pipeline_def_for_backwards_compat=None,\n )\n try:\n list(resources_manager.generate_setup_events())\n instantiated_resources = check.inst(\n resources_manager.get_object(), ScopedResourcesBuilder\n )\n yield instantiated_resources.build(\n set(instantiated_resources.resource_instance_dict.keys())\n )\n finally:\n list(resources_manager.generate_teardown_events())
\n\n\ndef wrap_resources_for_execution(\n resources: Optional[Dict[str, Any]] = None\n) -> Dict[str, ResourceDefinition]:\n resources = check.opt_dict_param(resources, "resources", key_type=str)\n resource_defs = {}\n # Wrap instantiated resource values in a resource definition.\n # If an instantiated IO manager is provided, wrap it in an IO manager definition.\n for resource_key, resource in resources.items():\n if isinstance(resource, ResourceDefinition):\n resource_defs[resource_key] = resource\n elif isinstance(resource, IOManager):\n resource_defs[resource_key] = IOManagerDefinition.hardcoded_io_manager(resource)\n else:\n resource_defs[resource_key] = ResourceDefinition.hardcoded_resource(resource)\n\n return resource_defs\n
", "current_page_name": "_modules/dagster/core/execution/build_resources", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.build_resources"}, "context": {"compute": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.compute

\nfrom abc import ABC, abstractmethod\nfrom typing import AbstractSet, Any, Dict, Iterator, List, Mapping, Optional, cast\n\nimport dagster._check as check\nfrom dagster.core.definitions.dependency import Node, NodeHandle\nfrom dagster.core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    Materialization,\n    UserEvent,\n)\nfrom dagster.core.definitions.job_definition import JobDefinition\nfrom dagster.core.definitions.mode import ModeDefinition\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.definitions.time_window_partitions import TimeWindow\nfrom dagster.core.errors import DagsterInvalidPropertyError\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import DagsterRun, PipelineRun\nfrom dagster.utils.forked_pdb import ForkedPdb\n\nfrom .system import StepExecutionContext\n\n\nclass AbstractComputeExecutionContext(ABC):  # pylint: disable=no-init\n    """Base class for solid context implemented by SolidExecutionContext and DagstermillExecutionContext"""\n\n    @abstractmethod\n    def has_tag(self, key) -> bool:\n        """Implement this method to check if a logging tag is set."""\n\n    @abstractmethod\n    def get_tag(self, key: str) -> Optional[str]:\n        """Implement this method to get a logging tag."""\n\n    @property\n    @abstractmethod\n    def run_id(self) -> str:\n        """The run id for the context."""\n\n    @property\n    @abstractmethod\n    def solid_def(self) -> SolidDefinition:\n        """The solid definition corresponding to the execution step being executed."""\n\n    @property\n    @abstractmethod\n    def solid(self) -> Node:\n        """The solid corresponding to the execution step being executed."""\n\n    @property\n    @abstractmethod\n    def pipeline_def(self) -> PipelineDefinition:\n        """The pipeline being executed."""\n\n    @property\n    @abstractmethod\n    def pipeline_run(self) -> PipelineRun:\n        """The PipelineRun object corresponding to the execution."""\n\n    @property\n    @abstractmethod\n    def resources(self) -> Any:\n        """Resources available in the execution context."""\n\n    @property\n    @abstractmethod\n    def log(self) -> DagsterLogManager:\n        """The log manager available in the execution context."""\n\n    @property\n    @abstractmethod\n    def solid_config(self) -> Any:\n        """The parsed config specific to this solid."""\n\n    @property\n    def op_config(self) -> Any:\n        return self.solid_config\n\n\n
[docs]class SolidExecutionContext(AbstractComputeExecutionContext):\n """The ``context`` object that can be made available as the first argument to a solid's compute\n function.\n\n The context object provides system information such as resources, config, and logging to a\n solid's compute function. Users should not instantiate this object directly.\n\n Example:\n\n .. code-block:: python\n\n @solid\n def hello_world(context: SolidExecutionContext):\n context.log.info("Hello, world!")\n\n """\n\n __slots__ = ["_step_execution_context"]\n\n def __init__(self, step_execution_context: StepExecutionContext):\n self._step_execution_context = check.inst_param(\n step_execution_context,\n "step_execution_context",\n StepExecutionContext,\n )\n self._pdb: Optional[ForkedPdb] = None\n self._events: List[DagsterEvent] = []\n self._output_metadata: Dict[str, Any] = {}\n\n @property\n def solid_config(self) -> Any:\n return self._step_execution_context.op_config\n\n @property\n def op_config(self) -> Any:\n return self.solid_config\n\n @property\n def pipeline_run(self) -> PipelineRun:\n """PipelineRun: The current pipeline run"""\n return self._step_execution_context.pipeline_run\n\n @property\n def run(self) -> DagsterRun:\n """DagsterRun: The current run"""\n return cast(DagsterRun, self.pipeline_run)\n\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current Dagster instance"""\n return self._step_execution_context.instance\n\n @property\n def pdb(self) -> ForkedPdb:\n """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the op.\n\n Example:\n\n .. code-block:: python\n\n @op\n def debug(context):\n context.pdb.set_trace()\n\n """\n if self._pdb is None:\n self._pdb = ForkedPdb()\n\n return self._pdb\n\n @property\n def file_manager(self):\n """Deprecated access to the file manager.\n\n :meta private:\n """\n raise DagsterInvalidPropertyError(\n "You have attempted to access the file manager which has been moved to resources in 0.10.0. "\n "Please access it via `context.resources.file_manager` instead."\n )\n\n @property\n def resources(self) -> Any:\n """Resources: The currently available resources."""\n return self._step_execution_context.resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n """Optional[StepLauncher]: The current step launcher, if any."""\n return self._step_execution_context.step_launcher\n\n @property\n def run_id(self) -> str:\n """str: The id of the current execution's run."""\n return self._step_execution_context.run_id\n\n @property\n def run_config(self) -> Mapping[str, object]:\n """dict: The run config for the current execution."""\n return self._step_execution_context.run_config\n\n @property\n def pipeline_def(self) -> PipelineDefinition:\n """PipelineDefinition: The currently executing pipeline."""\n return self._step_execution_context.pipeline_def\n\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The currently executing job."""\n return cast(\n JobDefinition,\n check.inst(\n self.pipeline_def,\n JobDefinition,\n "Accessing job_def inside a legacy pipeline. Use pipeline_def instead.",\n ),\n )\n\n @property\n def pipeline_name(self) -> str:\n """str: The name of the currently executing pipeline."""\n return self._step_execution_context.pipeline_name\n\n @property\n def job_name(self) -> str:\n """str: The name of the currently executing job."""\n return self.pipeline_name\n\n @property\n def mode_def(self) -> ModeDefinition:\n """ModeDefinition: The mode of the current execution."""\n return self._step_execution_context.mode_def\n\n @property\n def log(self) -> DagsterLogManager:\n """DagsterLogManager: The log manager available in the execution context."""\n return self._step_execution_context.log\n\n @property\n def solid_handle(self) -> NodeHandle:\n """NodeHandle: The current solid's handle.\n\n :meta private:\n """\n return self._step_execution_context.solid_handle\n\n @property\n def op_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self.solid_handle\n\n @property\n def solid(self) -> Node:\n """Solid: The current solid object.\n\n :meta private:\n\n """\n return self._step_execution_context.pipeline_def.get_solid(self.solid_handle)\n\n @property\n def op(self) -> Node:\n """Solid: The current op object.\n\n :meta private:\n\n """\n return self.solid\n\n @property\n def solid_def(self) -> SolidDefinition:\n """SolidDefinition: The current solid definition."""\n return self._step_execution_context.pipeline_def.get_solid(self.solid_handle).definition\n\n @property\n def op_def(self) -> OpDefinition:\n """OpDefinition: The current op definition."""\n return cast(\n OpDefinition,\n check.inst(\n self.solid_def,\n OpDefinition,\n "Called op_def on a legacy solid. Use solid_def instead.",\n ),\n )\n\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run"""\n return self._step_execution_context.has_partition_key\n\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n return self._step_execution_context.partition_key\n\n @property\n def partition_time_window(self) -> str:\n """The partition time window for the current run.\n\n Raises an error if the current run is not a partitioned run, or if the job's partition\n definition is not a TimeWindowPartitionsDefinition.\n """\n return self._step_execution_context.partition_time_window\n\n @property\n def selected_asset_keys(self) -> AbstractSet[AssetKey]:\n return self.job_def.asset_layer.asset_keys_for_node(self.solid_handle)\n\n @property\n def selected_output_names(self) -> AbstractSet[str]:\n # map selected asset keys to the output names they correspond to\n selected_asset_keys = self.selected_asset_keys\n selected_outputs = set()\n for output_name in self.op.output_dict.keys():\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.solid_handle, output_name\n )\n if asset_info and asset_info.key in selected_asset_keys:\n selected_outputs.add(output_name)\n return selected_outputs\n\n def asset_key_for_output(self, output_name: str = "result") -> AssetKey:\n asset_output_info = self.pipeline_def.asset_layer.asset_info_for_output(\n node_handle=self.op_handle, output_name=output_name\n )\n if asset_output_info is None:\n check.failed(f"Output '{output_name}' has no asset")\n else:\n return asset_output_info.key\n\n
[docs] def output_asset_partition_key(self, output_name: str = "result") -> str:\n """Returns the asset partition key for the given output. Defaults to "result", which is the\n name of the default output.\n """\n return self._step_execution_context.asset_partition_key_for_output(output_name)
\n\n
[docs] def output_asset_partitions_time_window(self, output_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n return self._step_execution_context.asset_partitions_time_window_for_output(output_name)
\n\n
[docs] def has_tag(self, key: str) -> bool:\n """Check if a logging tag is set.\n\n Args:\n key (str): The tag to check.\n\n Returns:\n bool: Whether the tag is set.\n """\n return self._step_execution_context.has_tag(key)
\n\n
[docs] def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag.\n\n Args:\n key (tag): The tag to get.\n\n Returns:\n Optional[str]: The value of the tag, if present.\n """\n return self._step_execution_context.get_tag(key)
\n\n def has_events(self) -> bool:\n return bool(self._events)\n\n
[docs] def consume_events(self) -> Iterator[DagsterEvent]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the beginning of the op's computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events
\n\n
[docs] def log_event(self, event: UserEvent) -> None:\n """Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.\n\n Events logged with this method will appear in the list of DagsterEvents, as well as the event log.\n\n Args:\n event (Union[AssetMaterialization, Materialization, AssetObservation, ExpectationResult]): The event to log.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op, AssetMaterialization\n\n @op\n def log_materialization(context):\n context.log_event(AssetMaterialization("foo"))\n """\n\n if isinstance(event, (AssetMaterialization, Materialization)):\n self._events.append(\n DagsterEvent.asset_materialization(\n self._step_execution_context,\n event,\n self._step_execution_context.get_input_lineage(),\n )\n )\n elif isinstance(event, AssetObservation):\n self._events.append(DagsterEvent.asset_observation(self._step_execution_context, event))\n elif isinstance(event, ExpectationResult):\n self._events.append(\n DagsterEvent.step_expectation_result(self._step_execution_context, event)\n )\n else:\n check.failed("Unexpected event {event}".format(event=event))
\n\n
[docs] def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n """Add metadata to one of the outputs of an op.\n\n This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.\n\n Args:\n metadata (Mapping[str, Any]): The metadata to attach to the output\n output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import Out, op\n from typing import Tuple\n\n @op\n def add_metadata(context):\n context.add_output_metadata({"foo", "bar"})\n return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n @op(out={"a": Out(), "b": Out()})\n def add_metadata_two_outputs(context) -> Tuple[str, int]:\n context.add_output_metadata({"foo": "bar"}, output_name="b")\n context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n return ("dog", 5)\n\n """\n metadata = check.dict_param(metadata, "metadata", key_type=str)\n output_name = check.opt_str_param(output_name, "output_name")\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n self._step_execution_context.add_output_metadata(\n metadata=metadata, output_name=output_name, mapping_key=mapping_key\n )
\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n return self._step_execution_context.get_output_metadata(\n output_name=output_name, mapping_key=mapping_key\n )\n\n def get_step_execution_context(self) -> StepExecutionContext:\n """Allows advanced users (e.g. framework authors) to punch through to the underlying\n step execution context.\n\n :meta private:\n\n Returns:\n StepExecutionContext: The underlying system context.\n """\n return self._step_execution_context\n\n @property\n def retry_number(self) -> int:\n """\n Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc.\n """\n\n return self._step_execution_context.previous_attempt_count\n\n def describe_op(self):\n return self._step_execution_context.describe_op()\n\n
[docs] def get_mapping_key(self) -> Optional[str]:\n """\n Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None.\n """\n return self._step_execution_context.step.get_mapping_key()
\n\n\n
[docs]class OpExecutionContext(SolidExecutionContext):\n pass
\n
", "current_page_name": "_modules/dagster/core/execution/context/compute", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.compute"}, "hook": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.hook

\nimport warnings\nfrom typing import AbstractSet, Any, Dict, Optional, Set, Union\n\nimport dagster._check as check\n\nfrom ...definitions.composition import PendingNodeInvocation\nfrom ...definitions.decorators.graph_decorator import graph\nfrom ...definitions.dependency import Node\nfrom ...definitions.hook_definition import HookDefinition\nfrom ...definitions.mode import ModeDefinition\nfrom ...definitions.op_definition import OpDefinition\nfrom ...definitions.resource_definition import IContainsGenerator, Resources\nfrom ...definitions.solid_definition import SolidDefinition\nfrom ...errors import DagsterInvalidPropertyError, DagsterInvariantViolationError\nfrom ...log_manager import DagsterLogManager\nfrom ..plan.step import ExecutionStep\nfrom .system import StepExecutionContext\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set when a `HookContext` is constructed from "\n        "`build_hook_context`."\n    )\n\n\ndef _check_property_on_test_context(\n    context: "HookContext", attr_str: str, user_facing_name: str, param_on_builder: str\n):\n    """Check if attribute is not None on context. If none, error, and point user in direction of\n    how to specify the parameter on the context object."""\n\n    value = getattr(context, attr_str)\n    if value is None:\n        raise DagsterInvalidPropertyError(\n            f"Attribute '{user_facing_name}' was not provided when "\n            f"constructing context. Provide a value for the '{param_on_builder}' parameter on "\n            "'build_hook_context'. To learn more, check out the testing hooks section of Dagster's "\n            "concepts docs: https://docs.dagster.io/concepts/ops-jobs-graphs/op-hooks#testing-hooks"\n        )\n    else:\n        return value\n\n\n
[docs]class HookContext:\n """The ``context`` object available to a hook function on an DagsterEvent.\n\n Attributes:\n log (DagsterLogManager): Centralized log dispatch from user code.\n hook_def (HookDefinition): The hook that the context object belongs to.\n solid (Solid): The solid instance associated with the hook.\n op (Op): The op instance associated with the hook.\n step_key (str): The key for the step where this hook is being triggered.\n required_resource_keys (Set[str]): Resources required by this hook.\n resources (Resources): Resources available in the hook context.\n solid_config (Any): The parsed config specific to this solid.\n op_config (Any): The parsed config specific to this op.\n pipeline_name (str): The name of the pipeline where this hook is being triggered.\n job_name (str): The name of the job where this hook is being triggered.\n run_id (str): The id of the run where this hook is being triggered.\n mode_def (ModeDefinition): The mode with which the pipeline is being run.\n op_exception (Optional[BaseException]): The thrown exception in a failed op.\n op_output_values (Dict): Computed output values in an op.\n """\n\n def __init__(\n self,\n step_execution_context: StepExecutionContext,\n hook_def: HookDefinition,\n ):\n self._step_execution_context = step_execution_context\n self._hook_def = check.inst_param(hook_def, "hook_def", HookDefinition)\n self._required_resource_keys = hook_def.required_resource_keys\n self._resources = step_execution_context.scoped_resources_builder.build(\n self._required_resource_keys\n )\n\n @property\n def pipeline_name(self) -> str:\n return self.job_name\n\n @property\n def job_name(self) -> str:\n return self._step_execution_context.job_name\n\n @property\n def run_id(self) -> str:\n return self._step_execution_context.run_id\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def solid(self) -> Node:\n return self.op\n\n @property\n def op(self) -> Node:\n return self._step_execution_context.solid\n\n @property\n def step(self) -> ExecutionStep:\n warnings.warn(\n "The step property of HookContext has been deprecated, and will be removed "\n "in a future release."\n )\n return self._step_execution_context.step\n\n @property\n def step_key(self) -> str:\n return self._step_execution_context.step.key\n\n @property\n def mode_def(self) -> Optional[ModeDefinition]:\n return self._step_execution_context.mode_def\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n solid_config = self._step_execution_context.resolved_run_config.solids.get(\n str(self._step_execution_context.step.solid_handle)\n )\n return solid_config.config if solid_config else None\n\n @property\n def op_config(self) -> Any:\n return self.solid_config\n\n # Because of the fact that we directly use the log manager of the step, if a user calls\n # hook_context.log.with_tags, then they will end up mutating the step's logging tags as well.\n # This is not problematic because the hook only runs after the step has been completed.\n @property\n def log(self) -> DagsterLogManager:\n return self._step_execution_context.log\n\n @property\n def solid_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed solid.\n\n Returns:\n Optional[BaseException]: the exception object, None if the solid execution succeeds.\n """\n return self.op_exception\n\n @property\n def op_exception(self):\n return self._step_execution_context.step_exception\n\n @property\n def solid_output_values(self) -> Dict[str, Union[Any, Dict[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n results: Dict[str, Union[Any, Dict[str, Any]]] = {}\n captured = self._step_execution_context.step_output_capture\n\n if captured is None:\n check.failed("Outputs were unexpectedly not captured for hook")\n\n # make the returned values more user-friendly\n for step_output_handle, value in captured.items():\n if step_output_handle.mapping_key:\n if results.get(step_output_handle.output_name) is None:\n results[step_output_handle.output_name] = {\n step_output_handle.mapping_key: value\n }\n else:\n results[step_output_handle.output_name][step_output_handle.mapping_key] = value\n else:\n results[step_output_handle.output_name] = value\n\n return results\n\n @property\n def op_output_values(self):\n return self.solid_output_values
\n\n\nclass UnboundHookContext(HookContext):\n def __init__(\n self,\n resources: Dict[str, Any],\n mode_def: Optional[ModeDefinition],\n op: Optional[Union[SolidDefinition, PendingNodeInvocation]],\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n ): # pylint: disable=super-init-not-called\n from ..build_resources import build_resources\n from ..context_creation_pipeline import initialize_console_manager\n\n self._mode_def = mode_def\n\n self._op = None\n if op is not None:\n\n @graph(name="hook_context_container")\n def temp_graph():\n op()\n\n self._op = temp_graph.solids[0]\n\n # Open resource context manager\n self._resources_cm = build_resources(resources)\n self._resources = self._resources_cm.__enter__() # pylint: disable=no-member\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n\n self._log = initialize_console_manager(None)\n\n self._cm_scope_entered = False\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n self._resources_cm.__exit__(*exc) # pylint: disable=no-member\n\n def __del__(self):\n if self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None) # pylint: disable=no-member\n\n @property\n def job_name(self) -> str:\n return self.pipeline_name\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def mode_def(self) -> Optional[ModeDefinition]:\n return self._mode_def\n\n @property\n def required_resource_keys(self) -> Set[str]:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def resources(self) -> "Resources":\n if self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_hook_context(...) as context:`"\n )\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log\n\n @property\n def op_exception(self) -> Optional[BaseException]:\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Dict[str, Union[Any, Dict[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n\nclass BoundHookContext(HookContext):\n def __init__(\n self,\n hook_def: HookDefinition,\n resources: Resources,\n op: Optional[Node],\n mode_def: Optional[ModeDefinition],\n log_manager: DagsterLogManager,\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n ): # pylint: disable=super-init-not-called\n self._hook_def = hook_def\n self._resources = resources\n self._op = op\n self._mode_def = mode_def\n self._log_manager = log_manager\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def mode_def(self) -> Optional[ModeDefinition]:\n return self._mode_def\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._hook_def.required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log_manager\n\n @property\n def op_exception(self):\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Dict[str, Union[Any, Dict[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n\n
[docs]def build_hook_context(\n resources: Optional[Dict[str, Any]] = None,\n mode_def: Optional[ModeDefinition] = None,\n solid: Optional[Union[SolidDefinition, PendingNodeInvocation]] = None,\n op: Optional[Union[OpDefinition, PendingNodeInvocation]] = None,\n run_id: Optional[str] = None,\n job_name: Optional[str] = None,\n op_exception: Optional[Exception] = None,\n) -> UnboundHookContext:\n """Builds hook context from provided parameters.\n\n ``build_hook_context`` can be used as either a function or a context manager. If there is a\n provided resource to ``build_hook_context`` that is a context manager, then it must be used as a\n context manager. This function can be used to provide the context argument to the invocation of\n a hook definition.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can\n either be values or resource definitions.\n mode_def (Optional[ModeDefinition]): The mode definition used with the context.\n op (Optional[OpDefinition, PendingNodeInvocation]): The op definition which the\n hook may be associated with.\n solid (Optional[SolidDefinition, PendingNodeInvocation]): (legacy) The solid definition which the\n hook may be associated with.\n run_id (Optional[str]): The id of the run in which the hook is invoked (provided for mocking purposes).\n job_name (Optional[str]): The name of the job in which the hook is used (provided for mocking purposes).\n op_exception (Optional[Exception]): The exception that caused the hook to be triggered.\n\n Examples:\n .. code-block:: python\n\n context = build_hook_context()\n hook_to_invoke(context)\n\n with build_hook_context(resources={"foo": context_manager_resource}) as context:\n hook_to_invoke(context)\n """\n check.invariant(not (solid and op), "cannot set both `solid` and `op` on `build_hook_context`.")\n\n op = check.opt_inst_param(op, "op", (OpDefinition, PendingNodeInvocation))\n solid = check.opt_inst_param(solid, "solid", (SolidDefinition, PendingNodeInvocation))\n op = op or solid\n\n return UnboundHookContext(\n resources=check.opt_dict_param(resources, "resources", key_type=str),\n mode_def=check.opt_inst_param(mode_def, "mode_def", ModeDefinition),\n op=op,\n run_id=check.opt_str_param(run_id, "run_id"),\n job_name=check.opt_str_param(job_name, "job_name"),\n op_exception=check.opt_inst_param(op_exception, "op_exception", Exception),\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/hook", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.hook"}, "init": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.init

\nfrom typing import Any, Dict, Optional, Union\n\nimport dagster._check as check\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\n\n\n
[docs]class InitResourceContext:\n """Resource-specific initialization context.\n\n Attributes:\n resource_config (Any): The configuration data provided by the run config. The schema\n for this data is defined by the ``config_field`` argument to\n :py:class:`ResourceDefinition`.\n resource_def (ResourceDefinition): The definition of the resource currently being\n constructed.\n log_manager (DagsterLogManager): The log manager for this run of the job or pipeline\n resources (ScopedResources): The resources that are available to the resource that we are\n initalizing.\n dagster_run (Optional[PipelineRun]): The dagster run to use. When initializing resources\n outside of execution context, this will be None.\n run_id (Optional[str]): The id for this run of the job or pipeline. When initializing resources\n outside of execution context, this will be None.\n pipeline_run (Optional[PipelineRun]): (legacy) The dagster run to use. When initializing resources\n outside of execution context, this will be None.\n\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Resources,\n resource_def: Optional[ResourceDefinition] = None,\n instance: Optional[DagsterInstance] = None,\n dagster_run: Optional[PipelineRun] = None,\n pipeline_run: Optional[PipelineRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n pipeline_def_for_backwards_compat: Optional[PipelineDefinition] = None,\n ):\n\n if dagster_run and pipeline_run:\n raise DagsterInvariantViolationError(\n "Provided both ``dagster_run`` and ``pipeline_run`` to InitResourceContext "\n "initialization. Please provide one or the other."\n )\n self._resource_config = resource_config\n self._resource_def = resource_def\n self._log_manager = log_manager\n self._instance = instance\n self._resources = resources\n\n self._pipeline_def_for_backwards_compat = pipeline_def_for_backwards_compat\n self._dagster_run = dagster_run or pipeline_run\n\n @property\n def resource_config(self) -> Any:\n return self._resource_config\n\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n return self._resource_def\n\n @property\n def resources(self) -> Resources:\n return self._resources\n\n @property\n def instance(self) -> Optional[DagsterInstance]:\n return self._instance\n\n @property\n def pipeline_def_for_backwards_compat(self) -> Optional[PipelineDefinition]:\n return self._pipeline_def_for_backwards_compat\n\n @property\n def dagster_run(self) -> Optional[PipelineRun]:\n return self._dagster_run\n\n @property\n def pipeline_run(self) -> Optional[PipelineRun]:\n return self.dagster_run\n\n @property\n def log(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n @property\n def run_id(self) -> Optional[str]:\n return self.pipeline_run.run_id if self.pipeline_run else None\n\n def replace_config(self, config: Any) -> "InitResourceContext":\n return InitResourceContext(\n resource_config=config,\n resources=self.resources,\n instance=self.instance,\n resource_def=self.resource_def,\n pipeline_run=self.pipeline_run,\n log_manager=self.log,\n )
\n\n\nclass UnboundInitResourceContext(InitResourceContext):\n """Resource initialization context outputted by ``build_init_resource_context``.\n\n Represents a context whose config has not yet been validated against a resource definition,\n hence the inability to access the `resource_def` attribute. When an instance of\n ``UnboundInitResourceContext`` is passed to a resource invocation, config is validated,\n and it is subsumed into an `InitResourceContext`, which contains the resource_def validated\n against.\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Optional[Union[Resources, Dict[str, Any]]],\n instance: Optional[DagsterInstance],\n ):\n from dagster.core.execution.api import ephemeral_instance_if_missing\n from dagster.core.execution.build_resources import build_resources\n from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n\n self._instance_provided = (\n check.opt_inst_param(instance, "instance", DagsterInstance) is not None\n )\n # Construct ephemeral instance if missing\n self._instance_cm = ephemeral_instance_if_missing(instance)\n # Pylint can't infer that the ephemeral_instance context manager has an __enter__ method,\n # so ignore lint error\n instance = self._instance_cm.__enter__() # pylint: disable=no-member\n\n # If we are provided with a Resources instance, then we do not need to initialize\n if isinstance(resources, Resources):\n self._resources_cm = None\n else:\n self._resources_cm = build_resources(\n check.opt_dict_param(resources, "resources", key_type=str), instance=instance\n )\n resources = self._resources_cm.__enter__() # pylint: disable=no-member\n self._resources_contain_cm = isinstance(resources, IContainsGenerator)\n\n self._cm_scope_entered = False\n super(UnboundInitResourceContext, self).__init__(\n resource_config=resource_config,\n resources=resources,\n resource_def=None,\n instance=instance,\n pipeline_run=None,\n log_manager=initialize_console_manager(None),\n pipeline_def_for_backwards_compat=None,\n )\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc) # pylint: disable=no-member\n if self._instance_provided:\n self._instance_cm.__exit__(*exc) # pylint: disable=no-member\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None) # pylint: disable=no-member\n if self._instance_provided and not self._cm_scope_entered:\n self._instance_cm.__exit__(None, None, None) # pylint: disable=no-member\n\n @property\n def resource_config(self) -> Any:\n return self._resource_config\n\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def resources(self) -> Resources:\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_init_resource_context(...) as context:`"\n )\n return self._resources\n\n @property\n def instance(self) -> Optional[DagsterInstance]:\n return self._instance\n\n @property\n def pipeline_def_for_backwards_compat(self) -> Optional[PipelineDefinition]:\n return None\n\n @property\n def pipeline_run(self) -> Optional[PipelineRun]:\n return None\n\n @property\n def log(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n @property\n def run_id(self) -> Optional[str]:\n return None\n\n\n
[docs]def build_init_resource_context(\n config: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> InitResourceContext:\n """Builds resource initialization context from provided parameters.\n\n ``build_init_resource_context`` can be used as either a function or context manager. If there is a\n provided resource to ``build_init_resource_context`` that is a context manager, then it must be\n used as a context manager. This function can be used to provide the context argument to the\n invocation of a resource.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n config (Optional[Any]): The resource config to provide to the context.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_init_resource_context()\n resource_to_init(context)\n\n with build_init_resource_context(\n resources={"foo": context_manager_resource}\n ) as context:\n resource_to_init(context)\n\n """\n return UnboundInitResourceContext(\n resource_config=check.opt_dict_param(config, "config", key_type=str),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n resources=check.opt_dict_param(resources, "resources", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/init", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.init"}, "input": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.input

\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union, cast\n\nimport dagster._check as check\nfrom dagster.core.definitions.events import AssetKey, AssetObservation\nfrom dagster.core.definitions.metadata import MetadataEntry, PartitionMetadataEntry\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster.core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.resource_definition import Resources\n    from dagster.core.definitions.solid_definition import SolidDefinition\n    from dagster.core.events import DagsterEvent\n    from dagster.core.execution.context.system import StepExecutionContext\n    from dagster.core.log_manager import DagsterLogManager\n    from dagster.core.types.dagster_type import DagsterType\n\n    from .output import OutputContext\n\n\n
[docs]class InputContext:\n """\n The ``context`` object available to the load_input method of :py:class:`RootInputManager`.\n\n Attributes:\n name (Optional[str]): The name of the input that we're loading.\n pipeline_name (Optional[str]): The name of the pipeline.\n solid_def (Optional[SolidDefinition]): The definition of the solid that's loading the input.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n log (Optional[DagsterLogManager]): The log manager to use for this input.\n resource_config (Optional[Dict[str, Any]]): The config associated with the resource that\n initializes the RootInputManager.\n resources (Optional[Resources]): The resources required by the resource that initializes the\n input manager. If using the :py:func:`@root_input_manager` decorator, these resources\n correspond to those requested with the `required_resource_keys` parameter.\n op_def (Optional[OpDefinition]): The definition of the op that's loading the input.\n """\n\n def __init__(\n self,\n name: Optional[str] = None,\n pipeline_name: Optional[str] = None,\n solid_def: Optional["SolidDefinition"] = None,\n config: Optional[Any] = None,\n metadata: Optional[Dict[str, Any]] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n resources: Optional[Union["Resources", Dict[str, Any]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n op_def: Optional["OpDefinition"] = None,\n ):\n from dagster.core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster.core.execution.build_resources import build_resources\n\n self._name = name\n self._pipeline_name = pipeline_name\n check.invariant(\n solid_def is None or op_def is None, "Can't provide both a solid_def and an op_def arg"\n )\n self._solid_def = solid_def or op_def\n self._config = config\n self._metadata = metadata\n self._upstream_output = upstream_output\n self._dagster_type = dagster_type\n self._log = log_manager\n self._resource_config = resource_config\n self._step_context = step_context\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_dict_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__() # pylint: disable=no-member\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events: List["DagsterEvent"] = []\n self._observations: List[AssetObservation] = []\n self._metadata_entries: List[Union[MetadataEntry, PartitionMetadataEntry]] = []\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc) # pylint: disable=no-member\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None) # pylint: disable=no-member\n\n @property\n def has_input_name(self) -> bool:\n """If we're the InputContext is being used to load the result of a run from outside the run,\n then it won't have an input name."""\n return self._name is not None\n\n @property\n def name(self) -> str:\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._name\n\n @property\n def pipeline_name(self) -> str:\n if self._pipeline_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access pipeline_name, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._pipeline_name\n\n @property\n def solid_def(self) -> "SolidDefinition":\n if self._solid_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access solid_def, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._solid_def\n\n @property\n def op_def(self) -> "OpDefinition":\n if self._solid_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the InputContext"\n )\n\n return cast(OpDefinition, self._solid_def)\n\n @property\n def config(self) -> Any:\n return self._config\n\n @property\n def metadata(self) -> Optional[Dict[str, Any]]:\n return self._metadata\n\n @property\n def upstream_output(self) -> Optional["OutputContext"]:\n return self._upstream_output\n\n @property\n def dagster_type(self) -> "DagsterType":\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._dagster_type\n\n @property\n def log(self) -> "DagsterLogManager":\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._log\n\n @property\n def resource_config(self) -> Optional[Dict[str, Any]]:\n return self._resource_config\n\n @property\n def resources(self) -> Any:\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the InputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_input_context(...) as context:`"\n )\n return self._resources\n\n @property\n def has_asset_key(self) -> bool:\n return (\n self._step_context is not None\n and self._name is not None\n and self._step_context.pipeline_def.asset_layer.asset_key_for_input(\n node_handle=self.step_context.solid_handle, input_name=self._name\n )\n is not None\n )\n\n @property\n def asset_key(self) -> AssetKey:\n result = self.step_context.pipeline_def.asset_layer.asset_key_for_input(\n node_handle=self.step_context.solid_handle, input_name=self.name\n )\n if result is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, but no asset is associated with this input"\n )\n\n return result\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._step_context\n\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run"""\n return self.step_context.has_partition_key\n\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n return self.step_context.partition_key\n\n @property\n def has_asset_partitions(self) -> bool:\n if self._step_context is not None:\n return self._step_context.has_asset_partitions_for_input(self.name)\n else:\n return False\n\n @property\n def asset_partition_key(self) -> str:\n """The partition key for input asset.\n\n Raises an error if the input asset has no partitioning, or if the run covers a partition\n range for the input asset.\n """\n return self.step_context.asset_partition_key_for_input(self.name)\n\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n return self.step_context.asset_partition_key_range_for_input(self.name)\n\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n if self.upstream_output is None:\n check.failed("InputContext needs upstream_output to get asset_partitions_time_window")\n\n if self.upstream_output.asset_info is None:\n raise ValueError(\n "Tried to get asset partitions for an output that does not correspond to a "\n "partitioned asset."\n )\n\n asset_info = self.upstream_output.asset_info\n\n if not isinstance(asset_info.partitions_def, TimeWindowPartitionsDefinition):\n raise ValueError(\n "Tried to get asset partitions for an input that correponds to a partitioned "\n "asset that is not partitioned with a TimeWindowPartitionsDefinition."\n )\n\n partitions_def: TimeWindowPartitionsDefinition = asset_info.partitions_def\n\n partition_key_range = self.asset_partition_key_range\n return TimeWindow(\n partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n )\n\n
[docs] def get_identifier(self) -> List[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step input.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the input.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the input is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n if self.upstream_output is None:\n raise DagsterInvariantViolationError(\n "InputContext.upstream_output not defined. " "Cannot compute an identifier"\n )\n\n return self.upstream_output.get_identifier()
\n\n def get_asset_identifier(self) -> Sequence[str]:\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return self.asset_key.path + [self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset identifier for an input with no asset key")\n\n
[docs] def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_input`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n\n events = self._events\n self._events = []\n yield from events
\n\n
[docs] def add_input_metadata(\n self,\n metadata: Dict[str, Any],\n description: Optional[str] = None,\n ) -> None:\n """Accepts a dictionary of metadata. Metadata entries will appear on the LOADED_INPUT event.\n If the input is an asset, metadata will be attached to an asset observation.\n\n The asset observation will be yielded from the run and appear in the event log.\n Only valid if the context has an asset key.\n """\n from dagster.core.definitions.metadata import normalize_metadata\n from dagster.core.events import DagsterEvent\n\n metadata = check.dict_param(metadata, "metadata", key_type=str)\n self._metadata_entries.extend(normalize_metadata(metadata, []))\n if self.has_asset_key:\n check.opt_str_param(description, "description")\n\n observation = AssetObservation(\n asset_key=self.asset_key,\n description=description,\n partition=self.asset_partition_key if self.has_asset_partitions else None,\n metadata=metadata,\n )\n self._observations.append(observation)\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, observation))
\n\n
[docs] def get_observations(\n self,\n ) -> List[AssetObservation]:\n """Retrieve the list of user-generated asset observations that were observed via the context.\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_input_context, AssetObservation\n\n class MyIOManager(IOManager):\n def load_input(self, context, obj):\n ...\n\n def test_load_input():\n mgr = MyIOManager()\n context = build_input_context()\n mgr.load_input(context)\n observations = context.get_observations()\n ...\n """\n return self._observations
\n\n def consume_metadata_entries(self) -> List[Union[MetadataEntry, PartitionMetadataEntry]]:\n result = self._metadata_entries\n self._metadata_entries = []\n return result
\n\n\n
[docs]def build_input_context(\n name: Optional[str] = None,\n config: Optional[Any] = None,\n metadata: Optional[Dict[str, Any]] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n op_def: Optional[OpDefinition] = None,\n step_context: Optional["StepExecutionContext"] = None,\n) -> "InputContext":\n """Builds input context from provided parameters.\n\n ``build_input_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_input_context`` must be used as a\n context manager.\n\n Args:\n name (Optional[str]): The name of the input that we're loading.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n resource_config (Optional[Dict[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the input manager.\n resources (Optional[Dict[str, Any]]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n asset_key (Optional[AssetKey]): The asset key attached to the InputDefinition.\n op_def (Optional[OpDefinition]): The definition of the op that's loading the input.\n step_context (Optional[StepExecutionContext]): For internal use.\n\n Examples:\n\n .. code-block:: python\n\n build_input_context()\n\n with build_input_context(resources={"foo": context_manager_resource}) as context:\n do_something\n """\n from dagster.core.execution.context.output import OutputContext\n from dagster.core.execution.context.system import StepExecutionContext\n from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n from dagster.core.types.dagster_type import DagsterType\n\n name = check.opt_str_param(name, "name")\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n upstream_output = check.opt_inst_param(upstream_output, "upstream_output", OutputContext)\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n resource_config = check.opt_dict_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_dict_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n step_context = check.opt_inst_param(step_context, "step_context", StepExecutionContext)\n\n return InputContext(\n name=name,\n pipeline_name=None,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n resource_config=resource_config,\n resources=resources,\n step_context=step_context,\n op_def=op_def,\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/input", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.input"}, "invocation": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.invocation

\n# pylint: disable=super-init-not-called\nfrom typing import AbstractSet, Any, Dict, List, Mapping, NamedTuple, Optional, Set, Union, cast\n\nimport dagster._check as check\nfrom dagster.config import Shape\nfrom dagster.core.definitions.composition import PendingNodeInvocation\nfrom dagster.core.definitions.dependency import Node, NodeHandle\nfrom dagster.core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    Materialization,\n    UserEvent,\n)\nfrom dagster.core.definitions.hook_definition import HookDefinition\nfrom dagster.core.definitions.mode import ModeDefinition\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.definitions.resource_definition import (\n    IContainsGenerator,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import (\n    DagsterInvalidConfigError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster.core.execution.build_resources import build_resources\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.types.dagster_type import DagsterType\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.forked_pdb import ForkedPdb\n\nfrom .compute import OpExecutionContext\nfrom .system import StepExecutionContext, TypeCheckContext\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set on the context when a solid is directly invoked."\n    )\n\n\nclass UnboundSolidExecutionContext(OpExecutionContext):\n    """The ``context`` object available as the first argument to a solid's compute function when\n    being invoked directly. Can also be used as a context manager.\n    """\n\n    def __init__(\n        self,\n        solid_config: Any,\n        resources_dict: Optional[Dict[str, Any]],\n        resources_config: Dict[str, Any],\n        instance: Optional[DagsterInstance],\n        partition_key: Optional[str],\n        mapping_key: Optional[str],\n    ):  # pylint: disable=super-init-not-called\n        from dagster.core.execution.api import ephemeral_instance_if_missing\n        from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n\n        self._solid_config = solid_config\n        self._mapping_key = mapping_key\n\n        self._instance_provided = (\n            check.opt_inst_param(instance, "instance", DagsterInstance) is not None\n        )\n        # Construct ephemeral instance if missing\n        self._instance_cm = ephemeral_instance_if_missing(instance)\n        # Pylint can't infer that the ephemeral_instance context manager has an __enter__ method,\n        # so ignore lint error\n        self._instance = self._instance_cm.__enter__()  # pylint: disable=no-member\n\n        self._resources_config = resources_config\n        # Open resource context manager\n        self._resources_contain_cm = False\n        self._resources_cm = build_resources(\n            resources=check.opt_dict_param(resources_dict, "resources_dict", key_type=str),\n            instance=instance,\n            resource_config=resources_config,\n        )\n        self._resources = self._resources_cm.__enter__()  # pylint: disable=no-member\n        self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n        self._log = initialize_console_manager(None)\n        self._pdb: Optional[ForkedPdb] = None\n        self._cm_scope_entered = False\n        self._partition_key = partition_key\n        self._user_events: List[UserEvent] = []\n        self._output_metadata: Dict[str, Any] = {}\n\n    def __enter__(self):\n        self._cm_scope_entered = True\n        return self\n\n    def __exit__(self, *exc):\n        self._resources_cm.__exit__(*exc)  # pylint: disable=no-member\n        if self._instance_provided:\n            self._instance_cm.__exit__(*exc)  # pylint: disable=no-member\n\n    def __del__(self):\n        if self._resources_contain_cm and not self._cm_scope_entered:\n            self._resources_cm.__exit__(None, None, None)  # pylint: disable=no-member\n        if self._instance_provided and not self._cm_scope_entered:\n            self._instance_cm.__exit__(None, None, None)  # pylint: disable=no-member\n\n    @property\n    def solid_config(self) -> Any:\n        return self._solid_config\n\n    @property\n    def resources(self) -> Resources:\n        if self._resources_contain_cm and not self._cm_scope_entered:\n            raise DagsterInvariantViolationError(\n                "At least one provided resource is a generator, but attempting to access "\n                "resources outside of context manager scope. You can use the following syntax to "\n                "open a context manager: `with build_solid_context(...) as context:`"\n            )\n        return self._resources\n\n    @property\n    def pipeline_run(self) -> PipelineRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> dict:\n        raise DagsterInvalidPropertyError(_property_msg("run_config", "property"))\n\n    @property\n    def pipeline_def(self) -> PipelineDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_def", "property"))\n\n    @property\n    def pipeline_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_name", "property"))\n\n    @property\n    def mode_def(self) -> ModeDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("mode_def", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def solid_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("solid_handle", "property"))\n\n    @property\n    def solid(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("solid", "property"))\n\n    @property\n    def solid_def(self) -> SolidDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("solid_def", "property"))\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key:\n            return self._partition_key\n        else:\n            check.failed("Tried to access partition_key for a non-partitioned run")\n\n    def has_tag(self, key: str) -> bool:\n        raise DagsterInvalidPropertyError(_property_msg("has_tag", "method"))\n\n    def get_tag(self, key: str) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("get_tag", "method"))\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def bind(\n        self, solid_def_or_invocation: Union[SolidDefinition, PendingNodeInvocation]\n    ) -> "BoundSolidExecutionContext":\n\n        solid_def = (\n            solid_def_or_invocation\n            if isinstance(solid_def_or_invocation, SolidDefinition)\n            else solid_def_or_invocation.node_def.ensure_solid_def()\n        )\n\n        _validate_resource_requirements(self.resources, solid_def)\n\n        solid_config = _resolve_bound_config(self.solid_config, solid_def)\n\n        return BoundSolidExecutionContext(\n            solid_def=solid_def,\n            solid_config=solid_config,\n            resources=self.resources,\n            resources_config=self._resources_config,\n            instance=self.instance,\n            log_manager=self.log,\n            pdb=self.pdb,\n            tags=solid_def_or_invocation.tags\n            if isinstance(solid_def_or_invocation, PendingNodeInvocation)\n            else None,\n            hook_defs=solid_def_or_invocation.hook_defs\n            if isinstance(solid_def_or_invocation, PendingNodeInvocation)\n            else None,\n            alias=solid_def_or_invocation.given_alias\n            if isinstance(solid_def_or_invocation, PendingNodeInvocation)\n            else None,\n            user_events=self._user_events,\n            output_metadata=self._output_metadata,\n            mapping_key=self._mapping_key,\n        )\n\n    def get_events(self) -> List[UserEvent]:\n        """Retrieve the list of user-generated events that were logged via the context.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import op, build_op_context, AssetMaterialization, ExpectationResult\n\n            @op\n            def my_op(context):\n                ...\n\n            def test_my_op():\n                context = build_op_context()\n                my_op(context)\n                all_user_events = context.get_events()\n                materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n                expectation_results = [event for event in all_user_events if isinstance(event, ExpectationResult)]\n                ...\n        """\n\n        return self._user_events\n\n    def get_output_metadata(\n        self, output_name: str, mapping_key: Optional[str] = None\n    ) -> Optional[Mapping[str, Any]]:\n        """Retrieve metadata that was logged for an output and mapping_key, if it exists.\n\n        If metadata cannot be found for the particular output_name/mapping_key combination, None will be returned.\n\n        Args:\n            output_name (str): The name of the output to retrieve logged metadata for.\n            mapping_key (Optional[str]): The mapping key to retrieve metadata for (only applies when using dynamic outputs).\n\n        Returns:\n            Optional[Mapping[str, Any]]: The metadata values present for the output_name/mapping_key combination, if present.\n        """\n        metadata = self._output_metadata.get(output_name)\n        if mapping_key and metadata:\n            return metadata.get(mapping_key)\n        return metadata\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n\ndef _validate_resource_requirements(resources: "Resources", solid_def: SolidDefinition) -> None:\n    """Validate correctness of resources against required resource keys"""\n\n    resources_dict = resources._asdict()  # type: ignore[attr-defined]\n\n    required_resource_keys: AbstractSet[str] = solid_def.required_resource_keys or set()\n    for resource_key in required_resource_keys:\n        if resource_key not in resources_dict:\n            raise DagsterInvalidInvocationError(\n                f'{solid_def.node_type_str} "{solid_def.name}" requires resource "{resource_key}", but no resource '\n                "with that key was found on the context."\n            )\n\n\ndef _resolve_bound_config(solid_config: Any, solid_def: SolidDefinition) -> Any:\n    """Validate config against config schema, and return validated config."""\n    from dagster.config.validate import process_config\n\n    # Config processing system expects the top level config schema to be a dictionary, but solid\n    # config schema can be scalar. Thus, we wrap it in another layer of indirection.\n    outer_config_shape = Shape({"config": solid_def.get_config_field()})\n    config_evr = process_config(\n        outer_config_shape, {"config": solid_config} if solid_config else {}\n    )\n    if not config_evr.success:\n        raise DagsterInvalidConfigError(\n            f"Error in config for {solid_def.node_type_str} ",\n            config_evr.errors,\n            solid_config,\n        )\n    validated_config = cast(Dict, config_evr.value).get("config")\n    mapped_config_evr = solid_def.apply_config_mapping({"config": validated_config})\n    if not mapped_config_evr.success:\n        raise DagsterInvalidConfigError(\n            f"Error in config for {solid_def.node_type_str} ",\n            mapped_config_evr.errors,\n            solid_config,\n        )\n    validated_config = cast(Dict, mapped_config_evr.value).get("config")\n    return validated_config\n\n\nclass BoundSolidExecutionContext(OpExecutionContext):\n    """The solid execution context that is passed to the compute function during invocation.\n\n    This context is bound to a specific solid definition, for which the resources and config have\n    been validated.\n    """\n\n    def __init__(\n        self,\n        solid_def: SolidDefinition,\n        solid_config: Any,\n        resources: "Resources",\n        resources_config: Dict[str, Any],\n        instance: DagsterInstance,\n        log_manager: DagsterLogManager,\n        pdb: Optional[ForkedPdb],\n        tags: Optional[Dict[str, str]],\n        hook_defs: Optional[AbstractSet[HookDefinition]],\n        alias: Optional[str],\n        user_events: List[UserEvent],\n        output_metadata: Dict[str, Any],\n        mapping_key: Optional[str],\n    ):\n        self._solid_def = solid_def\n        self._solid_config = solid_config\n        self._resources = resources\n        self._instance = instance\n        self._log = log_manager\n        self._pdb = pdb\n        self._tags = merge_dicts(self._solid_def.tags, tags) if tags else self._solid_def.tags\n        self._hook_defs = hook_defs\n        self._alias = alias if alias else self._solid_def.name\n        self._resources_config = resources_config\n        self._user_events: List[UserEvent] = user_events\n        self._seen_outputs: Dict[str, Union[str, Set[str]]] = {}\n        self._output_metadata: Dict[str, Any] = output_metadata\n        self._mapping_key = mapping_key\n\n    @property\n    def solid_config(self) -> Any:\n        return self._solid_config\n\n    @property\n    def resources(self) -> Resources:\n        return self._resources\n\n    @property\n    def pipeline_run(self) -> PipelineRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> dict:\n        run_config = {}\n        if self._solid_config:\n            run_config["solids"] = {self._solid_def.name: {"config": self._solid_config}}\n        run_config["resources"] = self._resources_config\n        return run_config\n\n    @property\n    def pipeline_def(self) -> PipelineDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_def", "property"))\n\n    @property\n    def pipeline_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_name", "property"))\n\n    @property\n    def mode_def(self) -> ModeDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("mode_def", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def solid_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("solid_handle", "property"))\n\n    @property\n    def solid(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("solid", "property"))\n\n    @property\n    def solid_def(self) -> SolidDefinition:\n        return self._solid_def\n\n    def has_tag(self, key: str) -> bool:\n        return key in self._tags\n\n    def get_tag(self, key: str) -> str:\n        return self._tags.get(key)\n\n    @property\n    def alias(self) -> str:\n        return self._alias\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def for_type(self, dagster_type: DagsterType) -> TypeCheckContext:\n        resources = cast(NamedTuple, self.resources)\n        return TypeCheckContext(\n            self.run_id, self.log, ScopedResourcesBuilder(resources._asdict()), dagster_type\n        )\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n    def describe_op(self):\n        if isinstance(self.solid_def, OpDefinition):\n            return f'op "{self.solid_def.name}"'\n\n        return f'solid "{self.solid_def.name}"'\n\n    def log_event(self, event: UserEvent) -> None:\n\n        check.inst_param(\n            event,\n            "event",\n            (AssetMaterialization, AssetObservation, ExpectationResult, Materialization),\n        )\n        self._user_events.append(event)\n\n    def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n        if mapping_key:\n            if output_name not in self._seen_outputs:\n                self._seen_outputs[output_name] = set()\n            cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n        else:\n            self._seen_outputs[output_name] = "seen"\n\n    def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n        if mapping_key:\n            return (\n                output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n            )\n        return output_name in self._seen_outputs\n\n    def add_output_metadata(\n        self,\n        metadata: Mapping[str, Any],\n        output_name: Optional[str] = None,\n        mapping_key: Optional[str] = None,\n    ) -> None:\n        """Add metadata to one of the outputs of an op.\n\n        This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.\n\n        Args:\n            metadata (Mapping[str, Any]): The metadata to attach to the output\n            output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import Out, op\n            from typing import Tuple\n\n            @op\n            def add_metadata(context):\n                context.add_output_metadata({"foo", "bar"})\n                return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n            @op(out={"a": Out(), "b": Out()})\n            def add_metadata_two_outputs(context) -> Tuple[str, int]:\n                context.add_output_metadata({"foo": "bar"}, output_name="b")\n                context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n                return ("dog", 5)\n\n        """\n        metadata = check.dict_param(metadata, "metadata", key_type=str)\n        output_name = check.opt_str_param(output_name, "output_name")\n        mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n        if output_name is None and len(self.solid_def.output_defs) == 1:\n            output_def = self.solid_def.output_defs[0]\n            output_name = output_def.name\n        elif output_name is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to log metadata without providing output_name, but multiple outputs exist. Please provide an output_name to the invocation of `context.add_output_metadata`."\n            )\n        else:\n            output_def = self.solid_def.output_def_named(output_name)\n\n        if self.has_seen_output(output_name, mapping_key):\n            output_desc = (\n                f"output '{output_def.name}'"\n                if not mapping_key\n                else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n            )\n            raise DagsterInvariantViolationError(\n                f"In {self.solid_def.node_type_str} '{self.solid_def.name}', attempted to log output metadata for {output_desc} which has already been yielded. Metadata must be logged before the output is yielded."\n            )\n        if output_def.is_dynamic and not mapping_key:\n            raise DagsterInvariantViolationError(\n                f"In {self.solid_def.node_type_str} '{self.solid_def.name}', attempted to log metadata for dynamic output '{output_def.name}' without providing a mapping key. When logging metadata for a dynamic output, it is necessary to provide a mapping key."\n            )\n\n        output_name = output_def.name\n        if output_name in self._output_metadata:\n            if not mapping_key or mapping_key in self._output_metadata[output_name]:\n                raise DagsterInvariantViolationError(\n                    f"In {self.solid_def.node_type_str} '{self.solid_def.name}', attempted to log metadata for output '{output_name}' more than once."\n                )\n        if mapping_key:\n            if not output_name in self._output_metadata:\n                self._output_metadata[output_name] = {}\n            self._output_metadata[output_name][mapping_key] = metadata\n\n        else:\n            self._output_metadata[output_name] = metadata\n\n\n
[docs]def build_op_context(\n resources: Optional[Dict[str, Any]] = None,\n op_config: Any = None,\n resources_config: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n config: Any = None,\n partition_key: Optional[str] = None,\n mapping_key: Optional[str] = None,\n) -> OpExecutionContext:\n """Builds op execution context from provided parameters.\n\n ``op`` is currently built on top of `solid`, and thus this function creates a `SolidExecutionContext`.\n ``build_op_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_op_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking a op.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n config (Optional[Any]): The op config to provide to the context.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n mapping_key (Optional[str]): A key representing the mapping key from an upstream dynamic output. Can be accessed using ``context.get_mapping_key()``.\n\n Examples:\n .. code-block:: python\n\n context = build_op_context()\n op_to_invoke(context)\n\n with build_op_context(resources={"foo": context_manager_resource}) as context:\n op_to_invoke(context)\n """\n\n if op_config and config:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke ``build_op_context`` with both ``op_config``, and its "\n "legacy version, ``config``. Please provide one or the other."\n )\n\n op_config = op_config if op_config else config\n return build_solid_context(\n resources=resources,\n resources_config=resources_config,\n solid_config=op_config,\n instance=instance,\n partition_key=partition_key,\n mapping_key=mapping_key,\n )
\n\n\n
[docs]def build_solid_context(\n resources: Optional[Dict[str, Any]] = None,\n solid_config: Any = None,\n resources_config: Optional[Dict[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n config: Any = None,\n partition_key: Optional[str] = None,\n mapping_key: Optional[str] = None,\n) -> UnboundSolidExecutionContext:\n """Builds solid execution context from provided parameters.\n\n ``build_solid_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_solid_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking a solid.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n solid_config (Optional[Any]): The solid config to provide to the context. The value provided\n here will be available as ``context.solid_config``.\n resources_config (Optional[Dict[str, Any]]): Configuration for any resource definitions\n provided to the resources arg. The configuration under a specific key should match the\n resource under a specific key in the resources dictionary.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_solid_context()\n solid_to_invoke(context)\n\n with build_solid_context(resources={"foo": context_manager_resource}) as context:\n solid_to_invoke(context)\n """\n\n if solid_config and config:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke ``build_solid_context`` with both ``solid_config``, and its "\n "legacy version, ``config``. Please provide one or the other."\n )\n\n solid_config = solid_config if solid_config else config\n\n return UnboundSolidExecutionContext(\n resources_dict=check.opt_dict_param(resources, "resources", key_type=str),\n resources_config=check.opt_dict_param(resources_config, "resources_config", key_type=str),\n solid_config=solid_config,\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/invocation", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.invocation"}, "logger": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.logger

\nfrom typing import Any, Optional\n\nimport dagster._check as check\nfrom dagster.core.definitions.job_definition import JobDefinition\nfrom dagster.core.definitions.logger_definition import LoggerDefinition\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.errors import DagsterInvariantViolationError\n\nfrom .output import RUN_ID_PLACEHOLDER\n\n\n
[docs]class InitLoggerContext:\n """Logger-specific initialization context.\n\n An instance of this class is made available as the first argument to the ``logger_fn`` decorated\n by :py:func:`@logger <logger>` or set on a :py:class:`LoggerDefinition`.\n\n Users should not instantiate this class.\n\n Attributes:\n logger_config (Any): The configuration data provided by the run config. The\n schema for this data is defined by ``config_schema`` on the :py:class:`LoggerDefinition`\n pipeline_def (Optional[PipelineDefinition]): The pipeline/job definition currently being executed.\n logger_def (Optional[LoggerDefinition]): The logger definition for the logger being constructed.\n run_id (str): The ID for this run of the pipeline.\n """\n\n def __init__(\n self,\n logger_config: Any,\n logger_def: Optional[LoggerDefinition] = None,\n pipeline_def: Optional[PipelineDefinition] = None,\n run_id: Optional[str] = None,\n ):\n self._logger_config = logger_config\n self._pipeline_def = check.opt_inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n self._logger_def = check.opt_inst_param(logger_def, "logger_def", LoggerDefinition)\n self._run_id = check.opt_str_param(run_id, "run_id")\n\n @property\n def logger_config(self) -> Any:\n return self._logger_config\n\n @property\n def pipeline_def(self) -> Optional[PipelineDefinition]:\n return self._pipeline_def\n\n @property\n def job_def(self) -> Optional[JobDefinition]:\n if not self._pipeline_def:\n return None\n if not isinstance(self._pipeline_def, JobDefinition):\n raise DagsterInvariantViolationError(\n "Attempted to access the .job_def property on an InitLoggerContext that was "\n "initialized with a PipelineDefinition. Please use .pipeline_def instead."\n )\n return self._pipeline_def\n\n @property\n def logger_def(self) -> Optional[LoggerDefinition]:\n return self._logger_def\n\n @property\n def run_id(self) -> Optional[str]:\n return self._run_id
\n\n\nclass UnboundInitLoggerContext(InitLoggerContext):\n """Logger initialization context outputted by ``build_init_logger_context``.\n\n Represents a context whose config has not yet been validated against a logger definition, hence\n the inability to access the `logger_def` attribute. When an instance of\n ``UnboundInitLoggerContext`` is passed to ``LoggerDefinition.initialize``, config is validated,\n and it is subsumed into an `InitLoggerContext`, which contains the logger_def validated against.\n """\n\n def __init__(self, logger_config: Any, pipeline_def: Optional[PipelineDefinition]):\n super(UnboundInitLoggerContext, self).__init__(\n logger_config, logger_def=None, pipeline_def=pipeline_def, run_id=None\n )\n\n @property\n def logger_def(self) -> LoggerDefinition:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def run_id(self) -> Optional[str]:\n return RUN_ID_PLACEHOLDER\n
", "current_page_name": "_modules/dagster/core/execution/context/logger", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.logger"}, "output": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.output

\nimport warnings\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union, cast\n\nimport dagster._check as check\nfrom dagster.core.definitions.asset_layer import AssetOutputInfo\nfrom dagster.core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    Materialization,\n    MetadataEntry,\n    PartitionMetadataEntry,\n)\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.time_window_partitions import TimeWindow\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.execution.plan.utils import build_resources_for_manager\n\nif TYPE_CHECKING:\n    from dagster.core.definitions import PipelineDefinition\n    from dagster.core.definitions.resource_definition import Resources\n    from dagster.core.events import DagsterEvent\n    from dagster.core.execution.context.system import StepExecutionContext\n    from dagster.core.execution.plan.outputs import StepOutputHandle\n    from dagster.core.execution.plan.plan import ExecutionPlan\n    from dagster.core.log_manager import DagsterLogManager\n    from dagster.core.system_config.objects import ResolvedRunConfig\n    from dagster.core.types.dagster_type import DagsterType\n\nRUN_ID_PLACEHOLDER = "__EPHEMERAL_RUN_ID"\n\n\n
[docs]class OutputContext:\n """\n The context object that is available to the `handle_output` method of an :py:class:`IOManager`.\n\n Attributes:\n step_key (Optional[str]): The step_key for the compute step that produced the output.\n name (Optional[str]): The name of the output that produced the output.\n pipeline_name (Optional[str]): The name of the pipeline definition.\n run_id (Optional[str]): The id of the run that produced the output.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n solid_def (Optional[SolidDefinition]): The definition of the solid that produced the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n log (Optional[DagsterLogManager]): The log manager to use for this output.\n version (Optional[str]): (Experimental) The version of the output.\n resource_config (Optional[Dict[str, Any]]): The config associated with the resource that\n initializes the RootInputManager.\n resources (Optional[Resources]): The resources required by the output manager, specified by the\n `required_resource_keys` parameter.\n op_def (Optional[OpDefinition]): The definition of the op that produced the output.\n asset_info: Optional[AssetOutputInfo]: (Experimental) Asset info corresponding to the\n output.\n """\n\n def __init__(\n self,\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n pipeline_name: Optional[str] = None,\n run_id: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n mapping_key: Optional[str] = None,\n config: Optional[Any] = None,\n solid_def: Optional["SolidDefinition"] = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n resources: Optional[Union["Resources", Dict[str, Any]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_info: Optional[AssetOutputInfo] = None,\n warn_on_step_context_use: bool = False,\n ):\n from dagster.core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster.core.execution.build_resources import build_resources\n\n self._step_key = step_key\n self._name = name\n self._pipeline_name = pipeline_name\n self._run_id = run_id\n self._metadata = metadata\n self._mapping_key = mapping_key\n self._config = config\n check.invariant(\n solid_def is None or op_def is None, "Can't provide both a solid_def and an op_def arg"\n )\n self._solid_def = solid_def or op_def\n self._dagster_type = dagster_type\n self._log = log_manager\n self._version = version\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_info = asset_info\n self._warn_on_step_context_use = warn_on_step_context_use\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_dict_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__() # pylint: disable=no-member\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events: List["DagsterEvent"] = []\n self._user_events: List[Union[AssetMaterialization, AssetObservation, Materialization]] = []\n self._metadata_entries: Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]] = None\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc) # pylint: disable=no-member\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None) # pylint: disable=no-member\n\n @property\n def step_key(self) -> str:\n if self._step_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_key\n\n @property\n def name(self) -> str:\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._name\n\n @property\n def pipeline_name(self) -> str:\n if self._pipeline_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access pipeline_name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._pipeline_name\n\n @property\n def run_id(self) -> str:\n if self._run_id is None:\n raise DagsterInvariantViolationError(\n "Attempting to access run_id, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._run_id\n\n @property\n def metadata(self) -> Optional[Dict[str, Any]]:\n return self._metadata\n\n @property\n def mapping_key(self) -> Optional[str]:\n return self._mapping_key\n\n @property\n def config(self) -> Any:\n return self._config\n\n @property\n def solid_def(self) -> "SolidDefinition":\n if self._solid_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access solid_def, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._solid_def\n\n @property\n def op_def(self) -> "OpDefinition":\n if self._solid_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return cast(OpDefinition, self._solid_def)\n\n @property\n def dagster_type(self) -> "DagsterType":\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._dagster_type\n\n @property\n def log(self) -> "DagsterLogManager":\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._log\n\n @property\n def version(self) -> Optional[str]:\n return self._version\n\n @property\n def resource_config(self) -> Optional[Dict[str, Any]]:\n return self._resource_config\n\n @property\n def resources(self) -> Any:\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the OutputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_output_context(...) as context:`"\n )\n return self._resources\n\n @property\n def asset_info(self) -> Optional[AssetOutputInfo]:\n return self._asset_info\n\n @property\n def has_asset_key(self) -> bool:\n return self._asset_info is not None\n\n @property\n def asset_key(self) -> AssetKey:\n if self._asset_info is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._asset_info.key\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.step_context"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_context\n\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run"""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.has_partition_key\n\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.partition_key\n\n @property\n def has_asset_partitions(self) -> bool:\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_asset_partitions"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is not None:\n return self._step_context.has_asset_partitions_for_output(self.name)\n else:\n return False\n\n @property\n def asset_partition_key(self) -> str:\n """The partition key for output asset.\n\n Raises an error if the output asset has no partitioning, or if the run covers a partition\n range for the output asset.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_for_output(self.name)\n\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key_range"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_range_for_output(self.name)\n\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partitions_time_window"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partitions_time_window_for_output(self.name)\n\n
[docs] def get_run_scoped_output_identifier(self) -> List[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n The unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. run id, step key, and output name\n """\n\n warnings.warn(\n "`OutputContext.get_run_scoped_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n # if run_id is None and this is a re-execution, it means we failed to find its source run id\n check.invariant(\n self.run_id is not None,\n "Unable to find the run scoped output identifier: run_id is None on OutputContext.",\n )\n check.invariant(\n self.step_key is not None,\n "Unable to find the run scoped output identifier: step_key is None on OutputContext.",\n )\n check.invariant(\n self.name is not None,\n "Unable to find the run scoped output identifier: name is None on OutputContext.",\n )\n run_id = cast(str, self.run_id)\n step_key = cast(str, self.step_key)\n name = cast(str, self.name)\n\n if self.mapping_key:\n return [run_id, step_key, name, self.mapping_key]\n\n return [run_id, step_key, name]
\n\n
[docs] def get_identifier(self) -> List[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n version = self.version\n step_key = self.step_key\n name = self.name\n if version is not None:\n check.invariant(\n self.mapping_key is None,\n f"Mapping key and version both provided for output '{name}' of step '{step_key}'. "\n "Dynamic mapping is not supported when using versioning.",\n )\n identifier = ["versioned_outputs", version, step_key, name]\n else:\n run_id = self.run_id\n identifier = [run_id, step_key, name]\n if self.mapping_key:\n identifier.append(self.mapping_key)\n\n return identifier
\n\n def get_output_identifier(self) -> List[str]:\n warnings.warn(\n "`OutputContext.get_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n\n return self.get_identifier()\n\n def get_asset_identifier(self) -> Sequence[str]:\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return self.asset_key.path + [self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset output identifier for an output with no asset key")\n\n def get_asset_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_asset_output_identifier` is deprecated. Use "\n "`OutputContext.get_asset_identifier` instead."\n )\n\n return self.get_asset_identifier()\n\n
[docs] def log_event(\n self, event: Union[AssetObservation, AssetMaterialization, Materialization]\n ) -> None:\n """Log an AssetMaterialization or AssetObservation from within the body of an io manager's `handle_output` method.\n\n Events logged with this method will appear in the event log.\n\n Args:\n event (Union[AssetMaterialization, Materialization, AssetObservation]): The event to log.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import IOManager, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.log_event(AssetMaterialization("foo"))\n """\n from dagster.core.events import DagsterEvent\n\n if isinstance(event, (AssetMaterialization, Materialization)):\n if self._step_context:\n self._events.append(\n DagsterEvent.asset_materialization(\n self._step_context,\n event,\n self._step_context.get_input_lineage(),\n )\n )\n self._user_events.append(event)\n elif isinstance(event, AssetObservation):\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, event))\n self._user_events.append(event)\n else:\n check.failed("Unexpected event {event}".format(event=event))
\n\n
[docs] def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_output`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n\n events = self._events\n self._events = []\n yield from events
\n\n
[docs] def get_logged_events(\n self,\n ) -> List[Union[AssetMaterialization, Materialization, AssetObservation]]:\n """Retrieve the list of user-generated events that were logged via the context.\n\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_output_context, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n ...\n\n def test_handle_output():\n mgr = MyIOManager()\n context = build_output_context()\n mgr.handle_output(context)\n all_user_events = context.get_logged_events()\n materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n ...\n """\n\n return self._user_events
\n\n
[docs] def add_output_metadata(self, metadata: Dict[str, Any]) -> None:\n """Add a dictionary of metadata to the handled output.\n\n Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.\n\n Args:\n metadata (Dict[str, Any]): A metadata dictionary to log\n\n Examples:\n\n .. code-block:: python\n\n from dagster import IOManager\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.add_output_metadata({"foo": "bar"})\n """\n from dagster.core.definitions.metadata import normalize_metadata\n\n self._metadata_entries = normalize_metadata(metadata, [])
\n\n
[docs] def get_logged_metadata_entries(\n self,\n ) -> List[Union[MetadataEntry, PartitionMetadataEntry]]:\n """Get the list of metadata entries that have been logged for use with this output."""\n return self._metadata_entries or []
\n\n
[docs] def consume_logged_metadata_entries(\n self,\n ) -> List[Union[MetadataEntry, PartitionMetadataEntry]]:\n """Pops and yields all user-generated metadata entries that have been recorded from this context.\n\n If consume_logged_metadata_entries has not yet been called, this will yield all logged events since the call to `handle_output`. If consume_logged_metadata_entries has been called, it will yield all events since the last time consume_logged_metadata_entries was called. Designed for internal use. Users should never need to invoke this method.\n """\n result = self._metadata_entries\n self._metadata_entries = []\n return result or []
\n\n\ndef get_output_context(\n execution_plan: "ExecutionPlan",\n pipeline_def: "PipelineDefinition",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n run_id: Optional[str],\n log_manager: Optional["DagsterLogManager"],\n step_context: Optional["StepExecutionContext"],\n resources: Optional["Resources"],\n version: Optional[str],\n warn_on_step_context_use: bool = False,\n) -> "OutputContext":\n """\n Args:\n run_id (str): The run ID of the run that produced the output, not necessarily the run that\n the context will be used in.\n """\n\n step = execution_plan.get_step_by_key(step_output_handle.step_key)\n # get config\n solid_config = resolved_run_config.solids[step.solid_handle.to_string()]\n outputs_config = solid_config.outputs\n\n if outputs_config:\n output_config = outputs_config.get_output_manager_config(step_output_handle.output_name)\n else:\n output_config = None\n\n step_output = execution_plan.get_step_output(step_output_handle)\n output_def = pipeline_def.get_solid(step_output.solid_handle).output_def_named(step_output.name)\n\n io_manager_key = output_def.io_manager_key\n resource_config = resolved_run_config.resources[io_manager_key].config\n\n node_handle = execution_plan.get_step_by_key(step.key).solid_handle\n asset_info = pipeline_def.asset_layer.asset_info_for_output(\n node_handle=node_handle, output_name=step_output.name\n )\n\n if step_context:\n check.invariant(\n not resources,\n "Expected either resources or step context to be set, but "\n "received both. If step context is provided, resources for IO manager will be "\n "retrieved off of that.",\n )\n resources = build_resources_for_manager(io_manager_key, step_context)\n\n return OutputContext(\n step_key=step_output_handle.step_key,\n name=step_output_handle.output_name,\n pipeline_name=pipeline_def.name,\n run_id=run_id,\n metadata=output_def.metadata,\n mapping_key=step_output_handle.mapping_key,\n config=output_config,\n solid_def=pipeline_def.get_solid(step.solid_handle).definition,\n dagster_type=output_def.dagster_type,\n log_manager=log_manager,\n version=version,\n step_context=step_context,\n resource_config=resource_config,\n resources=resources,\n asset_info=asset_info,\n warn_on_step_context_use=warn_on_step_context_use,\n )\n\n\ndef step_output_version(\n pipeline_def: "PipelineDefinition",\n execution_plan: "ExecutionPlan",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n) -> Optional[str]:\n from dagster.core.execution.resolve_versions import resolve_step_output_versions\n\n step_output_versions = resolve_step_output_versions(\n pipeline_def, execution_plan, resolved_run_config\n )\n return (\n step_output_versions[step_output_handle]\n if step_output_handle in step_output_versions\n else None\n )\n\n\n
[docs]def build_output_context(\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n run_id: Optional[str] = None,\n mapping_key: Optional[str] = None,\n config: Optional[Any] = None,\n dagster_type: Optional["DagsterType"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n solid_def: Optional[SolidDefinition] = None,\n op_def: Optional[OpDefinition] = None,\n asset_key: Optional[Union[AssetKey, str]] = None,\n) -> "OutputContext":\n """Builds output context from provided parameters.\n\n ``build_output_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_output_context`` must be used as a\n context manager.\n\n Args:\n step_key (Optional[str]): The step_key for the compute step that produced the output.\n name (Optional[str]): The name of the output that produced the output.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n version (Optional[str]): (Experimental) The version of the output.\n resource_config (Optional[Dict[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the output manager.\n resources (Optional[Resources]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n solid_def (Optional[SolidDefinition]): The definition of the solid that produced the output.\n op_def (Optional[OpDefinition]): The definition of the op that produced the output.\n asset_key: Optional[Union[AssetKey, Sequence[str], str]]: The asset key corresponding to the\n output.\n\n Examples:\n\n .. code-block:: python\n\n build_output_context()\n\n with build_output_context(resources={"foo": context_manager_resource}) as context:\n do_something\n\n """\n from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n from dagster.core.types.dagster_type import DagsterType\n\n step_key = check.opt_str_param(step_key, "step_key")\n name = check.opt_str_param(name, "name")\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n run_id = check.opt_str_param(run_id, "run_id", default=RUN_ID_PLACEHOLDER)\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n version = check.opt_str_param(version, "version")\n resource_config = check.opt_dict_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_dict_param(resources, "resources", key_type=str)\n solid_def = check.opt_inst_param(solid_def, "solid_def", SolidDefinition)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n asset_key = AssetKey.from_coerceable(asset_key) if asset_key else None\n\n return OutputContext(\n step_key=step_key,\n name=name,\n pipeline_name=None,\n run_id=run_id,\n metadata=metadata,\n mapping_key=mapping_key,\n config=config,\n solid_def=solid_def,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n version=version,\n resource_config=resource_config,\n resources=resources,\n step_context=None,\n op_def=op_def,\n asset_info=AssetOutputInfo(key=asset_key) if asset_key else None,\n )
\n
", "current_page_name": "_modules/dagster/core/execution/context/output", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.output"}, "system": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.context.system

\n"""\nThis module contains the execution context objects that are internal to the system.\nNot every property on these should be exposed to random Jane or Joe dagster user\nso we have a different layer of objects that encode the explicit public API\nin the user_context module\n"""\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster.core.definitions.events import AssetKey, AssetLineageInfo\nfrom dagster.core.definitions.hook_definition import HookDefinition\nfrom dagster.core.definitions.job_definition import JobDefinition\nfrom dagster.core.definitions.mode import ModeDefinition\nfrom dagster.core.definitions.op_definition import OpDefinition\nfrom dagster.core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster.core.definitions.pipeline_base import IPipeline\nfrom dagster.core.definitions.pipeline_definition import PipelineDefinition\nfrom dagster.core.definitions.policy import RetryPolicy\nfrom dagster.core.definitions.reconstruct import ReconstructablePipeline\nfrom dagster.core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster.core.definitions.solid_definition import SolidDefinition\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n)\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.execution.plan.step import ExecutionStep\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.executor.base import Executor\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.storage.io_manager import IOManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import PARTITION_NAME_TAG\nfrom dagster.core.system_config.objects import ResolvedRunConfig\nfrom dagster.core.types.dagster_type import DagsterType\n\nfrom .input import InputContext\nfrom .output import OutputContext, get_output_context\n\nif TYPE_CHECKING:\n    from dagster.core.definitions.dependency import Node, NodeHandle\n    from dagster.core.definitions.resource_definition import Resources\n    from dagster.core.events import DagsterEvent\n    from dagster.core.execution.plan.plan import ExecutionPlan\n    from dagster.core.execution.plan.state import KnownExecutionState\n    from dagster.core.instance import DagsterInstance\n\n    from .hook import HookContext\n\n\ndef is_iterable(obj: Any) -> bool:\n    try:\n        iter(obj)\n    except:\n        return False\n    return True\n\n\nclass IPlanContext(ABC):\n    """Context interface to represent run information that does not require access to user code.\n\n    The information available via this interface is accessible to the system throughout a run.\n    """\n\n    @property\n    @abstractmethod\n    def plan_data(self) -> "PlanData":\n        raise NotImplementedError()\n\n    @property\n    def pipeline(self) -> IPipeline:\n        return self.plan_data.pipeline\n\n    @property\n    def pipeline_run(self) -> PipelineRun:\n        return self.plan_data.pipeline_run\n\n    @property\n    def run_id(self) -> str:\n        return self.pipeline_run.run_id\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        return self.pipeline_run.run_config\n\n    @property\n    def pipeline_name(self) -> str:\n        return self.pipeline_run.pipeline_name\n\n    @property\n    def job_name(self) -> str:\n        return self.pipeline_name\n\n    @property\n    def instance(self) -> "DagsterInstance":\n        return self.plan_data.instance\n\n    @property\n    def raise_on_error(self) -> bool:\n        return self.plan_data.raise_on_error\n\n    @property\n    def retry_mode(self) -> RetryMode:\n        return self.plan_data.retry_mode\n\n    @property\n    def execution_plan(self):\n        return self.plan_data.execution_plan\n\n    @property\n    @abstractmethod\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        raise NotImplementedError()\n\n    @property\n    def log(self) -> DagsterLogManager:\n        raise NotImplementedError()\n\n    @property\n    def logging_tags(self) -> Dict[str, str]:\n        return self.log.logging_metadata.to_tags()\n\n    def has_tag(self, key: str) -> bool:\n        check.str_param(key, "key")\n        return key in self.log.logging_metadata.pipeline_tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        check.str_param(key, "key")\n        return self.log.logging_metadata.pipeline_tags.get(key)\n\n\nclass PlanData(NamedTuple):\n    """The data about a run that is available during both orchestration and execution.\n\n    This object does not contain any information that requires access to user code, such as the\n    pipeline definition and resources.\n    """\n\n    pipeline: IPipeline\n    pipeline_run: PipelineRun\n    instance: "DagsterInstance"\n    execution_plan: "ExecutionPlan"\n    raise_on_error: bool = False\n    retry_mode: RetryMode = RetryMode.DISABLED\n\n\nclass ExecutionData(NamedTuple):\n    """The data that is available to the system during execution.\n\n    This object contains information that requires access to user code, such as the pipeline\n    definition and resources.\n    """\n\n    scoped_resources_builder: ScopedResourcesBuilder\n    resolved_run_config: ResolvedRunConfig\n    pipeline_def: PipelineDefinition\n    mode_def: ModeDefinition\n\n\nclass IStepContext(IPlanContext):\n    """Interface to represent data to be available during either step orchestration or execution."""\n\n    @property\n    @abstractmethod\n    def step(self) -> ExecutionStep:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def solid_handle(self) -> "NodeHandle":\n        raise NotImplementedError()\n\n\nclass PlanOrchestrationContext(IPlanContext):\n    """Context for the orchestration of a run.\n\n    This context assumes inability to run user code directly.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n        resume_from_failure: bool = False,\n    ):\n        self._plan_data = plan_data\n        self._log_manager = log_manager\n        self._executor = executor\n        self._output_capture = output_capture\n        self._resume_from_failure = resume_from_failure\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def reconstructable_pipeline(self) -> ReconstructablePipeline:\n        if not isinstance(self.pipeline, ReconstructablePipeline):\n            raise DagsterInvariantViolationError(\n                "reconstructable_pipeline property must be a ReconstructablePipeline"\n            )\n        return self.pipeline\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def executor(self) -> Executor:\n        return self._executor\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(self, step: ExecutionStep) -> "IStepContext":\n        return StepOrchestrationContext(\n            plan_data=self.plan_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            executor=self.executor,\n            step=step,\n            output_capture=self.output_capture,\n        )\n\n    @property\n    def resume_from_failure(self) -> bool:\n        return self._resume_from_failure\n\n\nclass StepOrchestrationContext(PlanOrchestrationContext, IStepContext):\n    """Context for the orchestration of a step.\n\n    This context assumes inability to run user code directly. Thus, it does not include any resource\n    information.\n    """\n\n    def __init__(self, plan_data, log_manager, executor, step, output_capture):\n        super(StepOrchestrationContext, self).__init__(\n            plan_data, log_manager, executor, output_capture\n        )\n        self._step = step\n\n    @property\n    def step(self) -> ExecutionStep:\n        return self._step\n\n    @property\n    def solid_handle(self) -> "NodeHandle":\n        return self.step.solid_handle\n\n\nclass PlanExecutionContext(IPlanContext):\n    """Context for the execution of a plan.\n\n    This context assumes that user code can be run directly, and thus includes resource and\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        execution_data: ExecutionData,\n        log_manager: DagsterLogManager,\n        output_capture: Optional[Dict[StepOutputHandle, Any]] = None,\n    ):\n        self._plan_data = plan_data\n        self._execution_data = execution_data\n        self._log_manager = log_manager\n        self._output_capture = output_capture\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(\n        self,\n        step: ExecutionStep,\n        known_state: Optional["KnownExecutionState"] = None,\n    ) -> IStepContext:\n\n        return StepExecutionContext(\n            plan_data=self.plan_data,\n            execution_data=self._execution_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            step=step,\n            output_capture=self.output_capture,\n            known_state=known_state,\n        )\n\n    @property\n    def pipeline_def(self) -> PipelineDefinition:\n        return self._execution_data.pipeline_def\n\n    @property\n    def resolved_run_config(self) -> ResolvedRunConfig:\n        return self._execution_data.resolved_run_config\n\n    @property\n    def scoped_resources_builder(self) -> ScopedResourcesBuilder:\n        return self._execution_data.scoped_resources_builder\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def partition_key(self) -> str:\n        tags = self._plan_data.pipeline_run.tags\n        check.invariant(\n            PARTITION_NAME_TAG in tags, "Tried to access partition_key for a non-partitioned run"\n        )\n        return tags[PARTITION_NAME_TAG]\n\n    @property\n    def partition_time_window(self) -> str:\n        pipeline_def = self._execution_data.pipeline_def\n        if not isinstance(pipeline_def, JobDefinition):\n            check.failed(\n                # isinstance(pipeline_def, JobDefinition),\n                "Can only call 'partition_time_window', when using jobs, not legacy pipelines",\n            )\n        partitions_def = pipeline_def.partitions_def\n\n        if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n            check.failed(\n                f"Expected a TimeWindowPartitionsDefinition, but instead found {type(partitions_def)}",\n            )\n\n        # mypy thinks partitions_def is <nothing> here because ????\n        return partitions_def.time_window_for_partition_key(self.partition_key)  # type: ignore\n\n    @property\n    def has_partition_key(self) -> bool:\n        return PARTITION_NAME_TAG in self._plan_data.pipeline_run.tags\n\n    def for_type(self, dagster_type: DagsterType) -> "TypeCheckContext":\n        return TypeCheckContext(\n            self.run_id, self.log, self._execution_data.scoped_resources_builder, dagster_type\n        )\n\n\nclass StepExecutionContext(PlanExecutionContext, IStepContext):\n    """Context for the execution of a step.\n\n    This context assumes that user code can be run directly, and thus includes resource and\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        execution_data: ExecutionData,\n        log_manager: DagsterLogManager,\n        step: ExecutionStep,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n        known_state: Optional["KnownExecutionState"],\n    ):\n        from dagster.core.execution.resources_init import get_required_resource_keys_for_step\n\n        super(StepExecutionContext, self).__init__(\n            plan_data=plan_data,\n            execution_data=execution_data,\n            log_manager=log_manager,\n            output_capture=output_capture,\n        )\n        self._step = step\n        self._required_resource_keys = get_required_resource_keys_for_step(\n            plan_data.pipeline.get_definition(),\n            step,\n            plan_data.execution_plan,\n        )\n        self._resources = execution_data.scoped_resources_builder.build(\n            self._required_resource_keys\n        )\n        self._known_state = known_state\n        self._input_lineage: List[AssetLineageInfo] = []\n\n        resources_iter = cast(Iterable, self._resources)\n\n        step_launcher_resources = [\n            resource for resource in resources_iter if isinstance(resource, StepLauncher)\n        ]\n\n        self._step_launcher: Optional[StepLauncher] = None\n        if len(step_launcher_resources) > 1:\n            raise DagsterInvariantViolationError(\n                "Multiple required resources for {described_op} have inherited StepLauncher"\n                "There should be at most one step launcher resource per {node_type}.".format(\n                    described_op=self.describe_op(), node_type=self.solid_def.node_type_str\n                )\n            )\n        elif len(step_launcher_resources) == 1:\n            self._step_launcher = step_launcher_resources[0]\n\n        self._step_exception: Optional[BaseException] = None\n\n        self._step_output_capture: Optional[Dict[StepOutputHandle, Any]] = None\n        # Enable step output capture if there are any hooks which will receive them.\n        # Expect in the future that hooks may control whether or not they get outputs,\n        # but for now presence of any will cause output capture.\n        if self.pipeline_def.get_all_hooks_for_handle(self.solid_handle):\n            self._step_output_capture = {}\n\n        self._output_metadata: Dict[str, Any] = {}\n        self._seen_outputs: Dict[str, Union[str, Set[str]]] = {}\n\n    @property\n    def step(self) -> ExecutionStep:\n        return self._step\n\n    @property\n    def solid_handle(self) -> "NodeHandle":\n        return self.step.solid_handle\n\n    @property\n    def required_resource_keys(self) -> Set[str]:\n        return self._required_resource_keys\n\n    @property\n    def resources(self) -> "Resources":\n        return self._resources\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        return self._step_launcher\n\n    @property\n    def solid_def(self) -> SolidDefinition:\n        return self.solid.definition.ensure_solid_def()\n\n    @property\n    def pipeline_def(self) -> PipelineDefinition:\n        return self._execution_data.pipeline_def\n\n    @property\n    def mode_def(self) -> ModeDefinition:\n        return self._execution_data.mode_def\n\n    @property\n    def solid(self) -> "Node":\n        return self.pipeline_def.get_solid(self._step.solid_handle)\n\n    @property\n    def solid_retry_policy(self) -> Optional[RetryPolicy]:\n        return self.pipeline_def.get_retry_policy_for_handle(self.solid_handle)\n\n    def describe_op(self):\n        if isinstance(self.solid_def, OpDefinition):\n            return f'op "{str(self.solid_handle)}"'\n\n        return f'solid "{str(self.solid_handle)}"'\n\n    def get_io_manager(self, step_output_handle) -> IOManager:\n        step_output = self.execution_plan.get_step_output(step_output_handle)\n        io_manager_key = (\n            self.pipeline_def.get_solid(step_output.solid_handle)\n            .output_def_named(step_output.name)\n            .io_manager_key\n        )\n\n        output_manager = getattr(self.resources, io_manager_key)\n        return check.inst(output_manager, IOManager)\n\n    def get_output_context(self, step_output_handle) -> OutputContext:\n        return get_output_context(\n            self.execution_plan,\n            self.pipeline_def,\n            self.resolved_run_config,\n            step_output_handle,\n            self._get_source_run_id(step_output_handle),\n            log_manager=self.log,\n            step_context=self,\n            resources=None,\n            version=self.execution_plan.get_version_for_step_output_handle(step_output_handle),\n        )\n\n    def for_input_manager(\n        self,\n        name: str,\n        config: Any,\n        metadata: Any,\n        dagster_type: DagsterType,\n        source_handle: Optional[StepOutputHandle] = None,\n        resource_config: Any = None,\n        resources: Optional["Resources"] = None,\n        artificial_output_context: Optional["OutputContext"] = None,\n    ) -> InputContext:\n        if source_handle and artificial_output_context:\n            check.failed("Cannot specify both source_handle and artificial_output_context.")\n\n        upstream_output: Optional[OutputContext] = None\n\n        if source_handle is not None:\n            version = self.execution_plan.get_version_for_step_output_handle(source_handle)\n\n            # NOTE: this is using downstream step_context for upstream OutputContext. step_context\n            # will be set to None for 0.15 release.\n            upstream_output = get_output_context(\n                self.execution_plan,\n                self.pipeline_def,\n                self.resolved_run_config,\n                source_handle,\n                self._get_source_run_id(source_handle),\n                log_manager=self.log,\n                step_context=self,\n                resources=None,\n                version=version,\n                warn_on_step_context_use=True,\n            )\n        else:\n            upstream_output = artificial_output_context\n\n        return InputContext(\n            pipeline_name=self.pipeline_def.name,\n            name=name,\n            solid_def=self.solid_def,\n            config=config,\n            metadata=metadata,\n            upstream_output=upstream_output,\n            dagster_type=dagster_type,\n            log_manager=self.log,\n            step_context=self,\n            resource_config=resource_config,\n            resources=resources,\n        )\n\n    def for_hook(self, hook_def: HookDefinition) -> "HookContext":\n        from .hook import HookContext\n\n        return HookContext(self, hook_def)\n\n    def get_known_state(self) -> "KnownExecutionState":\n        if not self._known_state:\n            check.failed(\n                "Attempted to access KnownExecutionState but it was not provided at context creation"\n            )\n        return self._known_state\n\n    def can_load(\n        self,\n        step_output_handle: StepOutputHandle,\n    ) -> bool:\n        # can load from upstream in the same run\n        if step_output_handle in self.get_known_state().ready_outputs:\n            return True\n\n        if (\n            self._should_load_from_previous_runs(step_output_handle)\n            # should and can load from a previous run\n            and self._get_source_run_id_from_logs(step_output_handle)\n        ):\n            return True\n\n        return False\n\n    def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n        if mapping_key:\n            if output_name not in self._seen_outputs:\n                self._seen_outputs[output_name] = set()\n            cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n        else:\n            self._seen_outputs[output_name] = "seen"\n\n    def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n        if mapping_key:\n            return (\n                output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n            )\n        return output_name in self._seen_outputs\n\n    def add_output_metadata(\n        self,\n        metadata: Mapping[str, Any],\n        output_name: Optional[str] = None,\n        mapping_key: Optional[str] = None,\n    ) -> None:\n\n        if output_name is None and len(self.solid_def.output_defs) == 1:\n            output_def = self.solid_def.output_defs[0]\n            output_name = output_def.name\n        elif output_name is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to log metadata without providing output_name, but multiple outputs exist. Please provide an output_name to the invocation of `context.add_output_metadata`."\n            )\n        else:\n            output_def = self.solid_def.output_def_named(output_name)\n\n        if self.has_seen_output(output_name, mapping_key):\n            output_desc = (\n                f"output '{output_def.name}'"\n                if not mapping_key\n                else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n            )\n            raise DagsterInvariantViolationError(\n                f"In {self.solid_def.node_type_str} '{self.solid.name}', attempted to log output metadata for {output_desc} which has already been yielded. Metadata must be logged before the output is yielded."\n            )\n        if output_def.is_dynamic and not mapping_key:\n            raise DagsterInvariantViolationError(\n                f"In {self.solid_def.node_type_str} '{self.solid.name}', attempted to log metadata for dynamic output '{output_def.name}' without providing a mapping key. When logging metadata for a dynamic output, it is necessary to provide a mapping key."\n            )\n\n        if output_name in self._output_metadata:\n            if not mapping_key or mapping_key in self._output_metadata[output_name]:\n                raise DagsterInvariantViolationError(\n                    f"In {self.solid_def.node_type_str} '{self.solid.name}', attempted to log metadata for output '{output_name}' more than once."\n                )\n        if mapping_key:\n            if not output_name in self._output_metadata:\n                self._output_metadata[output_name] = {}\n            self._output_metadata[output_name][mapping_key] = metadata\n\n        else:\n            self._output_metadata[output_name] = metadata\n\n    def get_output_metadata(\n        self, output_name: str, mapping_key: Optional[str] = None\n    ) -> Optional[Mapping[str, Any]]:\n        metadata = self._output_metadata.get(output_name)\n        if mapping_key and metadata:\n            return metadata.get(mapping_key)\n        return metadata\n\n    def _get_source_run_id_from_logs(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n        from dagster.core.events import DagsterEventType\n\n        # walk through event logs to find the right run_id based on the run lineage\n        run_group = self.instance.get_run_group(self.run_id)\n        if run_group is None:\n            check.failed(f"Failed to load run group {self.run_id}")\n\n        _, runs = run_group\n        run_id_to_parent_run_id = {run.run_id: run.parent_run_id for run in runs}\n        source_run_id = self.pipeline_run.parent_run_id\n        while source_run_id:\n            # note: this would cost N db calls where N = number of parent runs\n            step_output_record = self.instance.all_logs(\n                source_run_id, of_type=DagsterEventType.STEP_OUTPUT\n            )\n            # if the parent run has yielded an StepOutput event for the given step output,\n            # we find the source run id\n            for r in step_output_record:\n                if r.dagster_event.step_output_data.step_output_handle == step_output_handle:\n                    return source_run_id\n            # else, keep looking backwards\n            source_run_id = run_id_to_parent_run_id.get(source_run_id)\n\n        # When a fixed path is provided via io manager, it's able to run step subset using an execution\n        # plan when the ascendant outputs were not previously created by dagster-controlled\n        # computations. for example, in backfills, with fixed path io manager, we allow users to\n        # "re-execute" runs with steps where the outputs weren't previously stored by dagster.\n\n        # Warn about this special case because it will also reach here when all previous runs have\n        # skipped yielding this output. From the logs, we have no easy way to differentiate the fixed\n        # path case and the skipping case, until we record the skipping info in KnownExecutionState,\n        # i.e. resolve https://github.com/dagster-io/dagster/issues/3511\n        self.log.warn(\n            f"No previously stored outputs found for source {step_output_handle}. "\n            "This is either because you are using an IO Manager that does not depend on run ID, "\n            "or because all the previous runs have skipped the output in conditional execution."\n        )\n        return None\n\n    def _should_load_from_previous_runs(self, step_output_handle: StepOutputHandle) -> bool:\n        # should not load if not a re-execution\n        if self.pipeline_run.parent_run_id is None:\n            return False\n        # should not load if re-executing the entire pipeline\n        if self.pipeline_run.step_keys_to_execute is None:\n            return False\n\n        # should not load if the entire dynamic step is being executed in the current run\n        handle = StepHandle.parse_from_key(step_output_handle.step_key)\n        if (\n            isinstance(handle, ResolvedFromDynamicStepHandle)\n            and handle.unresolved_form.to_key() in self.pipeline_run.step_keys_to_execute\n        ):\n            return False\n\n        # should not load if this step is being executed in the current run\n        return step_output_handle.step_key not in self.pipeline_run.step_keys_to_execute\n\n    def _get_source_run_id(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n        if self._should_load_from_previous_runs(step_output_handle):\n            return self._get_source_run_id_from_logs(step_output_handle)\n        else:\n            return self.pipeline_run.run_id\n\n    def capture_step_exception(self, exception: BaseException):\n        self._step_exception = check.inst_param(exception, "exception", BaseException)\n\n    @property\n    def step_exception(self) -> Optional[BaseException]:\n        return self._step_exception\n\n    @property\n    def step_output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._step_output_capture\n\n    @property\n    def previous_attempt_count(self) -> int:\n        return self.get_known_state().get_retry_state().get_attempt_count(self._step.key)\n\n    @property\n    def op_config(self) -> Any:\n        solid_config = self.resolved_run_config.solids.get(str(self.solid_handle))\n        return solid_config.config if solid_config else None\n\n    def has_asset_partitions_for_input(self, input_name: str) -> bool:\n        op_config = self.op_config\n\n        if is_iterable(op_config) and "assets" in op_config:\n            all_input_asset_partitions = op_config["assets"].get("input_partitions")\n            if all_input_asset_partitions is not None:\n                this_input_asset_partitions = all_input_asset_partitions.get(input_name)\n                if this_input_asset_partitions is not None:\n                    return True\n\n        return False\n\n    def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n        op_config = self.op_config\n        if is_iterable(op_config) and "assets" in op_config:\n            all_input_asset_partitions = op_config["assets"].get("input_partitions")\n            if all_input_asset_partitions is not None:\n                this_input_asset_partitions = all_input_asset_partitions.get(input_name)\n                if this_input_asset_partitions is not None:\n                    return PartitionKeyRange(\n                        this_input_asset_partitions["start"], this_input_asset_partitions["end"]\n                    )\n\n        check.failed("The input has no asset partitions")\n\n    def asset_partition_key_for_input(self, input_name: str) -> str:\n        start, end = self.asset_partition_key_range_for_input(input_name)\n        if start == end:\n            return start\n        else:\n            check.failed(\n                f"Tried to access partition key for input '{input_name}' of step '{self.step.key}', "\n                f"but the step input has a partition range: '{start}' to '{end}'."\n            )\n\n    def has_asset_partitions_for_output(self, output_name: str) -> bool:\n        op_config = self.op_config\n        if is_iterable(op_config) and "assets" in op_config:\n            all_output_asset_partitions = op_config["assets"].get("output_partitions")\n            if all_output_asset_partitions is not None:\n                this_output_asset_partitions = all_output_asset_partitions.get(output_name)\n                if this_output_asset_partitions is not None:\n                    return True\n\n        return False\n\n    def asset_partition_key_range_for_output(self, output_name: str) -> PartitionKeyRange:\n        op_config = self.op_config\n        if is_iterable(op_config) and "assets" in op_config:\n            all_output_asset_partitions = op_config["assets"].get("output_partitions")\n            if all_output_asset_partitions is not None:\n                this_output_asset_partitions = all_output_asset_partitions.get(output_name)\n                if this_output_asset_partitions is not None:\n                    return PartitionKeyRange(\n                        this_output_asset_partitions["start"], this_output_asset_partitions["end"]\n                    )\n\n        check.failed("The output has no asset partitions")\n\n    def asset_partition_key_for_output(self, output_name: str) -> str:\n        start, end = self.asset_partition_key_range_for_output(output_name)\n        if start == end:\n            return start\n        else:\n            check.failed(\n                f"Tried to access partition key for output '{output_name}' of step '{self.step.key}', "\n                f"but the step output has a partition range: '{start}' to '{end}'."\n            )\n\n    def asset_partitions_time_window_for_output(self, output_name: str) -> TimeWindow:\n        """The time window for the partitions of the asset correponding to the given output.\n\n        Raises an error if either of the following are true:\n        - The output asset has no partitioning.\n        - The output asset is not partitioned with a TimeWindowPartitionsDefinition.\n        """\n        asset_info = self.pipeline_def.asset_layer.asset_info_for_output(\n            self.solid_handle, output_name\n        )\n        partitions_def = asset_info.partitions_def if asset_info else None\n\n        if not partitions_def:\n            raise ValueError(\n                "Tried to get asset partitions for an output that does not correspond to a "\n                "partitioned asset."\n            )\n\n        if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n            raise ValueError(\n                "Tried to get asset partitions for an output that correponds to a partitioned "\n                "asset that is not partitioned with a TimeWindowPartitionsDefinition."\n            )\n        partition_key_range = self.asset_partition_key_range_for_output(output_name)\n        return TimeWindow(\n            # mypy thinks partitions_def is <nothing> here because ????\n            partitions_def.time_window_for_partition_key(partition_key_range.start).start,  # type: ignore\n            partitions_def.time_window_for_partition_key(partition_key_range.end).end,  # type: ignore\n        )\n\n    def get_input_lineage(self) -> List[AssetLineageInfo]:\n        if not self._input_lineage:\n\n            for step_input in self.step.step_inputs:\n                input_def = self.solid_def.input_def_named(step_input.name)\n                dagster_type = input_def.dagster_type\n\n                if dagster_type.is_nothing:\n                    continue\n\n                self._input_lineage.extend(step_input.source.get_asset_lineage(self, input_def))\n\n        self._input_lineage = _dedup_asset_lineage(self._input_lineage)\n\n        return self._input_lineage\n\n\ndef _dedup_asset_lineage(asset_lineage: List[AssetLineageInfo]) -> List[AssetLineageInfo]:\n    """Method to remove duplicate specifications of the same Asset/Partition pair from the lineage\n    information. Duplicates can occur naturally when calculating transitive dependencies from solids\n    with multiple Outputs, which in turn have multiple Inputs (because each Output of the solid will\n    inherit all dependencies from all of the solid Inputs).\n    """\n    key_partition_mapping: Dict[AssetKey, Set[str]] = defaultdict(set)\n\n    for lineage_info in asset_lineage:\n        if not lineage_info.partitions:\n            key_partition_mapping[lineage_info.asset_key] |= set()\n        for partition in lineage_info.partitions:\n            key_partition_mapping[lineage_info.asset_key].add(partition)\n    return [\n        AssetLineageInfo(asset_key=asset_key, partitions=partitions)\n        for asset_key, partitions in key_partition_mapping.items()\n    ]\n\n\n
[docs]class TypeCheckContext:\n """The ``context`` object available to a type check function on a DagsterType.\n\n Attributes:\n log (DagsterLogManager): Centralized log dispatch from user code.\n resources (Any): An object whose attributes contain the resources available to this op.\n run_id (str): The id of this job run.\n """\n\n def __init__(\n self,\n run_id: str,\n log_manager: DagsterLogManager,\n scoped_resources_builder: ScopedResourcesBuilder,\n dagster_type: DagsterType,\n ):\n self._run_id = run_id\n self._log = log_manager\n self._resources = scoped_resources_builder.build(dagster_type.required_resource_keys)\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def run_id(self) -> str:\n return self._run_id\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log
\n
", "current_page_name": "_modules/dagster/core/execution/context/system", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.context.system"}}, "execute_in_process_result": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.execute_in_process_result

\nfrom typing import Any, Dict, List, Optional, Union, cast\n\nimport dagster._check as check\nfrom dagster.core.definitions import NodeDefinition, NodeHandle\nfrom dagster.core.definitions.events import AssetMaterialization, AssetObservation, Materialization\nfrom dagster.core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster.core.errors import DagsterError, DagsterInvariantViolationError\nfrom dagster.core.events import (\n    AssetObservationData,\n    DagsterEvent,\n    DagsterEventType,\n    StepMaterializationData,\n)\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.storage.pipeline_run import DagsterRun\n\n\n
[docs]class ExecuteInProcessResult:\n def __init__(\n self,\n node_def: NodeDefinition,\n all_events: List[DagsterEvent],\n dagster_run: DagsterRun,\n output_capture: Optional[Dict[StepOutputHandle, Any]],\n ):\n self._node_def = node_def\n\n # If top-level result, no handle will be provided\n self._handle = NodeHandle(node_def.name, parent=None)\n self._event_list = all_events\n self._dagster_run = dagster_run\n\n self._output_capture = check.opt_dict_param(\n output_capture, "output_capture", key_type=StepOutputHandle\n )\n\n @property\n def success(self) -> bool:\n """bool: Whether execution was successful."""\n return self._dagster_run.is_success\n\n @property\n def all_node_events(self) -> List[DagsterEvent]:\n """List[DagsterEvent]: All dagster events from the in-process execution."""\n\n step_events = []\n\n for node_name in self._node_def.ensure_graph_def().node_dict.keys():\n handle = NodeHandle(node_name, None)\n step_events += _filter_events_by_handle(self._event_list, handle)\n\n return step_events\n\n @property\n def all_events(self) -> List[DagsterEvent]:\n """List[DagsterEvent]: All dagster events emitted during in-process execution."""\n\n return self._event_list\n\n @property\n def run_id(self) -> str:\n """str: The run id for the executed run"""\n return self._dagster_run.run_id\n\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: the DagsterRun object for the completed execution."""\n return self._dagster_run\n\n
[docs] def events_for_node(self, node_name: str) -> List[DagsterEvent]:\n """Retrieves all dagster events for a specific node.\n\n Args:\n node_name (str): The name of the node for which outputs should be retrieved.\n\n Returns:\n List[DagsterEvent]: A list of all dagster events associated with provided node name.\n """\n check.str_param(node_name, "node_name")\n\n return _filter_events_by_handle(self._event_list, NodeHandle.from_string(node_name))
\n\n def asset_materializations_for_node(\n self, node_name\n ) -> List[Union[Materialization, AssetMaterialization]]:\n return [\n cast(StepMaterializationData, event.event_specific_data).materialization\n for event in self.events_for_node(node_name)\n if event.event_type_value == DagsterEventType.ASSET_MATERIALIZATION.value\n ]\n\n def asset_observations_for_node(self, node_name) -> List[AssetObservation]:\n return [\n cast(AssetObservationData, event.event_specific_data).asset_observation\n for event in self.events_for_node(node_name)\n if event.event_type_value == DagsterEventType.ASSET_OBSERVATION.value\n ]\n\n
[docs] def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n If the top-level job has no output, calling this method will result in a\n DagsterInvariantViolationError.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n\n check.str_param(output_name, "output_name")\n\n graph_def = self._node_def.ensure_graph_def()\n if not graph_def.has_output(output_name) and len(graph_def.output_mappings) == 0:\n raise DagsterInvariantViolationError(\n f"Attempted to retrieve top-level outputs for '{graph_def.name}', which has no outputs."\n )\n elif not graph_def.has_output(output_name):\n raise DagsterInvariantViolationError(\n f"Could not find top-level output '{output_name}' in '{graph_def.name}'."\n )\n # Resolve the first layer of mapping\n output_mapping = graph_def.get_output_mapping(output_name)\n mapped_node = graph_def.solid_named(output_mapping.maps_from.solid_name)\n origin_output_def, origin_handle = mapped_node.definition.resolve_output_to_origin(\n output_mapping.maps_from.output_name,\n NodeHandle(mapped_node.name, None),\n )\n\n # Get output from origin node\n return _filter_outputs_by_handle(\n self._output_capture, origin_handle, origin_output_def.name\n )
\n\n
[docs] def output_for_node(self, node_str: str, output_name: Optional[str] = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the in-process run of the job.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n\n # resolve handle of node that node_str is referring to\n target_handle = NodeHandle.from_string(node_str)\n target_node_def = self._node_def.ensure_graph_def().get_solid(target_handle).definition\n origin_output_def, origin_handle = target_node_def.resolve_output_to_origin(\n output_name, NodeHandle.from_string(node_str)\n )\n\n # retrieve output value from resolved handle\n return _filter_outputs_by_handle(\n self._output_capture, origin_handle, origin_output_def.name\n )
\n\n
[docs] def get_job_success_event(self):\n """Returns a DagsterEvent with type DagsterEventType.PIPELINE_SUCCESS if it ocurred during\n execution\n """\n events = list(\n filter(\n lambda event: event.event_type == DagsterEventType.PIPELINE_SUCCESS, self.all_events\n )\n )\n\n if len(events) == 0:\n raise DagsterError("No event of type DagsterEventType.PIPELINE_SUCCESS found.")\n\n return events[0]
\n\n
[docs] def get_job_failure_event(self):\n """Returns a DagsterEvent with type DagsterEventType.PIPELINE_FAILURE if it ocurred during\n execution\n """\n events = list(\n filter(\n lambda event: event.event_type == DagsterEventType.PIPELINE_FAILURE, self.all_events\n )\n )\n\n if len(events) == 0:\n raise DagsterError("No event of type DagsterEventType.PIPELINE_FAILURE found.")\n\n return events[0]
\n\n\ndef _filter_events_by_handle(\n event_list: List[DagsterEvent], handle: NodeHandle\n) -> List[DagsterEvent]:\n step_events = []\n for event in event_list:\n if event.is_step_event:\n event_handle = cast(\n NodeHandle, event.solid_handle\n ) # step events are guaranteed to have a node handle.\n if event_handle.is_or_descends_from(handle):\n step_events.append(event)\n\n return step_events\n\n\ndef _filter_outputs_by_handle(\n output_dict: Dict[StepOutputHandle, Any],\n node_handle: NodeHandle,\n output_name: str,\n) -> Any:\n mapped_outputs = {}\n step_key = str(node_handle)\n output_found = False\n for step_output_handle, value in output_dict.items():\n\n # For the mapped output case, where step keys are in the format\n # "step_key[upstream_mapped_output_name]" within the step output handle.\n if (\n step_output_handle.step_key.startswith(f"{step_key}[")\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n key_start = step_output_handle.step_key.find("[")\n key_end = step_output_handle.step_key.find("]")\n upstream_mapped_output_name = step_output_handle.step_key[key_start + 1 : key_end]\n mapped_outputs[upstream_mapped_output_name] = value\n\n # For all other cases, search for exact match.\n elif (\n step_key == step_output_handle.step_key\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n if not step_output_handle.mapping_key:\n return output_dict[step_output_handle]\n mapped_outputs[step_output_handle.mapping_key] = value\n\n if not output_found:\n raise DagsterInvariantViolationError(\n f"No outputs found for output '{output_name}' from node '{node_handle}'."\n )\n return mapped_outputs\n
", "current_page_name": "_modules/dagster/core/execution/execute_in_process_result", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.execute_in_process_result"}, "results": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.results

\nfrom collections import defaultdict\n\nimport dagster._check as check\nfrom dagster.core.definitions import GraphDefinition, Node, NodeHandle, PipelineDefinition\nfrom dagster.core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.events import DagsterEvent, DagsterEventType\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.execution.plan.step import StepKind\nfrom dagster.core.execution.plan.utils import build_resources_for_manager\n\n\ndef _construct_events_by_step_key(event_list):\n    events_by_step_key = defaultdict(list)\n    for event in event_list:\n        events_by_step_key[event.step_key].append(event)\n\n    return dict(events_by_step_key)\n\n\nclass GraphExecutionResult:\n    def __init__(\n        self,\n        container,\n        event_list,\n        reconstruct_context,\n        pipeline_def,\n        handle=None,\n        output_capture=None,\n    ):\n        self.container = check.inst_param(container, "container", GraphDefinition)\n        self.event_list = check.list_param(event_list, "step_event_list", of_type=DagsterEvent)\n        self.reconstruct_context = check.callable_param(reconstruct_context, "reconstruct_context")\n        self.pipeline_def = check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n        self.handle = check.opt_inst_param(handle, "handle", NodeHandle)\n        self.output_capture = check.opt_dict_param(\n            output_capture, "output_capture", key_type=StepOutputHandle\n        )\n        self._events_by_step_key = _construct_events_by_step_key(event_list)\n\n    @property\n    def success(self):\n        """bool: Whether all steps in the execution were successful."""\n        return all([not event.is_failure for event in self.event_list])\n\n    @property\n    def step_event_list(self):\n        """List[DagsterEvent] The full list of events generated by steps in the execution.\n\n        Excludes events generated by the pipeline lifecycle, e.g., ``PIPELINE_START``.\n        """\n        return [event for event in self.event_list if event.is_step_event]\n\n    @property\n    def events_by_step_key(self):\n        return self._events_by_step_key\n\n    def result_for_solid(self, name):\n        """Get the result of a top level solid.\n\n        Args:\n            name (str): The name of the top-level solid or aliased solid for which to retrieve the\n                result.\n\n        Returns:\n            Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of the solid\n            execution within the pipeline.\n        """\n        if not self.container.has_solid_named(name):\n            raise DagsterInvariantViolationError(\n                "Tried to get result for solid '{name}' in '{container}'. No such top level "\n                "solid.".format(name=name, container=self.container.name)\n            )\n\n        return self.result_for_handle(NodeHandle(name, None))\n\n    def output_for_solid(self, handle_str, output_name=DEFAULT_OUTPUT):\n        """Get the output of a solid by its solid handle string and output name.\n\n        Args:\n            handle_str (str): The string handle for the solid.\n            output_name (str): Optional. The name of the output, default to DEFAULT_OUTPUT.\n\n        Returns:\n            The output value for the handle and output_name.\n        """\n        check.str_param(handle_str, "handle_str")\n        check.str_param(output_name, "output_name")\n        return self.result_for_handle(NodeHandle.from_string(handle_str)).output_value(output_name)\n\n    @property\n    def solid_result_list(self):\n        """List[Union[CompositeSolidExecutionResult, SolidExecutionResult]]: The results for each\n        top level solid."""\n        return [self.result_for_solid(solid.name) for solid in self.container.solids]\n\n    def _result_for_handle(self, solid, handle):\n        if not solid:\n            raise DagsterInvariantViolationError(\n                "Can not find solid handle {handle_str}.".format(handle_str=handle.to_string())\n            )\n\n        events_by_kind = defaultdict(list)\n\n        if solid.is_graph:\n            events = []\n            for event in self.event_list:\n                if event.is_step_event:\n                    if event.solid_handle.is_or_descends_from(handle.with_ancestor(self.handle)):\n                        events_by_kind[event.step_kind].append(event)\n                        events.append(event)\n\n            return CompositeSolidExecutionResult(\n                solid,\n                events,\n                events_by_kind,\n                self.reconstruct_context,\n                self.pipeline_def,\n                handle=handle.with_ancestor(self.handle),\n                output_capture=self.output_capture,\n            )\n        else:\n            for event in self.event_list:\n                if event.is_step_event:\n                    if event.solid_handle.is_or_descends_from(handle.with_ancestor(self.handle)):\n                        events_by_kind[event.step_kind].append(event)\n\n            return SolidExecutionResult(\n                solid,\n                events_by_kind,\n                self.reconstruct_context,\n                self.pipeline_def,\n                output_capture=self.output_capture,\n            )\n\n    def result_for_handle(self, handle):\n        """Get the result of a solid by its solid handle.\n\n        This allows indexing into top-level solids to retrieve the results of children of\n        composite solids.\n\n        Args:\n            handle (Union[str,NodeHandle]): The handle for the solid.\n\n        Returns:\n            Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of the given\n            solid.\n        """\n        if isinstance(handle, str):\n            handle = NodeHandle.from_string(handle)\n        else:\n            check.inst_param(handle, "handle", NodeHandle)\n\n        solid = self.container.get_solid(handle)\n\n        return self._result_for_handle(solid, handle)\n\n\n
[docs]class PipelineExecutionResult(GraphExecutionResult):\n """The result of executing a pipeline.\n\n Returned by :py:func:`execute_pipeline`. Users should not instantiate this class directly.\n """\n\n def __init__(\n self,\n pipeline_def,\n run_id,\n event_list,\n reconstruct_context,\n output_capture=None,\n ):\n self.run_id = check.str_param(run_id, "run_id")\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n super(PipelineExecutionResult, self).__init__(\n container=pipeline_def.graph,\n event_list=event_list,\n reconstruct_context=reconstruct_context,\n pipeline_def=pipeline_def,\n output_capture=output_capture,\n )
\n\n\n
[docs]class CompositeSolidExecutionResult(GraphExecutionResult):\n """Execution result for a composite solid in a pipeline.\n\n Users should not instantiate this class directly.\n """\n\n def __init__(\n self,\n solid,\n event_list,\n step_events_by_kind,\n reconstruct_context,\n pipeline_def,\n handle=None,\n output_capture=None,\n ):\n check.inst_param(solid, "solid", Node)\n check.invariant(\n solid.is_graph,\n desc="Tried to instantiate a CompositeSolidExecutionResult with a noncomposite solid",\n )\n self.solid = solid\n self.step_events_by_kind = check.dict_param(\n step_events_by_kind, "step_events_by_kind", key_type=StepKind, value_type=list\n )\n self.output_capture = check.opt_dict_param(\n output_capture, "output_capture", key_type=StepOutputHandle\n )\n super(CompositeSolidExecutionResult, self).__init__(\n container=solid.definition,\n event_list=event_list,\n reconstruct_context=reconstruct_context,\n pipeline_def=pipeline_def,\n handle=handle,\n output_capture=output_capture,\n )\n\n def output_values_for_solid(self, name):\n check.str_param(name, "name")\n return self.result_for_solid(name).output_values\n\n def output_values_for_handle(self, handle_str):\n check.str_param(handle_str, "handle_str")\n\n return self.result_for_handle(handle_str).output_values\n\n def output_value_for_solid(self, name, output_name=DEFAULT_OUTPUT):\n check.str_param(name, "name")\n check.str_param(output_name, "output_name")\n\n return self.result_for_solid(name).output_value(output_name)\n\n def output_value_for_handle(self, handle_str, output_name=DEFAULT_OUTPUT):\n check.str_param(handle_str, "handle_str")\n check.str_param(output_name, "output_name")\n\n return self.result_for_handle(handle_str).output_value(output_name)\n\n @property\n def output_values(self):\n values = {}\n\n for output_name in self.solid.definition.output_dict:\n output_mapping = self.solid.definition.get_output_mapping(output_name)\n\n inner_solid_values = self._result_for_handle(\n self.solid.definition.solid_named(output_mapping.maps_from.solid_name),\n NodeHandle(output_mapping.maps_from.solid_name, None),\n ).output_values\n\n if inner_solid_values is not None: # may be None if inner solid was skipped\n if output_mapping.maps_from.output_name in inner_solid_values:\n values[output_name] = inner_solid_values[output_mapping.maps_from.output_name]\n\n return values\n\n def output_value(self, output_name=DEFAULT_OUTPUT):\n check.str_param(output_name, "output_name")\n\n if not self.solid.definition.has_output(output_name):\n raise DagsterInvariantViolationError(\n "Output '{output_name}' not defined in composite solid '{solid}': "\n "{outputs_clause}. If you were expecting this output to be present, you may "\n "be missing an output_mapping from an inner solid to its enclosing composite "\n "solid.".format(\n output_name=output_name,\n solid=self.solid.name,\n outputs_clause="found outputs {output_names}".format(\n output_names=str(list(self.solid.definition.output_dict.keys()))\n )\n if self.solid.definition.output_dict\n else "no output mappings were defined",\n )\n )\n\n output_mapping = self.solid.definition.get_output_mapping(output_name)\n\n return self._result_for_handle(\n self.solid.definition.solid_named(output_mapping.maps_from.solid_name),\n NodeHandle(output_mapping.maps_from.solid_name, None),\n ).output_value(output_mapping.maps_from.output_name)
\n\n\n
[docs]class SolidExecutionResult:\n """Execution result for a leaf solid in a pipeline.\n\n Users should not instantiate this class.\n """\n\n def __init__(\n self, solid, step_events_by_kind, reconstruct_context, pipeline_def, output_capture=None\n ):\n check.inst_param(solid, "solid", Node)\n check.invariant(\n not solid.is_graph,\n desc="Tried to instantiate a SolidExecutionResult with a composite solid",\n )\n self.solid = solid\n self.step_events_by_kind = check.dict_param(\n step_events_by_kind, "step_events_by_kind", key_type=StepKind, value_type=list\n )\n self.reconstruct_context = check.callable_param(reconstruct_context, "reconstruct_context")\n self.output_capture = check.opt_dict_param(output_capture, "output_capture")\n self.pipeline_def = check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n\n @property\n def compute_input_event_dict(self):\n """Dict[str, DagsterEvent]: All events of type ``STEP_INPUT``, keyed by input name."""\n return {se.event_specific_data.input_name: se for se in self.input_events_during_compute}\n\n @property\n def input_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_INPUT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_INPUT)\n\n
[docs] def get_output_event_for_compute(self, output_name="result"):\n """The ``STEP_OUTPUT`` event for the given output name.\n\n Throws if not present.\n\n Args:\n output_name (Optional[str]): The name of the output. (default: 'result')\n\n Returns:\n DagsterEvent: The corresponding event.\n """\n events = self.get_output_events_for_compute(output_name)\n check.invariant(\n len(events) == 1, "Multiple output events returned, use get_output_events_for_compute"\n )\n return events[0]
\n\n @property\n def compute_output_events_dict(self):\n """Dict[str, List[DagsterEvent]]: All events of type ``STEP_OUTPUT``, keyed by output name"""\n results = defaultdict(list)\n for se in self.output_events_during_compute:\n results[se.step_output_data.output_name].append(se)\n\n return dict(results)\n\n
[docs] def get_output_events_for_compute(self, output_name="result"):\n """The ``STEP_OUTPUT`` event for the given output name.\n\n Throws if not present.\n\n Args:\n output_name (Optional[str]): The name of the output. (default: 'result')\n\n Returns:\n List[DagsterEvent]: The corresponding events.\n """\n return self.compute_output_events_dict[output_name]
\n\n @property\n def output_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_OUTPUT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_OUTPUT)\n\n @property\n def compute_step_events(self):\n """List[DagsterEvent]: All events generated by execution of the solid compute function."""\n return self.step_events_by_kind.get(StepKind.COMPUTE, [])\n\n @property\n def step_events(self):\n return self.compute_step_events\n\n @property\n def materializations_during_compute(self):\n """List[Materialization]: All materializations yielded by the solid."""\n return [\n mat_event.event_specific_data.materialization\n for mat_event in self.materialization_events_during_compute\n ]\n\n @property\n def materialization_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``ASSET_MATERIALIZATION``."""\n return self._compute_steps_of_type(DagsterEventType.ASSET_MATERIALIZATION)\n\n @property\n def expectation_events_during_compute(self):\n """List[DagsterEvent]: All events of type ``STEP_EXPECTATION_RESULT``."""\n return self._compute_steps_of_type(DagsterEventType.STEP_EXPECTATION_RESULT)\n\n def _compute_steps_of_type(self, dagster_event_type):\n return list(\n filter(lambda se: se.event_type == dagster_event_type, self.compute_step_events)\n )\n\n @property\n def expectation_results_during_compute(self):\n """List[ExpectationResult]: All expectation results yielded by the solid"""\n return [\n expt_event.event_specific_data.expectation_result\n for expt_event in self.expectation_events_during_compute\n ]\n\n
[docs] def get_step_success_event(self):\n """DagsterEvent: The ``STEP_SUCCESS`` event, throws if not present."""\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_SUCCESS:\n return step_event\n\n check.failed("Step success not found for solid {}".format(self.solid.name))
\n\n @property\n def compute_step_failure_event(self):\n """DagsterEvent: The ``STEP_FAILURE`` event, throws if it did not fail."""\n if self.success:\n raise DagsterInvariantViolationError(\n "Cannot call compute_step_failure_event if successful"\n )\n\n step_failure_events = self._compute_steps_of_type(DagsterEventType.STEP_FAILURE)\n check.invariant(len(step_failure_events) == 1)\n return step_failure_events[0]\n\n @property\n def success(self):\n """bool: Whether solid execution was successful."""\n any_success = False\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_FAILURE:\n return False\n if step_event.event_type == DagsterEventType.STEP_SUCCESS:\n any_success = True\n\n return any_success\n\n @property\n def skipped(self):\n """bool: Whether solid execution was skipped."""\n return all(\n [\n step_event.event_type == DagsterEventType.STEP_SKIPPED\n for step_event in self.compute_step_events\n ]\n )\n\n @property\n def output_values(self):\n """Union[None, Dict[str, Union[Any, Dict[str, Any]]]: The computed output values.\n\n Returns ``None`` if execution did not succeed.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n\n Note that accessing this property will reconstruct the pipeline context (including, e.g.,\n resources) to retrieve materialized output values.\n """\n if not self.success or not self.compute_step_events:\n return None\n\n results = {}\n with self.reconstruct_context() as context:\n for compute_step_event in self.compute_step_events:\n if compute_step_event.is_successful_output:\n output = compute_step_event.step_output_data\n step = context.execution_plan.get_step_by_key(compute_step_event.step_key)\n value = self._get_value(context.for_step(step), output)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if results.get(output.output_name) is None:\n results[output.output_name] = {mapping_key: value}\n else:\n results[output.output_name][mapping_key] = value\n else:\n results[output.output_name] = value\n\n return results\n\n
[docs] def output_value(self, output_name=DEFAULT_OUTPUT):\n """Get a computed output value.\n\n Note that calling this method will reconstruct the pipeline context (including, e.g.,\n resources) to retrieve materialized output values.\n\n Args:\n output_name(str): The output name for which to retrieve the value. (default: 'result')\n\n Returns:\n Union[None, Any, Dict[str, Any]]: ``None`` if execution did not succeed, the output value\n in the normal case, and a dict of mapping keys to values in the mapped case.\n """\n check.str_param(output_name, "output_name")\n\n if not self.solid.definition.has_output(output_name):\n raise DagsterInvariantViolationError(\n "Output '{output_name}' not defined in solid '{solid}': found outputs "\n "{output_names}".format(\n output_name=output_name,\n solid=self.solid.name,\n output_names=str(list(self.solid.definition.output_dict.keys())),\n )\n )\n\n if not self.success:\n return None\n\n with self.reconstruct_context() as context:\n found = False\n result = None\n for compute_step_event in self.compute_step_events:\n if (\n compute_step_event.is_successful_output\n and compute_step_event.step_output_data.output_name == output_name\n ):\n found = True\n output = compute_step_event.step_output_data\n step = context.execution_plan.get_step_by_key(compute_step_event.step_key)\n value = self._get_value(context.for_step(step), output)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if result is None:\n result = {mapping_key: value}\n else:\n result[\n mapping_key\n ] = value # pylint:disable=unsupported-assignment-operation\n else:\n result = value\n\n if found:\n return result\n\n raise DagsterInvariantViolationError(\n (\n "Did not find result {output_name} in solid {self.solid.name} "\n "execution result"\n ).format(output_name=output_name, self=self)\n )
\n\n def _get_value(self, context, step_output_data):\n step_output_handle = step_output_data.step_output_handle\n # output capture dictionary will only have values in the in process case, but will not have\n # values from steps launched via step launcher.\n if self.output_capture and step_output_handle in self.output_capture:\n return self.output_capture[step_output_handle]\n manager = context.get_io_manager(step_output_handle)\n manager_key = context.execution_plan.get_manager_key(step_output_handle, self.pipeline_def)\n res = manager.load_input(\n context.for_input_manager(\n name=None,\n config=None,\n metadata=None,\n dagster_type=self.solid.output_def_named(step_output_data.output_name).dagster_type,\n source_handle=step_output_handle,\n resource_config=context.resolved_run_config.resources[manager_key].config,\n resources=build_resources_for_manager(manager_key, context),\n )\n )\n return res\n\n @property\n def failure_data(self):\n """Union[None, StepFailureData]: Any data corresponding to this step's failure, if it\n failed."""\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_FAILURE:\n return step_event.step_failure_data\n\n @property\n def retry_attempts(self) -> int:\n """Number of times this step retried"""\n count = 0\n for step_event in self.compute_step_events:\n if step_event.event_type == DagsterEventType.STEP_RESTARTED:\n count += 1\n return count
\n
", "current_page_name": "_modules/dagster/core/execution/results", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.results"}, "validate_run_config": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.execution.validate_run_config

\nfrom typing import Any, Dict, Optional, cast\n\nimport dagster._check as check\nfrom dagster.core.definitions import JobDefinition, PipelineDefinition\nfrom dagster.core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]def validate_run_config(\n job_def: Optional[JobDefinition] = None,\n run_config: Optional[Dict[str, Any]] = None,\n mode: Optional[str] = None,\n pipeline_def: Optional[PipelineDefinition] = None,\n) -> Dict[str, Any]:\n """Function to validate a provided run config blob against a given job. For legacy APIs, a\n pipeline/mode can also be passed in.\n\n If validation is successful, this function will return a dictionary representation of the\n validated config actually used during execution.\n\n Args:\n job_def (Union[PipelineDefinition, JobDefinition]): The job definition to validate run\n config against\n run_config (Optional[Dict[str, Any]]): The run config to validate\n mode (str): The mode of the pipeline to validate against (different modes may require\n different config)\n pipeline_def (PipelineDefinition): The pipeline definition to validate run config against.\n\n Returns:\n Dict[str, Any]: A dictionary representation of the validated config.\n """\n\n job_def = check.opt_inst_param(job_def, "job_def", (JobDefinition, PipelineDefinition))\n pipeline_def = check.opt_inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n\n if job_def and pipeline_def:\n check.failed("Cannot specify both a job_def and a pipeline_def")\n\n pipeline_or_job_def = pipeline_def or job_def\n\n if pipeline_or_job_def is None:\n check.failed("Must specify at least one of job_def and pipeline_def")\n\n pipeline_or_job_def = cast(PipelineDefinition, pipeline_def or job_def)\n mode = check.opt_str_param(mode, "mode", default=pipeline_or_job_def.get_default_mode_name())\n\n return ResolvedRunConfig.build(pipeline_or_job_def, run_config, mode=mode).to_dict()
\n
", "current_page_name": "_modules/dagster/core/execution/validate_run_config", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.execution.validate_run_config"}}, "executor": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.executor.base

\nfrom abc import ABC, abstractmethod\n\nfrom dagster.core.execution.retries import RetryMode\n\n\n
[docs]class Executor(ABC): # pylint: disable=no-init\n
[docs] @abstractmethod\n def execute(self, plan_context, execution_plan):\n """\n For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.\n\n Args:\n plan_context (PlanOrchestrationContext): The plan's orchestration context.\n execution_plan (ExecutionPlan): The plan to execute.\n\n Returns:\n A stream of dagster events.\n """
\n\n @property\n @abstractmethod\n def retries(self) -> RetryMode:\n """\n Whether retries are enabled or disabled for this instance of the executor.\n\n Executors should allow this to be controlled via configuration if possible.\n\n Returns: RetryMode\n """
\n
", "current_page_name": "_modules/dagster/core/executor/base", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.executor.base"}, "init": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.executor.init

\nfrom typing import Dict, NamedTuple\n\nimport dagster._check as check\nfrom dagster.core.definitions import ExecutorDefinition, IPipeline\nfrom dagster.core.instance import DagsterInstance\n\n\n
[docs]class InitExecutorContext(\n NamedTuple(\n "InitExecutorContext",\n [\n ("job", IPipeline),\n ("executor_def", ExecutorDefinition),\n ("executor_config", Dict[str, object]),\n ("instance", DagsterInstance),\n ],\n )\n):\n """Executor-specific initialization context.\n\n Attributes:\n job (IPipeline): The job to be executed.\n executor_def (ExecutorDefinition): The definition of the executor currently being\n constructed.\n executor_config (dict): The parsed config passed to the executor.\n instance (DagsterInstance): The current instance.\n """\n\n def __new__(\n cls,\n job: IPipeline,\n executor_def: ExecutorDefinition,\n executor_config: Dict[str, object],\n instance: DagsterInstance,\n ):\n return super(InitExecutorContext, cls).__new__(\n cls,\n job=check.inst_param(job, "job", IPipeline),\n executor_def=check.inst_param(executor_def, "executor_def", ExecutorDefinition),\n executor_config=check.dict_param(executor_config, "executor_config", key_type=str),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )\n\n @property\n def pipeline(self) -> IPipeline:\n return self.job
\n
", "current_page_name": "_modules/dagster/core/executor/init", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.executor.init"}}, "instance": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.instance

\nimport logging\nimport logging.config\nimport os\nimport sys\nimport time\nimport warnings\nimport weakref\nfrom collections import defaultdict\nfrom contextlib import ExitStack\nfrom enum import Enum\nfrom tempfile import TemporaryDirectory\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport yaml\n\nimport dagster._check as check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.pipeline_base import InMemoryPipeline\nfrom dagster.core.definitions.pipeline_definition import (\n    PipelineDefinition,\n    PipelineSubsetDefinition,\n)\nfrom dagster.core.errors import (\n    DagsterHomeNotSetError,\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunConflict,\n)\nfrom dagster.core.storage.pipeline_run import (\n    IN_PROGRESS_RUN_STATUSES,\n    DagsterRun,\n    JobBucket,\n    PipelineRun,\n    PipelineRunStatsSnapshot,\n    PipelineRunStatus,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster.core.storage.tags import PARENT_RUN_ID_TAG, RESUME_RETRY_TAG, ROOT_RUN_ID_TAG\nfrom dagster.core.system_config.objects import ResolvedRunConfig\nfrom dagster.core.utils import str_format_list\nfrom dagster.serdes import ConfigurableClass\nfrom dagster.seven import get_current_datetime_in_utc\nfrom dagster.utils import merge_dicts, traced\nfrom dagster.utils.backcompat import experimental_functionality_warning\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .config import (\n    DAGSTER_CONFIG_YAML_FILENAME,\n    DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT,\n    is_dagster_home_set,\n)\nfrom .ref import InstanceRef\n\n# 'airflow_execution_date' and 'is_airflow_ingest_pipeline' are hardcoded tags used in the\n# airflow ingestion logic (see: dagster_pipeline_factory.py). 'airflow_execution_date' stores the\n# 'execution_date' used in Airflow operator execution and 'is_airflow_ingest_pipeline' determines\n# whether 'airflow_execution_date' is needed.\n# https://github.com/dagster-io/dagster/issues/2403\nAIRFLOW_EXECUTION_DATE_STR = "airflow_execution_date"\nIS_AIRFLOW_INGEST_PIPELINE_STR = "is_airflow_ingest_pipeline"\n\nif TYPE_CHECKING:\n    from dagster.core.debug import DebugRunPayload\n    from dagster.core.events import DagsterEvent, DagsterEventType\n    from dagster.core.events.log import EventLogEntry\n    from dagster.core.execution.plan.resume_retry import ReexecutionStrategy\n    from dagster.core.execution.stats import RunStepKeyStatsSnapshot\n    from dagster.core.host_representation import (\n        ExternalPipeline,\n        ExternalSensor,\n        HistoricalPipeline,\n        RepositoryLocation,\n    )\n    from dagster.core.launcher import RunLauncher\n    from dagster.core.run_coordinator import RunCoordinator\n    from dagster.core.scheduler import Scheduler\n    from dagster.core.scheduler.instigation import InstigatorState, InstigatorTick, TickStatus\n    from dagster.core.snap import ExecutionPlanSnapshot, PipelineSnapshot\n    from dagster.core.storage.compute_log_manager import ComputeLogManager\n    from dagster.core.storage.event_log import EventLogStorage\n    from dagster.core.storage.event_log.base import AssetRecord, EventLogRecord, EventRecordsFilter\n    from dagster.core.storage.root import LocalArtifactStorage\n    from dagster.core.storage.runs import RunStorage\n    from dagster.core.storage.schedules import ScheduleStorage\n    from dagster.core.workspace.workspace import IWorkspace\n    from dagster.daemon.types import DaemonHeartbeat\n\n\ndef _check_run_equality(\n    pipeline_run: PipelineRun, candidate_run: PipelineRun\n) -> Dict[str, Tuple[Any, Any]]:\n    field_diff = {}\n    for field in pipeline_run._fields:\n        expected_value = getattr(pipeline_run, field)\n        candidate_value = getattr(candidate_run, field)\n        if expected_value != candidate_value:\n            field_diff[field] = (expected_value, candidate_value)\n\n    return field_diff\n\n\ndef _format_field_diff(field_diff: Dict[str, Tuple[Any, Any]]) -> str:\n    return "\\n".join(\n        [\n            (\n                "    {field_name}:\\n"\n                + "        Expected: {expected_value}\\n"\n                + "        Received: {candidate_value}"\n            ).format(\n                field_name=field_name,\n                expected_value=expected_value,\n                candidate_value=candidate_value,\n            )\n            for field_name, (\n                expected_value,\n                candidate_value,\n            ) in field_diff.items()\n        ]\n    )\n\n\nclass _EventListenerLogHandler(logging.Handler):\n    def __init__(self, instance):\n        self._instance = instance\n        super(_EventListenerLogHandler, self).__init__()\n\n    def emit(self, record):\n        from dagster.core.events import EngineEventData\n        from dagster.core.events.log import StructuredLoggerMessage, construct_event_record\n\n        event = construct_event_record(\n            StructuredLoggerMessage(\n                name=record.name,\n                message=record.msg,\n                level=record.levelno,\n                meta=record.dagster_meta,\n                record=record,\n            )\n        )\n\n        try:\n            self._instance.handle_new_event(event)\n        except Exception as e:\n            sys.stderr.write(f"Exception while writing logger call to event log: {str(e)}\\n")\n            if event.dagster_event:\n                # Swallow user-generated log failures so that the entire step/run doesn't fail, but\n                # raise failures writing system-generated log events since they are the source of\n                # truth for the state of the run\n                raise\n            elif event.run_id:\n                self._instance.report_engine_event(\n                    "Exception while writing logger call to event log",\n                    pipeline_name=event.pipeline_name,\n                    run_id=event.run_id,\n                    step_key=event.step_key,\n                    engine_event_data=EngineEventData(\n                        error=serializable_error_info_from_exc_info(sys.exc_info()),\n                    ),\n                )\n\n\nclass InstanceType(Enum):\n    PERSISTENT = "PERSISTENT"\n    EPHEMERAL = "EPHEMERAL"\n\n\nT_DagsterInstance = TypeVar("T_DagsterInstance", bound="DagsterInstance")\n\n\nclass MayHaveInstanceWeakref(Generic[T_DagsterInstance]):\n    """Mixin for classes that can have a weakref back to a Dagster instance."""\n\n    def __init__(self):\n        self._instance_weakref: Optional[weakref.ReferenceType[T_DagsterInstance]] = None\n\n    @property\n    def _instance(self) -> T_DagsterInstance:\n        instance = (\n            self._instance_weakref()\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            if (hasattr(self, "_instance_weakref") and self._instance_weakref is not None)\n            else None\n        )\n        return cast(T_DagsterInstance, instance)\n\n    def register_instance(self, instance: T_DagsterInstance):\n        check.invariant(\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            (not hasattr(self, "_instance_weakref") or self._instance_weakref is None),\n            "Must only call initialize once",\n        )\n\n        # Store a weakref to avoid a circular reference / enable GC\n        self._instance_weakref = weakref.ref(instance)\n\n\n
[docs]class DagsterInstance:\n """Core abstraction for managing Dagster's access to storage and other resources.\n\n Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\n the values in the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\n transient in-memory components.\n\n Configuration of this class should be done by setting values in ``$DAGSTER_HOME/dagster.yaml``.\n For example, to use Postgres for run and event log storage, you can write a ``dagster.yaml``\n such as the following:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/postgres_dagster.yaml\n :caption: dagster.yaml\n :language: YAML\n\n Args:\n instance_type (InstanceType): Indicates whether the instance is ephemeral or persistent.\n Users should not attempt to set this value directly or in their ``dagster.yaml`` files.\n local_artifact_storage (LocalArtifactStorage): The local artifact storage is used to\n configure storage for any artifacts that require a local disk, such as schedules, or\n when using the filesystem system storage to manage files and intermediates. By default,\n this will be a :py:class:`dagster.core.storage.root.LocalArtifactStorage`. Configurable\n in ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass`\n machinery.\n run_storage (RunStorage): The run storage is used to store metadata about ongoing and past\n pipeline runs. By default, this will be a\n :py:class:`dagster.core.storage.runs.SqliteRunStorage`. Configurable in ``dagster.yaml``\n using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n event_storage (EventLogStorage): Used to store the structured event logs generated by\n pipeline runs. By default, this will be a\n :py:class:`dagster.core.storage.event_log.SqliteEventLogStorage`. Configurable in\n ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n compute_log_manager (ComputeLogManager): The compute log manager handles stdout and stderr\n logging for solid compute functions. By default, this will be a\n :py:class:`dagster.core.storage.local_compute_log_manager.LocalComputeLogManager`.\n Configurable in ``dagster.yaml`` using the\n :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n run_coordinator (RunCoordinator): A runs coordinator may be used to manage the execution\n of pipeline runs.\n run_launcher (Optional[RunLauncher]): Optionally, a run launcher may be used to enable\n a Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\n addition to running them locally.\n settings (Optional[Dict]): Specifies certain per-instance settings,\n such as feature flags. These are set in the ``dagster.yaml`` under a set of whitelisted\n keys.\n ref (Optional[InstanceRef]): Used by internal machinery to pass instances across process\n boundaries.\n """\n\n _PROCESS_TEMPDIR: Optional[TemporaryDirectory] = None\n _EXIT_STACK = None\n\n def __init__(\n self,\n instance_type: InstanceType,\n local_artifact_storage: "LocalArtifactStorage",\n run_storage: "RunStorage",\n event_storage: "EventLogStorage",\n compute_log_manager: "ComputeLogManager",\n run_coordinator: "RunCoordinator",\n run_launcher: "RunLauncher",\n scheduler: Optional["Scheduler"] = None,\n schedule_storage: Optional["ScheduleStorage"] = None,\n settings: Optional[Dict[str, Any]] = None,\n ref: Optional[InstanceRef] = None,\n ):\n from dagster.core.launcher import RunLauncher\n from dagster.core.run_coordinator import RunCoordinator\n from dagster.core.scheduler import Scheduler\n from dagster.core.storage.compute_log_manager import ComputeLogManager\n from dagster.core.storage.event_log import EventLogStorage\n from dagster.core.storage.root import LocalArtifactStorage\n from dagster.core.storage.runs import RunStorage\n from dagster.core.storage.schedules import ScheduleStorage\n\n self._instance_type = check.inst_param(instance_type, "instance_type", InstanceType)\n self._local_artifact_storage = check.inst_param(\n local_artifact_storage, "local_artifact_storage", LocalArtifactStorage\n )\n self._event_storage = check.inst_param(event_storage, "event_storage", EventLogStorage)\n self._event_storage.register_instance(self)\n\n self._run_storage = check.inst_param(run_storage, "run_storage", RunStorage)\n self._run_storage.register_instance(self)\n\n self._compute_log_manager = check.inst_param(\n compute_log_manager, "compute_log_manager", ComputeLogManager\n )\n self._compute_log_manager.register_instance(self)\n self._scheduler = check.opt_inst_param(scheduler, "scheduler", Scheduler)\n\n self._schedule_storage = check.opt_inst_param(\n schedule_storage, "schedule_storage", ScheduleStorage\n )\n if self._schedule_storage:\n self._schedule_storage.register_instance(self)\n\n self._run_coordinator = check.inst_param(run_coordinator, "run_coordinator", RunCoordinator)\n self._run_coordinator.register_instance(self)\n\n self._run_launcher = check.inst_param(run_launcher, "run_launcher", RunLauncher)\n self._run_launcher.register_instance(self)\n\n self._settings = check.opt_dict_param(settings, "settings")\n\n self._ref = check.opt_inst_param(ref, "ref", InstanceRef)\n\n self._subscribers: Dict[str, List[Callable]] = defaultdict(list)\n\n run_monitoring_enabled = self.run_monitoring_settings.get("enabled", False)\n if run_monitoring_enabled and not self.run_launcher.supports_check_run_worker_health:\n run_monitoring_enabled = False\n warnings.warn(\n "The configured run launcher does not support run monitoring, disabling it.",\n )\n self._run_monitoring_enabled = run_monitoring_enabled\n if self.run_monitoring_enabled and self.run_monitoring_max_resume_run_attempts:\n check.invariant(\n self.run_launcher.supports_resume_run,\n "The configured run launcher does not support resuming runs. "\n "Set max_resume_run_attempts to 0 to use run monitoring. Any runs with a failed run "\n "worker will be marked as failed, but will not be resumed.",\n )\n\n # ctors\n\n @staticmethod\n def ephemeral(\n tempdir: Optional[str] = None, preload: Optional[List["DebugRunPayload"]] = None\n ) -> "DagsterInstance":\n from dagster.core.launcher.sync_in_memory_run_launcher import SyncInMemoryRunLauncher\n from dagster.core.run_coordinator import DefaultRunCoordinator\n from dagster.core.storage.event_log import InMemoryEventLogStorage\n from dagster.core.storage.noop_compute_log_manager import NoOpComputeLogManager\n from dagster.core.storage.root import LocalArtifactStorage\n from dagster.core.storage.runs import InMemoryRunStorage\n\n if tempdir is None:\n tempdir = DagsterInstance.temp_storage()\n\n return DagsterInstance(\n instance_type=InstanceType.EPHEMERAL,\n local_artifact_storage=LocalArtifactStorage(tempdir),\n run_storage=InMemoryRunStorage(preload=preload),\n event_storage=InMemoryEventLogStorage(preload=preload),\n compute_log_manager=NoOpComputeLogManager(),\n run_coordinator=DefaultRunCoordinator(),\n run_launcher=SyncInMemoryRunLauncher(),\n )\n\n @staticmethod\n def get() -> "DagsterInstance":\n dagster_home_path = os.getenv("DAGSTER_HOME")\n\n if not dagster_home_path:\n raise DagsterHomeNotSetError(\n (\n "The environment variable $DAGSTER_HOME is not set. \\n"\n "Dagster requires this environment variable to be set to an existing directory in your filesystem. "\n "This directory is used to store metadata across sessions, or load the dagster.yaml "\n "file which can configure storing metadata in an external database.\\n"\n "You can resolve this error by exporting the environment variable. For example, you can run the following command in your shell or include it in your shell configuration file:\\n"\n '\\texport DAGSTER_HOME=~"/dagster_home"\\n'\n "or PowerShell\\n"\n "$env:DAGSTER_HOME = ($home + '\\\\dagster_home')"\n "or batch"\n "set DAGSTER_HOME=%UserProfile%/dagster_home"\n "Alternatively, DagsterInstance.ephemeral() can be used for a transient instance.\\n"\n )\n )\n\n dagster_home_path = os.path.expanduser(dagster_home_path)\n\n if not os.path.isabs(dagster_home_path):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" must be an absolute path. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem."\n ).format(dagster_home_path)\n )\n\n if not (os.path.exists(dagster_home_path) and os.path.isdir(dagster_home_path)):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" is not a directory or does not exist. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem"\n ).format(dagster_home_path)\n )\n\n return DagsterInstance.from_config(dagster_home_path)\n\n @staticmethod\n def local_temp(tempdir=None, overrides=None) -> "DagsterInstance":\n if tempdir is None:\n tempdir = DagsterInstance.temp_storage()\n\n return DagsterInstance.from_ref(InstanceRef.from_dir(tempdir, overrides=overrides))\n\n @staticmethod\n def from_config(\n config_dir: str,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n ) -> "DagsterInstance":\n instance_ref = InstanceRef.from_dir(config_dir, config_filename=config_filename)\n return DagsterInstance.from_ref(instance_ref)\n\n @staticmethod\n def from_ref(instance_ref: InstanceRef) -> "DagsterInstance":\n check.inst_param(instance_ref, "instance_ref", InstanceRef)\n\n # DagsterInstance doesn't implement ConfigurableClass, but we may still sometimes want to\n # have custom subclasses of DagsterInstance. This machinery allows for those custom\n # subclasses to receive additional keyword arguments passed through the config YAML.\n klass = instance_ref.custom_instance_class or DagsterInstance\n kwargs = instance_ref.custom_instance_class_config\n\n return klass( # type: ignore\n instance_type=InstanceType.PERSISTENT,\n local_artifact_storage=instance_ref.local_artifact_storage,\n run_storage=instance_ref.run_storage,\n event_storage=instance_ref.event_storage,\n compute_log_manager=instance_ref.compute_log_manager,\n schedule_storage=instance_ref.schedule_storage,\n scheduler=instance_ref.scheduler,\n run_coordinator=instance_ref.run_coordinator,\n run_launcher=instance_ref.run_launcher,\n settings=instance_ref.settings,\n ref=instance_ref,\n **kwargs,\n )\n\n # flags\n\n @property\n def is_persistent(self) -> bool:\n return self._instance_type == InstanceType.PERSISTENT\n\n @property\n def is_ephemeral(self) -> bool:\n return self._instance_type == InstanceType.EPHEMERAL\n\n def get_ref(self) -> InstanceRef:\n if self._ref:\n return self._ref\n\n check.failed(\n "Attempted to prepare an ineligible DagsterInstance ({inst_type}) for cross "\n "process communication.{dagster_home_msg}".format(\n inst_type=self._instance_type,\n dagster_home_msg="\\nDAGSTER_HOME environment variable is not set, set it to "\n "a directory on the filesystem for dagster to use for storage and cross "\n "process coordination."\n if os.getenv("DAGSTER_HOME") is None\n else "",\n )\n )\n\n @property\n def root_directory(self) -> str:\n return self._local_artifact_storage.base_dir\n\n @staticmethod\n def temp_storage() -> str:\n from dagster.core.test_utils import environ\n\n if DagsterInstance._PROCESS_TEMPDIR is None:\n DagsterInstance._EXIT_STACK = ExitStack()\n DagsterInstance._EXIT_STACK.enter_context(\n environ({"DAGSTER_TELEMETRY_DISABLED": "yes"})\n )\n DagsterInstance._PROCESS_TEMPDIR = TemporaryDirectory()\n return cast(TemporaryDirectory, DagsterInstance._PROCESS_TEMPDIR).name\n\n def _info(self, component):\n # ConfigurableClass may not have inst_data if it's a direct instantiation\n # which happens for ephemeral instances\n if isinstance(component, ConfigurableClass) and component.inst_data:\n return component.inst_data.info_dict()\n if type(component) is dict:\n return component\n return component.__class__.__name__\n\n def _info_str_for_component(self, component_name, component):\n return yaml.dump(\n {component_name: self._info(component)}, default_flow_style=False, sort_keys=False\n )\n\n def info_dict(self):\n\n settings = self._settings if self._settings else {}\n\n ret = {\n "local_artifact_storage": self._info(self._local_artifact_storage),\n "run_storage": self._info(self._run_storage),\n "event_log_storage": self._info(self._event_storage),\n "compute_logs": self._info(self._compute_log_manager),\n "schedule_storage": self._info(self._schedule_storage),\n "scheduler": self._info(self._scheduler),\n "run_coordinator": self._info(self._run_coordinator),\n "run_launcher": self._info(self._run_launcher),\n }\n ret.update(\n {\n settings_key: self._info(settings_value)\n for settings_key, settings_value in settings.items()\n }\n )\n\n return ret\n\n def info_str(self) -> str:\n return yaml.dump(self.info_dict(), default_flow_style=False, sort_keys=False)\n\n def schema_str(self) -> str:\n def _schema_dict(alembic_version):\n if not alembic_version:\n return None\n db_revision, head_revision = alembic_version\n return {\n "current": db_revision,\n "latest": head_revision,\n }\n\n return yaml.dump(\n {\n "schema": {\n "event_log_storage": _schema_dict(self._event_storage.alembic_version()),\n "run_storage": _schema_dict(self._event_storage.alembic_version()),\n "schedule_storage": _schema_dict(self._event_storage.alembic_version()),\n }\n },\n default_flow_style=False,\n sort_keys=False,\n )\n\n @property\n def run_storage(self) -> "RunStorage":\n return self._run_storage\n\n @property\n def event_log_storage(self) -> "EventLogStorage":\n return self._event_storage\n\n # schedule storage\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n return self._schedule_storage\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n return self._scheduler\n\n @property\n def scheduler_class(self) -> Optional[str]:\n return self.scheduler.__class__.__name__ if self.scheduler else None\n\n # run coordinator\n\n @property\n def run_coordinator(self) -> "RunCoordinator":\n return self._run_coordinator\n\n # run launcher\n\n @property\n def run_launcher(self) -> "RunLauncher":\n return self._run_launcher\n\n # compute logs\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n return self._compute_log_manager\n\n def get_settings(self, settings_key: str) -> Any:\n check.str_param(settings_key, "settings_key")\n if self._settings and settings_key in self._settings:\n return self._settings.get(settings_key)\n return {}\n\n @property\n def telemetry_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n dagster_telemetry_enabled_default = True\n\n telemetry_settings = self.get_settings("telemetry")\n\n if not telemetry_settings:\n return dagster_telemetry_enabled_default\n\n if "enabled" in telemetry_settings:\n return telemetry_settings["enabled"]\n elif "experimental_dagit" in telemetry_settings:\n return telemetry_settings["experimental_dagit"]\n else:\n return dagster_telemetry_enabled_default\n\n # run monitoring\n\n @property\n def run_monitoring_enabled(self) -> bool:\n return self._run_monitoring_enabled\n\n @property\n def run_monitoring_settings(self) -> Dict:\n return self.get_settings("run_monitoring")\n\n @property\n def run_monitoring_start_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("start_timeout_seconds", 180)\n\n @property\n def code_server_settings(self) -> Dict:\n return self.get_settings("code_servers")\n\n @property\n def code_server_process_startup_timeout(self) -> int:\n return self.code_server_settings.get(\n "local_startup_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def run_monitoring_max_resume_run_attempts(self) -> int:\n default_max_resume_run_attempts = 3 if self.run_launcher.supports_resume_run else 0\n return self.run_monitoring_settings.get(\n "max_resume_run_attempts", default_max_resume_run_attempts\n )\n\n @property\n def run_monitoring_poll_interval_seconds(self) -> int:\n return self.run_monitoring_settings.get("poll_interval_seconds", 120)\n\n @property\n def cancellation_thread_poll_interval_seconds(self) -> int:\n return self.get_settings("run_monitoring").get(\n "cancellation_thread_poll_interval_seconds", 10\n )\n\n # python logs\n\n @property\n def managed_python_loggers(self) -> List[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n return python_log_settings.get("managed_python_loggers", [])\n\n @property\n def python_log_level(self) -> Optional[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n return python_log_settings.get("python_log_level")\n\n def upgrade(self, print_fn=None):\n from dagster.core.storage.migration.utils import upgrading_instance\n\n with upgrading_instance(self):\n\n if print_fn:\n print_fn("Updating run storage...")\n self._run_storage.upgrade()\n self._run_storage.migrate(print_fn)\n\n if print_fn:\n print_fn("Updating event storage...")\n self._event_storage.upgrade()\n self._event_storage.reindex_assets(print_fn=print_fn)\n\n if print_fn:\n print_fn("Updating schedule storage...")\n self._schedule_storage.upgrade()\n self._schedule_storage.migrate(print_fn)\n\n def optimize_for_dagit(self, statement_timeout):\n if self._schedule_storage:\n self._schedule_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n self._run_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n self._event_storage.optimize_for_dagit(statement_timeout=statement_timeout)\n\n def reindex(self, print_fn=lambda _: None):\n print_fn("Checking for reindexing...")\n self._event_storage.reindex_events(print_fn)\n self._event_storage.reindex_assets(print_fn)\n self._run_storage.optimize(print_fn)\n self._schedule_storage.optimize(print_fn)\n print_fn("Done.")\n\n def dispose(self):\n self._run_storage.dispose()\n self.run_coordinator.dispose()\n self._run_launcher.dispose()\n self._event_storage.dispose()\n self._compute_log_manager.dispose()\n\n # run storage\n @traced\n def get_run_by_id(self, run_id: str) -> Optional[PipelineRun]:\n return self._run_storage.get_run_by_id(run_id)\n\n @traced\n def get_pipeline_snapshot(self, snapshot_id: str) -> "PipelineSnapshot":\n return self._run_storage.get_pipeline_snapshot(snapshot_id)\n\n @traced\n def has_pipeline_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_pipeline_snapshot(snapshot_id)\n\n @traced\n def has_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_snapshot(snapshot_id)\n\n @traced\n def get_historical_pipeline(self, snapshot_id: str) -> "HistoricalPipeline":\n from dagster.core.host_representation import HistoricalPipeline\n\n snapshot = self._run_storage.get_pipeline_snapshot(snapshot_id)\n parent_snapshot = (\n self._run_storage.get_pipeline_snapshot(snapshot.lineage_snapshot.parent_snapshot_id)\n if snapshot.lineage_snapshot\n else None\n )\n return HistoricalPipeline(snapshot, snapshot_id, parent_snapshot)\n\n @traced\n def has_historical_pipeline(self, snapshot_id: str) -> bool:\n return self._run_storage.has_pipeline_snapshot(snapshot_id)\n\n @traced\n def get_execution_plan_snapshot(self, snapshot_id: str) -> "ExecutionPlanSnapshot":\n return self._run_storage.get_execution_plan_snapshot(snapshot_id)\n\n @traced\n def get_run_stats(self, run_id: str) -> PipelineRunStatsSnapshot:\n return self._event_storage.get_stats_for_run(run_id)\n\n @traced\n def get_run_step_stats(self, run_id, step_keys=None) -> List["RunStepKeyStatsSnapshot"]:\n return self._event_storage.get_step_stats_for_run(run_id, step_keys)\n\n @traced\n def get_run_tags(self) -> List[Tuple[str, Set[str]]]:\n return self._run_storage.get_run_tags()\n\n @traced\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Iterable[PipelineRun]]]:\n return self._run_storage.get_run_group(run_id)\n\n def create_run_for_pipeline(\n self,\n pipeline_def,\n execution_plan=None,\n run_id=None,\n run_config=None,\n mode=None,\n solids_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n solid_selection=None,\n asset_selection=None,\n external_pipeline_origin=None,\n pipeline_code_origin=None,\n ):\n from dagster.core.execution.api import create_execution_plan\n from dagster.core.execution.plan.plan import ExecutionPlan\n from dagster.core.snap import snapshot_from_execution_plan\n\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)\n\n # note that solids_to_execute is required to execute the solid subset, which is the\n # frozenset version of the previous solid_subset.\n # solid_selection is not required and will not be converted to solids_to_execute here.\n # i.e. this function doesn't handle solid queries.\n # solid_selection is only used to pass the user queries further down.\n check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str)\n check.opt_list_param(solid_selection, "solid_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n if solids_to_execute:\n if isinstance(pipeline_def, PipelineSubsetDefinition):\n # for the case when pipeline_def is created by IPipeline or ExternalPipeline\n check.invariant(\n solids_to_execute == pipeline_def.solids_to_execute,\n "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} "\n "that conflicts with solids_to_execute arg {solids_to_execute}".format(\n pipeline_solids_to_execute=str_format_list(pipeline_def.solids_to_execute),\n solids_to_execute=str_format_list(solids_to_execute),\n ),\n )\n else:\n # for cases when `create_run_for_pipeline` is directly called\n pipeline_def = pipeline_def.get_pipeline_subset_def(\n solids_to_execute=solids_to_execute\n )\n\n step_keys_to_execute = None\n\n if execution_plan:\n step_keys_to_execute = execution_plan.step_keys_to_execute\n\n else:\n execution_plan = create_execution_plan(\n pipeline=InMemoryPipeline(pipeline_def),\n run_config=run_config,\n mode=mode,\n instance_ref=self.get_ref() if self.is_persistent else None,\n tags=tags,\n )\n\n return self.create_run(\n pipeline_name=pipeline_def.name,\n run_id=run_id,\n run_config=run_config,\n mode=check.opt_str_param(mode, "mode", default=pipeline_def.get_default_mode_name()),\n solid_selection=solid_selection,\n asset_selection=asset_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan,\n pipeline_def.get_pipeline_snapshot_id(),\n ),\n parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),\n external_pipeline_origin=external_pipeline_origin,\n pipeline_code_origin=pipeline_code_origin,\n )\n\n def _construct_run_with_snapshots(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n status,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n asset_selection=None,\n solid_selection=None,\n external_pipeline_origin=None,\n pipeline_code_origin=None,\n ):\n\n # https://github.com/dagster-io/dagster/issues/2403\n if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:\n if AIRFLOW_EXECUTION_DATE_STR not in tags:\n tags[AIRFLOW_EXECUTION_DATE_STR] = get_current_datetime_in_utc().isoformat()\n\n check.invariant(\n not (not pipeline_snapshot and execution_plan_snapshot),\n "It is illegal to have an execution plan snapshot and not have a pipeline snapshot. "\n "It is possible to have no execution plan snapshot since we persist runs "\n "that do not successfully compile execution plans in the scheduled case.",\n )\n\n pipeline_snapshot_id = (\n self._ensure_persisted_pipeline_snapshot(pipeline_snapshot, parent_pipeline_snapshot)\n if pipeline_snapshot\n else None\n )\n\n execution_plan_snapshot_id = (\n self._ensure_persisted_execution_plan_snapshot(\n execution_plan_snapshot, pipeline_snapshot_id, step_keys_to_execute\n )\n if execution_plan_snapshot and pipeline_snapshot_id\n else None\n )\n\n return DagsterRun(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n asset_selection=asset_selection,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_pipeline_origin=external_pipeline_origin,\n pipeline_code_origin=pipeline_code_origin,\n )\n\n def _ensure_persisted_pipeline_snapshot(self, pipeline_snapshot, parent_pipeline_snapshot):\n from dagster.core.snap import PipelineSnapshot, create_pipeline_snapshot_id\n\n check.inst_param(pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot)\n check.opt_inst_param(parent_pipeline_snapshot, "parent_pipeline_snapshot", PipelineSnapshot)\n\n if pipeline_snapshot.lineage_snapshot:\n if not self._run_storage.has_pipeline_snapshot(\n pipeline_snapshot.lineage_snapshot.parent_snapshot_id\n ):\n check.invariant(\n create_pipeline_snapshot_id(parent_pipeline_snapshot)\n == pipeline_snapshot.lineage_snapshot.parent_snapshot_id,\n "Parent pipeline snapshot id out of sync with passed parent pipeline snapshot",\n )\n\n returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(\n parent_pipeline_snapshot\n )\n check.invariant(\n pipeline_snapshot.lineage_snapshot.parent_snapshot_id\n == returned_pipeline_snapshot_id\n )\n\n pipeline_snapshot_id = create_pipeline_snapshot_id(pipeline_snapshot)\n if not self._run_storage.has_pipeline_snapshot(pipeline_snapshot_id):\n returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(\n pipeline_snapshot\n )\n check.invariant(pipeline_snapshot_id == returned_pipeline_snapshot_id)\n\n return pipeline_snapshot_id\n\n def _ensure_persisted_execution_plan_snapshot(\n self, execution_plan_snapshot, pipeline_snapshot_id, step_keys_to_execute\n ):\n from dagster.core.snap.execution_plan_snapshot import (\n ExecutionPlanSnapshot,\n create_execution_plan_snapshot_id,\n )\n\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n check.opt_nullable_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n check.invariant(\n execution_plan_snapshot.pipeline_snapshot_id == pipeline_snapshot_id,\n (\n "Snapshot mismatch: Snapshot ID in execution plan snapshot is "\n '"{ep_pipeline_snapshot_id}" and snapshot_id created in memory is '\n '"{pipeline_snapshot_id}"'\n ).format(\n ep_pipeline_snapshot_id=execution_plan_snapshot.pipeline_snapshot_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n ),\n )\n\n execution_plan_snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n if not self._run_storage.has_execution_plan_snapshot(execution_plan_snapshot_id):\n returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(\n execution_plan_snapshot\n )\n\n check.invariant(execution_plan_snapshot_id == returned_execution_plan_snapshot_id)\n\n return execution_plan_snapshot_id\n\n def _log_asset_materialization_planned_events(self, pipeline_run, execution_plan_snapshot):\n from dagster.core.events import DagsterEvent\n from dagster.core.execution.context_creation_pipeline import initialize_console_manager\n\n pipeline_name = pipeline_run.pipeline_name\n\n for step in execution_plan_snapshot.steps:\n if step.key in execution_plan_snapshot.step_keys_to_execute:\n for output in step.outputs:\n asset_key = output.properties.asset_key\n if asset_key:\n # Logs and stores asset_materialization_planned event\n DagsterEvent.asset_materialization_planned(\n pipeline_name, asset_key, initialize_console_manager(pipeline_run, self)\n )\n\n def create_run(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n status,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n asset_selection=None,\n solid_selection=None,\n external_pipeline_origin=None,\n pipeline_code_origin=None,\n ):\n\n pipeline_run = self._construct_run_with_snapshots(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n asset_selection=asset_selection,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_pipeline_snapshot=parent_pipeline_snapshot,\n external_pipeline_origin=external_pipeline_origin,\n pipeline_code_origin=pipeline_code_origin,\n )\n\n pipeline_run = self._run_storage.add_run(pipeline_run)\n\n if execution_plan_snapshot:\n self._log_asset_materialization_planned_events(pipeline_run, execution_plan_snapshot)\n\n return pipeline_run\n\n def create_reexecuted_run(\n self,\n parent_run: PipelineRun,\n repo_location: "RepositoryLocation",\n external_pipeline: "ExternalPipeline",\n strategy: "ReexecutionStrategy",\n extra_tags: Optional[Dict[str, Any]] = None,\n run_config: Optional[Dict[str, Any]] = None,\n mode: Optional[str] = None,\n use_parent_run_tags: bool = False,\n ) -> PipelineRun:\n from dagster.core.execution.plan.resume_retry import (\n ReexecutionStrategy,\n get_retry_steps_from_parent_run,\n )\n from dagster.core.host_representation import ExternalPipeline, RepositoryLocation\n\n check.inst_param(parent_run, "parent_run", PipelineRun)\n check.inst_param(repo_location, "repo_location", RepositoryLocation)\n check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)\n check.inst_param(strategy, "strategy", ReexecutionStrategy)\n check.opt_dict_param(extra_tags, "extra_tags", key_type=str)\n check.opt_dict_param(run_config, "run_config", key_type=str)\n check.opt_str_param(mode, "mode")\n\n check.bool_param(use_parent_run_tags, "use_parent_run_tags")\n\n root_run_id = parent_run.root_run_id or parent_run.run_id\n parent_run_id = parent_run.run_id\n\n tags = merge_dicts(\n external_pipeline.tags,\n # these can differ from external_pipeline.tags if tags were added at launch time\n parent_run.tags if use_parent_run_tags else {},\n extra_tags or {},\n {\n PARENT_RUN_ID_TAG: parent_run_id,\n ROOT_RUN_ID_TAG: root_run_id,\n },\n )\n\n mode = cast(str, mode if mode is not None else parent_run.mode)\n run_config = run_config if run_config is not None else parent_run.run_config\n\n if strategy == ReexecutionStrategy.FROM_FAILURE:\n check.invariant(\n parent_run.status == PipelineRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n\n step_keys_to_execute, known_state = get_retry_steps_from_parent_run(\n self, parent_run=parent_run\n )\n tags[RESUME_RETRY_TAG] = "true"\n elif strategy == ReexecutionStrategy.ALL_STEPS:\n step_keys_to_execute = None\n known_state = None\n else:\n raise DagsterInvariantViolationError(f"Unknown reexecution strategy: {strategy}")\n\n external_execution_plan = repo_location.get_external_execution_plan(\n external_pipeline,\n run_config,\n mode=mode,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance=self,\n )\n\n return self.create_run(\n pipeline_name=parent_run.pipeline_name,\n run_id=None,\n run_config=run_config,\n mode=mode,\n solids_to_execute=parent_run.solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=PipelineRunStatus.NOT_STARTED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=external_pipeline.pipeline_snapshot,\n execution_plan_snapshot=external_execution_plan.execution_plan_snapshot,\n parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,\n solid_selection=parent_run.solid_selection,\n asset_selection=parent_run.asset_selection,\n external_pipeline_origin=external_pipeline.get_external_origin(),\n pipeline_code_origin=external_pipeline.get_python_origin(),\n )\n\n def register_managed_run(\n self,\n pipeline_name,\n run_id,\n run_config,\n mode,\n solids_to_execute,\n step_keys_to_execute,\n tags,\n root_run_id,\n parent_run_id,\n pipeline_snapshot,\n execution_plan_snapshot,\n parent_pipeline_snapshot,\n solid_selection=None,\n ):\n # The usage of this method is limited to dagster-airflow, specifically in Dagster\n # Operators that are executed in Airflow. Because a common workflow in Airflow is to\n # retry dags from arbitrary tasks, we need any node to be capable of creating a\n # PipelineRun.\n #\n # The try-except DagsterRunAlreadyExists block handles the race when multiple "root" tasks\n # simultaneously execute self._run_storage.add_run(pipeline_run). When this happens, only\n # one task succeeds in creating the run, while the others get DagsterRunAlreadyExists\n # error; at this point, the failed tasks try again to fetch the existing run.\n # https://github.com/dagster-io/dagster/issues/2412\n\n pipeline_run = self._construct_run_with_snapshots(\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=PipelineRunStatus.MANAGED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot=pipeline_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_pipeline_snapshot=parent_pipeline_snapshot,\n )\n\n def get_run():\n candidate_run = self.get_run_by_id(pipeline_run.run_id)\n\n field_diff = _check_run_equality(pipeline_run, candidate_run)\n\n if field_diff:\n raise DagsterRunConflict(\n "Found conflicting existing run with same id {run_id}. Runs differ in:"\n "\\n{field_diff}".format(\n run_id=pipeline_run.run_id,\n field_diff=_format_field_diff(field_diff),\n ),\n )\n return candidate_run\n\n if self.has_run(pipeline_run.run_id):\n return get_run()\n\n try:\n return self._run_storage.add_run(pipeline_run)\n except DagsterRunAlreadyExists:\n return get_run()\n\n @traced\n def add_run(self, pipeline_run: PipelineRun):\n return self._run_storage.add_run(pipeline_run)\n\n @traced\n def add_snapshot(self, snapshot, snapshot_id=None):\n return self._run_storage.add_snapshot(snapshot, snapshot_id)\n\n @traced\n def handle_run_event(self, run_id: str, event: "DagsterEvent"):\n return self._run_storage.handle_run_event(run_id, event)\n\n @traced\n def add_run_tags(self, run_id: str, new_tags: Dict[str, str]):\n return self._run_storage.add_run_tags(run_id, new_tags)\n\n @traced\n def has_run(self, run_id: str) -> bool:\n return self._run_storage.has_run(run_id)\n\n @traced\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Iterable[PipelineRun]:\n return self._run_storage.get_runs(filters, cursor, limit, bucket_by)\n\n @traced\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n return self._run_storage.get_runs_count(filters)\n\n @traced\n def get_run_groups(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Dict[str, Dict[str, Union[Iterable[PipelineRun], int]]]:\n return self._run_storage.get_run_groups(filters=filters, cursor=cursor, limit=limit)\n\n @traced\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> List[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n return self._run_storage.get_run_records(\n filters, limit, order_by, ascending, cursor, bucket_by\n )\n\n @property\n def supports_bucket_queries(self):\n return self._run_storage.supports_bucket_queries\n\n @traced\n def get_run_partition_data(\n self, partition_set_name: str, job_name: str, repository_label: str\n ) -> List[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n return self._run_storage.get_run_partition_data(\n partition_set_name, job_name, repository_label\n )\n\n def wipe(self):\n self._run_storage.wipe()\n self._event_storage.wipe()\n\n @traced\n def delete_run(self, run_id: str):\n self._run_storage.delete_run(run_id)\n self._event_storage.delete_events(run_id)\n\n # event storage\n @traced\n def logs_after(\n self,\n run_id,\n cursor,\n of_type: Optional["DagsterEventType"] = None,\n limit: Optional[int] = None,\n ):\n return self._event_storage.get_logs_for_run(\n run_id,\n cursor=cursor,\n of_type=of_type,\n limit=limit,\n )\n\n @traced\n def all_logs(\n self, run_id, of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None\n ):\n return self._event_storage.get_logs_for_run(run_id, of_type=of_type)\n\n @traced\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n limit: Optional[int] = None,\n ):\n return self._event_storage.get_records_for_run(run_id, cursor, of_type, limit)\n\n def watch_event_logs(self, run_id, cursor, cb):\n return self._event_storage.watch(run_id, cursor, cb)\n\n def end_watch_event_logs(self, run_id, cb):\n return self._event_storage.end_watch(run_id, cb)\n\n # asset storage\n\n @traced\n def all_asset_keys(self):\n return self._event_storage.all_asset_keys()\n\n @traced\n def get_asset_keys(self, prefix=None, limit=None, cursor=None):\n return self._event_storage.get_asset_keys(prefix=prefix, limit=limit, cursor=cursor)\n\n @traced\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n return self._event_storage.has_asset_key(asset_key)\n\n @traced\n def get_latest_materialization_events(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n return self._event_storage.get_latest_materialization_events(asset_keys)\n\n @traced\n def get_event_records(\n self,\n event_records_filter: Optional["EventRecordsFilter"] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable["EventLogRecord"]:\n """Return a list of event records stored in the event log storage.\n\n Args:\n event_records_filter (Optional[EventRecordsFilter]): the filter by which to filter event\n records.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[EventLogRecord]: List of event log records stored in the event log storage.\n """\n return self._event_storage.get_event_records(event_records_filter, limit, ascending)\n\n @traced\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Iterable["AssetRecord"]:\n return self._event_storage.get_asset_records(asset_keys)\n\n @traced\n def events_for_asset_key(\n self,\n asset_key,\n partitions=None,\n before_cursor=None,\n after_cursor=None,\n cursor=None,\n before_timestamp=None,\n limit=None,\n ascending=False,\n ):\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n warnings.warn(\n """\nThe method `events_for_asset_key` on DagsterInstance has been deprecated as of `0.12.0` in favor of\nthe method `get_event_records`. The method `get_event_records` takes in an `EventRecordsFilter`\nargument that allows for filtering by asset key and asset key partitions. The return value is a\nlist of `EventLogRecord` objects, each of which contains a storage_id and an event log entry.\n\nExample:\nrecords = instance.get_event_records(\n EventRecordsFilter(\n asset_key=asset_key,\n asset_partitions=partitions,\n after_cursor=after_cursor,\n ),\n)\n"""\n )\n\n return self._event_storage.get_asset_events(\n asset_key,\n partitions,\n before_cursor,\n after_cursor,\n limit,\n before_timestamp=before_timestamp,\n ascending=ascending,\n include_cursor=True,\n cursor=cursor,\n )\n\n @traced\n def run_ids_for_asset_key(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n return self._event_storage.get_asset_run_ids(asset_key)\n\n @traced\n def wipe_assets(self, asset_keys):\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset(asset_key)\n\n @traced\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n return self._event_storage.get_materialization_count_by_partition(asset_keys)\n\n # event subscriptions\n\n def _get_yaml_python_handlers(self):\n if self._settings:\n logging_config = self.get_settings("python_logs").get("dagster_handler_config", {})\n\n if logging_config:\n experimental_functionality_warning("Handling yaml-defined logging configuration")\n\n # Handlers can only be retrieved from dictConfig configuration if they are attached\n # to a logger. We add a dummy logger to the configuration that allows us to access user\n # defined handlers.\n handler_names = logging_config.get("handlers", {}).keys()\n\n dagster_dummy_logger_name = "dagster_dummy_logger"\n\n processed_dict_conf = {\n "version": 1,\n "disable_existing_loggers": False,\n "loggers": {dagster_dummy_logger_name: {"handlers": handler_names}},\n }\n processed_dict_conf.update(logging_config)\n\n logging.config.dictConfig(processed_dict_conf)\n\n dummy_logger = logging.getLogger(dagster_dummy_logger_name)\n return dummy_logger.handlers\n return []\n\n def _get_event_log_handler(self):\n event_log_handler = _EventListenerLogHandler(self)\n event_log_handler.setLevel(10)\n return event_log_handler\n\n def get_handlers(self):\n handlers = [self._get_event_log_handler()]\n handlers.extend(self._get_yaml_python_handlers())\n return handlers\n\n def store_event(self, event):\n self._event_storage.store_event(event)\n\n def handle_new_event(self, event):\n run_id = event.run_id\n\n self._event_storage.store_event(event)\n\n if event.is_dagster_event and event.dagster_event.is_pipeline_event:\n self._run_storage.handle_run_event(run_id, event.dagster_event)\n\n for sub in self._subscribers[run_id]:\n sub(event)\n\n def add_event_listener(self, run_id, cb):\n self._subscribers[run_id].append(cb)\n\n
[docs] def report_engine_event(\n self,\n message,\n pipeline_run=None,\n engine_event_data=None,\n cls=None,\n step_key=None,\n pipeline_name=None,\n run_id=None,\n ):\n """\n Report a EngineEvent that occurred outside of a pipeline execution context.\n """\n from dagster.core.events import DagsterEvent, DagsterEventType, EngineEventData\n from dagster.core.events.log import EventLogEntry\n\n check.opt_class_param(cls, "cls")\n check.str_param(message, "message")\n check.opt_inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(run_id, "run_id")\n check.opt_str_param(pipeline_name, "pipeline_name")\n\n check.invariant(\n pipeline_run or (pipeline_name and run_id),\n "Must include either pipeline_run or pipeline_name and run_id",\n )\n\n run_id = run_id if run_id else pipeline_run.run_id\n pipeline_name = pipeline_name if pipeline_name else pipeline_run.pipeline_name\n\n engine_event_data = check.opt_inst_param(\n engine_event_data,\n "engine_event_data",\n EngineEventData,\n EngineEventData([]),\n )\n\n if cls:\n message = "[{}] {}".format(cls.__name__, message)\n\n log_level = logging.INFO\n if engine_event_data and engine_event_data.error:\n log_level = logging.ERROR\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=pipeline_name,\n message=message,\n event_specific_data=engine_event_data,\n step_key=step_key,\n )\n event_record = EventLogEntry(\n user_message="",\n level=log_level,\n pipeline_name=pipeline_name,\n run_id=run_id,\n error_info=None,\n timestamp=time.time(),\n step_key=step_key,\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event
\n\n def report_run_canceling(self, run, message=None):\n\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import EventLogEntry\n\n check.inst_param(run, "run", PipelineRun)\n message = check.opt_str_param(\n message,\n "message",\n "Sending run termination request.",\n )\n canceling_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELING.value,\n pipeline_name=run.pipeline_name,\n message=message,\n )\n\n event_record = EventLogEntry(\n user_message="",\n level=logging.INFO,\n pipeline_name=run.pipeline_name,\n run_id=run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=canceling_event,\n )\n\n self.handle_new_event(event_record)\n\n def report_run_canceled(\n self,\n pipeline_run,\n message=None,\n ):\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import EventLogEntry\n\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n message = check.opt_str_param(\n message,\n "mesage",\n "This run has been marked as canceled from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELED.value,\n pipeline_name=pipeline_run.pipeline_name,\n message=message,\n )\n event_record = EventLogEntry(\n user_message="",\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event\n\n def report_run_failed(self, pipeline_run, message=None):\n from dagster.core.events import DagsterEvent, DagsterEventType\n from dagster.core.events.log import EventLogEntry\n\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n message = check.opt_str_param(\n message,\n "message",\n "This run has been marked as failed from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_FAILURE.value,\n pipeline_name=pipeline_run.pipeline_name,\n message=message,\n )\n event_record = EventLogEntry(\n user_message="",\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=dagster_event,\n )\n\n self.handle_new_event(event_record)\n return dagster_event\n\n # directories\n\n def file_manager_directory(self, run_id):\n return self._local_artifact_storage.file_manager_dir(run_id)\n\n def storage_directory(self):\n return self._local_artifact_storage.storage_dir\n\n def schedules_directory(self):\n return self._local_artifact_storage.schedules_dir\n\n # Runs coordinator\n\n
[docs] def submit_run(self, run_id, workspace: "IWorkspace") -> PipelineRun:\n """Submit a pipeline run to the coordinator.\n\n This method delegates to the ``RunCoordinator``, configured on the instance, and will\n call its implementation of ``RunCoordinator.submit_run()`` to send the run to the\n coordinator for execution. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.NOT_STARTED`` state. They also must have a non-null\n ExternalPipelineOrigin.\n\n Args:\n run_id (str): The id of the run.\n """\n\n from dagster.core.host_representation import ExternalPipelineOrigin\n from dagster.core.origin import PipelinePythonOrigin\n from dagster.core.run_coordinator import SubmitRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to submit_run"\n )\n\n check.inst(\n run.external_pipeline_origin,\n ExternalPipelineOrigin,\n "External pipeline origin must be set for submitted runs",\n )\n check.inst(\n run.pipeline_code_origin,\n PipelinePythonOrigin,\n "Python origin must be set for submitted runs",\n )\n\n try:\n submitted_run = self._run_coordinator.submit_run(\n SubmitRunContext(run, workspace=workspace)\n )\n except:\n from dagster.core.events import EngineEventData\n\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return submitted_run
\n\n # Run launcher\n\n
[docs] def launch_run(self, run_id: str, workspace: "IWorkspace"):\n """Launch a pipeline run.\n\n This method is typically called using `instance.submit_run` rather than being invoked\n directly. This method delegates to the ``RunLauncher``, if any, configured on the instance,\n and will call its implementation of ``RunLauncher.launch_run()`` to begin the execution of\n the specified run. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and should be in the\n ``PipelineRunStatus.NOT_STARTED`` state.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster.core.events import DagsterEvent, DagsterEventType, EngineEventData\n from dagster.core.events.log import EventLogEntry\n from dagster.core.launcher import LaunchRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to launch_run"\n )\n\n launch_started_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_STARTING.value,\n pipeline_name=run.pipeline_name,\n )\n\n event_record = EventLogEntry(\n user_message="",\n level=logging.INFO,\n pipeline_name=run.pipeline_name,\n run_id=run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=launch_started_event,\n )\n\n self.handle_new_event(event_record)\n\n run = self.get_run_by_id(run_id)\n if run is None:\n check.failed(f"Failed to reload run {run_id}")\n\n try:\n self._run_launcher.launch_run(LaunchRunContext(pipeline_run=run, workspace=workspace))\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run
\n\n
[docs] def resume_run(self, run_id: str, workspace: "IWorkspace", attempt_number: int):\n """Resume a pipeline run.\n\n This method should be called on runs which have already been launched, but whose run workers\n have died.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster.core.events import EngineEventData\n from dagster.core.launcher import ResumeRunContext\n from dagster.daemon.monitoring import RESUME_RUN_LOG_MESSAGE\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to resume_run"\n )\n if run.status not in IN_PROGRESS_RUN_STATUSES:\n raise DagsterInvariantViolationError(\n f"Run {run_id} is not in a state that can be resumed"\n )\n\n self.report_engine_event(\n RESUME_RUN_LOG_MESSAGE,\n run,\n )\n\n try:\n self._run_launcher.resume_run(\n ResumeRunContext(\n pipeline_run=run,\n workspace=workspace,\n resume_attempt_number=attempt_number,\n )\n )\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run
\n\n def count_resume_run_attempts(self, run_id: str):\n from dagster.daemon.monitoring import count_resume_run_attempts\n\n return count_resume_run_attempts(self, run_id)\n\n def run_will_resume(self, run_id: str):\n if not self.run_monitoring_enabled:\n return False\n return self.count_resume_run_attempts(run_id) < self.run_monitoring_max_resume_run_attempts\n\n # Scheduler\n\n def start_schedule(self, external_schedule):\n return self._scheduler.start_schedule(self, external_schedule)\n\n def stop_schedule(self, schedule_origin_id, schedule_selector_id, external_schedule):\n return self._scheduler.stop_schedule(\n self, schedule_origin_id, schedule_selector_id, external_schedule\n )\n\n def scheduler_debug_info(self):\n from dagster.core.definitions.run_request import InstigatorType\n from dagster.core.scheduler import SchedulerDebugInfo\n\n errors = []\n\n schedules = []\n for schedule_state in self.all_instigator_state(instigator_type=InstigatorType.SCHEDULE):\n schedule_info = {\n schedule_state.instigator_name: {\n "status": schedule_state.status.value,\n "cron_schedule": schedule_state.instigator_data.cron_schedule,\n "schedule_origin_id": schedule_state.instigator_origin_id,\n "repository_origin_id": schedule_state.repository_origin_id,\n }\n }\n\n schedules.append(yaml.safe_dump(schedule_info, default_flow_style=False))\n\n return SchedulerDebugInfo(\n scheduler_config_info=self._info_str_for_component("Scheduler", self.scheduler),\n scheduler_info=self.scheduler.debug_info(),\n schedule_storage=schedules,\n errors=errors,\n )\n\n # Schedule / Sensor Storage\n\n def start_sensor(self, external_sensor: "ExternalSensor"):\n from dagster.core.definitions.run_request import InstigatorType\n from dagster.core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(\n external_sensor.get_external_origin_id(), external_sensor.selector_id\n )\n\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n if not stored_state:\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.RUNNING,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.RUNNING))\n\n def stop_sensor(\n self,\n instigator_origin_id: str,\n selector_id: str,\n external_sensor: Optional["ExternalSensor"],\n ):\n from dagster.core.definitions.run_request import InstigatorType\n from dagster.core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(instigator_origin_id, selector_id)\n if external_sensor:\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n else:\n computed_state = stored_state\n\n if not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_sensor\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.STOPPED,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.STOPPED))\n\n @traced\n def all_instigator_state(\n self, repository_origin_id=None, repository_selector_id=None, instigator_type=None\n ):\n return self._schedule_storage.all_instigator_state(\n repository_origin_id, repository_selector_id, instigator_type\n )\n\n @traced\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional["InstigatorState"]:\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.get_instigator_state(origin_id, selector_id)\n\n def add_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.add_instigator_state(state)\n\n def update_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.update_instigator_state(state)\n\n def delete_instigator_state(self, origin_id, selector_id):\n return self._schedule_storage.delete_instigator_state(origin_id, selector_id)\n\n @property\n def supports_batch_tick_queries(self):\n return self._schedule_storage and self._schedule_storage.supports_batch_queries\n\n @traced\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Mapping[str, Iterable["InstigatorTick"]]:\n if not self._schedule_storage:\n return {}\n return self._schedule_storage.get_batch_ticks(selector_ids, limit, statuses)\n\n @traced\n def get_tick(self, origin_id, selector_id, timestamp):\n matches = self._schedule_storage.get_ticks(\n origin_id, selector_id, before=timestamp + 1, after=timestamp - 1, limit=1\n )\n return matches[0] if len(matches) else None\n\n @traced\n def get_ticks(self, origin_id, selector_id, before=None, after=None, limit=None, statuses=None):\n return self._schedule_storage.get_ticks(\n origin_id, selector_id, before=before, after=after, limit=limit, statuses=statuses\n )\n\n def create_tick(self, tick_data):\n return self._schedule_storage.create_tick(tick_data)\n\n def update_tick(self, tick):\n return self._schedule_storage.update_tick(tick)\n\n def purge_ticks(self, origin_id, selector_id, tick_status, before):\n self._schedule_storage.purge_ticks(origin_id, selector_id, tick_status, before)\n\n def wipe_all_schedules(self):\n if self._scheduler:\n self._scheduler.wipe(self)\n\n self._schedule_storage.wipe()\n\n def logs_path_for_schedule(self, schedule_origin_id):\n return self._scheduler.get_logs_path(self, schedule_origin_id)\n\n def __enter__(self):\n return self\n\n def __exit__(self, exception_type, exception_value, traceback):\n self.dispose()\n if DagsterInstance._EXIT_STACK:\n DagsterInstance._EXIT_STACK.close()\n\n # dagster daemon\n
[docs] def add_daemon_heartbeat(self, daemon_heartbeat: "DaemonHeartbeat"):\n """Called on a regular interval by the daemon"""\n self._run_storage.add_daemon_heartbeat(daemon_heartbeat)
\n\n
[docs] def get_daemon_heartbeats(self) -> Dict[str, "DaemonHeartbeat"]:\n """Latest heartbeats of all daemon types"""\n return self._run_storage.get_daemon_heartbeats()
\n\n def wipe_daemon_heartbeats(self):\n self._run_storage.wipe_daemon_heartbeats()\n\n def get_required_daemon_types(self):\n from dagster.core.run_coordinator import QueuedRunCoordinator\n from dagster.core.scheduler import DagsterDaemonScheduler\n from dagster.daemon.daemon import (\n BackfillDaemon,\n MonitoringDaemon,\n SchedulerDaemon,\n SensorDaemon,\n )\n from dagster.daemon.run_coordinator.queued_run_coordinator_daemon import (\n QueuedRunCoordinatorDaemon,\n )\n\n if self.is_ephemeral:\n return []\n\n daemons = [SensorDaemon.daemon_type(), BackfillDaemon.daemon_type()]\n if isinstance(self.scheduler, DagsterDaemonScheduler):\n daemons.append(SchedulerDaemon.daemon_type())\n if isinstance(self.run_coordinator, QueuedRunCoordinator):\n daemons.append(QueuedRunCoordinatorDaemon.daemon_type())\n if self.run_monitoring_enabled:\n daemons.append(MonitoringDaemon.daemon_type())\n return daemons\n\n # backfill\n def get_backfills(self, status=None, cursor=None, limit=None):\n return self._run_storage.get_backfills(status=status, cursor=cursor, limit=limit)\n\n def get_backfill(self, backfill_id):\n return self._run_storage.get_backfill(backfill_id)\n\n def add_backfill(self, partition_backfill):\n self._run_storage.add_backfill(partition_backfill)\n\n def update_backfill(self, partition_backfill):\n return self._run_storage.update_backfill(partition_backfill)\n\n @property\n def should_start_background_run_thread(self) -> bool:\n """\n Gate on an experimental feature to start a thread that monitors for if the run should be canceled.\n """\n return False
\n\n\ndef is_dagit_telemetry_enabled(instance):\n telemetry_settings = instance.get_settings("telemetry")\n if not telemetry_settings:\n return False\n\n if "experimental_dagit" in telemetry_settings:\n return telemetry_settings["experimental_dagit"]\n else:\n return False\n
", "current_page_name": "_modules/dagster/core/instance", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "ref": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.instance.ref

\nimport os\nfrom typing import Dict, NamedTuple, Optional\n\nimport yaml\n\nimport dagster._check as check\nfrom dagster.serdes import ConfigurableClassData, class_from_code_pointer, whitelist_for_serdes\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME, dagster_instance_config\n\n\ndef _runs_directory(base):\n    return os.path.join(base, "history", "")\n\n\ndef compute_logs_directory(base):\n    return os.path.join(base, "storage")\n\n\ndef _event_logs_directory(base):\n    return os.path.join(base, "history", "runs", "")\n\n\ndef _schedule_directory(base):\n    return os.path.join(base, "schedules")\n\n\ndef configurable_class_data(config_field):\n    return ConfigurableClassData(\n        check.str_elem(config_field, "module"),\n        check.str_elem(config_field, "class"),\n        yaml.dump(check.opt_dict_elem(config_field, "config"), default_flow_style=False),\n    )\n\n\ndef configurable_class_data_or_default(config_value, field_name, default):\n    return (\n        configurable_class_data(config_value[field_name])\n        if config_value.get(field_name)\n        else default\n    )\n\n\n
[docs]@whitelist_for_serdes\nclass InstanceRef(\n NamedTuple(\n "_InstanceRef",\n [\n ("local_artifact_storage_data", ConfigurableClassData),\n ("run_storage_data", ConfigurableClassData),\n ("event_storage_data", ConfigurableClassData),\n ("compute_logs_data", ConfigurableClassData),\n ("schedule_storage_data", Optional[ConfigurableClassData]),\n ("scheduler_data", Optional[ConfigurableClassData]),\n ("run_coordinator_data", Optional[ConfigurableClassData]),\n ("run_launcher_data", Optional[ConfigurableClassData]),\n ("settings", Dict[str, object]),\n ("custom_instance_class_data", Optional[ConfigurableClassData]),\n ],\n )\n):\n """Serializable representation of a :py:class:`DagsterInstance`.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n local_artifact_storage_data: ConfigurableClassData,\n run_storage_data: ConfigurableClassData,\n event_storage_data: ConfigurableClassData,\n compute_logs_data: ConfigurableClassData,\n schedule_storage_data: Optional[ConfigurableClassData],\n scheduler_data: Optional[ConfigurableClassData],\n run_coordinator_data: Optional[ConfigurableClassData],\n run_launcher_data: Optional[ConfigurableClassData],\n settings: Dict[str, object],\n custom_instance_class_data: Optional[ConfigurableClassData] = None,\n ):\n return super(cls, InstanceRef).__new__(\n cls,\n local_artifact_storage_data=check.inst_param(\n local_artifact_storage_data, "local_artifact_storage_data", ConfigurableClassData\n ),\n run_storage_data=check.inst_param(\n run_storage_data, "run_storage_data", ConfigurableClassData\n ),\n event_storage_data=check.inst_param(\n event_storage_data, "event_storage_data", ConfigurableClassData\n ),\n compute_logs_data=check.inst_param(\n compute_logs_data, "compute_logs_data", ConfigurableClassData\n ),\n schedule_storage_data=check.opt_inst_param(\n schedule_storage_data, "schedule_storage_data", ConfigurableClassData\n ),\n scheduler_data=check.opt_inst_param(\n scheduler_data, "scheduler_data", ConfigurableClassData\n ),\n run_coordinator_data=check.opt_inst_param(\n run_coordinator_data, "run_coordinator_data", ConfigurableClassData\n ),\n run_launcher_data=check.opt_inst_param(\n run_launcher_data, "run_launcher_data", ConfigurableClassData\n ),\n settings=check.opt_dict_param(settings, "settings", key_type=str),\n custom_instance_class_data=check.opt_inst_param(\n custom_instance_class_data,\n "instance_class",\n ConfigurableClassData,\n ),\n )\n\n @staticmethod\n def config_defaults(base_dir):\n return {\n "local_artifact_storage": ConfigurableClassData(\n "dagster.core.storage.root",\n "LocalArtifactStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "run_storage": ConfigurableClassData(\n "dagster.core.storage.runs",\n "SqliteRunStorage",\n yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n ),\n "event_log_storage": ConfigurableClassData(\n "dagster.core.storage.event_log",\n "SqliteEventLogStorage",\n yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n ),\n "compute_logs": ConfigurableClassData(\n "dagster.core.storage.local_compute_log_manager",\n "LocalComputeLogManager",\n yaml.dump({"base_dir": compute_logs_directory(base_dir)}, default_flow_style=False),\n ),\n "schedule_storage": ConfigurableClassData(\n "dagster.core.storage.schedules",\n "SqliteScheduleStorage",\n yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n ),\n "scheduler": ConfigurableClassData(\n "dagster.core.scheduler",\n "DagsterDaemonScheduler",\n yaml.dump({}),\n ),\n "run_coordinator": ConfigurableClassData(\n "dagster.core.run_coordinator", "DefaultRunCoordinator", yaml.dump({})\n ),\n "run_launcher": ConfigurableClassData(\n "dagster",\n "DefaultRunLauncher",\n yaml.dump({}),\n ),\n }\n\n @staticmethod\n def from_dir(base_dir, config_filename=DAGSTER_CONFIG_YAML_FILENAME, overrides=None):\n overrides = check.opt_dict_param(overrides, "overrides")\n config_value, custom_instance_class = dagster_instance_config(\n base_dir, config_filename=config_filename, overrides=overrides\n )\n\n if custom_instance_class:\n config_keys = set(custom_instance_class.config_schema().keys())\n custom_instance_class_config = {\n key: val for key, val in config_value.items() if key in config_keys\n }\n custom_instance_class_data = ConfigurableClassData(\n config_value["instance_class"]["module"],\n config_value["instance_class"]["class"],\n yaml.dump(custom_instance_class_config, default_flow_style=False),\n )\n defaults = custom_instance_class.config_defaults(base_dir)\n else:\n custom_instance_class_data = None\n defaults = InstanceRef.config_defaults(base_dir)\n\n local_artifact_storage_data = configurable_class_data_or_default(\n config_value, "local_artifact_storage", defaults["local_artifact_storage"]\n )\n\n run_storage_data = configurable_class_data_or_default(\n config_value, "run_storage", defaults["run_storage"]\n )\n\n event_storage_data = configurable_class_data_or_default(\n config_value, "event_log_storage", defaults["event_log_storage"]\n )\n\n compute_logs_data = configurable_class_data_or_default(\n config_value,\n "compute_logs",\n defaults["compute_logs"],\n )\n\n schedule_storage_data = configurable_class_data_or_default(\n config_value, "schedule_storage", defaults["schedule_storage"]\n )\n\n scheduler_data = configurable_class_data_or_default(\n config_value, "scheduler", defaults["scheduler"]\n )\n\n run_coordinator_data = configurable_class_data_or_default(\n config_value,\n "run_coordinator",\n defaults["run_coordinator"],\n )\n\n run_launcher_data = configurable_class_data_or_default(\n config_value,\n "run_launcher",\n defaults["run_launcher"],\n )\n\n settings_keys = {"telemetry", "python_logs", "run_monitoring", "code_servers"}\n settings = {key: config_value.get(key) for key in settings_keys if config_value.get(key)}\n\n return InstanceRef(\n local_artifact_storage_data=local_artifact_storage_data,\n run_storage_data=run_storage_data,\n event_storage_data=event_storage_data,\n compute_logs_data=compute_logs_data,\n schedule_storage_data=schedule_storage_data,\n scheduler_data=scheduler_data,\n run_coordinator_data=run_coordinator_data,\n run_launcher_data=run_launcher_data,\n settings=settings,\n custom_instance_class_data=custom_instance_class_data,\n )\n\n @staticmethod\n def from_dict(instance_ref_dict):\n def value_for_ref_item(k, v):\n if v is None:\n return None\n if k == "settings":\n return v\n return ConfigurableClassData(*v)\n\n return InstanceRef(**{k: value_for_ref_item(k, v) for k, v in instance_ref_dict.items()})\n\n @property\n def local_artifact_storage(self):\n return self.local_artifact_storage_data.rehydrate()\n\n @property\n def run_storage(self):\n return self.run_storage_data.rehydrate()\n\n @property\n def event_storage(self):\n return self.event_storage_data.rehydrate()\n\n @property\n def compute_log_manager(self):\n return self.compute_logs_data.rehydrate()\n\n @property\n def schedule_storage(self):\n return self.schedule_storage_data.rehydrate() if self.schedule_storage_data else None\n\n @property\n def scheduler(self):\n return self.scheduler_data.rehydrate() if self.scheduler_data else None\n\n @property\n def run_coordinator(self):\n return self.run_coordinator_data.rehydrate() if self.run_coordinator_data else None\n\n @property\n def run_launcher(self):\n return self.run_launcher_data.rehydrate() if self.run_launcher_data else None\n\n @property\n def custom_instance_class(self):\n return (\n class_from_code_pointer(\n self.custom_instance_class_data.module_name,\n self.custom_instance_class_data.class_name,\n )\n if self.custom_instance_class_data\n else None\n )\n\n @property\n def custom_instance_class_config(self):\n return (\n self.custom_instance_class_data.config_dict if self.custom_instance_class_data else {}\n )\n\n def to_dict(self):\n return self._asdict()
\n
", "current_page_name": "_modules/dagster/core/instance/ref", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster.core.instance"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.instance.ref"}, "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.instance"}, "launcher": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.launcher.base

\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.origin import PipelinePythonOrigin\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.workspace.workspace import IWorkspace\nfrom dagster.serdes import whitelist_for_serdes\n\n\nclass LaunchRunContext(NamedTuple):\n    """\n    Context available within a run launcher's launch_run call.\n    """\n\n    pipeline_run: PipelineRun\n    workspace: Optional[IWorkspace]\n\n    @property\n    def pipeline_code_origin(self) -> Optional[PipelinePythonOrigin]:\n        return self.pipeline_run.pipeline_code_origin\n\n\nclass ResumeRunContext(NamedTuple):\n    """\n    Context available within a run launcher's resume_run call.\n    """\n\n    pipeline_run: PipelineRun\n    workspace: Optional[IWorkspace]\n    resume_attempt_number: Optional[int] = None\n\n    @property\n    def pipeline_code_origin(self) -> Optional[PipelinePythonOrigin]:\n        return self.pipeline_run.pipeline_code_origin\n\n\n@whitelist_for_serdes\nclass WorkerStatus(Enum):\n    RUNNING = "RUNNING"\n    NOT_FOUND = "NOT_FOUND"\n    FAILED = "FAILED"\n    SUCCESS = "SUCCESS"\n    UNKNOWN = "UNKNOWN"\n\n\nclass CheckRunHealthResult(NamedTuple):\n    """\n    Result of a check_run_worker_health call.\n    """\n\n    status: WorkerStatus\n    msg: Optional[str] = None\n\n    def __str__(self) -> str:\n        return f"{self.status.value}: '{self.msg}'"\n\n\n
[docs]class RunLauncher(ABC, MayHaveInstanceWeakref):\n @abstractmethod\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run.\n\n This method should begin the execution of the specified run, and may emit engine events.\n Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.STARTING`` state. Typically, this method will\n not be invoked directly, but should be invoked through ``DagsterInstance.launch_run()``.\n\n Args:\n context (LaunchRunContext): information about the launch - every run launcher\n will need the PipelineRun, and some run launchers may need information from the\n IWorkspace from which the run was launched.\n """\n\n @abstractmethod\n def terminate(self, run_id):\n """\n Terminates a process.\n\n Returns False is the process was already terminated. Returns true if\n the process was alive and was successfully terminated\n """\n\n def dispose(self):\n """\n Do any resource cleanup that should happen when the DagsterInstance is\n cleaning itself up.\n """\n\n def join(self, timeout=30):\n pass\n\n @property\n def supports_check_run_worker_health(self):\n """\n Whether the run launcher supports check_run_worker_health.\n """\n return False\n\n def check_run_worker_health(self, run: PipelineRun) -> CheckRunHealthResult:\n raise NotImplementedError(\n "This run launcher does not support run monitoring. Please disable it on your instance."\n )\n\n @property\n def supports_resume_run(self):\n """\n Whether the run launcher supports resume_run.\n """\n return False\n\n def resume_run(self, context: ResumeRunContext) -> None:\n raise NotImplementedError(\n "This run launcher does not support resuming runs. If using "\n "run monitoring, set max_resume_run_attempts to 0."\n )
\n
", "current_page_name": "_modules/dagster/core/launcher/base", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.launcher.base"}, "default_run_launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.launcher.default_run_launcher

\nimport time\nfrom typing import cast\n\nimport dagster.seven as seven\nfrom dagster import Bool, Field\nfrom dagster import _check as check\nfrom dagster.core.errors import DagsterInvariantViolationError, DagsterLaunchFailedError\nfrom dagster.core.host_representation.grpc_server_registry import ProcessGrpcServerRegistry\nfrom dagster.core.host_representation.repository_location import GrpcServerRepositoryLocation\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import GRPC_INFO_TAG\nfrom dagster.grpc.client import DagsterGrpcClient\nfrom dagster.grpc.types import CancelExecutionRequest, ExecuteExternalPipelineArgs, StartRunResult\nfrom dagster.serdes import ConfigurableClass, deserialize_as, deserialize_json_to_dagster_namedtuple\nfrom dagster.utils import merge_dicts\n\nfrom .base import LaunchRunContext, RunLauncher\n\n\n
[docs]class DefaultRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs against running GRPC servers."""\n\n def __init__(self, inst_data=None, wait_for_processes=False):\n self._inst_data = inst_data\n\n # Whether to wait for any processes that were used to launch runs to finish\n # before disposing of this launcher. Primarily useful for test cleanup where\n # we want to make sure that resources used by the test are cleaned up before\n # the test ends.\n self._wait_for_processes = check.bool_param(wait_for_processes, "wait_for_processes")\n\n self._run_ids = set()\n\n self._locations_to_wait_for = []\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"wait_for_processes": Field(Bool, is_required=False)}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return DefaultRunLauncher(\n inst_data=inst_data, wait_for_processes=config_value.get("wait_for_processes", False)\n )\n\n @staticmethod\n def launch_run_from_grpc_client(instance, run, grpc_client):\n instance.add_run_tags(\n run.run_id,\n {\n GRPC_INFO_TAG: seven.json.dumps(\n merge_dicts(\n {"host": grpc_client.host},\n (\n {"port": grpc_client.port}\n if grpc_client.port\n else {"socket": grpc_client.socket}\n ),\n ({"use_ssl": True} if grpc_client.use_ssl else {}),\n )\n )\n },\n )\n\n res = deserialize_as(\n grpc_client.start_run(\n ExecuteExternalPipelineArgs(\n pipeline_origin=run.external_pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=instance.get_ref(),\n )\n ),\n StartRunResult,\n )\n if not res.success:\n raise (\n DagsterLaunchFailedError(\n res.message, serializable_error_info=res.serializable_error_info\n )\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.pipeline_run\n\n check.inst_param(run, "run", PipelineRun)\n\n if not context.workspace:\n raise DagsterInvariantViolationError(\n "DefaultRunLauncher requires a workspace to be included in its LaunchRunContext"\n )\n\n external_pipeline_origin = check.not_none(run.external_pipeline_origin)\n repository_location = context.workspace.get_repository_location(\n external_pipeline_origin.external_repository_origin.repository_location_origin.location_name\n )\n\n check.inst(\n repository_location,\n GrpcServerRepositoryLocation,\n "DefaultRunLauncher: Can't launch runs for pipeline not loaded from a GRPC server",\n )\n\n DefaultRunLauncher.launch_run_from_grpc_client(\n self._instance, run, cast(GrpcServerRepositoryLocation, repository_location).client\n )\n\n self._run_ids.add(run.run_id)\n\n if self._wait_for_processes:\n self._locations_to_wait_for.append(repository_location)\n\n def _get_grpc_client_for_termination(self, run_id):\n if not self._instance:\n return None\n\n run = self._instance.get_run_by_id(run_id)\n if not run or run.is_finished:\n return None\n\n tags = run.tags\n\n if GRPC_INFO_TAG not in tags:\n return None\n\n grpc_info = seven.json.loads(tags.get(GRPC_INFO_TAG))\n\n return DagsterGrpcClient(\n port=grpc_info.get("port"),\n socket=grpc_info.get("socket"),\n host=grpc_info.get("host"),\n use_ssl=bool(grpc_info.get("use_ssl", False)),\n )\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n if not self._instance:\n return False\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n client = self._get_grpc_client_for_termination(run_id)\n\n if not client:\n self._instance.report_engine_event(\n message="Unable to get grpc client to send termination request to.",\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n self._instance.report_run_canceling(run)\n res = deserialize_json_to_dagster_namedtuple(\n client.cancel_execution(CancelExecutionRequest(run_id=run_id))\n )\n return res.success\n\n def join(self, timeout=30):\n # If this hasn't been initialized at all, we can just do a noop\n if not self._instance:\n return\n\n total_time = 0\n interval = 0.01\n\n while True:\n active_run_ids = [\n run_id\n for run_id in self._run_ids\n if (\n self._instance.get_run_by_id(run_id)\n and not self._instance.get_run_by_id(run_id).is_finished\n )\n ]\n\n if len(active_run_ids) == 0:\n return\n\n if total_time >= timeout:\n raise Exception(\n "Timed out waiting for these runs to finish: {active_run_ids}".format(\n active_run_ids=repr(active_run_ids)\n )\n )\n\n total_time += interval\n time.sleep(interval)\n interval = interval * 2\n\n def dispose(self):\n if not self._wait_for_processes:\n return\n\n for location in self._locations_to_wait_for:\n if isinstance(location, GrpcServerRepositoryLocation) and isinstance(\n location.grpc_server_registry, ProcessGrpcServerRegistry\n ):\n location.grpc_server_registry.wait_for_processes()
\n
", "current_page_name": "_modules/dagster/core/launcher/default_run_launcher", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.launcher.default_run_launcher"}}, "log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.log_manager

\nimport datetime\nimport logging\nfrom typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster.core.utils import coerce_valid_log_level, make_new_run_id\nfrom dagster.utils.log import get_dagster_logger\n\nif TYPE_CHECKING:\n    from dagster import DagsterInstance, PipelineRun\n    from dagster.core.events import DagsterEvent\n\nDAGSTER_META_KEY = "dagster_meta"\n\n\nclass DagsterMessageProps(\n    NamedTuple(\n        "_DagsterMessageProps",\n        [\n            ("orig_message", Optional[str]),\n            ("log_message_id", Optional[str]),\n            ("log_timestamp", Optional[str]),\n            ("dagster_event", Optional[Any]),\n        ],\n    )\n):\n    """Internal class used to represent specific attributes about a logged message"""\n\n    def __new__(\n        cls,\n        orig_message: str,\n        log_message_id: Optional[str] = None,\n        log_timestamp: Optional[str] = None,\n        dagster_event: Optional["DagsterEvent"] = None,\n    ):\n        return super().__new__(\n            cls,\n            orig_message=check.str_param(orig_message, "orig_message"),\n            log_message_id=check.opt_str_param(\n                log_message_id, "log_message_id", default=make_new_run_id()\n            ),\n            log_timestamp=check.opt_str_param(\n                log_timestamp, "log_timestamp", default=datetime.datetime.utcnow().isoformat()\n            ),\n            dagster_event=dagster_event,\n        )\n\n    @property\n    def error_str(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n\n        event_specific_data = self.dagster_event.event_specific_data\n        if not event_specific_data:\n            return None\n\n        error = getattr(event_specific_data, "error", None)\n        if error:\n            return "\\n\\n" + getattr(event_specific_data, "error_display_string", error.to_string())\n        return None\n\n    @property\n    def pid(self) -> Optional[str]:\n        if self.dagster_event is None or self.dagster_event.pid is None:\n            return None\n        return str(self.dagster_event.pid)\n\n    @property\n    def step_key(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.step_key\n\n    @property\n    def event_type_value(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.event_type_value\n\n\nclass DagsterLoggingMetadata(\n    NamedTuple(\n        "_DagsterLoggingMetadata",\n        [\n            ("run_id", Optional[str]),\n            ("pipeline_name", Optional[str]),\n            ("pipeline_tags", Dict[str, str]),\n            ("step_key", Optional[str]),\n            ("solid_name", Optional[str]),\n            ("resource_name", Optional[str]),\n            ("resource_fn_name", Optional[str]),\n        ],\n    )\n):\n    """Internal class used to represent the context in which a given message was logged (i.e. the\n    step, pipeline run, resource, etc.)\n    """\n\n    def __new__(\n        cls,\n        run_id: Optional[str] = None,\n        pipeline_name: Optional[str] = None,\n        pipeline_tags: Optional[Dict[str, str]] = None,\n        step_key: Optional[str] = None,\n        solid_name: Optional[str] = None,\n        resource_name: Optional[str] = None,\n        resource_fn_name: Optional[str] = None,\n    ):\n        return super().__new__(\n            cls,\n            run_id=run_id,\n            pipeline_name=pipeline_name,\n            pipeline_tags=pipeline_tags or {},\n            step_key=step_key,\n            solid_name=solid_name,\n            resource_name=resource_name,\n            resource_fn_name=resource_fn_name,\n        )\n\n    @property\n    def log_source(self):\n        if self.resource_name is None:\n            return self.pipeline_name or "system"\n        return f"resource:{self.resource_name}"\n\n    def to_tags(self) -> Dict[str, str]:\n        # converts all values into strings\n        return {k: str(v) for k, v in self._asdict().items()}\n\n\ndef construct_log_string(\n    logging_metadata: DagsterLoggingMetadata, message_props: DagsterMessageProps\n) -> str:\n\n    from dagster.core.events import EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n\n    event_type_str = (\n        EVENT_TYPE_VALUE_TO_DISPLAY_STRING[message_props.event_type_value]\n        if message_props.event_type_value in EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n        else message_props.event_type_value\n    )\n    return " - ".join(\n        filter(\n            None,\n            (\n                logging_metadata.log_source,\n                logging_metadata.run_id,\n                message_props.pid,\n                logging_metadata.step_key,\n                event_type_str,\n                message_props.orig_message,\n            ),\n        )\n    ) + (message_props.error_str or "")\n\n\ndef get_dagster_meta_dict(\n    logging_metadata: DagsterLoggingMetadata, dagster_message_props: DagsterMessageProps\n) -> Dict[str, Any]:\n    # combine all dagster meta information into a single dictionary\n    meta_dict = {\n        **logging_metadata._asdict(),\n        **dagster_message_props._asdict(),\n    }\n    # step-level events can be logged from a pipeline context. for these cases, pull the step\n    # key from the underlying DagsterEvent\n    if meta_dict["step_key"] is None:\n        meta_dict["step_key"] = dagster_message_props.step_key\n\n    return meta_dict\n\n\nclass DagsterLogHandler(logging.Handler):\n    """Internal class used to turn regular logs into Dagster logs by adding Dagster-specific\n    metadata (such as pipeline_name or step_key), as well as reformatting the underlying message.\n\n    Note: The `loggers` argument will be populated with the set of @loggers supplied to the current\n    pipeline run. These essentially work as handlers (they do not create their own log messages,\n    they simply re-log messages that are created from context.log.x() calls), which is why they are\n    referenced from within this handler class.\n    """\n\n    def __init__(\n        self,\n        logging_metadata: DagsterLoggingMetadata,\n        loggers: List[logging.Logger],\n        handlers: List[logging.Handler],\n    ):\n        self._logging_metadata = logging_metadata\n        self._loggers = loggers\n        self._handlers = handlers\n        self._should_capture = True\n        super().__init__()\n\n    @property\n    def logging_metadata(self):\n        return self._logging_metadata\n\n    def with_tags(self, **new_tags):\n        return DagsterLogHandler(\n            logging_metadata=self.logging_metadata._replace(**new_tags),\n            loggers=self._loggers,\n            handlers=self._handlers,\n        )\n\n    def _extract_extra(self, record: logging.LogRecord) -> Dict[str, Any]:\n        """In the logging.Logger log() implementation, the elements of the `extra` dictionary\n        argument are smashed into the __dict__ of the underlying logging.LogRecord.\n        This function figures out what the original `extra` values of the log call were by\n        comparing the set of attributes in the received record to those of a default record.\n        """\n        ref_attrs = list(logging.makeLogRecord({}).__dict__.keys()) + ["message", "asctime"]\n        return {k: v for k, v in record.__dict__.items() if k not in ref_attrs}\n\n    def _convert_record(self, record: logging.LogRecord) -> logging.LogRecord:\n        # we store the originating DagsterEvent in the DAGSTER_META_KEY field, if applicable\n        dagster_meta = getattr(record, DAGSTER_META_KEY, None)\n\n        # generate some properties for this specific record\n        dagster_message_props = DagsterMessageProps(\n            orig_message=record.getMessage(), dagster_event=dagster_meta\n        )\n\n        # set the dagster meta info for the record\n        setattr(\n            record,\n            DAGSTER_META_KEY,\n            get_dagster_meta_dict(self._logging_metadata, dagster_message_props),\n        )\n\n        # update the message to be formatted like other dagster logs\n        record.msg = construct_log_string(self._logging_metadata, dagster_message_props)\n        record.args = ()\n\n        return record\n\n    def filter(self, record: logging.LogRecord) -> bool:\n        """If you list multiple levels of a python logging hierarchy as managed loggers, and do not\n        set the propagate attribute to False, this will result in that record getting logged\n        multiple times, as the DagsterLogHandler will be invoked at each level of the hierarchy as\n        the message is propagated. This filter prevents this from happening.\n        """\n        return self._should_capture and not isinstance(\n            getattr(record, DAGSTER_META_KEY, None), dict\n        )\n\n    def emit(self, record: logging.LogRecord):\n        """For any received record, add Dagster metadata, and have handlers handle it"""\n\n        try:\n            # to prevent the potential for infinite loops in which a handler produces log messages\n            # which are then captured and then handled by that same handler (etc.), do not capture\n            # any log messages while one is currently being emitted\n            self._should_capture = False\n            dagster_record = self._convert_record(record)\n            # built-in handlers\n            for handler in self._handlers:\n                if dagster_record.levelno >= handler.level:\n                    handler.handle(dagster_record)\n            # user-defined @loggers\n            for logger in self._loggers:\n                logger.log(\n                    dagster_record.levelno,\n                    dagster_record.msg,\n                    exc_info=dagster_record.exc_info,\n                    extra=self._extract_extra(record),\n                )\n        finally:\n            self._should_capture = True\n\n\n
[docs]class DagsterLogManager(logging.Logger):\n """Centralized dispatch for logging from user code.\n\n Handles the construction of uniform structured log messages and passes them through to the\n underlying loggers/handlers.\n\n An instance of the log manager is made available to ops as ``context.log``. Users should not\n initialize instances of the log manager directly. To configure custom loggers, set the\n ``logger_defs`` argument in an `@job` decorator or when calling the `to_job()` method on a\n :py:class:`GraphDefinition`.\n\n The log manager inherits standard convenience methods like those exposed by the Python standard\n library :py:mod:`python:logging` module (i.e., within the body of an op,\n ``context.log.{debug, info, warning, warn, error, critical, fatal}``).\n\n The underlying integer API can also be called directly using, e.g.\n ``context.log.log(5, msg)``, and the log manager will delegate to the ``log`` method\n defined on each of the loggers it manages.\n\n User-defined custom log levels are not supported, and calls to, e.g.,\n ``context.log.trace`` or ``context.log.notice`` will result in hard exceptions **at runtime**.\n """\n\n def __init__(\n self,\n dagster_handler: DagsterLogHandler,\n level: int = logging.NOTSET,\n managed_loggers: Optional[List[logging.Logger]] = None,\n ):\n super().__init__(name="dagster", level=coerce_valid_log_level(level))\n self._managed_loggers = check.opt_list_param(\n managed_loggers, "managed_loggers", of_type=logging.Logger\n )\n self._dagster_handler = dagster_handler\n self.addHandler(dagster_handler)\n\n @classmethod\n def create(\n cls,\n loggers: List[logging.Logger],\n handlers: Optional[List[logging.Handler]] = None,\n instance: Optional["DagsterInstance"] = None,\n pipeline_run: Optional["PipelineRun"] = None,\n ) -> "DagsterLogManager":\n """Create a DagsterLogManager with a set of subservient loggers."""\n\n handlers = check.opt_list_param(handlers, "handlers", of_type=logging.Handler)\n\n managed_loggers = [get_dagster_logger()]\n python_log_level = logging.NOTSET\n\n if instance:\n handlers += instance.get_handlers()\n managed_loggers += [\n logging.getLogger(lname) if lname != "root" else logging.getLogger()\n for lname in instance.managed_python_loggers\n ]\n if instance.python_log_level is not None:\n python_log_level = coerce_valid_log_level(instance.python_log_level)\n\n # set all loggers to the declared logging level\n for logger in managed_loggers:\n logger.setLevel(python_log_level)\n\n if pipeline_run:\n logging_metadata = DagsterLoggingMetadata(\n run_id=pipeline_run.run_id,\n pipeline_name=pipeline_run.pipeline_name,\n pipeline_tags=pipeline_run.tags,\n )\n else:\n logging_metadata = DagsterLoggingMetadata()\n\n return cls(\n dagster_handler=DagsterLogHandler(\n logging_metadata=logging_metadata,\n loggers=loggers,\n handlers=handlers,\n ),\n level=python_log_level,\n managed_loggers=managed_loggers,\n )\n\n @property\n def logging_metadata(self) -> DagsterLoggingMetadata:\n return self._dagster_handler.logging_metadata\n\n def begin_python_log_capture(self):\n for logger in self._managed_loggers:\n logger.addHandler(self._dagster_handler)\n\n def end_python_log_capture(self):\n for logger in self._managed_loggers:\n logger.removeHandler(self._dagster_handler)\n\n def log_dagster_event(self, level: Union[str, int], msg: str, dagster_event: "DagsterEvent"):\n """Log a DagsterEvent at the given level. Attributes about the context it was logged in\n (such as the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): message describing the event\n dagster_event (DagsterEvent): DagsterEvent that will be logged\n """\n self.log(level=level, msg=msg, extra={DAGSTER_META_KEY: dagster_event})\n\n def log(self, level, msg, *args, **kwargs):\n """Log a message at the given level. Attributes about the context it was logged in (such as\n the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): the message to be logged\n *args: the logged message will be msg % args\n """\n level = coerce_valid_log_level(level)\n # log DagsterEvents regardless of level\n if self.isEnabledFor(level) or ("extra" in kwargs and DAGSTER_META_KEY in kwargs["extra"]):\n self._log(level, msg, args, **kwargs)\n\n def with_tags(self, **new_tags):\n """Add new tags in "new_tags" to the set of tags attached to this log manager instance, and\n return a new DagsterLogManager with the merged set of tags.\n\n Args:\n new_tags (Dict[str,str]): Dictionary of tags\n\n Returns:\n DagsterLogManager: a new DagsterLogManager namedtuple with updated tags for the same\n run ID and loggers.\n """\n return DagsterLogManager(\n dagster_handler=self._dagster_handler.with_tags(**new_tags),\n managed_loggers=self._managed_loggers,\n level=self.level,\n )
\n
", "current_page_name": "_modules/dagster/core/log_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.log_manager"}, "run_coordinator": {"default_run_coordinator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.run_coordinator.default_run_coordinator

\nimport dagster._check as check\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\n
[docs]class DefaultRunCoordinator(RunCoordinator, ConfigurableClass):\n """Immediately send runs to the run launcher."""\n\n def __init__(self, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n def submit_run(self, context: SubmitRunContext) -> PipelineRun:\n pipeline_run = context.pipeline_run\n check.invariant(pipeline_run.status == PipelineRunStatus.NOT_STARTED)\n\n self._instance.launch_run(pipeline_run.run_id, context.workspace)\n run = self._instance.get_run_by_id(pipeline_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {pipeline_run.run_id}")\n return run\n\n def cancel_run(self, run_id):\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/core/run_coordinator/default_run_coordinator", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.run_coordinator.default_run_coordinator"}, "queued_run_coordinator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.run_coordinator.queued_run_coordinator

\nimport logging\nimport time\nfrom typing import Any, Dict, List, NamedTuple, Optional\n\nfrom dagster import DagsterEvent, DagsterEventType, IntSource, String\nfrom dagster import _check as check\nfrom dagster.builtins import Bool\nfrom dagster.config import Field\nfrom dagster.config.config_type import Array, Noneable, ScalarUnion\nfrom dagster.config.field_utils import Shape\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\nclass RunQueueConfig(\n    NamedTuple(\n        "_RunQueueConfig",\n        [("max_concurrent_runs", int), ("tag_concurrency_limits", Optional[List[Dict[str, Any]]])],\n    )\n):\n    pass\n\n\n
[docs]class QueuedRunCoordinator(RunCoordinator, ConfigurableClass):\n """\n Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\n the Dagster Daemon process to be alive in order for runs to be launched.\n """\n\n def __init__(\n self,\n max_concurrent_runs=None,\n tag_concurrency_limits=None,\n dequeue_interval_seconds=None,\n inst_data=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._max_concurrent_runs = check.opt_int_param(\n max_concurrent_runs, "max_concurrent_runs", 10\n )\n check.invariant(\n self._max_concurrent_runs >= -1,\n "Negative values other than -1 (which disables the limit) for max_concurrent_runs are disallowed.",\n )\n self._tag_concurrency_limits = check.opt_list_param(\n tag_concurrency_limits,\n "tag_concurrency_limits",\n )\n self._dequeue_interval_seconds = check.opt_int_param(\n dequeue_interval_seconds, "dequeue_interval_seconds", 5\n )\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def get_run_queue_config(self):\n return RunQueueConfig(\n max_concurrent_runs=self._max_concurrent_runs,\n tag_concurrency_limits=self._tag_concurrency_limits,\n )\n\n @property\n def dequeue_interval_seconds(self):\n return self._dequeue_interval_seconds\n\n @classmethod\n def config_type(cls):\n return {\n "max_concurrent_runs": Field(\n config=IntSource,\n is_required=False,\n description="The maximum number of runs that are allowed to be in progress at once. "\n "Defaults to 10. Set to -1 to disable the limit. Set to 0 to stop any runs from launching. "\n "Any other negative values are disallowed.",\n ),\n "tag_concurrency_limits": Field(\n config=Noneable(\n Array(\n Shape(\n {\n "key": String,\n "value": Field(\n ScalarUnion(\n scalar_type=String,\n non_scalar_schema=Shape({"applyLimitPerUniqueValue": Bool}),\n ),\n is_required=False,\n ),\n "limit": Field(int),\n }\n )\n )\n ),\n is_required=False,\n description="A set of limits that are applied to runs with particular tags. "\n "If a value is set, the limit is applied to only that key-value pair. "\n "If no value is set, the limit is applied across all values of that key. "\n "If the value is set to a dict with `applyLimitPerUniqueValue: true`, the limit "\n "will apply to the number of unique values for that key.",\n ),\n "dequeue_interval_seconds": Field(\n config=IntSource,\n is_required=False,\n description="The interval in seconds at which the Dagster Daemon "\n "should periodically check the run queue for new runs to launch.",\n ),\n }\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(\n inst_data=inst_data,\n max_concurrent_runs=config_value.get("max_concurrent_runs"),\n tag_concurrency_limits=config_value.get("tag_concurrency_limits"),\n dequeue_interval_seconds=config_value.get("dequeue_interval_seconds"),\n )\n\n def submit_run(self, context: SubmitRunContext) -> PipelineRun:\n pipeline_run = context.pipeline_run\n check.invariant(pipeline_run.status == PipelineRunStatus.NOT_STARTED)\n\n enqueued_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_ENQUEUED.value,\n pipeline_name=pipeline_run.pipeline_name,\n )\n event_record = EventLogEntry(\n user_message="",\n level=logging.INFO,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n timestamp=time.time(),\n dagster_event=enqueued_event,\n )\n self._instance.handle_new_event(event_record)\n\n run = self._instance.get_run_by_id(pipeline_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {pipeline_run.run_id}")\n return run\n\n def cancel_run(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n # NOTE: possible race condition if the dequeuer acts on this run at the same time\n # https://github.com/dagster-io/dagster/issues/3323\n if run.status == PipelineRunStatus.QUEUED:\n self._instance.report_run_canceling(\n run,\n message="Canceling run from the queue.",\n )\n self._instance.report_run_canceled(run)\n return True\n else:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/core/run_coordinator/queued_run_coordinator", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.run_coordinator.queued_run_coordinator"}}, "scheduler": {"scheduler": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.scheduler.scheduler

\nimport abc\nimport os\nfrom typing import List, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster.config import Field\nfrom dagster.config.source import IntSource\nfrom dagster.core.definitions.run_request import InstigatorType\nfrom dagster.core.errors import DagsterError\nfrom dagster.core.host_representation import ExternalSchedule\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.scheduler.instigation import (\n    InstigatorState,\n    InstigatorStatus,\n    ScheduleInstigatorData,\n)\nfrom dagster.serdes import ConfigurableClass\nfrom dagster.seven import get_current_datetime_in_utc\nfrom dagster.utils import mkdir_p\n\n\nclass DagsterSchedulerError(DagsterError):\n    """Base class for all Dagster Scheduler errors"""\n\n\nclass DagsterScheduleDoesNotExist(DagsterSchedulerError):\n    """Errors raised when fetching a schedule."""\n\n\nclass SchedulerDebugInfo(\n    NamedTuple(\n        "SchedulerDebugInfo",\n        [\n            ("errors", List[str]),\n            ("scheduler_config_info", str),\n            ("scheduler_info", str),\n            ("schedule_storage", List[str]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        errors: List[str],\n        scheduler_config_info: str,\n        scheduler_info: str,\n        schedule_storage: List[str],\n    ):\n        return super(SchedulerDebugInfo, cls).__new__(\n            cls,\n            errors=check.list_param(errors, "errors", of_type=str),\n            scheduler_config_info=check.str_param(scheduler_config_info, "scheduler_config_info"),\n            scheduler_info=check.str_param(scheduler_info, "scheduler_info"),\n            schedule_storage=check.list_param(schedule_storage, "schedule_storage", of_type=str),\n        )\n\n\n
[docs]class Scheduler(abc.ABC):\n """Abstract base class for a scheduler. This component is responsible for interfacing with\n an external system such as cron to ensure scheduled repeated execution according.\n """\n\n def start_schedule(\n self, instance: DagsterInstance, external_schedule: ExternalSchedule\n ) -> InstigatorState:\n """\n Updates the status of the given schedule to `InstigatorStatus.RUNNING` in schedule storage,\n\n This should not be overridden by subclasses.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start\n\n """\n\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(\n external_schedule.get_external_origin_id(), external_schedule.selector_id\n )\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n new_instigator_data = ScheduleInstigatorData(\n external_schedule.cron_schedule,\n get_current_datetime_in_utc().timestamp(),\n )\n\n if not stored_state:\n started_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.RUNNING,\n new_instigator_data,\n )\n instance.add_instigator_state(started_state)\n else:\n started_state = stored_state.with_status(InstigatorStatus.RUNNING).with_data(\n new_instigator_data\n )\n instance.update_instigator_state(started_state)\n return started_state\n\n def stop_schedule(\n self,\n instance: DagsterInstance,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional[ExternalSchedule],\n ) -> InstigatorState:\n """\n Updates the status of the given schedule to `InstigatorStatus.STOPPED` in schedule storage,\n\n This should not be overridden by subclasses.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to stop running.\n """\n\n check.str_param(schedule_origin_id, "schedule_origin_id")\n check.opt_inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(schedule_origin_id, schedule_selector_id)\n\n if not external_schedule:\n computed_state = stored_state\n else:\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n\n if computed_state and not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_schedule\n stopped_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.STOPPED,\n ScheduleInstigatorData(\n external_schedule.cron_schedule,\n ),\n )\n instance.add_instigator_state(stopped_state)\n else:\n stopped_state = stored_state.with_status(InstigatorStatus.STOPPED).with_data(\n ScheduleInstigatorData(\n cron_schedule=computed_state.instigator_data.cron_schedule,\n )\n )\n instance.update_instigator_state(stopped_state)\n\n return stopped_state\n\n @abc.abstractmethod\n def debug_info(self):\n """Returns debug information about the scheduler"""\n\n @abc.abstractmethod\n def get_logs_path(self, instance, schedule_origin_id):\n """Get path to store logs for schedule\n\n Args:\n schedule_origin_id (string): The id of the schedule target to retrieve the log path for\n """
\n\n\nDEFAULT_MAX_CATCHUP_RUNS = 5\n\n\n
[docs]class DagsterDaemonScheduler(Scheduler, ConfigurableClass):\n """Default scheduler implementation that submits runs from the `dagster-daemon`\n long-lived process. Periodically checks each running schedule for execution times that don't\n have runs yet and launches them.\n """\n\n def __init__(\n self, max_catchup_runs=DEFAULT_MAX_CATCHUP_RUNS, max_tick_retries=0, inst_data=None\n ):\n self.max_catchup_runs = check.opt_int_param(\n max_catchup_runs, "max_catchup_runs", DEFAULT_MAX_CATCHUP_RUNS\n )\n self.max_tick_retries = check.opt_int_param(max_tick_retries, "max_tick_retries", 0)\n self._inst_data = inst_data\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "max_catchup_runs": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_MAX_CATCHUP_RUNS,\n description="""For partitioned schedules, controls the maximum number of past\n partitions for each schedule that will be considered when looking for missing\n runs . Generally this parameter will only come into play if the scheduler\n falls behind or launches after experiencing downtime. This parameter will not be checked for\n schedules without partition sets (for example, schedules created using the @schedule\n decorator) - only the most recent execution time will be considered for those schedules.\n\n Note that no matter what this value is, the scheduler will never launch a run from a time\n before the schedule was turned on (even if the start_date on the schedule is earlier) - if\n you want to launch runs for earlier partitions, launch a backfill.\n """,\n ),\n "max_tick_retries": Field(\n IntSource,\n default_value=0,\n is_required=False,\n description="For each schedule tick that raises an error, how many times to retry that tick",\n ),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return DagsterDaemonScheduler(inst_data=inst_data, **config_value)\n\n def debug_info(self):\n return ""\n\n def wipe(self, instance):\n pass\n\n def _get_or_create_logs_directory(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)\n if not os.path.isdir(logs_directory):\n mkdir_p(logs_directory)\n\n return logs_directory\n\n def get_logs_path(self, instance, schedule_origin_id):\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n return os.path.join(logs_directory, "scheduler.log")
\n
", "current_page_name": "_modules/dagster/core/scheduler/scheduler", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.scheduler.scheduler"}}, "storage": {"compute_log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.compute_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nfrom rx import Observable\n\nimport dagster._check as check\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.storage.pipeline_run import PipelineRun\n\nMAX_BYTES_FILE_READ = 33554432  # 32 MB\nMAX_BYTES_CHUNK_READ = 4194304  # 4 MB\n\n\nclass ComputeIOType(Enum):\n    STDOUT = "stdout"\n    STDERR = "stderr"\n\n\nclass ComputeLogFileData(\n    NamedTuple(\n        "ComputeLogFileData",\n        [\n            ("path", str),\n            ("data", Optional[str]),\n            ("cursor", int),\n            ("size", int),\n            ("download_url", Optional[str]),\n        ],\n    )\n):\n    """Representation of a chunk of compute execution log data"""\n\n    def __new__(\n        cls, path: str, data: Optional[str], cursor: int, size: int, download_url: Optional[str]\n    ):\n        return super(ComputeLogFileData, cls).__new__(\n            cls,\n            path=check.str_param(path, "path"),\n            data=check.opt_str_param(data, "data"),\n            cursor=check.int_param(cursor, "cursor"),\n            size=check.int_param(size, "size"),\n            download_url=check.opt_str_param(download_url, "download_url"),\n        )\n\n\n
[docs]class ComputeLogManager(ABC, MayHaveInstanceWeakref):\n """Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\n steps of pipeline solids."""\n\n @contextmanager\n def watch(self, pipeline_run, step_key=None):\n """\n Watch the stdout/stderr for a given execution for a given run_id / step_key and persist it.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n\n if not self.enabled(pipeline_run, step_key):\n yield\n return\n\n self.on_watch_start(pipeline_run, step_key)\n with self._watch_logs(pipeline_run, step_key):\n yield\n self.on_watch_finish(pipeline_run, step_key)\n\n @contextmanager\n @abstractmethod\n def _watch_logs(self, pipeline_run, step_key=None):\n """\n Method to watch the stdout/stderr logs for a given run_id / step_key. Kept separate from\n blessed `watch` method, which triggers all the start/finish hooks that are necessary to\n implement the different remote implementations.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n def get_local_path(self, run_id, key, io_type):\n """Get the local path of the logfile for a given execution step. This determines the\n location on the local filesystem to which stdout/stderr will be rerouted.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either ComputeIOType.STDOUT or\n ComputeIOType.STDERR\n\n Returns:\n str\n """\n\n @abstractmethod\n def is_watch_completed(self, run_id, key):\n """Flag indicating when computation for a given execution step has completed.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def on_watch_start(self, pipeline_run, step_key):\n """Hook called when starting to watch compute logs.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def on_watch_finish(self, pipeline_run, step_key):\n """Hook called when computation for a given execution step is finished.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def download_url(self, run_id, key, io_type):\n """Get a URL where the logs can be downloaded.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n\n Returns:\n String\n """\n\n @abstractmethod\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n """Get compute log data for a given compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n max_bytes (Optional[Int]): Maximum number of bytes to be read and returned\n\n Returns:\n ComputeLogFileData\n """\n\n def enabled(self, _pipeline_run, _step_key):\n """Hook for disabling compute log capture.\n\n Args:\n _step_key (Optional[String]): The step_key for a compute step\n\n Returns:\n Boolean\n """\n return True\n\n @abstractmethod\n def on_subscribe(self, subscription):\n """Hook for managing streaming subscriptions for log data from `dagit`\n\n Args:\n subscription (ComputeLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def on_unsubscribe(self, subscription):\n pass\n\n def observable(self, run_id, key, io_type, cursor=None):\n """Return an Observable which streams back log data from the execution logs for a given\n compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n\n Returns:\n Observable\n """\n check.str_param(run_id, "run_id")\n check.str_param(key, "key")\n check.inst_param(io_type, "io_type", ComputeIOType)\n check.opt_str_param(cursor, "cursor")\n\n if cursor:\n cursor = int(cursor)\n else:\n cursor = 0\n\n subscription = ComputeLogSubscription(self, run_id, key, io_type, cursor)\n self.on_subscribe(subscription)\n return Observable.create(subscription) # pylint: disable=E1101\n\n def dispose(self):\n pass
\n\n\nclass ComputeLogSubscription:\n """Observable object that generates ComputeLogFileData objects as compute step execution logs\n are written\n """\n\n def __init__(self, manager, run_id, key, io_type, cursor):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.io_type = io_type\n self.cursor = cursor\n self.observer = None\n\n def __call__(self, observer):\n self.observer = observer\n self.fetch()\n if self.manager.is_watch_completed(self.run_id, self.key):\n self.complete()\n return self\n\n def dispose(self):\n # called when the connection gets closed, allowing the observer to get GC'ed\n if self.observer and callable(getattr(self.observer, "dispose", None)):\n self.observer.dispose()\n self.observer = None\n self.manager.on_unsubscribe(self)\n\n def fetch(self):\n if not self.observer:\n return\n\n should_fetch = True\n while should_fetch:\n update = self.manager.read_logs_file(\n self.run_id,\n self.key,\n self.io_type,\n self.cursor,\n max_bytes=MAX_BYTES_CHUNK_READ,\n )\n if not self.cursor or update.cursor != self.cursor:\n self.observer.on_next(update)\n self.cursor = update.cursor\n should_fetch = update.data and len(update.data.encode("utf-8")) >= MAX_BYTES_CHUNK_READ\n\n def complete(self):\n if not self.observer:\n return\n self.observer.on_completed()\n
", "current_page_name": "_modules/dagster/core/storage/compute_log_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.compute_log_manager"}, "event_log": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.event_log.base

\nimport base64\nimport warnings\nfrom abc import ABC, abstractmethod\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    Callable,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster.core.assets import AssetDetails\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.events import DagsterEventType\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.execution.stats import (\n    RunStepKeyStatsSnapshot,\n    build_run_stats_from_events,\n    build_run_step_stats_from_events,\n)\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.storage.pipeline_run import PipelineRunStatsSnapshot\nfrom dagster.serdes import whitelist_for_serdes\nfrom dagster.seven import json\n\n\n
[docs]class RunShardedEventsCursor(NamedTuple):\n """Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\n performance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\n run-sharded storages, the id field is ignored, since they may not be unique across shards\n """\n\n id: int\n run_updated_after: datetime
\n\n\n
[docs]class EventLogRecord(NamedTuple):\n """Internal representation of an event record, as stored in a\n :py:class:`~dagster.core.storage.event_log.EventLogStorage`.\n """\n\n storage_id: int\n event_log_entry: EventLogEntry
\n\n\nclass EventLogConnection(NamedTuple):\n records: List[EventLogRecord]\n cursor: str\n has_more: bool\n\n\nclass EventLogCursorType(Enum):\n OFFSET = "OFFSET"\n STORAGE_ID = "STORAGE_ID"\n\n\nclass EventLogCursor(NamedTuple):\n """Representation of an event record cursor, keeping track of the log query state"""\n\n cursor_type: EventLogCursorType\n value: int\n\n def is_offset_cursor(self) -> bool:\n return self.cursor_type == EventLogCursorType.OFFSET\n\n def is_id_cursor(self) -> bool:\n return self.cursor_type == EventLogCursorType.STORAGE_ID\n\n def offset(self) -> int:\n check.invariant(self.cursor_type == EventLogCursorType.OFFSET)\n return max(0, int(self.value))\n\n def storage_id(self) -> int:\n check.invariant(self.cursor_type == EventLogCursorType.STORAGE_ID)\n return int(self.value)\n\n def __str__(self):\n return self.to_string()\n\n def to_string(self) -> str:\n raw = json.dumps({"type": self.cursor_type.value, "value": self.value})\n return base64.b64encode(bytes(raw, encoding="utf-8")).decode("utf-8")\n\n @staticmethod\n def parse(cursor_str: str) -> "EventLogCursor":\n raw = json.loads(base64.b64decode(cursor_str).decode("utf-8"))\n return EventLogCursor(EventLogCursorType(raw["type"]), raw["value"])\n\n @staticmethod\n def from_offset(offset: int) -> "EventLogCursor":\n return EventLogCursor(EventLogCursorType.OFFSET, offset)\n\n @staticmethod\n def from_storage_id(storage_id: int) -> "EventLogCursor":\n return EventLogCursor(EventLogCursorType.STORAGE_ID, storage_id)\n\n\nclass AssetEntry(\n NamedTuple(\n "_AssetEntry",\n [\n ("asset_key", AssetKey),\n ("last_materialization", Optional[EventLogEntry]),\n ("last_run_id", Optional[str]),\n ("asset_details", Optional[AssetDetails]),\n ],\n )\n):\n def __new__(\n cls,\n asset_key: AssetKey,\n last_materialization: Optional[EventLogEntry] = None,\n last_run_id: Optional[str] = None,\n asset_details: Optional[AssetDetails] = None,\n ):\n return super(AssetEntry, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n last_materialization=check.opt_inst_param(\n last_materialization, "last_materialization", EventLogEntry\n ),\n last_run_id=check.opt_str_param(last_run_id, "last_run_id"),\n asset_details=check.opt_inst_param(asset_details, "asset_details", AssetDetails),\n )\n\n\nclass AssetRecord(NamedTuple):\n storage_id: int\n asset_entry: AssetEntry\n\n\n
[docs]@whitelist_for_serdes\nclass EventRecordsFilter(\n NamedTuple(\n "_EventRecordsFilter",\n [\n ("event_type", Optional[DagsterEventType]),\n ("asset_key", Optional[AssetKey]),\n ("asset_partitions", Optional[List[str]]),\n ("after_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("before_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("after_timestamp", Optional[float]),\n ("before_timestamp", Optional[float]),\n ],\n )\n):\n """Defines a set of filter fields for fetching a set of event log entries or event log records.\n\n Args:\n event_type (Optional[DagsterEventType]): Filter argument for dagster event type\n asset_key (Optional[AssetKey]): Asset key for which to get asset materialization event\n entries / records.\n asset_partitions (Optional[List[str]]): Filter parameter such that only asset\n materialization events with a partition value matching one of the provided values. Only\n valid when the `asset_key` parameter is provided.\n after_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that only\n records with storage_id greater than the provided value are returned. Using a\n run-sharded events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n before_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that\n records with storage_id less than the provided value are returned. Using a run-sharded\n events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n after_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp greater than the provided value are returned.\n before_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp less than the provided value are returned.\n """\n\n def __new__(\n cls,\n event_type: Optional[DagsterEventType] = None,\n asset_key: Optional[AssetKey] = None,\n asset_partitions: Optional[List[str]] = None,\n after_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n before_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n after_timestamp: Optional[float] = None,\n before_timestamp: Optional[float] = None,\n ):\n check.opt_list_param(asset_partitions, "asset_partitions", of_type=str)\n event_type = check.opt_inst_param(event_type, "event_type", DagsterEventType)\n if not event_type:\n warnings.warn(\n "The use of `EventRecordsFilter` without an event type is deprecated and will "\n "begin erroring starting in 0.15.0"\n )\n\n return super(EventRecordsFilter, cls).__new__(\n cls,\n event_type=event_type,\n asset_key=check.opt_inst_param(asset_key, "asset_key", AssetKey),\n asset_partitions=asset_partitions,\n after_cursor=check.opt_inst_param(\n after_cursor, "after_cursor", (int, RunShardedEventsCursor)\n ),\n before_cursor=check.opt_inst_param(\n before_cursor, "before_cursor", (int, RunShardedEventsCursor)\n ),\n after_timestamp=check.opt_float_param(after_timestamp, "after_timestamp"),\n before_timestamp=check.opt_float_param(before_timestamp, "before_timestamp"),\n )
\n\n\n
[docs]class EventLogStorage(ABC, MayHaveInstanceWeakref):\n """Abstract base class for storing structured event logs from pipeline runs.\n\n Note that event log storages using SQL databases as backing stores should implement\n :py:class:`~dagster.core.storage.event_log.SqlEventLogStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagit`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n def get_logs_for_run(\n self,\n run_id: str,\n cursor: Optional[Union[str, int]] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Iterable[EventLogEntry]:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[Union[str, int]]): Cursor value to track paginated queries. Legacy\n support for integer offset cursors.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n if isinstance(cursor, int):\n cursor = EventLogCursor.from_offset(cursor + 1).to_string()\n records = self.get_records_for_run(run_id, cursor, of_type, limit).records\n return [record.event_log_entry for record in records]\n\n @abstractmethod\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> EventLogConnection:\n """Get all of the event log records corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[str]): Cursor value to track paginated queries.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n\n def get_stats_for_run(self, run_id: str) -> PipelineRunStatsSnapshot:\n """Get a summary of events that have ocurred in a run."""\n return build_run_stats_from_events(run_id, self.get_logs_for_run(run_id))\n\n def get_step_stats_for_run(self, run_id: str, step_keys=None) -> List[RunStepKeyStatsSnapshot]:\n """Get per-step stats for a pipeline run."""\n logs = self.get_logs_for_run(run_id)\n if step_keys:\n logs = [\n event\n for event in logs\n if event.is_dagster_event and event.get_dagster_event().step_key in step_keys\n ]\n\n return build_run_step_stats_from_events(run_id, logs)\n\n @abstractmethod\n def store_event(self, event: EventLogEntry):\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n\n @abstractmethod\n def delete_events(self, run_id: str):\n """Remove events for a given run id"""\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def reindex_events(self, print_fn: Callable = lambda _: None, force: bool = False):\n """Call this method to run any data migrations across the event_log tables."""\n\n @abstractmethod\n def reindex_assets(self, print_fn: Callable = lambda _: None, force: bool = False):\n """Call this method to run any data migrations across the asset tables."""\n\n @abstractmethod\n def wipe(self):\n """Clear the log storage."""\n\n @abstractmethod\n def watch(self, run_id: str, cursor: str, callback: Callable):\n """Call this method to start watching."""\n\n @abstractmethod\n def end_watch(self, run_id: str, handler: Callable):\n """Call this method to stop watching."""\n\n @property\n @abstractmethod\n def is_persistent(self) -> bool:\n """bool: Whether the storage is persistent."""\n\n def dispose(self):\n """Explicit lifecycle management."""\n\n def optimize_for_dagit(self, statement_timeout: int):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""\n\n @abstractmethod\n def get_event_records(\n self,\n event_records_filter: Optional[EventRecordsFilter] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n pass\n\n @abstractmethod\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Iterable[AssetRecord]:\n pass\n\n @abstractmethod\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n pass\n\n @abstractmethod\n def all_asset_keys(self) -> Iterable[AssetKey]:\n pass\n\n def get_asset_keys(\n self,\n prefix: Optional[List[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Iterable[AssetKey]:\n # base implementation of get_asset_keys, using the existing `all_asset_keys` and doing the\n # filtering in-memory\n asset_keys = sorted(self.all_asset_keys(), key=str)\n if prefix:\n asset_keys = [\n asset_key for asset_key in asset_keys if asset_key.path[: len(prefix)] == prefix\n ]\n if cursor:\n cursor_asset = AssetKey.from_db_string(cursor)\n if cursor_asset and cursor_asset in asset_keys:\n idx = asset_keys.index(cursor_asset)\n asset_keys = asset_keys[idx + 1 :]\n if limit:\n asset_keys = asset_keys[:limit]\n return asset_keys\n\n @abstractmethod\n def get_latest_materialization_events(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n pass\n\n @abstractmethod\n def get_asset_events(\n self,\n asset_key: AssetKey,\n partitions: Optional[List[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n include_cursor: bool = False,\n before_timestamp=None,\n cursor: Optional[int] = None, # deprecated\n ) -> Union[Iterable[EventLogEntry], Iterable[Tuple[int, EventLogEntry]]]:\n pass\n\n @abstractmethod\n def get_asset_run_ids(self, asset_key: AssetKey) -> Iterable[str]:\n pass\n\n @abstractmethod\n def wipe_asset(self, asset_key: AssetKey):\n """Remove asset index history from event log for given asset_key"""\n\n @abstractmethod\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n pass\n\n def alembic_version(self):\n return None
\n\n\ndef extract_asset_events_cursor(cursor, before_cursor, after_cursor, ascending):\n if cursor:\n warnings.warn(\n "Parameter `cursor` is a deprecated for `get_asset_events`. Use `before_cursor` or `after_cursor` instead"\n )\n if ascending and after_cursor is None:\n after_cursor = cursor\n if not ascending and before_cursor is None:\n before_cursor = cursor\n\n if after_cursor is not None:\n try:\n after_cursor = int(after_cursor)\n except ValueError:\n after_cursor = None\n\n if before_cursor is not None:\n try:\n before_cursor = int(before_cursor)\n except ValueError:\n before_cursor = None\n\n return before_cursor, after_cursor\n
", "current_page_name": "_modules/dagster/core/storage/event_log/base", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.event_log.base"}, "sql_event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.event_log.sql_event_log

\nimport logging\nimport warnings\nfrom abc import abstractmethod\nfrom collections import OrderedDict\nfrom datetime import datetime\nfrom typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, cast\n\nimport pendulum\nimport sqlalchemy as db\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.core.assets import AssetDetails\nfrom dagster.core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster.core.errors import DagsterEventLogInvalidForRun\nfrom dagster.core.events import DagsterEventType\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.execution.stats import build_run_step_stats_from_events\nfrom dagster.serdes import (\n    deserialize_as,\n    deserialize_json_to_dagster_namedtuple,\n    serialize_dagster_namedtuple,\n)\nfrom dagster.serdes.errors import DeserializationError\nfrom dagster.utils import datetime_as_float, utc_datetime_from_naive, utc_datetime_from_timestamp\n\nfrom ..pipeline_run import PipelineRunStatsSnapshot\nfrom .base import (\n    AssetEntry,\n    AssetRecord,\n    EventLogConnection,\n    EventLogCursor,\n    EventLogRecord,\n    EventLogStorage,\n    EventRecordsFilter,\n    RunShardedEventsCursor,\n    extract_asset_events_cursor,\n)\nfrom .migration import ASSET_DATA_MIGRATIONS, ASSET_KEY_INDEX_COLS, EVENT_LOG_DATA_MIGRATIONS\nfrom .schema import AssetKeyTable, SecondaryIndexMigrationTable, SqlEventLogStorageTable\n\nMIN_ASSET_ROWS = 25\n\n\n
[docs]class SqlEventLogStorage(EventLogStorage):\n """Base class for SQL backed event log storages.\n\n Distinguishes between run-based connections and index connections in order to support run-level\n sharding, while maintaining the ability to do cross-run queries\n """\n\n @abstractmethod\n def run_connection(self, run_id):\n """Context manager yielding a connection to access the event logs for a specific run.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def index_connection(self):\n """Context manager yielding a connection to access cross-run indexed tables.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def prepare_insert_event(self, event):\n """Helper method for preparing the event log SQL insertion statement. Abstracted away to\n have a single place for the logical table representation of the event, while having a way\n for SQL backends to implement different execution implementations for `store_event`. See\n the `dagster-postgres` implementation which overrides the generic SQL implementation of\n `store_event`.\n """\n\n dagster_event_type = None\n asset_key_str = None\n partition = None\n step_key = event.step_key\n\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n step_key = event.dagster_event.step_key\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n if event.dagster_event.partition:\n partition = event.dagster_event.partition\n\n # https://stackoverflow.com/a/54386260/324449\n return SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter\n run_id=event.run_id,\n event=serialize_dagster_namedtuple(event),\n dagster_event_type=dagster_event_type,\n # Postgres requires a datetime that is in UTC but has no timezone info set\n # in order to be stored correctly\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=step_key,\n asset_key=asset_key_str,\n partition=partition,\n )\n\n def has_asset_key_index_cols(self):\n with self.index_connection() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(AssetKeyTable.name)]\n return "last_materialization_timestamp" in column_names\n\n def store_asset_event(self, event):\n check.inst_param(event, "event", EventLogEntry)\n if not event.is_dagster_event or not event.dagster_event.asset_key:\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n values = self._get_asset_entry_values(event, self.has_asset_key_index_cols())\n insert_statement = AssetKeyTable.insert().values(\n asset_key=event.dagster_event.asset_key.to_string(), **values\n )\n update_statement = (\n AssetKeyTable.update()\n .values(**values)\n .where(\n AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(),\n )\n )\n\n with self.index_connection() as conn:\n try:\n conn.execute(insert_statement)\n except db.exc.IntegrityError:\n conn.execute(update_statement)\n\n def _get_asset_entry_values(self, event, has_asset_key_index_cols):\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in Dagit.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n entry_values: Dict[str, Any] = {}\n if event.dagster_event.is_step_materialization:\n entry_values.update(\n {\n "last_materialization": serialize_dagster_namedtuple(event),\n "last_run_id": event.run_id,\n }\n )\n if has_asset_key_index_cols:\n materialization = event.dagster_event.step_materialization_data.materialization\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n "tags": seven.json.dumps(materialization.tags)\n if materialization.tags\n else None,\n }\n )\n elif event.dagster_event.is_asset_materialization_planned:\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n entry_values.update({"last_run_id": event.run_id})\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif event.dagster_event.is_asset_observation:\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n\n return entry_values\n\n def store_event(self, event):\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.run_connection(run_id) as conn:\n conn.execute(insert_event_statement)\n\n if (\n event.is_dagster_event\n and (\n event.dagster_event.is_step_materialization\n or event.dagster_event.is_asset_observation\n or event.dagster_event.is_asset_materialization_planned\n )\n and event.dagster_event.asset_key\n ):\n self.store_asset_event(event)\n\n def get_records_for_run(\n self,\n run_id,\n cursor=None,\n of_type=None,\n limit=None,\n ) -> EventLogConnection:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): the maximum number of events to fetch\n """\n check.str_param(run_id, "run_id")\n check.opt_str_param(cursor, "cursor")\n\n check.invariant(\n not of_type\n or isinstance(of_type, DagsterEventType)\n or isinstance(of_type, (frozenset, set))\n )\n\n dagster_event_types = (\n {of_type}\n if isinstance(of_type, DagsterEventType)\n else check.opt_set_param(of_type, "dagster_event_type", of_type=DagsterEventType)\n )\n\n query = (\n db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n # adjust 0 based index cursor to SQL offset\n if cursor is not None:\n cursor_obj = EventLogCursor.parse(cursor)\n if cursor_obj.is_offset_cursor():\n query = query.offset(cursor_obj.offset())\n elif cursor_obj.is_id_cursor():\n query = query.where(SqlEventLogStorageTable.c.id > cursor_obj.storage_id())\n\n if limit:\n query = query.limit(limit)\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n last_record_id = None\n try:\n records = []\n for (\n record_id,\n json_str,\n ) in results:\n records.append(\n EventLogRecord(\n storage_id=record_id,\n event_log_entry=deserialize_as(json_str, EventLogEntry),\n )\n )\n last_record_id = record_id\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n if last_record_id is not None:\n next_cursor = EventLogCursor.from_storage_id(last_record_id).to_string()\n elif cursor:\n # record fetch returned no new logs, return the same cursor\n next_cursor = cursor\n else:\n # rely on the fact that all storage ids will be positive integers\n next_cursor = EventLogCursor.from_storage_id(-1).to_string()\n\n return EventLogConnection(\n records=records,\n cursor=next_cursor,\n has_more=bool(limit and len(results) == limit),\n )\n\n def get_stats_for_run(self, run_id):\n check.str_param(run_id, "run_id")\n\n query = (\n db.select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n db.func.count().label("n_events_of_type"),\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("last_event_timestamp"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.run_id == run_id,\n SqlEventLogStorageTable.c.dagster_event_type != None,\n )\n )\n .group_by("dagster_event_type")\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n try:\n counts = {}\n times = {}\n for result in results:\n (dagster_event_type, n_events_of_type, last_event_timestamp) = result\n check.invariant(dagster_event_type is not None)\n counts[dagster_event_type] = n_events_of_type\n times[dagster_event_type] = last_event_timestamp\n\n enqueued_time = times.get(DagsterEventType.PIPELINE_ENQUEUED.value, None)\n launch_time = times.get(DagsterEventType.PIPELINE_STARTING.value, None)\n start_time = times.get(DagsterEventType.PIPELINE_START.value, None)\n end_time = times.get(\n DagsterEventType.PIPELINE_SUCCESS.value,\n times.get(\n DagsterEventType.PIPELINE_FAILURE.value,\n times.get(DagsterEventType.PIPELINE_CANCELED.value, None),\n ),\n )\n\n return PipelineRunStatsSnapshot(\n run_id=run_id,\n steps_succeeded=counts.get(DagsterEventType.STEP_SUCCESS.value, 0),\n steps_failed=counts.get(DagsterEventType.STEP_FAILURE.value, 0),\n materializations=counts.get(DagsterEventType.ASSET_MATERIALIZATION.value, 0),\n expectations=counts.get(DagsterEventType.STEP_EXPECTATION_RESULT.value, 0),\n enqueued_time=datetime_as_float(enqueued_time) if enqueued_time else None,\n launch_time=datetime_as_float(launch_time) if launch_time else None,\n start_time=datetime_as_float(start_time) if start_time else None,\n end_time=datetime_as_float(end_time) if end_time else None,\n )\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def get_step_stats_for_run(self, run_id, step_keys=None):\n check.str_param(run_id, "run_id")\n check.opt_list_param(step_keys, "step_keys", of_type=str)\n\n # Originally, this was two different queries:\n # 1) one query which aggregated top-level step stats by grouping by event type / step_key in\n # a single query, using pure SQL (e.g. start_time, end_time, status, attempt counts).\n # 2) one query which fetched all the raw events for a specific event type and then inspected\n # the deserialized event object to aggregate stats derived from sequences of events.\n # (e.g. marker events, materializations, expectations resuls, attempts timing, etc.)\n #\n # For simplicity, we now just do the second type of query and derive the stats in Python\n # from the raw events. This has the benefit of being easier to read and also the benefit of\n # being able to share code with the in-memory event log storage implementation. We may\n # choose to revisit this in the future, especially if we are able to do JSON-column queries\n # in SQL as a way of bypassing the serdes layer in all cases.\n raw_event_query = (\n db.select([SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.step_key != None)\n .where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [\n DagsterEventType.STEP_START.value,\n DagsterEventType.STEP_SUCCESS.value,\n DagsterEventType.STEP_SKIPPED.value,\n DagsterEventType.STEP_FAILURE.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.ASSET_MATERIALIZATION.value,\n DagsterEventType.STEP_EXPECTATION_RESULT.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.STEP_UP_FOR_RETRY.value,\n DagsterEventType.ENGINE_EVENT.value,\n ]\n )\n )\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n if step_keys:\n raw_event_query = raw_event_query.where(\n SqlEventLogStorageTable.c.step_key.in_(step_keys)\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(raw_event_query).fetchall()\n\n try:\n records = [\n check.inst_param(\n deserialize_json_to_dagster_namedtuple(json_str), "event", EventLogEntry\n )\n for (json_str,) in results\n ]\n return build_run_step_stats_from_events(run_id, records)\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def _apply_migration(self, migration_name, migration_fn, print_fn, force):\n if self.has_secondary_index(migration_name):\n if not force:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n return\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.enable_secondary_index(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def reindex_events(self, print_fn=None, force=False):\n """Call this method to run any data migrations across the event_log table"""\n for migration_name, migration_fn in EVENT_LOG_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def reindex_assets(self, print_fn=None, force=False):\n """Call this method to run any data migrations across the asset_keys table"""\n for migration_name, migration_fn in ASSET_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def wipe(self):\n """Clears the event log storage."""\n # Should be overridden by SqliteEventLogStorage and other storages that shard based on\n # run_id\n\n # https://stackoverflow.com/a/54386260/324449\n with self.run_connection(run_id=None) as conn:\n conn.execute(SqlEventLogStorageTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(AssetKeyTable.delete()) # pylint: disable=no-value-for-parameter\n\n with self.index_connection() as conn:\n conn.execute(SqlEventLogStorageTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(AssetKeyTable.delete()) # pylint: disable=no-value-for-parameter\n\n def delete_events(self, run_id):\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n\n def delete_events_for_run(self, conn, run_id):\n check.str_param(run_id, "run_id")\n\n delete_statement = (\n SqlEventLogStorageTable.delete().where( # pylint: disable=no-value-for-parameter\n SqlEventLogStorageTable.c.run_id == run_id\n )\n )\n removed_asset_key_query = (\n db.select([SqlEventLogStorageTable.c.asset_key])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.asset_key != None)\n .group_by(SqlEventLogStorageTable.c.asset_key)\n )\n\n removed_asset_keys = [\n AssetKey.from_db_string(row[0])\n for row in conn.execute(removed_asset_key_query).fetchall()\n ]\n conn.execute(delete_statement)\n if len(removed_asset_keys) > 0:\n keys_to_check = []\n keys_to_check.extend([key.to_string() for key in removed_asset_keys])\n keys_to_check.extend([key.to_string(legacy=True) for key in removed_asset_keys])\n remaining_asset_keys = [\n AssetKey.from_db_string(row[0])\n for row in conn.execute(\n db.select([SqlEventLogStorageTable.c.asset_key])\n .where(SqlEventLogStorageTable.c.asset_key.in_(keys_to_check))\n .group_by(SqlEventLogStorageTable.c.asset_key)\n )\n ]\n to_remove = set(removed_asset_keys) - set(remaining_asset_keys)\n if to_remove:\n keys_to_remove = []\n keys_to_remove.extend([key.to_string() for key in to_remove])\n keys_to_remove.extend([key.to_string(legacy=True) for key in to_remove])\n conn.execute(\n AssetKeyTable.delete().where( # pylint: disable=no-value-for-parameter\n AssetKeyTable.c.asset_key.in_(keys_to_remove)\n )\n )\n\n @property\n def is_persistent(self):\n return True\n\n def update_event_log_record(self, record_id, event):\n """Utility method for migration scripts to update SQL representation of event records."""\n check.int_param(record_id, "record_id")\n check.inst_param(event, "event", EventLogEntry)\n dagster_event_type = None\n asset_key_str = None\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n\n with self.run_connection(run_id=event.run_id) as conn:\n conn.execute(\n SqlEventLogStorageTable.update() # pylint: disable=no-value-for-parameter\n .where(SqlEventLogStorageTable.c.id == record_id)\n .values(\n event=serialize_dagster_namedtuple(event),\n dagster_event_type=dagster_event_type,\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=event.step_key,\n asset_key=asset_key_str,\n )\n )\n\n def get_event_log_table_data(self, run_id, record_id):\n """Utility method to test representation of the record in the SQL table. Returns all of\n the columns stored in the event log storage (as opposed to the deserialized `EventLogEntry`).\n This allows checking that certain fields are extracted to support performant lookups (e.g.\n extracting `step_key` for fast filtering)"""\n with self.run_connection(run_id=run_id) as conn:\n query = (\n db.select([SqlEventLogStorageTable])\n .where(SqlEventLogStorageTable.c.id == record_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n return conn.execute(query).fetchone()\n\n def has_secondary_index(self, name):\n """This method uses a checkpoint migration table to see if summary data has been constructed\n in a secondary index table. Can be used to checkpoint event_log data migrations.\n """\n query = (\n db.select([1])\n .where(SecondaryIndexMigrationTable.c.name == name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None)\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def enable_secondary_index(self, name):\n """This method marks an event_log data migration as complete, to indicate that a summary\n data migration is complete.\n """\n query = (\n SecondaryIndexMigrationTable.insert().values( # pylint: disable=no-value-for-parameter\n name=name,\n migration_completed=datetime.now(),\n )\n )\n with self.index_connection() as conn:\n try:\n conn.execute(query)\n except db.exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update() # pylint: disable=no-value-for-parameter\n .where(SecondaryIndexMigrationTable.c.name == name)\n .values(migration_completed=datetime.now())\n )\n\n def _apply_filter_to_query(\n self,\n query,\n event_records_filter=None,\n asset_details=None,\n apply_cursor_filters=True,\n ):\n if not event_records_filter:\n return query\n\n if event_records_filter.event_type:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type\n == event_records_filter.event_type.value\n )\n\n if event_records_filter.asset_key:\n query = query.where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key\n == event_records_filter.asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key\n == event_records_filter.asset_key.to_string(legacy=True),\n )\n )\n\n if event_records_filter.asset_partitions:\n query = query.where(\n SqlEventLogStorageTable.c.partition.in_(event_records_filter.asset_partitions)\n )\n\n if asset_details and asset_details.last_wipe_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if apply_cursor_filters:\n # allow the run-sharded sqlite implementation to disable this cursor filtering so that\n # it can implement its own custom cursor logic, as cursor ids are not unique across run\n # shards\n if event_records_filter.before_cursor is not None:\n before_cursor_id = (\n event_records_filter.before_cursor.id\n if isinstance(event_records_filter.before_cursor, RunShardedEventsCursor)\n else event_records_filter.before_cursor\n )\n before_query = db.select([SqlEventLogStorageTable.c.id]).where(\n SqlEventLogStorageTable.c.id == before_cursor_id\n )\n query = query.where(SqlEventLogStorageTable.c.id < before_query)\n\n if event_records_filter.after_cursor is not None:\n after_cursor_id = (\n event_records_filter.after_cursor.id\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else event_records_filter.after_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor_id)\n\n if event_records_filter.before_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n < datetime.utcfromtimestamp(event_records_filter.before_timestamp)\n )\n\n if event_records_filter.after_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(event_records_filter.after_timestamp)\n )\n\n return query\n\n def get_event_records(\n self,\n event_records_filter: Optional[EventRecordsFilter] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n """Returns a list of (record_id, record)."""\n check.opt_inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n if not event_records_filter:\n warnings.warn(\n "The use of `get_event_records` without an `EventRecordsFilter` is deprecated and "\n "will begin erroring starting in 0.15.0"\n )\n\n query = db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n if event_records_filter and event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n )\n if limit:\n query = query.limit(limit)\n\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.id.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.id.desc())\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n event_records = []\n for row_id, json_str in results:\n try:\n event_record = deserialize_json_to_dagster_namedtuple(json_str)\n if not isinstance(event_record, EventLogEntry):\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n continue\n else:\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n return event_records\n\n def _construct_asset_record_from_row(self, row, last_materialization: Optional[EventLogEntry]):\n asset_key = AssetKey.from_db_string(row[1])\n if asset_key:\n return AssetRecord(\n storage_id=row[0],\n asset_entry=AssetEntry(\n asset_key=asset_key,\n last_materialization=last_materialization,\n last_run_id=row[3],\n asset_details=AssetDetails.from_db_string(row[4]),\n ),\n )\n\n def _get_latest_materializations(\n self, raw_asset_rows\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n # Given a list of raw asset rows, returns a mapping of asset key to latest asset materialization\n # event log entry. Fetches backcompat EventLogEntry records when the last_materialization\n # in the raw asset row is an AssetMaterialization.\n to_backcompat_fetch = set()\n results: Dict[AssetKey, Optional[EventLogEntry]] = {}\n for row in raw_asset_rows:\n asset_key = AssetKey.from_db_string(row[1])\n if not asset_key:\n continue\n event_or_materialization = (\n deserialize_json_to_dagster_namedtuple(row[2]) if row[2] else None\n )\n if isinstance(event_or_materialization, EventLogEntry):\n results[asset_key] = event_or_materialization\n else:\n to_backcompat_fetch.add(asset_key)\n\n latest_event_subquery = (\n db.select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("timestamp"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in to_backcompat_fetch]\n ),\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key)\n .alias("latest_materializations")\n )\n backcompat_query = db.select(\n [SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.event]\n ).select_from(\n latest_event_subquery.join(\n SqlEventLogStorageTable,\n db.and_(\n SqlEventLogStorageTable.c.asset_key == latest_event_subquery.c.asset_key,\n SqlEventLogStorageTable.c.timestamp == latest_event_subquery.c.timestamp,\n ),\n )\n )\n with self.index_connection() as conn:\n event_rows = conn.execute(backcompat_query).fetchall()\n\n for row in event_rows:\n asset_key = AssetKey.from_db_string(row[0])\n if asset_key:\n results[asset_key] = cast(\n EventLogEntry, deserialize_json_to_dagster_namedtuple(row[1])\n )\n return results\n\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Iterable[AssetRecord]:\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n latest_materializations = self._get_latest_materializations(rows)\n\n asset_records: List[AssetRecord] = []\n for row in rows:\n asset_key = AssetKey.from_db_string(row[1])\n if asset_key:\n asset_records.append(\n self._construct_asset_record_from_row(\n row, latest_materializations.get(asset_key)\n )\n )\n\n return asset_records\n\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n check.inst_param(asset_key, "asset_key", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=[asset_key])\n return bool(rows)\n\n def all_asset_keys(self):\n rows = self._fetch_asset_rows()\n asset_keys = [AssetKey.from_db_string(row[1]) for row in sorted(rows, key=lambda x: x[1])]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_asset_keys(\n self,\n prefix: Optional[List[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Iterable[AssetKey]:\n rows = self._fetch_asset_rows(prefix=prefix, limit=limit, cursor=cursor)\n asset_keys = [AssetKey.from_db_string(row[1]) for row in sorted(rows, key=lambda x: x[1])]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_latest_materialization_events(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n check.list_param(asset_keys, "asset_keys", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n return self._get_latest_materializations(rows)\n\n def _fetch_asset_rows(self, asset_keys=None, prefix=None, limit=None, cursor=None):\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments.\n #\n # Differs from _fetch_raw_asset_rows, in that it loops through to make sure enough rows are\n # returned to satisfy the limit.\n #\n # returns a list of rows where each row is a tuple of serialized asset_key, materialization,\n # and asset_details\n should_query = True\n current_cursor = cursor\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # if we have migrated, we can limit using SQL\n fetch_limit = limit\n else:\n # if we haven't migrated, overfetch in case the first N results are wiped\n fetch_limit = max(limit, MIN_ASSET_ROWS) if limit else None\n result = []\n\n while should_query:\n rows, has_more, current_cursor = self._fetch_raw_asset_rows(\n asset_keys=asset_keys, prefix=prefix, limit=fetch_limit, cursor=current_cursor\n )\n result.extend(rows)\n should_query = bool(has_more) and bool(limit) and len(result) < cast(int, limit)\n\n is_partial_query = bool(asset_keys) or bool(prefix) or bool(limit) or bool(cursor)\n if not is_partial_query and self._can_mark_assets_as_migrated(rows):\n self.enable_secondary_index(ASSET_KEY_INDEX_COLS)\n\n return result[:limit] if limit else result\n\n def _fetch_raw_asset_rows(self, asset_keys=None, prefix=None, limit=None, cursor=None):\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments. Does not guarantee that the number of\n # rows returned will match the limit specified. This helper function is used to fetch a\n # chunk of asset key rows, which may or may not be wiped.\n #\n # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized\n # asset_key, materialization, and asset_details\n # TODO update comment\n\n columns = [\n AssetKeyTable.c.id,\n AssetKeyTable.c.asset_key,\n AssetKeyTable.c.last_materialization,\n AssetKeyTable.c.last_run_id,\n AssetKeyTable.c.asset_details,\n ]\n if self.has_asset_key_index_cols():\n columns.extend(\n [\n AssetKeyTable.c.wipe_timestamp,\n AssetKeyTable.c.last_materialization_timestamp,\n AssetKeyTable.c.tags,\n ]\n )\n\n is_partial_query = bool(asset_keys) or bool(prefix) or bool(limit) or bool(cursor)\n if self.has_asset_key_index_cols() and not is_partial_query:\n # if the schema has been migrated, fetch the last_materialization_timestamp to see if\n # we can lazily migrate the data table\n columns.append(AssetKeyTable.c.last_materialization_timestamp)\n columns.append(AssetKeyTable.c.wipe_timestamp)\n\n query = db.select(columns).order_by(AssetKeyTable.c.asset_key.asc())\n query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor)\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n query = query.where(\n db.or_(\n AssetKeyTable.c.wipe_timestamp == None,\n AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp,\n )\n )\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n return rows, False, None\n\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n wiped_timestamps_by_asset_key = {}\n row_by_asset_key = OrderedDict()\n\n for row in rows:\n asset_key = AssetKey.from_db_string(row[1])\n if not asset_key:\n continue\n asset_details = AssetDetails.from_db_string(row[4])\n if not asset_details or not asset_details.last_wipe_timestamp:\n row_by_asset_key[asset_key] = row\n continue\n materialization_or_event = (\n deserialize_json_to_dagster_namedtuple(row[2]) if row[2] else None\n )\n if isinstance(materialization_or_event, EventLogEntry):\n if asset_details.last_wipe_timestamp > materialization_or_event.timestamp:\n # this asset has not been materialized since being wiped, skip\n continue\n else:\n # add the key\n row_by_asset_key[asset_key] = row\n else:\n row_by_asset_key[asset_key] = row\n wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp\n\n if wiped_timestamps_by_asset_key:\n materialization_times = self._fetch_backcompat_materialization_times(\n wiped_timestamps_by_asset_key.keys()\n )\n for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items():\n materialization_time = materialization_times.get(asset_key)\n if not materialization_time or utc_datetime_from_naive(\n materialization_time\n ) < utc_datetime_from_timestamp(wiped_timestamp):\n # remove rows that have not been materialized since being wiped\n row_by_asset_key.pop(asset_key)\n\n has_more = limit and len(rows) == limit\n new_cursor = rows[-1][0] if rows else None\n\n return row_by_asset_key.values(), has_more, new_cursor\n\n def _fetch_backcompat_materialization_times(self, asset_keys):\n # fetches the latest materialization timestamp for the given asset_keys. Uses the (slower)\n # raw event log table.\n backcompat_query = (\n db.select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.timestamp),\n ]\n )\n .where(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key)\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())\n )\n with self.index_connection() as conn:\n backcompat_rows = conn.execute(backcompat_query).fetchall()\n return {AssetKey.from_db_string(row[0]): row[1] for row in backcompat_rows}\n\n def _can_mark_assets_as_migrated(self, rows):\n if not self.has_asset_key_index_cols():\n return False\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # we have already migrated\n return False\n\n for row in rows:\n if not _get_from_row(row, "last_materialization_timestamp"):\n return False\n\n if _get_from_row(row, "asset_details") and not _get_from_row(row, "wipe_timestamp"):\n return False\n\n return True\n\n def _apply_asset_filter_to_query(\n self,\n query,\n asset_keys=None,\n prefix=None,\n limit=None,\n cursor=None,\n ):\n if asset_keys:\n query = query.where(\n AssetKeyTable.c.asset_key.in_([asset_key.to_string() for asset_key in asset_keys])\n )\n\n if prefix:\n prefix_str = seven.dumps(prefix)[:-1]\n query = query.where(AssetKeyTable.c.asset_key.startswith(prefix_str))\n\n if cursor:\n query = query.where(AssetKeyTable.c.asset_key > cursor)\n\n if limit:\n query = query.limit(limit)\n return query\n\n def _get_assets_details(self, asset_keys: Sequence[AssetKey]):\n check.list_param(asset_keys, "asset_key", AssetKey)\n rows = None\n with self.index_connection() as conn:\n rows = conn.execute(\n db.select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details]).where(\n AssetKeyTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n )\n ).fetchall()\n\n asset_key_to_details = {\n row[0]: (deserialize_json_to_dagster_namedtuple(row[1]) if row[1] else None)\n for row in rows\n }\n\n # returns a list of the corresponding asset_details to provided asset_keys\n return [\n asset_key_to_details.get(asset_key.to_string(), None) for asset_key in asset_keys\n ]\n\n def _add_assets_wipe_filter_to_query(\n self, query, assets_details: Sequence[str], asset_keys: Sequence[AssetKey]\n ):\n check.invariant(\n len(assets_details) == len(asset_keys),\n "asset_details and asset_keys must be the same length",\n )\n for i in range(len(assets_details)):\n asset_key, asset_details = asset_keys[i], assets_details[i]\n if asset_details and asset_details.last_wipe_timestamp: # type: ignore[attr-defined]\n asset_key_in_row = db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n # If asset key is in row, keep the row if the timestamp > wipe timestamp, else remove the row.\n # If asset key is not in row, keep the row.\n query = query.where(\n db.or_(\n db.and_(\n asset_key_in_row,\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp), # type: ignore[attr-defined]\n ),\n db.not_(asset_key_in_row),\n )\n )\n\n return query\n\n def get_asset_events(\n self,\n asset_key,\n partitions=None,\n before_cursor=None,\n after_cursor=None,\n limit=None,\n ascending=False,\n include_cursor=False, # deprecated\n before_timestamp=None,\n cursor=None, # deprecated\n ):\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.opt_list_param(partitions, "partitions", of_type=str)\n before_cursor, after_cursor = extract_asset_events_cursor(\n cursor, before_cursor, after_cursor, ascending\n )\n event_records = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n asset_partitions=partitions,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n before_timestamp=before_timestamp,\n ),\n limit=limit,\n ascending=ascending,\n )\n if include_cursor:\n return [tuple([record.storage_id, record.event_log_entry]) for record in event_records]\n else:\n return [record.event_log_entry for record in event_records]\n\n def get_asset_run_ids(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n query = (\n db.select(\n [SqlEventLogStorageTable.c.run_id, db.func.max(SqlEventLogStorageTable.c.timestamp)]\n )\n .where(\n db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .group_by(\n SqlEventLogStorageTable.c.run_id,\n )\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).desc())\n )\n\n asset_keys = [asset_key]\n asset_details = self._get_assets_details(asset_keys)\n query = self._add_assets_wipe_filter_to_query(query, asset_details, asset_keys)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return [run_id for (run_id, _timestamp) in results]\n\n def _asset_materialization_from_json_column(self, json_str):\n if not json_str:\n return None\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n event_or_materialization = deserialize_json_to_dagster_namedtuple(json_str)\n if isinstance(event_or_materialization, AssetMaterialization):\n return event_or_materialization\n\n if (\n not isinstance(event_or_materialization, EventLogEntry)\n or not event_or_materialization.is_dagster_event\n or not event_or_materialization.dagster_event.asset_key\n ):\n return None\n\n return event_or_materialization.dagster_event.step_materialization_data.materialization\n\n def wipe_asset(self, asset_key):\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n wipe_timestamp = pendulum.now("UTC").timestamp()\n\n if self.has_asset_key_index_cols():\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update() # pylint: disable=no-value-for-parameter\n .where(\n db.or_(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .values(\n asset_details=serialize_dagster_namedtuple(\n AssetDetails(last_wipe_timestamp=wipe_timestamp)\n ),\n wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp),\n last_run_id=None,\n )\n )\n\n else:\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update() # pylint: disable=no-value-for-parameter\n .where(\n db.or_(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n .values(\n asset_details=serialize_dagster_namedtuple(\n AssetDetails(last_wipe_timestamp=wipe_timestamp)\n ),\n last_run_id=None,\n )\n )\n\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n check.list_param(asset_keys, "asset_keys", AssetKey)\n\n query = (\n db.select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.partition,\n db.func.count(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n db.or_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string(legacy=True) for asset_key in asset_keys]\n ),\n ),\n SqlEventLogStorageTable.c.partition != None,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details(asset_keys)\n query = self._add_assets_wipe_filter_to_query(query, assets_details, asset_keys)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n materialization_count_by_partition: Dict[AssetKey, Dict[str, int]] = {\n asset_key: {} for asset_key in asset_keys\n }\n for row in results:\n asset_key = AssetKey.from_db_string(row[0])\n if asset_key:\n materialization_count_by_partition[asset_key][row[1]] = row[2]\n\n return materialization_count_by_partition
\n\n\ndef _get_from_row(row, column):\n """utility function for extracting a column from a sqlalchemy row proxy, since '_asdict' is not\n supported in sqlalchemy 1.3"""\n if not row.has_key(column):\n return None\n return row[column]\n
", "current_page_name": "_modules/dagster/core/storage/event_log/sql_event_log", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.event_log.sql_event_log"}, "sqlite": {"consolidated_sqlite_event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.event_log.sqlite.consolidated_sqlite_event_log

\nimport logging\nimport os\nfrom collections import defaultdict\nfrom contextlib import contextmanager\n\nfrom sqlalchemy.pool import NullPool\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nfrom dagster.config.source import StringSource\nfrom dagster.core.storage.event_log.base import EventLogCursor\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata\nfrom ..sql_event_log import SqlEventLogStorage\n\nSQLITE_EVENT_LOG_FILENAME = "event_log"\n\n\n
[docs]class ConsolidatedSqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed consolidated event log storage intended for test cases only.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\n the following to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster.core.storage.event_log\n class: ConsolidatedSqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the database.\n """\n\n def __init__(self, base_dir, inst_data=None):\n self._base_dir = check.str_param(base_dir, "base_dir")\n self._conn_string = create_db_conn_string(base_dir, SQLITE_EVENT_LOG_FILENAME)\n self._secondary_index_cache = {}\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._watchers = defaultdict(dict)\n self._obs = None\n\n if not os.path.exists(self.get_db_path()):\n self._init_db()\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return ConsolidatedSqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def _init_db(self):\n mkdir_p(self._base_dir)\n engine = create_engine(self._conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n if should_mark_indexes:\n # mark all secondary indexes\n self.reindex_events()\n self.reindex_assets()\n\n @contextmanager\n def _connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n conn = engine.connect()\n try:\n yield conn\n finally:\n conn.close()\n\n def run_connection(self, run_id):\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def get_db_path(self):\n return os.path.join(self._base_dir, "{}.db".format(SQLITE_EVENT_LOG_FILENAME))\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n ConsolidatedSqliteEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(ConsolidatedSqliteEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, cursor, callback):\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n self._obs.schedule(\n ConsolidatedSqliteEventLogStorageWatchdog(self), self._base_dir, True\n )\n\n self._watchers[run_id][callback] = cursor\n\n def on_modified(self):\n keys = [\n (run_id, callback)\n for run_id, callback_dict in self._watchers.items()\n for callback, _ in callback_dict.items()\n ]\n for run_id, callback in keys:\n cursor = self._watchers[run_id][callback]\n\n # fetch events\n connection = self.get_records_for_run(run_id, cursor)\n\n # update cursor\n if connection.cursor:\n self._watchers[run_id][callback] = connection.cursor\n\n for record in connection.records:\n status = None\n try:\n status = callback(\n record.event_log_entry,\n str(EventLogCursor.from_storage_id(record.storage_id)),\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", run_id)\n\n if (\n status == PipelineRunStatus.SUCCESS\n or status == PipelineRunStatus.FAILURE\n or status == PipelineRunStatus.CANCELED\n ):\n self.end_watch(run_id, callback)\n\n def end_watch(self, run_id, handler):\n if run_id in self._watchers and handler in self._watchers[run_id]:\n del self._watchers[run_id][handler]\n\n def dispose(self):\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)
\n\n\nclass ConsolidatedSqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", ConsolidatedSqliteEventLogStorage\n )\n self._log_path = event_log_storage.get_db_path()\n super(ConsolidatedSqliteEventLogStorageWatchdog, self).__init__(\n patterns=[self._log_path], **kwargs\n )\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._event_log_storage.on_modified()\n
", "current_page_name": "_modules/dagster/core/storage/event_log/sqlite/consolidated_sqlite_event_log", "customsidebar": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.event_log.sqlite.consolidated_sqlite_event_log"}, "sqlite_event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.event_log.sqlite.sqlite_event_log

\nimport glob\nimport logging\nimport os\nimport sqlite3\nimport threading\nimport time\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import Iterable, Optional\n\nimport sqlalchemy as db\nfrom sqlalchemy.pool import NullPool\nfrom tqdm import tqdm\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.config.source import StringSource\nfrom dagster.core.events import DagsterEventType\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.storage.event_log.base import EventLogCursor, EventLogRecord, EventRecordsFilter\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus, RunsFilter\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string\nfrom dagster.serdes import (\n    ConfigurableClass,\n    ConfigurableClassData,\n    deserialize_json_to_dagster_namedtuple,\n)\nfrom dagster.utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata, SqlEventLogStorageTable\nfrom ..sql_event_log import RunShardedEventsCursor, SqlEventLogStorage\n\nINDEX_SHARD_NAME = "index"\n\n\n
[docs]class SqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default event log storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for event log storage, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n event_log_storage:\n module: dagster.core.storage.event_log\n class: SqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the databases. To\n improve concurrent performance, event logs are stored in a separate SQLite database for each\n run.\n """\n\n def __init__(self, base_dir, inst_data=None):\n """Note that idempotent initialization of the SQLite database is done on a per-run_id\n basis in the body of connect, since each run is stored in a separate database."""\n self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir"))\n mkdir_p(self._base_dir)\n\n self._obs = None\n\n self._watchers = defaultdict(dict)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n # Used to ensure that each run ID attempts to initialize its DB the first time it connects,\n # ensuring that the database will be created if it doesn't exist\n self._initialized_dbs = set()\n\n # Ensure that multiple threads (like the event log watcher) interact safely with each other\n self._db_lock = threading.Lock()\n\n if not os.path.exists(self.path_for_shard(INDEX_SHARD_NAME)):\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n self._initdb(engine)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def upgrade(self):\n all_run_ids = self.get_all_run_ids()\n print( # pylint: disable=print-call\n f"Updating event log storage for {len(all_run_ids)} runs on disk..."\n )\n alembic_config = get_alembic_config(__file__)\n if all_run_ids:\n for run_id in tqdm(all_run_ids):\n with self.run_connection(run_id) as conn:\n run_alembic_upgrade(alembic_config, conn, run_id)\n\n print("Updating event log storage for index db on disk...") # pylint: disable=print-call\n with self.index_connection() as conn:\n run_alembic_upgrade(alembic_config, conn, "index")\n\n self._initialized_dbs = set()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def get_all_run_ids(self):\n all_filenames = glob.glob(os.path.join(self._base_dir, "*.db"))\n return [\n os.path.splitext(os.path.basename(filename))[0]\n for filename in all_filenames\n if os.path.splitext(os.path.basename(filename))[0] != INDEX_SHARD_NAME\n ]\n\n def path_for_shard(self, run_id):\n return os.path.join(self._base_dir, "{run_id}.db".format(run_id=run_id))\n\n def conn_string_for_shard(self, shard_name):\n check.str_param(shard_name, "shard_name")\n return create_db_conn_string(self._base_dir, shard_name)\n\n def _initdb(self, engine):\n alembic_config = get_alembic_config(__file__)\n\n retry_limit = 10\n\n while True:\n try:\n\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n\n break\n except (db.exc.DatabaseError, sqlite3.DatabaseError, sqlite3.OperationalError) as exc:\n # This is SQLite-specific handling for concurrency issues that can arise when\n # multiple processes (e.g. the dagit process and user code process) contend with\n # each other to init the db. When we hit the following errors, we know that another\n # process is on the case and we should retry.\n err_msg = str(exc)\n\n if not (\n "table asset_keys already exists" in err_msg\n or "table secondary_indexes already exists" in err_msg\n or "table event_logs already exists" in err_msg\n or "database is locked" in err_msg\n or "table alembic_version already exists" in err_msg\n or "UNIQUE constraint failed: alembic_version.version_num" in err_msg\n ):\n raise\n\n if retry_limit == 0:\n raise\n else:\n logging.info(\n "SqliteEventLogStorage._initdb: Encountered apparent concurrent init, "\n "retrying (%s retries left). Exception: %s",\n retry_limit,\n err_msg,\n )\n time.sleep(0.2)\n retry_limit -= 1\n\n @contextmanager\n def _connect(self, shard):\n with self._db_lock:\n check.str_param(shard, "shard")\n\n conn_string = self.conn_string_for_shard(shard)\n engine = create_engine(conn_string, poolclass=NullPool)\n\n if not shard in self._initialized_dbs:\n self._initdb(engine)\n self._initialized_dbs.add(shard)\n\n conn = engine.connect()\n\n try:\n yield conn\n finally:\n conn.close()\n engine.dispose()\n\n def run_connection(self, run_id=None):\n return self._connect(run_id)\n\n def index_connection(self):\n return self._connect(INDEX_SHARD_NAME)\n\n def store_event(self, event):\n """\n Overridden method to replicate asset events in a central assets.db sqlite shard, enabling\n cross-run asset queries.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.run_connection(run_id) as conn:\n conn.execute(insert_event_statement)\n\n if event.is_dagster_event and event.dagster_event.asset_key:\n check.invariant(\n event.dagster_event_type == DagsterEventType.ASSET_MATERIALIZATION\n or event.dagster_event_type == DagsterEventType.ASSET_OBSERVATION\n or event.dagster_event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n "Can only store asset materializations, materialization_planned, and observations in index database",\n )\n\n # mirror the event in the cross-run index database\n with self.index_connection() as conn:\n conn.execute(insert_event_statement)\n\n if (\n event.dagster_event.is_step_materialization\n or event.dagster_event.is_asset_observation\n or event.dagster_event.is_asset_materialization_planned\n ):\n self.store_asset_event(event)\n\n def get_event_records(\n self,\n event_records_filter: Optional[EventRecordsFilter] = None,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n """Overridden method to enable cross-run event queries in sqlite.\n\n The record id in sqlite does not auto increment cross runs, so instead of fetching events\n after record id, we only fetch events whose runs updated after update_timestamp.\n """\n check.opt_inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n is_asset_query = event_records_filter and (\n event_records_filter.event_type == DagsterEventType.ASSET_MATERIALIZATION\n or event_records_filter.event_type == DagsterEventType.ASSET_OBSERVATION\n )\n if is_asset_query:\n # asset materializations and observations get mirrored into the index shard, so no\n # custom run shard-aware cursor logic needed\n return super(SqliteEventLogStorage, self).get_event_records(\n event_records_filter=event_records_filter, limit=limit, ascending=ascending\n )\n\n query = db.select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n if event_records_filter and event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if (\n event_records_filter\n and event_records_filter.after_cursor != None\n and not isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n ):\n raise Exception(\n """\n Called `get_event_records` on a run-sharded event log storage with a cursor that\n is not run-aware. Add a RunShardedEventsCursor to your query filter\n or switch your instance configuration to use a non-run-sharded event log storage\n (e.g. PostgresEventLogStorage, ConsolidatedSqliteEventLogStorage)\n """\n )\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n apply_cursor_filters=False, # run-sharded cursor filters don't really make sense\n )\n if limit:\n query = query.limit(limit)\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc())\n\n # workaround for the run-shard sqlite to enable cross-run queries: get a list of run_ids\n # whose events may qualify the query, and then open run_connection per run_id at a time.\n run_updated_after = (\n event_records_filter.after_cursor.run_updated_after\n if event_records_filter\n and isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else None\n )\n run_records = self._instance.get_run_records(\n filters=RunsFilter(updated_after=run_updated_after),\n order_by="update_timestamp",\n ascending=ascending,\n )\n\n event_records = []\n for run_record in run_records:\n run_id = run_record.pipeline_run.run_id\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n for row_id, json_str in results:\n try:\n event_record = deserialize_json_to_dagster_namedtuple(json_str)\n if not isinstance(event_record, EventLogEntry):\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n continue\n else:\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n if limit and len(event_records) >= limit:\n break\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n if limit and len(event_records) >= limit:\n break\n\n return event_records[:limit]\n\n def delete_events(self, run_id):\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n\n # delete the mirrored event in the cross-run index database\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n\n def wipe(self):\n # should delete all the run-sharded dbs as well as the index db\n for filename in (\n glob.glob(os.path.join(self._base_dir, "*.db"))\n + glob.glob(os.path.join(self._base_dir, "*.db-wal"))\n + glob.glob(os.path.join(self._base_dir, "*.db-shm"))\n ):\n os.unlink(filename)\n\n self._initialized_dbs = set()\n\n def _delete_mirrored_events_for_asset_key(self, asset_key):\n with self.index_connection() as conn:\n conn.execute(\n SqlEventLogStorageTable.delete().where( # pylint: disable=no-value-for-parameter\n db.or_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),\n )\n )\n )\n\n def wipe_asset(self, asset_key):\n # default implementation will update the event_logs in the sharded dbs, and the asset_key\n # table in the asset shard, but will not remove the mirrored event_log events in the asset\n # shard\n super(SqliteEventLogStorage, self).wipe_asset(asset_key)\n self._delete_mirrored_events_for_asset_key(asset_key)\n\n def watch(self, run_id, cursor, callback):\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n\n watchdog = SqliteEventLogStorageWatchdog(self, run_id, callback, cursor)\n self._watchers[run_id][callback] = (\n watchdog,\n self._obs.schedule(watchdog, self._base_dir, True),\n )\n\n def end_watch(self, run_id, handler):\n if handler in self._watchers[run_id]:\n event_handler, watch = self._watchers[run_id][handler]\n self._obs.remove_handler_for_watch(event_handler, watch)\n del self._watchers[run_id][handler]\n\n def dispose(self):\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)\n\n def alembic_version(self):\n alembic_config = get_alembic_config(__file__)\n with self.index_connection() as conn:\n return check_alembic_revision(alembic_config, conn)
\n\n\nclass SqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, run_id, callback, cursor, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", SqliteEventLogStorage\n )\n self._run_id = check.str_param(run_id, "run_id")\n self._cb = check.callable_param(callback, "callback")\n self._log_path = event_log_storage.path_for_shard(run_id)\n self._cursor = cursor\n super(SqliteEventLogStorageWatchdog, self).__init__(patterns=[self._log_path], **kwargs)\n\n def _process_log(self):\n connection = self._event_log_storage.get_records_for_run(self._run_id, self._cursor)\n if connection.cursor:\n self._cursor = connection.cursor\n for record in connection.records:\n status = None\n try:\n status = self._cb(\n record.event_log_entry, str(EventLogCursor.from_storage_id(record.storage_id))\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", self._run_id)\n\n if (\n status == PipelineRunStatus.SUCCESS\n or status == PipelineRunStatus.FAILURE\n or status == PipelineRunStatus.CANCELED\n ):\n self._event_log_storage.end_watch(self._run_id, self._cb)\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._process_log()\n
", "current_page_name": "_modules/dagster/core/storage/event_log/sqlite/sqlite_event_log", "customsidebar": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.event_log.sqlite.sqlite_event_log"}}}, "file_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.file_manager

\nimport io\nimport os\nimport shutil\nimport uuid\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import BinaryIO, Optional, TextIO, Union\n\nimport dagster._check as check\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.definitions.resource_definition import resource\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.utils import mkdir_p\n\nfrom .temp_file_manager import TempfileManager\n\n\n# pylint: disable=no-init\n
[docs]class FileHandle(ABC):\n """A reference to a file as manipulated by a FileManager\n\n Subclasses may handle files that are resident on the local file system, in an object store, or\n in any arbitrary place where a file can be stored.\n\n This exists to handle the very common case where you wish to write a computation that reads,\n transforms, and writes files, but where you also want the same code to work in local development\n as well as on a cluster where the files will be stored in a globally available object store\n such as S3.\n """\n\n @property\n @abstractmethod\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n raise NotImplementedError()
\n\n\n
[docs]class LocalFileHandle(FileHandle):\n """A reference to a file on a local filesystem."""\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @property\n def path(self) -> str:\n """The file's path."""\n return self._path\n\n @property\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n return self._path
\n\n\n
[docs]class FileManager(ABC): # pylint: disable=no-init\n """Base class for all file managers in dagster.\n\n The file manager is an interface that can be implemented by resources to provide abstract\n access to a file system such as local disk, S3, or other cloud storage.\n\n For examples of usage, see the documentation of the concrete file manager implementations.\n """\n\n
[docs] @abstractmethod\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n """Copy a file represented by a file handle to a temp file.\n\n In an implementation built around an object store such as S3, this method would be expected\n to download the file from S3 to local filesystem in a location assigned by the standard\n library's :py:mod:`python:tempfile` module.\n\n Temp files returned by this method are *not* guaranteed to be reusable across solid\n boundaries. For files that must be available across solid boundaries, use the\n :py:meth:`~dagster.core.storage.file_manager.FileManager.read`,\n :py:meth:`~dagster.core.storage.file_manager.FileManager.read_data`,\n :py:meth:`~dagster.core.storage.file_manager.FileManager.write`, and\n :py:meth:`~dagster.core.storage.file_manager.FileManager.write_data` methods.\n\n Args:\n file_handle (FileHandle): The handle to the file to make available as a local temp file.\n\n Returns:\n str: Path to the local temp file.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def delete_local_temp(self):\n """Delete all local temporary files created by previous calls to\n :py:meth:`~dagster.core.storage.file_manager.FileManager.copy_handle_to_local_temp`.\n\n Should typically only be called by framework implementors.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def read(self, file_handle: FileHandle, mode: str = "rb") -> Union[TextIO, BinaryIO]:\n """Return a file-like stream for the file handle.\n\n This may incur an expensive network call for file managers backed by object stores\n such as S3.\n\n Args:\n file_handle (FileHandle): The file handle to make available as a stream.\n mode (str): The mode in which to open the file. Default: ``"rb"``.\n\n Returns:\n Union[TextIO, BinaryIO]: A file-like stream.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def read_data(self, file_handle: FileHandle) -> bytes:\n """Return the bytes for a given file handle. This may incur an expensive network\n call for file managers backed by object stores such as s3.\n\n Args:\n file_handle (FileHandle): The file handle for which to return bytes.\n\n Returns:\n bytes: Bytes for a given file handle.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def write(\n self, file_obj: Union[TextIO, BinaryIO], mode: str = "wb", ext: Optional[str] = None\n ) -> FileHandle:\n """Write the bytes contained within the given file object into the file manager.\n\n Args:\n file_obj (Union[TextIO, StringIO]): A file-like object.\n mode (Optional[str]): The mode in which to write the file into the file manager.\n Default: ``"wb"``.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def write_data(self, data: bytes, ext: Optional[str] = None) -> FileHandle:\n """Write raw bytes into the file manager.\n\n Args:\n data (bytes): The bytes to write into the file manager.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n\n
[docs]@resource(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef local_file_manager(init_context):\n """FileManager that provides abstract access to a local filesystem.\n\n By default, files will be stored in `<local_artifact_storage>/storage/file_manager` where\n `<local_artifact_storage>` can be configured the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n\n Examples:\n\n .. code-block:: python\n\n import tempfile\n\n from dagster import ModeDefinition, local_file_manager, pipeline, solid\n\n\n @solid(required_resource_keys={"file_manager"})\n def write_files(context):\n fh_1 = context.resources.file_manager.write_data(b"foo")\n\n with tempfile.NamedTemporaryFile("w+") as fd:\n fd.write("bar")\n fd.seek(0)\n fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n return (fh_1, fh_2)\n\n\n @solid(required_resource_keys={"file_manager"})\n def read_files(context, file_handles):\n fh_1, fh_2 = file_handles\n assert context.resources.file_manager.read_data(fh_2) == b"bar"\n fd = context.resources.file_manager.read(fh_2, mode="r")\n assert fd.read() == "foo"\n fd.close()\n\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"file_manager": local_file_manager})])\n def files_pipeline():\n read_files(write_files())\n\n Or to specify the file directory:\n\n .. code-block:: python\n\n @pipeline(\n mode_defs=[\n ModeDefinition(\n resource_defs={\n "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n }\n )\n ]\n )\n def files_pipeline():\n read_files(write_files())\n\n """\n\n return LocalFileManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "file_manager")\n )\n )
\n\n\ndef check_file_like_obj(obj):\n check.invariant(obj and hasattr(obj, "read") and hasattr(obj, "write"))\n\n\nclass LocalFileManager(FileManager):\n def __init__(self, base_dir):\n self.base_dir = base_dir\n self._base_dir_ensured = False\n self._temp_file_manager = TempfileManager()\n\n @staticmethod\n def for_instance(instance, run_id):\n check.inst_param(instance, "instance", DagsterInstance)\n return LocalFileManager(instance.file_manager_directory(run_id))\n\n def ensure_base_dir_exists(self):\n if self._base_dir_ensured:\n return\n\n mkdir_p(self.base_dir)\n\n self._base_dir_ensured = True\n\n def copy_handle_to_local_temp(self, file_handle):\n check.inst_param(file_handle, "file_handle", FileHandle)\n with self.read(file_handle, "rb") as handle_obj:\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_file_obj.write(handle_obj.read())\n temp_name = temp_file_obj.name\n temp_file_obj.close()\n return temp_name\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", LocalFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n encoding = None if mode == "rb" else "utf8"\n with open(file_handle.path, mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n check.opt_str_param(ext, "ext")\n\n self.ensure_base_dir_exists()\n\n dest_file_path = os.path.join(\n self.base_dir, str(uuid.uuid4()) + (("." + ext) if ext is not None else "")\n )\n\n encoding = None if "b" in mode else "utf8"\n with open(dest_file_path, mode, encoding=encoding) as dest_file_obj:\n shutil.copyfileobj(file_obj, dest_file_obj)\n return LocalFileHandle(dest_file_path)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster/core/storage/file_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.file_manager"}, "fs_io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.fs_io_manager

\nimport os\nimport pickle\nfrom typing import Union\n\nimport dagster._check as check\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster.core.definitions.metadata import MetadataEntry, MetadataValue\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.execution.context.input import InputContext\nfrom dagster.core.execution.context.output import OutputContext\nfrom dagster.core.storage.io_manager import IOManager, io_manager\nfrom dagster.core.storage.memoizable_io_manager import MemoizableIOManager\nfrom dagster.utils import PICKLE_PROTOCOL, mkdir_p\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef fs_io_manager(init_context):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n Allows users to specify a base directory where all the step outputs will be stored. By\n default, step outputs will be stored in the directory specified by local_artifact_storage in\n your dagster.yaml file (which will be a temporary directory if not explicitly set).\n\n Serializes and deserializes output values using pickling and automatically constructs\n the filepaths for ops and assets.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n }\n )\n def job():\n op_b(op_a())\n\n\n 2. Specify IO manager on :py:class:`Out`, which allows the user to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": fs_io_manager})\n def job():\n op_b(op_a())\n\n """\n base_dir = init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory()\n )\n\n return PickledObjectFilesystemIOManager(base_dir=base_dir)
\n\n\nclass PickledObjectFilesystemIOManager(MemoizableIOManager):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n """Automatically construct filepath."""\n if context.has_asset_key:\n path = context.get_asset_identifier()\n else:\n path = context.get_identifier()\n\n return os.path.join(self.base_dir, *path)\n\n def has_output(self, context):\n filepath = self._get_path(context)\n\n return os.path.exists(filepath)\n\n def handle_output(self, context, obj):\n """Pickle the data and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n\n filepath = self._get_path(context)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n try:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n except (AttributeError, RecursionError, ImportError, pickle.PicklingError) as e:\n executor = context.step_context.pipeline_def.mode_definitions[0].executor_defs[0]\n\n if isinstance(e, RecursionError):\n # if obj can't be pickled because of RecursionError then __str__() will also\n # throw a RecursionError\n obj_repr = f"{obj.__class__} exceeds recursion limit and"\n else:\n obj_repr = obj.__str__()\n\n raise DagsterInvariantViolationError(\n f"Object {obj_repr} is not picklable. You are currently using the "\n f"fs_io_manager and the {executor.name}. You will need to use a different "\n "io manager to continue using this output. For example, you can use the "\n "mem_io_manager with the in_process_executor.\\n"\n "For more information on io managers, visit "\n "https://docs.dagster.io/concepts/io-management/io-managers \\n"\n "For more information on executors, vist "\n "https://docs.dagster.io/deployment/executors#overview"\n ) from e\n\n context.add_output_metadata({"path": MetadataValue.path(os.path.abspath(filepath))})\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n\n filepath = self._get_path(context)\n context.add_input_metadata({"path": MetadataValue.path(os.path.abspath(filepath))})\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\nclass CustomPathPickledObjectFilesystemIOManager(IOManager):\n """Built-in filesystem IO managerthat stores and retrieves values using pickling and\n allow users to specify file path for outputs.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, path):\n return os.path.join(self.base_dir, path)\n\n def handle_output(self, context, obj):\n """Pickle the data and store the object to a custom file path.\n\n This method emits an AssetMaterialization event so the assets will be tracked by the\n Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n metadata = context.metadata\n path = check.str_param(metadata.get("path"), "metadata.path")\n\n filepath = self._get_path(path)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n context.log.debug(f"Writing file at: {filepath}")\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n return AssetMaterialization(\n asset_key=AssetKey([context.pipeline_name, context.step_key, context.name]),\n metadata_entries=[\n MetadataEntry("path", value=MetadataValue.path(os.path.abspath(filepath)))\n ],\n )\n\n def load_input(self, context):\n """Unpickle the file from a given file path and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n metadata = context.upstream_output.metadata\n path = check.str_param(metadata.get("path"), "metadata.path")\n filepath = self._get_path(path)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\n
[docs]@io_manager(config_schema={"base_dir": Field(StringSource, is_required=True)})\n@experimental\ndef custom_path_fs_io_manager(init_context):\n """Built-in IO manager that allows users to custom output file path per output definition.\n\n It requires users to specify a base directory where all the step output will be stored in. It\n serializes and deserializes output values (assets) using pickling and stores the pickled object\n in the user-provided file paths.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import custom_path_fs_io_manager, job, op\n\n @op(out=Out(metadata={"path": "path/to/sample_output"}))\n def sample_data(df):\n return df[:5]\n\n my_custom_path_fs_io_manager = custom_path_fs_io_manager.configured(\n {"base_dir": "path/to/basedir"}\n )\n\n @job(resource_defs={"io_manager": my_custom_path_fs_io_manager})\n def my_job():\n sample_data()\n\n """\n\n return CustomPathPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get("base_dir")\n )
\n
", "current_page_name": "_modules/dagster/core/storage/fs_io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.fs_io_manager"}, "io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.io_manager

\nfrom abc import abstractmethod\nfrom functools import update_wrapper\nfrom typing import Optional, Set\n\nimport dagster._check as check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.definition_config_schema import (\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.resource_definition import ResourceDefinition\nfrom dagster.core.storage.input_manager import InputManager\nfrom dagster.core.storage.output_manager import IOutputManagerDefinition, OutputManager\nfrom dagster.core.storage.root_input_manager import IInputManagerDefinition\n\n\n
[docs]class IOManagerDefinition(ResourceDefinition, IInputManagerDefinition, IOutputManagerDefinition):\n """Definition of an IO manager resource.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n An IOManagerDefinition is a :py:class:`ResourceDefinition` whose `resource_fn` returns an\n :py:class:`IOManager`.\n\n The easiest way to create an IOManagerDefnition is with the :py:func:`@io_manager <io_manager>`\n decorator.\n """\n\n def __init__(\n self,\n resource_fn=None,\n config_schema=None,\n description=None,\n required_resource_keys=None,\n version=None,\n input_config_schema=None,\n output_config_schema=None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n # Unlike other configurable objects, whose config schemas default to Any, output_config_schema\n # defaults to None. This the because IOManager input / output config shares config\n # namespace with dagster type loaders and materializers. The absence of provided\n # output_config_schema means that we should fall back to using the materializer that\n # corresponds to the output dagster type.\n self._output_config_schema = (\n convert_user_facing_definition_config_schema(output_config_schema)\n if output_config_schema is not None\n else None\n )\n super(IOManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self):\n return self._input_config_schema\n\n @property\n def output_config_schema(self):\n return self._output_config_schema\n\n def copy_for_configured(self, description, config_schema, _):\n return IOManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n output_config_schema=self.output_config_schema,\n )\n\n
[docs] @staticmethod\n def hardcoded_io_manager(value, description=None):\n """A helper function that creates an ``IOManagerDefinition`` with a hardcoded IOManager.\n\n Args:\n value (Any): A hardcoded IO Manager which helps mock the definition.\n description ([Optional[str]]): The description of the IO Manager. Defaults to None.\n\n Returns:\n [IOManagerDefinition]: A hardcoded resource.\n """\n check.inst_param(value, "value", IOManager)\n return IOManagerDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n\n
[docs]class IOManager(InputManager, OutputManager):\n """\n Base class for user-provided IO managers.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n Extend this class to handle how objects are loaded and stored. Users should implement\n ``handle_output`` to store an object and ``load_input`` to retrieve an object.\n """\n\n
[docs] @abstractmethod\n def load_input(self, context):\n """User-defined method that loads an input to an op.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n\n Returns:\n Any: The data object.\n """
\n\n
[docs] @abstractmethod\n def handle_output(self, context, obj):\n """User-defined method that stores an output of an op.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n obj (Any): The object, returned by the op, to be stored.\n """
\n\n
[docs] def get_output_asset_key(self, _context) -> Optional[AssetKey]:\n """User-defined method that associates outputs handled by this IOManager with a particular\n AssetKey.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n """\n return None
\n\n
[docs] def get_output_asset_partitions(self, _context) -> Set[str]:\n """User-defined method that associates outputs handled by this IOManager with a set of\n partitions of an AssetKey.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n """\n return set()
\n\n
[docs] def get_input_asset_key(self, context) -> Optional[AssetKey]:\n """User-defined method that associates inputs loaded by this IOManager with a particular\n AssetKey.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n """\n return self.get_output_asset_key(context.upstream_output)
\n\n
[docs] def get_input_asset_partitions(self, context) -> Set[str]:\n """User-defined method that associates inputs loaded by this IOManager with a set of\n partitions of an AssetKey.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n """\n return self.get_output_asset_partitions(context.upstream_output)
\n\n\n
[docs]def io_manager(\n config_schema=None,\n description=None,\n output_config_schema=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n):\n """\n Define an IO manager.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an\n :py:class:`IOManager`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource config. Configuration\n data available in `init_context.resource_config`. If not set, Dagster will accept any\n config provided.\n description(Optional[str]): A human-readable description of the resource.\n output_config_schema (Optional[ConfigSchema]): The schema for per-output config. If not set,\n no per-output configuration will be allowed.\n input_config_schema (Optional[ConfigSchema]): The schema for per-input config. If not set,\n Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the object\n manager.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n\n **Examples:**\n\n .. code-block:: python\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n write_csv("some/path")\n\n def load_input(self, context):\n return read_csv("some/path")\n\n @io_manager\n def my_io_manager(init_context):\n return MyIOManager()\n\n @op(out=Out(io_manager_key="my_io_manager_key"))\n def my_op(_):\n return do_stuff()\n\n @job(resource_defs={"my_io_manager_key": my_io_manager})\n def my_job():\n my_op()\n\n """\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _IOManagerDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn):\n return _IOManagerDecoratorCallable(\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n output_config_schema=output_config_schema,\n input_config_schema=input_config_schema,\n )(resource_fn)\n\n return _wrap
\n\n\nclass _IOManagerDecoratorCallable:\n def __init__(\n self,\n config_schema=None,\n description=None,\n required_resource_keys=None,\n version=None,\n output_config_schema=None,\n input_config_schema=None,\n ):\n # type validation happens in IOManagerDefinition\n self.config_schema = config_schema\n self.description = description\n self.required_resource_keys = required_resource_keys\n self.version = version\n self.output_config_schema = output_config_schema\n self.input_config_schema = input_config_schema\n\n def __call__(self, fn):\n check.callable_param(fn, "fn")\n\n io_manager_def = IOManagerDefinition(\n resource_fn=fn,\n config_schema=self.config_schema,\n description=self.description,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n output_config_schema=self.output_config_schema,\n input_config_schema=self.input_config_schema,\n )\n\n update_wrapper(io_manager_def, wrapped=fn)\n\n return io_manager_def\n
", "current_page_name": "_modules/dagster/core/storage/io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.io_manager"}, "local_compute_log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.local_compute_log_manager

\nimport hashlib\nimport os\nimport sys\nfrom collections import defaultdict\nfrom contextlib import contextmanager\n\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers.polling import PollingObserver\n\nfrom dagster import Field, Float, StringSource\nfrom dagster import _check as check\nfrom dagster.core.execution.compute_logs import mirror_stream_to_file\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import ensure_dir, touch_file\n\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n    ComputeLogSubscription,\n)\n\nDEFAULT_WATCHDOG_POLLING_TIMEOUT = 2.5\n\nIO_TYPE_EXTENSION = {ComputeIOType.STDOUT: "out", ComputeIOType.STDERR: "err"}\n\nMAX_FILENAME_LENGTH = 255\n\n\n
[docs]class LocalComputeLogManager(ComputeLogManager, ConfigurableClass):\n """Stores copies of stdout & stderr for each compute step locally on disk."""\n\n def __init__(self, base_dir, polling_timeout=None, inst_data=None):\n self._base_dir = base_dir\n self._polling_timeout = check.opt_float_param(\n polling_timeout, "polling_timeout", DEFAULT_WATCHDOG_POLLING_TIMEOUT\n )\n self._subscription_manager = LocalComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, pipeline_run, step_key=None):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n\n key = self.get_key(pipeline_run, step_key)\n outpath = self.get_local_path(pipeline_run.run_id, key, ComputeIOType.STDOUT)\n errpath = self.get_local_path(pipeline_run.run_id, key, ComputeIOType.STDERR)\n with mirror_stream_to_file(sys.stdout, outpath):\n with mirror_stream_to_file(sys.stderr, errpath):\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def polling_timeout(self):\n return self._polling_timeout\n\n @classmethod\n def config_type(cls):\n return {\n "base_dir": StringSource,\n "polling_timeout": Field(Float, is_required=False),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return LocalComputeLogManager(inst_data=inst_data, **config_value)\n\n def _run_directory(self, run_id):\n return os.path.join(self._base_dir, run_id, "compute_logs")\n\n def get_local_path(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return self._get_local_path(run_id, key, IO_TYPE_EXTENSION[io_type])\n\n def complete_artifact_path(self, run_id, key):\n return self._get_local_path(run_id, key, "complete")\n\n def _get_local_path(self, run_id, key, extension):\n filename = "{}.{}".format(key, extension)\n if len(filename) > MAX_FILENAME_LENGTH:\n filename = "{}.{}".format(hashlib.md5(key.encode("utf-8")).hexdigest(), extension)\n return os.path.join(self._run_directory(run_id), filename)\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n path = self.get_local_path(run_id, key, io_type)\n\n if not os.path.exists(path) or not os.path.isfile(path):\n return ComputeLogFileData(path=path, data=None, cursor=0, size=0, download_url=None)\n\n # See: https://docs.python.org/2/library/stdtypes.html#file.tell for Windows behavior\n with open(path, "rb") as f:\n f.seek(cursor, os.SEEK_SET)\n data = f.read(max_bytes)\n cursor = f.tell()\n stats = os.fstat(f.fileno())\n\n # local download path\n download_url = self.download_url(run_id, key, io_type)\n return ComputeLogFileData(\n path=path,\n data=data.decode("utf-8"),\n cursor=cursor,\n size=stats.st_size,\n download_url=download_url,\n )\n\n def is_watch_completed(self, run_id, key):\n return os.path.exists(self.complete_artifact_path(run_id, key))\n\n def on_watch_start(self, pipeline_run, step_key):\n pass\n\n def get_key(self, pipeline_run, step_key):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n return step_key or pipeline_run.pipeline_name\n\n def on_watch_finish(self, pipeline_run, step_key=None):\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n check.opt_str_param(step_key, "step_key")\n key = self.get_key(pipeline_run, step_key)\n touchpath = self.complete_artifact_path(pipeline_run.run_id, key)\n touch_file(touchpath)\n\n def download_url(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return "/download/{}/{}/{}".format(run_id, key, io_type.value)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()
\n\n\nclass LocalComputeLogSubscriptionManager:\n def __init__(self, manager):\n self._manager = manager\n self._subscriptions = defaultdict(list)\n self._watchers = {}\n self._observer = None\n\n def _watch_key(self, run_id, key):\n return "{}:{}".format(run_id, key)\n\n def add_subscription(self, subscription):\n check.inst_param(subscription, "subscription", ComputeLogSubscription)\n if self._manager.is_watch_completed(subscription.run_id, subscription.key):\n subscription.fetch()\n subscription.complete()\n else:\n watch_key = self._watch_key(subscription.run_id, subscription.key)\n self._subscriptions[watch_key].append(subscription)\n self.watch(subscription.run_id, subscription.key)\n\n def remove_subscription(self, subscription):\n check.inst_param(subscription, "subscription", ComputeLogSubscription)\n watch_key = self._watch_key(subscription.run_id, subscription.key)\n if subscription in self._subscriptions[watch_key]:\n self._subscriptions[watch_key].remove(subscription)\n subscription.complete()\n\n def remove_all_subscriptions(self, run_id, step_key):\n watch_key = self._watch_key(run_id, step_key)\n for subscription in self._subscriptions.pop(watch_key, []):\n subscription.complete()\n\n def watch(self, run_id, step_key):\n watch_key = self._watch_key(run_id, step_key)\n if watch_key in self._watchers:\n return\n\n update_paths = [\n self._manager.get_local_path(run_id, step_key, ComputeIOType.STDOUT),\n self._manager.get_local_path(run_id, step_key, ComputeIOType.STDERR),\n ]\n complete_paths = [self._manager.complete_artifact_path(run_id, step_key)]\n directory = os.path.dirname(\n self._manager.get_local_path(run_id, step_key, ComputeIOType.STDERR)\n )\n\n if not self._observer:\n self._observer = PollingObserver(self._manager.polling_timeout)\n self._observer.start()\n\n ensure_dir(directory)\n\n self._watchers[watch_key] = self._observer.schedule(\n LocalComputeLogFilesystemEventHandler(\n self, run_id, step_key, update_paths, complete_paths\n ),\n str(directory),\n )\n\n def notify_subscriptions(self, run_id, step_key):\n watch_key = self._watch_key(run_id, step_key)\n for subscription in self._subscriptions[watch_key]:\n subscription.fetch()\n\n def unwatch(self, run_id, step_key, handler):\n watch_key = self._watch_key(run_id, step_key)\n if watch_key in self._watchers:\n self._observer.remove_handler_for_watch(handler, self._watchers[watch_key])\n del self._watchers[watch_key]\n\n def dispose(self):\n if self._observer:\n self._observer.stop()\n self._observer.join(15)\n\n\nclass LocalComputeLogFilesystemEventHandler(PatternMatchingEventHandler):\n def __init__(self, manager, run_id, key, update_paths, complete_paths):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.update_paths = update_paths\n self.complete_paths = complete_paths\n patterns = update_paths + complete_paths\n super(LocalComputeLogFilesystemEventHandler, self).__init__(patterns=patterns)\n\n def on_created(self, event):\n if event.src_path in self.complete_paths:\n self.manager.remove_all_subscriptions(self.run_id, self.key)\n self.manager.unwatch(self.run_id, self.key, self)\n\n def on_modified(self, event):\n if event.src_path in self.update_paths:\n self.manager.notify_subscriptions(self.run_id, self.key)\n
", "current_page_name": "_modules/dagster/core/storage/local_compute_log_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.local_compute_log_manager"}, "mem_io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.mem_io_manager

\nfrom dagster.core.storage.io_manager import IOManager, io_manager\n\n\nclass InMemoryIOManager(IOManager):\n    def __init__(self):\n        self.values = {}\n\n    def handle_output(self, context, obj):\n        keys = tuple(context.get_identifier())\n        self.values[keys] = obj\n\n    def load_input(self, context):\n        keys = tuple(context.get_identifier())\n        return self.values[keys]\n\n\n
[docs]@io_manager\ndef mem_io_manager(_):\n """Built-in IO manager that stores and retrieves values in memory."""\n\n return InMemoryIOManager()
\n
", "current_page_name": "_modules/dagster/core/storage/mem_io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.mem_io_manager"}, "memoizable_io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.memoizable_io_manager

\nimport os\nimport pickle\nfrom abc import abstractmethod\nfrom typing import Union\n\nimport dagster._check as check\nfrom dagster.config import Field\nfrom dagster.config.source import StringSource\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.execution.context.input import InputContext\nfrom dagster.core.execution.context.output import OutputContext\nfrom dagster.core.storage.io_manager import IOManager, io_manager\nfrom dagster.utils import PICKLE_PROTOCOL, mkdir_p\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]class MemoizableIOManager(IOManager):\n """\n Base class for IO manager enabled to work with memoized execution. Users should implement\n the ``load_input`` and ``handle_output`` methods described in the ``IOManager`` API, and the\n ``has_output`` method, which returns a boolean representing whether a data object can be found.\n """\n\n
[docs] @abstractmethod\n def has_output(self, context: OutputContext) -> bool:\n """The user-defined method that returns whether data exists given the metadata.\n\n Args:\n context (OutputContext): The context of the step performing this check.\n\n Returns:\n bool: True if there is data present that matches the provided context. False otherwise.\n """
\n\n\nclass VersionedPickledObjectFilesystemIOManager(MemoizableIOManager):\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n output_context: OutputContext\n\n if isinstance(context, OutputContext):\n output_context = context\n else:\n if context.upstream_output is None:\n raise DagsterInvariantViolationError(\n "Missing value of InputContext.upstream_output. "\n "Cannot compute the input path."\n )\n\n output_context = context.upstream_output\n\n # automatically construct filepath\n step_key = check.str_param(output_context.step_key, "context.step_key")\n output_name = check.str_param(output_context.name, "context.name")\n version = check.str_param(output_context.version, "context.version")\n\n return os.path.join(self.base_dir, step_key, output_name, version)\n\n def handle_output(self, context, obj):\n """Pickle the data with the associated version, and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n\n filepath = self._get_path(context)\n\n context.log.debug(f"Writing file at: {filepath}")\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n\n filepath = self._get_path(context)\n\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n def has_output(self, context):\n """Returns true if data object exists with the associated version, False otherwise."""\n\n filepath = self._get_path(context)\n\n context.log.debug(f"Checking for file at: {filepath}")\n\n return os.path.exists(filepath) and not os.path.isdir(filepath)\n\n\n@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\n@experimental\ndef versioned_filesystem_io_manager(init_context):\n """Filesystem IO manager that utilizes versioning of stored objects.\n\n It requires users to specify a base directory where all the step outputs will be stored in. It\n serializes and deserializes output values (assets) using pickling and automatically constructs\n the filepaths for the assets using the provided directory, and the version for a provided step\n output.\n """\n return VersionedPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "versioned_outputs")\n )\n )\n
", "current_page_name": "_modules/dagster/core/storage/memoizable_io_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.memoizable_io_manager"}, "pipeline_run": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.pipeline_run

\nimport warnings\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Dict, FrozenSet, List, Mapping, NamedTuple, Optional, Type\n\nimport dagster._check as check\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.origin import PipelinePythonOrigin\nfrom dagster.core.storage.tags import PARENT_RUN_ID_TAG, ROOT_RUN_ID_TAG\nfrom dagster.core.utils import make_new_run_id\nfrom dagster.serdes.serdes import (\n    DefaultNamedTupleSerializer,\n    EnumSerializer,\n    WhitelistMap,\n    register_serdes_enum_fallbacks,\n    register_serdes_tuple_fallbacks,\n    replace_storage_keys,\n    unpack_inner_value,\n    whitelist_for_serdes,\n)\n\nfrom .tags import (\n    BACKFILL_ID_TAG,\n    PARTITION_NAME_TAG,\n    PARTITION_SET_TAG,\n    REPOSITORY_LABEL_TAG,\n    RESUME_RETRY_TAG,\n    SCHEDULE_NAME_TAG,\n    SENSOR_NAME_TAG,\n)\n\nif TYPE_CHECKING:\n    from dagster.core.host_representation.origin import ExternalPipelineOrigin\n\n\nclass DagsterRunStatusSerializer(EnumSerializer):\n    @classmethod\n    def value_from_storage_str(cls, storage_str: str, klass: Type) -> Enum:\n        return getattr(klass, storage_str)\n\n    @classmethod\n    def value_to_storage_str(\n        cls, value: Enum, whitelist_map: WhitelistMap, descent_path: str\n    ) -> str:\n        enum_value = value.value\n        # Store DagsterRunStatus with backcompat name PipelineRunStatus\n        backcompat_name = "PipelineRunStatus"\n        return ".".join([backcompat_name, enum_value])\n\n\n
[docs]@whitelist_for_serdes(serializer=DagsterRunStatusSerializer)\nclass DagsterRunStatus(Enum):\n """The status of pipeline execution."""\n\n QUEUED = "QUEUED"\n NOT_STARTED = "NOT_STARTED"\n MANAGED = "MANAGED"\n STARTING = "STARTING"\n STARTED = "STARTED"\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n CANCELING = "CANCELING"\n CANCELED = "CANCELED"
\n\n\nPipelineRunStatus = DagsterRunStatus\nregister_serdes_enum_fallbacks({"PipelineRunStatus": DagsterRunStatus})\n\n# These statuses that indicate a run may be using compute resources\nIN_PROGRESS_RUN_STATUSES = [\n PipelineRunStatus.STARTING,\n PipelineRunStatus.STARTED,\n PipelineRunStatus.CANCELING,\n]\n\n# This serves as an explicit list of run statuses that indicate that the run is not using compute\n# resources. This and the enum above should cover all run statuses.\nNON_IN_PROGRESS_RUN_STATUSES = [\n PipelineRunStatus.QUEUED,\n PipelineRunStatus.NOT_STARTED,\n PipelineRunStatus.SUCCESS,\n PipelineRunStatus.FAILURE,\n PipelineRunStatus.MANAGED,\n PipelineRunStatus.CANCELED,\n]\n\n\n@whitelist_for_serdes\nclass PipelineRunStatsSnapshot(\n NamedTuple(\n "_PipelineRunStatsSnapshot",\n [\n ("run_id", str),\n ("steps_succeeded", int),\n ("steps_failed", int),\n ("materializations", int),\n ("expectations", int),\n ("enqueued_time", Optional[float]),\n ("launch_time", Optional[float]),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n steps_succeeded: int,\n steps_failed: int,\n materializations: int,\n expectations: int,\n enqueued_time: Optional[float],\n launch_time: Optional[float],\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(PipelineRunStatsSnapshot, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n steps_succeeded=check.int_param(steps_succeeded, "steps_succeeded"),\n steps_failed=check.int_param(steps_failed, "steps_failed"),\n materializations=check.int_param(materializations, "materializations"),\n expectations=check.int_param(expectations, "expectations"),\n enqueued_time=check.opt_float_param(enqueued_time, "enqueued_time"),\n launch_time=check.opt_float_param(launch_time, "launch_time"),\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\nclass DagsterRunSerializer(DefaultNamedTupleSerializer):\n @classmethod\n def value_from_storage_dict(\n cls,\n storage_dict,\n klass,\n args_for_class,\n whitelist_map,\n descent_path,\n ):\n # unpack all stored fields\n unpacked_dict = {\n key: unpack_inner_value(value, whitelist_map, f"{descent_path}.{key}")\n for key, value in storage_dict.items()\n }\n # called by the serdes layer, delegates to helper method with expanded kwargs\n return pipeline_run_from_storage(**unpacked_dict)\n\n @classmethod\n def value_to_storage_dict(\n cls,\n value: NamedTuple,\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Dict[str, Any]:\n storage = super().value_to_storage_dict(\n value,\n whitelist_map,\n descent_path,\n )\n # persist using legacy name PipelineRun\n storage["__class__"] = "PipelineRun"\n return storage\n\n\ndef pipeline_run_from_storage(\n pipeline_name=None,\n run_id=None,\n run_config=None,\n mode=None,\n asset_selection=None,\n solid_selection=None,\n solids_to_execute=None,\n step_keys_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n pipeline_snapshot_id=None,\n execution_plan_snapshot_id=None,\n # backcompat\n environment_dict=None,\n previous_run_id=None,\n selector=None,\n solid_subset=None,\n reexecution_config=None, # pylint: disable=unused-argument\n external_pipeline_origin=None,\n pipeline_code_origin=None,\n **kwargs,\n):\n\n # serdes log\n # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve\n # * added pipeline_snapshot_id\n # * renamed previous_run_id -> parent_run_id, added root_run_id\n # * added execution_plan_snapshot_id\n # * removed selector\n # * added solid_subset\n # * renamed solid_subset -> solid_selection, added solids_to_execute\n # * renamed environment_dict -> run_config\n # * added asset_selection\n\n # back compat for environment dict => run_config\n if environment_dict:\n check.invariant(\n not run_config,\n "Cannot set both run_config and environment_dict. Use run_config parameter.",\n )\n run_config = environment_dict\n\n # back compat for previous_run_id => parent_run_id, root_run_id\n if previous_run_id and not (parent_run_id and root_run_id):\n parent_run_id = previous_run_id\n root_run_id = previous_run_id\n\n # back compat for selector => pipeline_name, solids_to_execute\n selector = check.opt_inst_param(selector, "selector", ExecutionSelector)\n if selector:\n check.invariant(\n pipeline_name is None or selector.name == pipeline_name,\n (\n "Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: "\n "selector was passed with pipeline {selector_pipeline}".format(\n pipeline_name=pipeline_name, selector_pipeline=selector.name\n )\n ),\n )\n if pipeline_name is None:\n pipeline_name = selector.name\n\n check.invariant(\n solids_to_execute is None or set(selector.solid_subset) == solids_to_execute,\n (\n "Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: "\n "selector was passed with subset {selector_subset}".format(\n solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset\n )\n ),\n )\n # for old runs that only have selector but no solids_to_execute\n if solids_to_execute is None:\n solids_to_execute = frozenset(selector.solid_subset) if selector.solid_subset else None\n\n # back compat for solid_subset => solids_to_execute\n check.opt_list_param(solid_subset, "solid_subset", of_type=str)\n if solid_subset:\n solids_to_execute = frozenset(solid_subset)\n\n # warn about unused arguments\n if len(kwargs):\n warnings.warn(\n "Found unhandled arguments from stored PipelineRun: {args}".format(args=kwargs.keys())\n )\n\n return DagsterRun( # pylint: disable=redundant-keyword-arg\n pipeline_name=pipeline_name,\n run_id=run_id,\n run_config=run_config,\n mode=mode,\n asset_selection=asset_selection,\n solid_selection=solid_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n pipeline_snapshot_id=pipeline_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_pipeline_origin=external_pipeline_origin,\n pipeline_code_origin=pipeline_code_origin,\n )\n\n\n
[docs]class PipelineRun(\n NamedTuple(\n "_PipelineRun",\n [\n ("pipeline_name", str),\n ("run_id", str),\n ("run_config", Mapping[str, object]),\n ("mode", Optional[str]),\n ("asset_selection", Optional[FrozenSet[AssetKey]]),\n ("solid_selection", Optional[List[str]]),\n ("solids_to_execute", Optional[FrozenSet[str]]),\n ("step_keys_to_execute", Optional[List[str]]),\n ("status", PipelineRunStatus),\n ("tags", Dict[str, str]),\n ("root_run_id", Optional[str]),\n ("parent_run_id", Optional[str]),\n ("pipeline_snapshot_id", Optional[str]),\n ("execution_plan_snapshot_id", Optional[str]),\n ("external_pipeline_origin", Optional["ExternalPipelineOrigin"]),\n ("pipeline_code_origin", Optional[PipelinePythonOrigin]),\n ],\n )\n):\n """Serializable internal representation of a pipeline run, as stored in a\n :py:class:`~dagster.core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n pipeline_name: str,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n mode: Optional[str] = None,\n asset_selection: Optional[FrozenSet[AssetKey]] = None,\n solid_selection: Optional[List[str]] = None,\n solids_to_execute: Optional[FrozenSet[str]] = None,\n step_keys_to_execute: Optional[List[str]] = None,\n status: Optional[PipelineRunStatus] = None,\n tags: Optional[Dict[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n pipeline_snapshot_id: Optional[str] = None,\n execution_plan_snapshot_id: Optional[str] = None,\n external_pipeline_origin: Optional["ExternalPipelineOrigin"] = None,\n pipeline_code_origin: Optional[PipelinePythonOrigin] = None,\n ):\n check.invariant(\n (root_run_id is not None and parent_run_id is not None)\n or (root_run_id is None and parent_run_id is None),\n (\n "Must set both root_run_id and parent_run_id when creating a PipelineRun that "\n "belongs to a run group"\n ),\n )\n # a frozenset which contains the names of the solids to execute\n solids_to_execute = check.opt_nullable_set_param(\n solids_to_execute, "solids_to_execute", of_type=str\n )\n # a list of solid queries provided by the user\n # possible to be None when only solids_to_execute is set by the user directly\n solid_selection = check.opt_nullable_list_param(\n solid_selection, "solid_selection", of_type=str\n )\n check.opt_nullable_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n asset_selection = check.opt_nullable_set_param(\n asset_selection, "asset_selection", of_type=AssetKey\n )\n\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster.core.host_representation.origin import ExternalPipelineOrigin\n\n if status == PipelineRunStatus.QUEUED:\n check.inst_param(\n external_pipeline_origin,\n "external_pipeline_origin",\n ExternalPipelineOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n if run_id is None:\n run_id = make_new_run_id()\n\n return super(PipelineRun, cls).__new__(\n cls,\n pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n run_id=check.str_param(run_id, "run_id"),\n run_config=check.opt_mapping_param(run_config, "run_config", key_type=str),\n mode=check.opt_str_param(mode, "mode"),\n solid_selection=solid_selection,\n asset_selection=asset_selection,\n solids_to_execute=solids_to_execute,\n step_keys_to_execute=step_keys_to_execute,\n status=check.opt_inst_param(\n status, "status", PipelineRunStatus, PipelineRunStatus.NOT_STARTED\n ),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n root_run_id=check.opt_str_param(root_run_id, "root_run_id"),\n parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"),\n pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, "pipeline_snapshot_id"),\n execution_plan_snapshot_id=check.opt_str_param(\n execution_plan_snapshot_id, "execution_plan_snapshot_id"\n ),\n external_pipeline_origin=check.opt_inst_param(\n external_pipeline_origin, "external_pipeline_origin", ExternalPipelineOrigin\n ),\n pipeline_code_origin=check.opt_inst_param(\n pipeline_code_origin, "pipeline_code_origin", PipelinePythonOrigin\n ),\n )\n\n def with_status(self, status):\n if status == PipelineRunStatus.QUEUED:\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster.core.host_representation.origin import ExternalPipelineOrigin\n\n check.inst(\n self.external_pipeline_origin,\n ExternalPipelineOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n return self._replace(status=status)\n\n def with_mode(self, mode):\n return self._replace(mode=mode)\n\n def with_tags(self, tags):\n return self._replace(tags=tags)\n\n def get_root_run_id(self):\n return self.tags.get(ROOT_RUN_ID_TAG)\n\n def get_parent_run_id(self):\n return self.tags.get(PARENT_RUN_ID_TAG)\n\n def tags_for_storage(self):\n repository_tags = {}\n if self.external_pipeline_origin:\n # tag the run with a label containing the repository name / location name, to allow for\n # per-repository filtering of runs from dagit.\n repository_tags[\n REPOSITORY_LABEL_TAG\n ] = self.external_pipeline_origin.external_repository_origin.get_label()\n\n if not self.tags:\n return repository_tags\n\n return {**repository_tags, **self.tags}\n\n @property\n def is_finished(self):\n return (\n self.status == PipelineRunStatus.SUCCESS\n or self.status == PipelineRunStatus.FAILURE\n or self.status == PipelineRunStatus.CANCELED\n )\n\n @property\n def is_success(self):\n return self.status == PipelineRunStatus.SUCCESS\n\n @property\n def is_failure(self):\n return self.status == PipelineRunStatus.FAILURE\n\n @property\n def is_failure_or_canceled(self):\n return self.status == PipelineRunStatus.FAILURE or self.status == PipelineRunStatus.CANCELED\n\n @property\n def is_resume_retry(self):\n return self.tags.get(RESUME_RETRY_TAG) == "true"\n\n @property\n def previous_run_id(self):\n # Compat\n return self.parent_run_id\n\n @staticmethod\n def tags_for_schedule(schedule):\n return {SCHEDULE_NAME_TAG: schedule.name}\n\n @staticmethod\n def tags_for_sensor(sensor):\n return {SENSOR_NAME_TAG: sensor.name}\n\n @staticmethod\n def tags_for_backfill_id(backfill_id):\n return {BACKFILL_ID_TAG: backfill_id}\n\n @staticmethod\n def tags_for_partition_set(partition_set, partition):\n return {PARTITION_NAME_TAG: partition.name, PARTITION_SET_TAG: partition_set.name}
\n\n\n@whitelist_for_serdes(serializer=DagsterRunSerializer)\nclass DagsterRun(PipelineRun):\n """Serializable internal representation of a dagster run, as stored in a\n :py:class:`~dagster.core.storage.runs.RunStorage`.\n\n Subclasses PipelineRun for backcompat purposes. DagsterRun is the actual initialized class used throughout the system.\n """\n\n\n# DagsterRun is serialized as PipelineRun so that it can be read by older (pre 0.13.x) version of\n# Dagster, but is read back in as a DagsterRun.\nregister_serdes_tuple_fallbacks({"PipelineRun": DagsterRun})\n\n\nclass RunsFilterSerializer(DefaultNamedTupleSerializer):\n @classmethod\n def value_to_storage_dict(\n cls,\n value: NamedTuple,\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Dict[str, Any]:\n storage = super().value_to_storage_dict(\n value,\n whitelist_map,\n descent_path,\n )\n # For backcompat, we store:\n # job_name as pipeline_name\n return replace_storage_keys(storage, {"job_name": "pipeline_name"})\n\n\n@whitelist_for_serdes(serializer=RunsFilterSerializer)\nclass RunsFilter(\n NamedTuple(\n "_RunsFilter",\n [\n ("run_ids", List[str]),\n ("job_name", Optional[str]),\n ("statuses", List[PipelineRunStatus]),\n ("tags", Dict[str, str]),\n ("snapshot_id", Optional[str]),\n ("updated_after", Optional[datetime]),\n ("mode", Optional[str]),\n ("created_before", Optional[datetime]),\n ],\n )\n):\n def __new__(\n cls,\n run_ids: Optional[List[str]] = None,\n job_name: Optional[str] = None,\n statuses: Optional[List[PipelineRunStatus]] = None,\n tags: Optional[Dict[str, str]] = None,\n snapshot_id: Optional[str] = None,\n updated_after: Optional[datetime] = None,\n mode: Optional[str] = None,\n created_before: Optional[datetime] = None,\n pipeline_name: Optional[str] = None, # for backcompat purposes\n ):\n job_name = job_name or pipeline_name\n\n check.invariant(run_ids != [], "When filtering on run ids, a non-empty list must be used.")\n\n return super(RunsFilter, cls).__new__(\n cls,\n run_ids=check.opt_list_param(run_ids, "run_ids", of_type=str),\n job_name=check.opt_str_param(job_name, "job_name"),\n statuses=check.opt_list_param(statuses, "statuses", of_type=PipelineRunStatus),\n tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str),\n snapshot_id=check.opt_str_param(snapshot_id, "snapshot_id"),\n updated_after=check.opt_inst_param(updated_after, "updated_after", datetime),\n mode=check.opt_str_param(mode, "mode"),\n created_before=check.opt_inst_param(created_before, "created_before", datetime),\n )\n\n @property\n def pipeline_name(self):\n return self.job_name\n\n @staticmethod\n def for_schedule(schedule):\n return RunsFilter(tags=PipelineRun.tags_for_schedule(schedule))\n\n @staticmethod\n def for_partition(partition_set, partition):\n return RunsFilter(tags=PipelineRun.tags_for_partition_set(partition_set, partition))\n\n @staticmethod\n def for_sensor(sensor):\n return RunsFilter(tags=PipelineRun.tags_for_sensor(sensor))\n\n @staticmethod\n def for_backfill(backfill_id):\n return RunsFilter(tags=PipelineRun.tags_for_backfill_id(backfill_id))\n\n\nregister_serdes_tuple_fallbacks({"PipelineRunsFilter": RunsFilter})\n# DEPRECATED - keeping around for backcompat reasons (some folks might have imported directly)\nPipelineRunsFilter = RunsFilter\n\n\nclass JobBucket(NamedTuple):\n job_names: List[str]\n bucket_limit: Optional[int]\n\n\nclass TagBucket(NamedTuple):\n tag_key: str\n tag_values: List[str]\n bucket_limit: Optional[int]\n\n\nclass RunRecord(\n NamedTuple(\n "_RunRecord",\n [\n ("storage_id", int),\n ("pipeline_run", PipelineRun),\n ("create_timestamp", datetime),\n ("update_timestamp", datetime),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n """Internal representation of a run record, as stored in a\n :py:class:`~dagster.core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n storage_id,\n pipeline_run,\n create_timestamp,\n update_timestamp,\n start_time=None,\n end_time=None,\n ):\n return super(RunRecord, cls).__new__(\n cls,\n storage_id=check.int_param(storage_id, "storage_id"),\n pipeline_run=check.inst_param(pipeline_run, "pipeline_run", PipelineRun),\n create_timestamp=check.inst_param(create_timestamp, "create_timestamp", datetime),\n update_timestamp=check.inst_param(update_timestamp, "update_timestamp", datetime),\n # start_time and end_time fields will be populated once the run has started and ended, respectively, but will be None beforehand.\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\n@whitelist_for_serdes\nclass RunPartitionData(\n NamedTuple(\n "_RunPartitionData",\n [\n ("run_id", str),\n ("partition", str),\n ("status", DagsterRunStatus),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n partition: str,\n status: DagsterRunStatus,\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(RunPartitionData, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n partition=check.str_param(partition, "partition"),\n status=check.inst_param(status, "status", DagsterRunStatus),\n start_time=check.opt_inst(start_time, float),\n end_time=check.opt_inst(end_time, float),\n )\n\n\n###################################################################################################\n# GRAVEYARD\n#\n# -|-\n# |\n# _-'~~~~~`-_\n# .' '.\n# | R I P |\n# | |\n# | Execution |\n# | Selector |\n# | |\n# | |\n###################################################################################################\n\n\n@whitelist_for_serdes\nclass ExecutionSelector(\n NamedTuple("_ExecutionSelector", [("name", str), ("solid_subset", Optional[List[str]])])\n):\n """\n Kept here to maintain loading of PipelineRuns from when it was still alive.\n """\n\n def __new__(cls, name: str, solid_subset: Optional[List[str]] = None):\n return super(ExecutionSelector, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n solid_subset=None\n if solid_subset is None\n else check.list_param(solid_subset, "solid_subset", of_type=str),\n )\n
", "current_page_name": "_modules/dagster/core/storage/pipeline_run", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.pipeline_run"}, "root": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.root

\nimport os\n\nfrom dagster import StringSource\nfrom dagster import _check as check\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\n\n
[docs]class LocalArtifactStorage(ConfigurableClass):\n def __init__(self, base_dir, inst_data=None):\n self._base_dir = base_dir\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def base_dir(self):\n return self._base_dir\n\n def file_manager_dir(self, run_id):\n check.str_param(run_id, "run_id")\n return os.path.join(self.base_dir, "storage", run_id, "files")\n\n @property\n def storage_dir(self):\n return os.path.join(self.base_dir, "storage")\n\n @property\n def schedules_dir(self):\n return os.path.join(self.base_dir, "schedules")\n\n
[docs] @staticmethod\n def from_config_value(inst_data, config_value):\n return LocalArtifactStorage(inst_data=inst_data, **config_value)
\n\n
[docs] @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}
\n
", "current_page_name": "_modules/dagster/core/storage/root", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.root"}, "root_input_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.root_input_manager

\nfrom abc import abstractmethod\nfrom functools import update_wrapper\n\nimport dagster._check as check\nfrom dagster.core.definitions.config import is_callable_valid_config_arg\nfrom dagster.core.definitions.definition_config_schema import (\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster.core.definitions.resource_definition import ResourceDefinition, is_context_provided\nfrom dagster.core.storage.input_manager import InputManager\nfrom dagster.utils.backcompat import experimental\n\nfrom ..decorator_utils import get_function_params\n\n\nclass IInputManagerDefinition:\n    @property\n    @abstractmethod\n    def input_config_schema(self):\n        """The schema for per-input configuration for inputs that are managed by this\n        input manager"""\n\n\n
[docs]class RootInputManagerDefinition(ResourceDefinition, IInputManagerDefinition):\n """Definition of a root input manager resource.\n\n Root input managers load op inputs that aren't connected to upstream outputs.\n\n An RootInputManagerDefinition is a :py:class:`ResourceDefinition` whose resource_fn returns an\n :py:class:`RootInputManager`.\n\n The easiest way to create an RootInputManagerDefinition is with the\n :py:func:`@root_input_manager <root_input_manager>` decorator.\n """\n\n def __init__(\n self,\n resource_fn=None,\n config_schema=None,\n description=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n super(RootInputManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self):\n return self._input_config_schema\n\n def copy_for_configured(self, description, config_schema, _):\n return RootInputManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n )
\n\n\n
[docs]class RootInputManager(InputManager):\n """RootInputManagers are used to load inputs to ops at the root of a job.\n\n The easiest way to define an RootInputManager is with the\n :py:func:`@root_input_manager <root_input_manager>` decorator.\n """\n\n
[docs] @abstractmethod\n def load_input(self, context):\n """The user-defined read method that loads data given its metadata.\n\n Args:\n context (InputContext): The context of the step output that produces this asset.\n\n Returns:\n Any: The data object.\n """
\n\n\n
[docs]@experimental\ndef root_input_manager(\n config_schema=None,\n description=None,\n input_config_schema=None,\n required_resource_keys=None,\n version=None,\n):\n """Define a root input manager.\n\n Root input managers load op inputs that aren't connected to upstream outputs.\n\n The decorated function should accept a :py:class:`InputContext` and resource config, and return\n a loaded object that will be passed into one of the inputs of an op.\n\n The decorator produces an :py:class:`RootInputManagerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource-level config. If not\n set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the resource.\n input_config_schema (Optional[ConfigSchema]): A schema for the input-level config. Each\n input that uses this input manager can be configured separately using this config.\n If not set, Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the input\n manager.\n version (Optional[str]): (Experimental) the version of the input manager definition.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import root_input_manager, op, job, In\n\n @root_input_manager\n def csv_loader(_):\n return read_csv("some/path")\n\n @op(ins={"input1": In(root_manager_key="csv_loader_key")})\n def my_op(_, input1):\n do_stuff(input1)\n\n @job(resource_defs={"csv_loader_key": csv_loader})\n def my_job():\n my_op()\n\n @root_input_manager(config_schema={"base_dir": str})\n def csv_loader(context):\n return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n @root_input_manager(input_config_schema={"path": str})\n def csv_loader(context):\n return read_csv(context.config["path"])\n """\n\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _InputManagerDecoratorCallable()(config_schema)\n\n def _wrap(load_fn):\n return _InputManagerDecoratorCallable(\n config_schema=config_schema,\n description=description,\n version=version,\n input_config_schema=input_config_schema,\n required_resource_keys=required_resource_keys,\n )(load_fn)\n\n return _wrap
\n\n\nclass RootInputManagerWrapper(RootInputManager):\n def __init__(self, load_fn):\n self._load_fn = load_fn\n\n def load_input(self, context):\n return (\n self._load_fn(context)\n if is_context_provided(get_function_params(self._load_fn))\n else self._load_fn()\n )\n\n\nclass _InputManagerDecoratorCallable:\n def __init__(\n self,\n config_schema=None,\n description=None,\n version=None,\n input_config_schema=None,\n required_resource_keys=None,\n ):\n self.config_schema = config_schema\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.input_config_schema = input_config_schema\n self.required_resource_keys = required_resource_keys\n\n def __call__(self, load_fn):\n check.callable_param(load_fn, "load_fn")\n\n def _resource_fn(_):\n return RootInputManagerWrapper(load_fn)\n\n root_input_manager_def = RootInputManagerDefinition(\n resource_fn=_resource_fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n input_config_schema=self.input_config_schema,\n required_resource_keys=self.required_resource_keys,\n )\n\n update_wrapper(root_input_manager_def, wrapped=load_fn)\n\n return root_input_manager_def\n
", "current_page_name": "_modules/dagster/core/storage/root_input_manager", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.root_input_manager"}, "runs": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.runs.base

\nfrom abc import ABC, abstractmethod\nfrom typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union\n\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.snap import ExecutionPlanSnapshot, PipelineSnapshot\nfrom dagster.core.storage.pipeline_run import (\n    JobBucket,\n    PipelineRun,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster.daemon.types import DaemonHeartbeat\n\n\n
[docs]class RunStorage(ABC, MayHaveInstanceWeakref):\n """Abstract base class for storing pipeline run history.\n\n Note that run storages using SQL databases as backing stores should implement\n :py:class:`~dagster.core.storage.runs.SqlRunStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagit`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def add_run(self, pipeline_run: PipelineRun) -> PipelineRun:\n """Add a run to storage.\n\n If a run already exists with the same ID, raise DagsterRunAlreadyExists\n If the run's snapshot ID does not exist raise DagsterSnapshotDoesNotExist\n\n Args:\n pipeline_run (PipelineRun): The run to add.\n """\n\n @abstractmethod\n def handle_run_event(self, run_id: str, event: DagsterEvent):\n """Update run storage in accordance to a pipeline run related DagsterEvent\n\n Args:\n run_id (str)\n event (DagsterEvent)\n """\n\n @abstractmethod\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Iterable[PipelineRun]:\n """Return all the runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n List[PipelineRun]\n """\n\n @abstractmethod\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n """Return the number of runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.PipelineRunFilter` by which to filter\n runs\n\n Returns:\n int: The number of runs that match the given filters.\n """\n\n @abstractmethod\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Iterable[PipelineRun]]]:\n """Get the run group to which a given run belongs.\n\n Args:\n run_id (str): If the corresponding run is the descendant of some root run (i.e., there\n is a root_run_id on the :py:class:`PipelineRun`), that root run and all of its\n descendants are returned; otherwise, the group will consist only of the given run\n (a run that does not descend from any root is its own root).\n\n Returns:\n Optional[Tuple[string, List[PipelineRun]]]: If there is a corresponding run group, tuple\n whose first element is the root_run_id and whose second element is a list of all the\n descendent runs. Otherwise `None`.\n """\n\n @abstractmethod\n def get_run_groups(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Dict[str, Dict[str, Union[Iterable[PipelineRun], int]]]:\n """Return all of the run groups present in the storage that include rows matching the\n given filter.\n\n Args:\n filter (Optional[RunsFilter]) -- The\n :py:class:`~dagster.core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n Dict[str, Dict[str, Union[List[PipelineRun], int]]]: Specifically, a dict of the form\n ``{'pipeline_run_id': {'runs': [PipelineRun, ...], 'count': int}, ...}``. The\n instances of :py:class:`~dagster.core.pipeline_run.PipelineRun` returned in this\n data structure correspond to all of the runs that would have been returned by\n calling :py:meth:`get_run_groups` with the same arguments, plus their corresponding\n root runs, if any. The keys of this structure are the run_ids of all of the root\n runs (a run with no root is its own root). The integer counts are inclusive of all\n of the root runs' children, including those that would not have been returned by\n calling :py:meth:`get_run_groups` with the same arguments, but exclusive of the root\n run itself; i.e., if a run has no children, the count will be 0.\n """\n\n # Note that we could have made the opposite decision here and filtered for root runs\n # matching a given filter, etc., rather than for child runs; so that asking for the last 5\n # run groups would give the last 5 roots and their descendants, rather than the last 5\n # children and their roots. Consider the case where we have just been retrying runs\n # belonging to a group created long ago; it makes sense to bump these to the top of the\n # interface rather than burying them deeply paginated down. Note also that this query can\n # return no more run groups than there are runs in an equivalent call to get_runs, and no\n # more than 2x total instances of PipelineRun.\n\n @abstractmethod\n def get_run_by_id(self, run_id: str) -> Optional[PipelineRun]:\n """Get a run by its id.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n Optional[PipelineRun]\n """\n\n @abstractmethod\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> List[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n\n @abstractmethod\n def get_run_tags(self) -> List[Tuple[str, Set[str]]]:\n """Get a list of tag keys and the values that have been associated with them.\n\n Returns:\n List[Tuple[str, Set[str]]]\n """\n\n @abstractmethod\n def add_run_tags(self, run_id: str, new_tags: Dict[str, str]):\n """Add additional tags for a pipeline run.\n\n Args:\n run_id (str)\n new_tags (Dict[string, string])\n """\n\n @abstractmethod\n def has_run(self, run_id: str) -> bool:\n """Check if the storage contains a run.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n bool\n """\n\n def add_snapshot(\n self,\n snapshot: Union[PipelineSnapshot, ExecutionPlanSnapshot],\n snapshot_id: Optional[str] = None,\n ):\n """Add a snapshot to the storage.\n\n Args:\n snapshot (Union[PipelineSnapshot, ExecutionPlanSnapshot])\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n """\n if isinstance(snapshot, PipelineSnapshot):\n self.add_pipeline_snapshot(snapshot, snapshot_id)\n else:\n self.add_execution_plan_snapshot(snapshot, snapshot_id)\n\n def has_snapshot(self, snapshot_id: str):\n return self.has_pipeline_snapshot(snapshot_id) or self.has_execution_plan_snapshot(\n snapshot_id\n )\n\n @abstractmethod\n def has_pipeline_snapshot(self, pipeline_snapshot_id: str) -> bool:\n """Check to see if storage contains a pipeline snapshot.\n\n Args:\n pipeline_snapshot_id (str): The id of the run.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_pipeline_snapshot(\n self, pipeline_snapshot: PipelineSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n """Add a pipeline snapshot to the run store.\n\n Pipeline snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n pipeline_snapshot (PipelineSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The pipeline_snapshot_id\n """\n\n @abstractmethod\n def get_pipeline_snapshot(self, pipeline_snapshot_id: str) -> PipelineSnapshot:\n """Fetch a snapshot by ID\n\n Args:\n pipeline_snapshot_id (str)\n\n Returns:\n PipelineSnapshot\n """\n\n @abstractmethod\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n """Check to see if storage contains an execution plan snapshot.\n\n Args:\n execution_plan_snapshot_id (str): The id of the execution plan.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n """Add an execution plan snapshot to the run store.\n\n Execution plan snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n execution_plan_snapshot (ExecutionPlanSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The execution_plan_snapshot_id\n """\n\n @abstractmethod\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n """Fetch a snapshot by ID\n\n Args:\n execution_plan_snapshot_id (str)\n\n Returns:\n ExecutionPlanSnapshot\n """\n\n @abstractmethod\n def wipe(self):\n """Clears the run storage."""\n\n @abstractmethod\n def delete_run(self, run_id: str):\n """Remove a run from storage"""\n\n @property\n def supports_bucket_queries(self):\n return True\n\n @abstractmethod\n def get_run_partition_data(\n self,\n partition_set_name: str,\n job_name: str,\n repository_label: str,\n ) -> List[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n\n def migrate(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n """Call this method to run any required data migrations"""\n\n def optimize(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n """Call this method to run any optional data migrations for optimized reads"""\n\n def dispose(self):\n """Explicit lifecycle management."""\n\n def optimize_for_dagit(self, statement_timeout: int):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""\n\n # Daemon Heartbeat Storage\n #\n # Holds heartbeats from the Dagster Daemon so that other system components can alert when it's not\n # alive.\n # This is temporarily placed along with run storage to avoid adding a new instance concept. It\n # should be split out once all metadata storages are configured together.\n\n @abstractmethod\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat):\n """Called on a regular interval by the daemon"""\n\n @abstractmethod\n def get_daemon_heartbeats(self) -> Dict[str, DaemonHeartbeat]:\n """Latest heartbeats of all daemon types"""\n\n @abstractmethod\n def wipe_daemon_heartbeats(self):\n """Wipe all daemon heartbeats"""\n\n # Backfill storage\n @abstractmethod\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> List[PartitionBackfill]:\n """Get a list of partition backfills"""\n\n @abstractmethod\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n """Get the partition backfill of the given backfill id."""\n\n @abstractmethod\n def add_backfill(self, partition_backfill: PartitionBackfill):\n """Add partition backfill to run storage"""\n\n @abstractmethod\n def update_backfill(self, partition_backfill: PartitionBackfill):\n """Update a partition backfill in run storage"""\n\n def alembic_version(self):\n return None
\n
", "current_page_name": "_modules/dagster/core/storage/runs/base", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.runs.base"}, "sql_run_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.runs.sql_run_storage

\nimport logging\nimport uuid\nimport zlib\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union\n\nimport pendulum\nimport sqlalchemy as db\n\nimport dagster._check as check\nfrom dagster.core.errors import (\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunNotFoundError,\n    DagsterSnapshotDoesNotExist,\n)\nfrom dagster.core.events import EVENT_TYPE_TO_PIPELINE_RUN_STATUS, DagsterEvent, DagsterEventType\nfrom dagster.core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster.core.snap import (\n    ExecutionPlanSnapshot,\n    PipelineSnapshot,\n    create_execution_plan_snapshot_id,\n    create_pipeline_snapshot_id,\n)\nfrom dagster.core.storage.tags import PARTITION_NAME_TAG, PARTITION_SET_TAG, ROOT_RUN_ID_TAG\nfrom dagster.daemon.types import DaemonHeartbeat\nfrom dagster.serdes import (\n    deserialize_as,\n    deserialize_json_to_dagster_namedtuple,\n    serialize_dagster_namedtuple,\n)\nfrom dagster.seven import JSONDecodeError\nfrom dagster.utils import merge_dicts, utc_datetime_from_timestamp\n\nfrom ..pipeline_run import (\n    DagsterRunStatus,\n    JobBucket,\n    PipelineRun,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom .base import RunStorage\nfrom .migration import OPTIONAL_DATA_MIGRATIONS, REQUIRED_DATA_MIGRATIONS, RUN_PARTITIONS\nfrom .schema import (\n    BulkActionsTable,\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunTagsTable,\n    RunsTable,\n    SecondaryIndexMigrationTable,\n    SnapshotsTable,\n)\n\n\nclass SnapshotType(Enum):\n    PIPELINE = "PIPELINE"\n    EXECUTION_PLAN = "EXECUTION_PLAN"\n\n\n
[docs]class SqlRunStorage(RunStorage): # pylint: disable=no-init\n """Base class for SQL based run storages"""\n\n @abstractmethod\n def connect(self):\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n @abstractmethod\n def upgrade(self):\n """This method should perform any schema or data migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def fetchall(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def fetchone(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n row = result_proxy.fetchone()\n result_proxy.close()\n\n return row\n\n def add_run(self, pipeline_run: PipelineRun) -> PipelineRun:\n check.inst_param(pipeline_run, "pipeline_run", PipelineRun)\n\n if pipeline_run.pipeline_snapshot_id and not self.has_pipeline_snapshot(\n pipeline_run.pipeline_snapshot_id\n ):\n raise DagsterSnapshotDoesNotExist(\n "Snapshot {ss_id} does not exist in run storage".format(\n ss_id=pipeline_run.pipeline_snapshot_id\n )\n )\n\n has_tags = pipeline_run.tags and len(pipeline_run.tags) > 0\n partition = pipeline_run.tags.get(PARTITION_NAME_TAG) if has_tags else None\n partition_set = pipeline_run.tags.get(PARTITION_SET_TAG) if has_tags else None\n\n runs_insert = RunsTable.insert().values( # pylint: disable=no-value-for-parameter\n run_id=pipeline_run.run_id,\n pipeline_name=pipeline_run.pipeline_name,\n status=pipeline_run.status.value,\n run_body=serialize_dagster_namedtuple(pipeline_run),\n snapshot_id=pipeline_run.pipeline_snapshot_id,\n partition=partition,\n partition_set=partition_set,\n )\n with self.connect() as conn:\n try:\n conn.execute(runs_insert)\n except db.exc.IntegrityError as exc:\n raise DagsterRunAlreadyExists from exc\n\n tags_to_insert = pipeline_run.tags_for_storage()\n if tags_to_insert:\n conn.execute(\n RunTagsTable.insert(), # pylint: disable=no-value-for-parameter\n [\n dict(run_id=pipeline_run.run_id, key=k, value=v)\n for k, v in tags_to_insert.items()\n ],\n )\n\n return pipeline_run\n\n def handle_run_event(self, run_id: str, event: DagsterEvent):\n check.str_param(run_id, "run_id")\n check.inst_param(event, "event", DagsterEvent)\n\n if event.event_type not in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n return\n\n run = self.get_run_by_id(run_id)\n if not run:\n # TODO log?\n return\n\n new_pipeline_status = EVENT_TYPE_TO_PIPELINE_RUN_STATUS[event.event_type]\n\n run_stats_cols_in_index = self.has_run_stats_index_cols()\n\n kwargs = {}\n\n # consider changing the `handle_run_event` signature to get timestamp off of the\n # EventLogEntry instead of the DagsterEvent, for consistency\n now = pendulum.now("UTC")\n\n if run_stats_cols_in_index and event.event_type == DagsterEventType.PIPELINE_START:\n kwargs["start_time"] = now.timestamp()\n\n if run_stats_cols_in_index and event.event_type in {\n DagsterEventType.PIPELINE_CANCELED,\n DagsterEventType.PIPELINE_FAILURE,\n DagsterEventType.PIPELINE_SUCCESS,\n }:\n kwargs["end_time"] = now.timestamp()\n\n with self.connect() as conn:\n\n conn.execute(\n RunsTable.update() # pylint: disable=no-value-for-parameter\n .where(RunsTable.c.run_id == run_id)\n .values(\n status=new_pipeline_status.value,\n run_body=serialize_dagster_namedtuple(run.with_status(new_pipeline_status)),\n update_timestamp=now,\n **kwargs,\n )\n )\n\n def _row_to_run(self, row: Tuple) -> PipelineRun:\n return deserialize_as(row[0], PipelineRun)\n\n def _rows_to_runs(self, rows: Iterable[Tuple]) -> List[PipelineRun]:\n return list(map(self._row_to_run, rows))\n\n def _add_cursor_limit_to_query(\n self,\n query,\n cursor: Optional[str],\n limit: Optional[int],\n order_by: Optional[str],\n ascending: Optional[bool],\n ):\n """Helper function to deal with cursor/limit pagination args"""\n\n if cursor:\n cursor_query = db.select([RunsTable.c.id]).where(RunsTable.c.run_id == cursor)\n query = query.where(RunsTable.c.id < cursor_query)\n\n if limit:\n query = query.limit(limit)\n\n sorting_column = getattr(RunsTable.c, order_by) if order_by else RunsTable.c.id\n direction = db.asc if ascending else db.desc\n query = query.order_by(direction(sorting_column))\n\n return query\n\n def _add_filters_to_query(self, query, filters: RunsFilter):\n check.inst_param(filters, "filters", RunsFilter)\n\n if filters.run_ids:\n query = query.where(RunsTable.c.run_id.in_(filters.run_ids))\n\n if filters.job_name:\n query = query.where(RunsTable.c.pipeline_name == filters.job_name)\n\n if filters.mode:\n query = query.where(RunsTable.c.mode == filters.mode)\n\n if filters.statuses:\n query = query.where(\n RunsTable.c.status.in_([status.value for status in filters.statuses])\n )\n\n if filters.tags:\n query = query.where(\n db.or_(\n *(\n db.and_(RunTagsTable.c.key == key, RunTagsTable.c.value == value)\n for key, value in filters.tags.items()\n )\n )\n ).group_by(RunsTable.c.run_body, RunsTable.c.id)\n\n if len(filters.tags) > 0:\n query = query.having(db.func.count(RunsTable.c.run_id) == len(filters.tags))\n\n if filters.snapshot_id:\n query = query.where(RunsTable.c.snapshot_id == filters.snapshot_id)\n\n if filters.updated_after:\n query = query.where(RunsTable.c.update_timestamp > filters.updated_after)\n\n if filters.created_before:\n query = query.where(RunsTable.c.create_timestamp < filters.created_before)\n\n return query\n\n def _runs_query(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n columns: Optional[List[str]] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ):\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_str_param(cursor, "cursor")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(columns, "columns")\n check.opt_str_param(order_by, "order_by")\n check.opt_bool_param(ascending, "ascending")\n\n if columns is None:\n columns = ["run_body"]\n\n if bucket_by:\n if limit or cursor:\n check.failed("cannot specify bucket_by and limit/cursor at the same time")\n return self._bucketed_runs_query(bucket_by, filters, columns, order_by, ascending)\n\n query_columns = [getattr(RunsTable.c, column) for column in columns]\n if filters.tags:\n base_query = db.select(query_columns).select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n )\n else:\n base_query = db.select(query_columns).select_from(RunsTable)\n\n base_query = self._add_filters_to_query(base_query, filters)\n return self._add_cursor_limit_to_query(base_query, cursor, limit, order_by, ascending)\n\n def _bucket_rank_column(self, bucket_by, order_by, ascending):\n check.inst_param(bucket_by, "bucket_by", (JobBucket, TagBucket))\n check.invariant(\n self.supports_bucket_queries, "Bucket queries are not supported by this storage layer"\n )\n sorting_column = getattr(RunsTable.c, order_by) if order_by else RunsTable.c.id\n direction = db.asc if ascending else db.desc\n bucket_column = (\n RunsTable.c.pipeline_name if isinstance(bucket_by, JobBucket) else RunTagsTable.c.value\n )\n return (\n db.func.rank()\n .over(order_by=direction(sorting_column), partition_by=bucket_column)\n .label("rank")\n )\n\n def _bucketed_runs_query(\n self,\n bucket_by: Union[JobBucket, TagBucket],\n filters: RunsFilter,\n columns: List[str],\n order_by: Optional[str] = None,\n ascending: bool = False,\n ):\n bucket_rank = self._bucket_rank_column(bucket_by, order_by, ascending)\n query_columns = [getattr(RunsTable.c, column) for column in columns] + [bucket_rank]\n\n if isinstance(bucket_by, JobBucket):\n # bucketing by job\n base_query = (\n db.select(query_columns)\n .select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n if filters.tags\n else RunsTable\n )\n .where(RunsTable.c.pipeline_name.in_(bucket_by.job_names))\n )\n base_query = self._add_filters_to_query(base_query, filters)\n\n elif not filters.tags:\n # bucketing by tag, no tag filters\n base_query = (\n db.select(query_columns)\n .select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n )\n .where(RunTagsTable.c.key == bucket_by.tag_key)\n .where(RunTagsTable.c.value.in_(bucket_by.tag_values))\n )\n base_query = self._add_filters_to_query(base_query, filters)\n\n else:\n # there are tag filters as well as tag buckets, so we have to apply the tag filters in\n # a separate join\n filtered_query = db.select([RunsTable.c.run_id]).select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)\n )\n filtered_query = self._add_filters_to_query(filtered_query, filters)\n filtered_query = filtered_query.alias("filtered_query")\n\n base_query = (\n db.select(query_columns)\n .select_from(\n RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id).join(\n filtered_query, RunsTable.c.run_id == filtered_query.c.run_id\n )\n )\n .where(RunTagsTable.c.key == bucket_by.tag_key)\n .where(RunTagsTable.c.value.in_(bucket_by.tag_values))\n )\n\n subquery = base_query.alias("subquery")\n\n # select all the columns, but skip the bucket_rank column, which is only used for applying\n # the limit / order\n subquery_columns = [getattr(subquery.c, column) for column in columns]\n query = db.select(subquery_columns).order_by(subquery.c.rank.asc())\n if bucket_by.bucket_limit:\n query = query.where(subquery.c.rank <= bucket_by.bucket_limit)\n\n return query\n\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> List[PipelineRun]:\n query = self._runs_query(filters, cursor, limit, bucket_by=bucket_by)\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n subquery = self._runs_query(filters=filters).alias("subquery")\n\n # We use an alias here because Postgres requires subqueries to be\n # aliased.\n subquery = subquery.alias("subquery")\n\n query = db.select([db.func.count()]).select_from(subquery)\n rows = self.fetchall(query)\n count = rows[0][0]\n return count\n\n def get_run_by_id(self, run_id: str) -> Optional[PipelineRun]:\n """Get a run by its id.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n Optional[PipelineRun]\n """\n check.str_param(run_id, "run_id")\n\n query = db.select([RunsTable.c.run_body]).where(RunsTable.c.run_id == run_id)\n rows = self.fetchall(query)\n return deserialize_as(rows[0][0], PipelineRun) if len(rows) else None\n\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> List[RunRecord]:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_int_param(limit, "limit")\n\n columns = ["id", "run_body", "create_timestamp", "update_timestamp"]\n\n if self.has_run_stats_index_cols():\n columns += ["start_time", "end_time"]\n # only fetch columns we use to build RunRecord\n query = self._runs_query(\n filters=filters,\n limit=limit,\n columns=columns,\n order_by=order_by,\n ascending=ascending,\n cursor=cursor,\n bucket_by=bucket_by,\n )\n\n rows = self.fetchall(query)\n return [\n RunRecord(\n storage_id=check.int_param(row["id"], "id"),\n pipeline_run=deserialize_as(\n check.str_param(row["run_body"], "run_body"), PipelineRun\n ),\n create_timestamp=check.inst(row["create_timestamp"], datetime),\n update_timestamp=check.inst(row["update_timestamp"], datetime),\n start_time=check.opt_inst(row["start_time"], float)\n if "start_time" in row\n else None,\n end_time=check.opt_inst(row["end_time"], float) if "end_time" in row else None,\n )\n for row in rows\n ]\n\n def get_run_tags(self) -> List[Tuple[str, Set[str]]]:\n result = defaultdict(set)\n query = db.select([RunTagsTable.c.key, RunTagsTable.c.value]).distinct(\n RunTagsTable.c.key, RunTagsTable.c.value\n )\n rows = self.fetchall(query)\n for r in rows:\n result[r[0]].add(r[1])\n return sorted(list([(k, v) for k, v in result.items()]), key=lambda x: x[0])\n\n def add_run_tags(self, run_id: str, new_tags: Dict[str, str]):\n check.str_param(run_id, "run_id")\n check.dict_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n run = self.get_run_by_id(run_id)\n if not run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n current_tags = run.tags if run.tags else {}\n\n all_tags = merge_dicts(current_tags, new_tags)\n partition = all_tags.get(PARTITION_NAME_TAG)\n partition_set = all_tags.get(PARTITION_SET_TAG)\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update() # pylint: disable=no-value-for-parameter\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_dagster_namedtuple(\n run.with_tags(merge_dicts(current_tags, new_tags))\n ),\n partition=partition,\n partition_set=partition_set,\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n RunTagsTable.update() # pylint: disable=no-value-for-parameter\n .where(db.and_(RunTagsTable.c.run_id == run_id, RunTagsTable.c.key == tag))\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n RunTagsTable.insert(), # pylint: disable=no-value-for-parameter\n [dict(run_id=run_id, key=tag, value=new_tags[tag]) for tag in added_tags],\n )\n\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Iterable[PipelineRun]]]:\n check.str_param(run_id, "run_id")\n pipeline_run = self.get_run_by_id(run_id)\n if not pipeline_run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n\n # find root_run\n root_run_id = pipeline_run.root_run_id if pipeline_run.root_run_id else pipeline_run.run_id\n root_run = self.get_run_by_id(root_run_id)\n if not root_run:\n raise DagsterRunNotFoundError(\n f"Run id {root_run} set as root run id for run {run_id} was not found in instance.",\n invalid_run_id=root_run,\n )\n\n # root_run_id to run_id 1:1 mapping\n # https://github.com/dagster-io/dagster/issues/2495\n # Note: we currently use tags to persist the run group info\n root_to_run = (\n db.select(\n [RunTagsTable.c.value.label("root_run_id"), RunTagsTable.c.run_id.label("run_id")]\n )\n .where(\n db.and_(RunTagsTable.c.key == ROOT_RUN_ID_TAG, RunTagsTable.c.value == root_run_id)\n )\n .alias("root_to_run")\n )\n # get run group\n run_group_query = (\n db.select([RunsTable.c.run_body])\n .select_from(\n root_to_run.join(\n RunsTable,\n root_to_run.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n .alias("run_group")\n )\n\n with self.connect() as conn:\n res = conn.execute(run_group_query)\n run_group = self._rows_to_runs(res)\n\n return (root_run_id, [root_run] + run_group)\n\n def get_run_groups(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Dict[str, Dict[str, Union[Iterable[PipelineRun], int]]]:\n # The runs that would be returned by calling RunStorage.get_runs with the same arguments\n runs = self._runs_query(\n filters=filters, cursor=cursor, limit=limit, columns=["run_body", "run_id"]\n ).alias("runs")\n\n # Gets us the run_id and associated root_run_id for every run in storage that is a\n # descendant run of some root\n #\n # pseudosql:\n # with all_descendant_runs as (\n # select *\n # from run_tags\n # where key = @ROOT_RUN_ID_TAG\n # )\n\n all_descendant_runs = (\n db.select([RunTagsTable])\n .where(RunTagsTable.c.key == ROOT_RUN_ID_TAG)\n .alias("all_descendant_runs")\n )\n\n # Augment the runs in our query, for those runs that are the descendant of some root run,\n # with the root_run_id\n #\n # pseudosql:\n #\n # with runs_augmented as (\n # select\n # runs.run_id as run_id,\n # all_descendant_runs.value as root_run_id\n # from runs\n # left outer join all_descendant_runs\n # on all_descendant_runs.run_id = runs.run_id\n # )\n\n runs_augmented = (\n db.select(\n [\n runs.c.run_id.label("run_id"),\n all_descendant_runs.c.value.label("root_run_id"),\n ]\n )\n .select_from(\n runs.join(\n all_descendant_runs,\n all_descendant_runs.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n .alias("runs_augmented")\n )\n\n # Get all the runs our query will return. This includes runs as well as their root runs.\n #\n # pseudosql:\n #\n # with runs_and_root_runs as (\n # select runs.run_id as run_id\n # from runs, runs_augmented\n # where\n # runs.run_id = runs_augmented.run_id or\n # runs.run_id = runs_augmented.root_run_id\n # )\n\n runs_and_root_runs = (\n db.select([RunsTable.c.run_id.label("run_id")])\n .select_from(runs_augmented)\n .where(\n db.or_(\n RunsTable.c.run_id == runs_augmented.c.run_id,\n RunsTable.c.run_id == runs_augmented.c.root_run_id,\n )\n )\n .distinct(RunsTable.c.run_id)\n ).alias("runs_and_root_runs")\n\n # We count the descendants of all of the runs in our query that are roots so that\n # we can accurately display when a root run has more descendants than are returned by this\n # query and afford a drill-down. This might be an unnecessary complication, but the\n # alternative isn't obvious -- we could go and fetch *all* the runs in any group that we're\n # going to return in this query, and then append those.\n #\n # pseudosql:\n #\n # select runs.run_body, count(all_descendant_runs.id) as child_counts\n # from runs\n # join runs_and_root_runs on runs.run_id = runs_and_root_runs.run_id\n # left outer join all_descendant_runs\n # on all_descendant_runs.value = runs_and_root_runs.run_id\n # group by runs.run_body\n # order by child_counts desc\n\n runs_and_root_runs_with_descendant_counts = (\n db.select(\n [\n RunsTable.c.run_body,\n db.func.count(all_descendant_runs.c.id).label("child_counts"),\n ]\n )\n .select_from(\n RunsTable.join(\n runs_and_root_runs, RunsTable.c.run_id == runs_and_root_runs.c.run_id\n ).join(\n all_descendant_runs,\n all_descendant_runs.c.value == runs_and_root_runs.c.run_id,\n isouter=True,\n )\n )\n .group_by(RunsTable.c.run_body)\n .order_by(db.desc(db.column("child_counts")))\n )\n\n with self.connect() as conn:\n res = conn.execute(runs_and_root_runs_with_descendant_counts).fetchall()\n\n # Postprocess: descendant runs get aggregated with their roots\n root_run_id_to_group: Dict[str, List[PipelineRun]] = defaultdict(list)\n root_run_id_to_count: Dict[str, int] = defaultdict(int)\n for (run_body, count) in res:\n row = (run_body,)\n pipeline_run = self._row_to_run(row)\n root_run_id = pipeline_run.get_root_run_id()\n if root_run_id is not None:\n root_run_id_to_group[root_run_id].append(pipeline_run)\n else:\n root_run_id_to_group[pipeline_run.run_id].append(pipeline_run)\n root_run_id_to_count[pipeline_run.run_id] = count + 1\n\n return {\n root_run_id: {\n "runs": list(run_group),\n "count": root_run_id_to_count[root_run_id],\n }\n for root_run_id, run_group in root_run_id_to_group.items()\n }\n\n def has_run(self, run_id: str) -> bool:\n check.str_param(run_id, "run_id")\n return bool(self.get_run_by_id(run_id))\n\n def delete_run(self, run_id: str):\n check.str_param(run_id, "run_id")\n query = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(query)\n\n def has_pipeline_snapshot(self, pipeline_snapshot_id: str) -> bool:\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n return self._has_snapshot_id(pipeline_snapshot_id)\n\n def add_pipeline_snapshot(\n self, pipeline_snapshot: PipelineSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n check.inst_param(pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_pipeline_snapshot_id(pipeline_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=pipeline_snapshot,\n snapshot_type=SnapshotType.PIPELINE,\n )\n\n def get_pipeline_snapshot(self, pipeline_snapshot_id: str) -> PipelineSnapshot:\n check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id")\n return self._get_snapshot(pipeline_snapshot_id)\n\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return bool(self.get_execution_plan_snapshot(execution_plan_snapshot_id))\n\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=execution_plan_snapshot,\n snapshot_type=SnapshotType.EXECUTION_PLAN,\n )\n\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return self._get_snapshot(execution_plan_snapshot_id)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n check.str_param(snapshot_id, "snapshot_id")\n check.not_none_param(snapshot_obj, "snapshot_obj")\n check.inst_param(snapshot_type, "snapshot_type", SnapshotType)\n\n with self.connect() as conn:\n snapshot_insert = (\n SnapshotsTable.insert().values( # pylint: disable=no-value-for-parameter\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(\n serialize_dagster_namedtuple(snapshot_obj).encode("utf-8")\n ),\n snapshot_type=snapshot_type.value,\n )\n )\n conn.execute(snapshot_insert)\n return snapshot_id\n\n def get_run_storage_id(self) -> str:\n query = db.select([InstanceInfo.c.run_storage_id])\n row = self.fetchone(query)\n if not row:\n run_storage_id = str(uuid.uuid4())\n with self.connect() as conn:\n conn.execute(InstanceInfo.insert().values(run_storage_id=run_storage_id))\n return run_storage_id\n else:\n return row[0]\n\n def _has_snapshot_id(self, snapshot_id: str) -> bool:\n query = db.select([SnapshotsTable.c.snapshot_id]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return bool(row)\n\n def _get_snapshot(self, snapshot_id: str):\n query = db.select([SnapshotsTable.c.snapshot_body]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return defensively_unpack_pipeline_snapshot_query(logging, row) if row else None\n\n def get_run_partition_data(\n self, partition_set_name: str, job_name: str, repository_label: str\n ) -> List[RunPartitionData]:\n if self.has_built_index(RUN_PARTITIONS) and self.has_run_stats_index_cols():\n query = self._runs_query(\n filters=RunsFilter(\n pipeline_name=job_name,\n tags={\n PARTITION_SET_TAG: partition_set_name,\n },\n ),\n columns=["run_id", "status", "start_time", "end_time", "partition"],\n )\n rows = self.fetchall(query)\n\n # dedup by partition\n _partition_data_by_partition = {}\n for row in rows:\n if not row["partition"] or row["partition"] in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[row["partition"]] = RunPartitionData(\n run_id=row["run_id"],\n partition=row["partition"],\n status=DagsterRunStatus[row["status"]],\n start_time=row["start_time"],\n end_time=row["end_time"],\n )\n\n return list(_partition_data_by_partition.values())\n else:\n query = self._runs_query(\n filters=RunsFilter(\n pipeline_name=job_name,\n tags={\n PARTITION_SET_TAG: partition_set_name,\n },\n ),\n )\n rows = self.fetchall(query)\n _partition_data_by_partition = {}\n for row in rows:\n run = self._row_to_run(row)\n partition = run.tags.get(PARTITION_NAME_TAG)\n if not partition or partition in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[partition] = RunPartitionData(\n run_id=run.run_id,\n partition=partition,\n status=run.status,\n start_time=None,\n end_time=None,\n )\n\n return list(_partition_data_by_partition.values())\n\n def _get_partition_runs(\n self, partition_set_name: str, partition_name: str\n ) -> List[PipelineRun]:\n # utility method to help test reads off of the partition column\n if not self.has_built_index(RUN_PARTITIONS):\n # query by tags\n return self.get_runs(\n filters=RunsFilter(\n tags={\n PARTITION_SET_TAG: partition_set_name,\n PARTITION_NAME_TAG: partition_name,\n }\n )\n )\n else:\n query = (\n self._runs_query()\n .where(RunsTable.c.partition == partition_name)\n .where(RunsTable.c.partition_set == partition_set_name)\n )\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n # Tracking data migrations over secondary indexes\n\n def _execute_data_migrations(\n self, migrations, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False\n ):\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n self._execute_data_migrations(REQUIRED_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def optimize(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n self._execute_data_migrations(OPTIONAL_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def has_built_index(self, migration_name: str) -> bool:\n query = (\n db.select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None)\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str):\n query = (\n SecondaryIndexMigrationTable.insert().values( # pylint: disable=no-value-for-parameter\n name=migration_name,\n migration_completed=datetime.now(),\n )\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db.exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update() # pylint: disable=no-value-for-parameter\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n # Checking for migrations\n\n def has_run_stats_index_cols(self):\n with self.connect() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(RunsTable.name)]\n return "start_time" in column_names and "end_time" in column_names\n\n # Daemon heartbeats\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat):\n with self.connect() as conn:\n\n # insert, or update if already present\n try:\n conn.execute(\n DaemonHeartbeatsTable.insert().values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n )\n except db.exc.IntegrityError:\n conn.execute(\n DaemonHeartbeatsTable.update() # pylint: disable=no-value-for-parameter\n .where(DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type)\n .values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n )\n\n def get_daemon_heartbeats(self) -> Dict[str, DaemonHeartbeat]:\n\n with self.connect() as conn:\n rows = conn.execute(db.select(DaemonHeartbeatsTable.columns))\n heartbeats = []\n for row in rows:\n heartbeats.append(deserialize_as(row.body, DaemonHeartbeat))\n return {heartbeat.daemon_type: heartbeat for heartbeat in heartbeats}\n\n def wipe(self):\n """Clears the run storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(RunsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(RunTagsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(SnapshotsTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(DaemonHeartbeatsTable.delete()) # pylint: disable=no-value-for-parameter\n\n def wipe_daemon_heartbeats(self):\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(DaemonHeartbeatsTable.delete()) # pylint: disable=no-value-for-parameter\n\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> List[PartitionBackfill]:\n check.opt_inst_param(status, "status", BulkActionStatus)\n query = db.select([BulkActionsTable.c.body])\n if status:\n query = query.where(BulkActionsTable.c.status == status.value)\n if cursor:\n cursor_query = db.select([BulkActionsTable.c.id]).where(\n BulkActionsTable.c.key == cursor\n )\n query = query.where(BulkActionsTable.c.id < cursor_query)\n if limit:\n query = query.limit(limit)\n query = query.order_by(BulkActionsTable.c.id.desc())\n rows = self.fetchall(query)\n return [deserialize_as(row[0], PartitionBackfill) for row in rows]\n\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n check.str_param(backfill_id, "backfill_id")\n query = db.select([BulkActionsTable.c.body]).where(BulkActionsTable.c.key == backfill_id)\n row = self.fetchone(query)\n return deserialize_as(row[0], PartitionBackfill) if row else None\n\n def add_backfill(self, partition_backfill: PartitionBackfill):\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n with self.connect() as conn:\n conn.execute(\n BulkActionsTable.insert().values( # pylint: disable=no-value-for-parameter\n key=partition_backfill.backfill_id,\n status=partition_backfill.status.value,\n timestamp=utc_datetime_from_timestamp(partition_backfill.backfill_timestamp),\n body=serialize_dagster_namedtuple(partition_backfill),\n )\n )\n\n def update_backfill(self, partition_backfill: PartitionBackfill):\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n backfill_id = partition_backfill.backfill_id\n if not self.get_backfill(backfill_id):\n raise DagsterInvariantViolationError(\n f"Backfill {backfill_id} is not present in storage"\n )\n with self.connect() as conn:\n conn.execute(\n BulkActionsTable.update() # pylint: disable=no-value-for-parameter\n .where(BulkActionsTable.c.key == backfill_id)\n .values(\n status=partition_backfill.status.value,\n body=serialize_dagster_namedtuple(partition_backfill),\n )\n )
\n\n\nGET_PIPELINE_SNAPSHOT_QUERY_ID = "get-pipeline-snapshot"\n\n\ndef defensively_unpack_pipeline_snapshot_query(logger, row):\n # no checking here because sqlalchemy returns a special\n # row proxy and don't want to instance check on an internal\n # implementation detail\n\n def _warn(msg):\n logger.warning("get-pipeline-snapshot: {msg}".format(msg=msg))\n\n if not isinstance(row[0], bytes):\n _warn("First entry in row is not a binary type.")\n return None\n\n try:\n uncompressed_bytes = zlib.decompress(row[0])\n except zlib.error:\n _warn("Could not decompress bytes stored in snapshot table.")\n return None\n\n try:\n decoded_str = uncompressed_bytes.decode("utf-8")\n except UnicodeDecodeError:\n _warn("Could not unicode decode decompressed bytes stored in snapshot table.")\n return None\n\n try:\n return deserialize_json_to_dagster_namedtuple(decoded_str)\n except JSONDecodeError:\n _warn("Could not parse json in snapshot table.")\n return None\n
", "current_page_name": "_modules/dagster/core/storage/runs/sql_run_storage", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.runs.sql_run_storage"}, "sqlite": {"sqlite_run_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.runs.sqlite.sqlite_run_storage

\nimport os\nfrom contextlib import contextmanager\nfrom urllib.parse import urljoin, urlparse\n\nimport sqlalchemy as db\nfrom sqlalchemy.pool import NullPool\n\nfrom dagster import StringSource\nfrom dagster import _check as check\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_downgrade,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string, get_sqlite_version\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\n\nfrom ..schema import InstanceInfo, RunStorageSqlMetadata, RunTagsTable, RunsTable\nfrom ..sql_run_storage import SqlRunStorage\n\nMINIMUM_SQLITE_BUCKET_VERSION = [3, 25, 0]\n\n\n
[docs]class SqliteRunStorage(SqlRunStorage, ConfigurableClass):\n """SQLite-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default run storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster.core.storage.runs\n class: SqliteRunStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the run storage where on disk to store the database.\n """\n\n def __init__(self, conn_string, inst_data=None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteRunStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(cls, base_dir, inst_data=None):\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "runs")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n RunStorageSqlMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n table_names = db.inspect(engine).get_table_names()\n if "instance_info" not in table_names:\n InstanceInfo.create(engine)\n\n run_storage = cls(conn_string, inst_data)\n\n if should_mark_indexes:\n run_storage.migrate()\n run_storage.optimize()\n\n return run_storage\n\n @contextmanager\n def connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n conn = engine.connect()\n try:\n yield conn\n finally:\n conn.close()\n\n def _alembic_upgrade(self, rev="head"):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn, rev=rev)\n\n def _alembic_downgrade(self, rev="head"):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_downgrade(alembic_config, conn, rev=rev)\n\n @property\n def supports_bucket_queries(self):\n parts = get_sqlite_version().split(".")\n try:\n for i in range(min(len(parts), len(MINIMUM_SQLITE_BUCKET_VERSION))):\n curr = int(parts[i])\n if curr < MINIMUM_SQLITE_BUCKET_VERSION[i]:\n return False\n if curr > MINIMUM_SQLITE_BUCKET_VERSION[i]:\n return True\n except ValueError:\n return False\n\n return False\n\n def upgrade(self):\n self._check_for_version_066_migration_and_perform()\n self._alembic_upgrade()\n\n # In version 0.6.6, we changed the layout of the of the sqllite dbs on disk\n # to move from the root of DAGSTER_HOME/runs.db to DAGSTER_HOME/history/runs.bd\n # This function checks for that condition and does the move\n def _check_for_version_066_migration_and_perform(self):\n old_conn_string = "sqlite://" + urljoin(urlparse(self._conn_string).path, "../runs.db")\n path_to_old_db = urlparse(old_conn_string).path\n # sqlite URLs look like `sqlite:///foo/bar/baz on Unix/Mac` but on Windows they look like\n # `sqlite:///D:/foo/bar/baz` (or `sqlite:///D:\\foo\\bar\\baz`)\n if os.name == "nt":\n path_to_old_db = path_to_old_db.lstrip("/")\n if os.path.exists(path_to_old_db):\n old_storage = SqliteRunStorage(old_conn_string)\n old_runs = old_storage.get_runs()\n for run in old_runs:\n self.add_run(run)\n os.unlink(path_to_old_db)\n\n def delete_run(self, run_id):\n """Override the default sql delete run implementation until we can get full\n support on cascading deletes"""\n check.str_param(run_id, "run_id")\n remove_tags = db.delete(RunTagsTable).where(RunTagsTable.c.run_id == run_id)\n remove_run = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(remove_tags)\n conn.execute(remove_run)\n\n def alembic_version(self):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/core/storage/runs/sqlite/sqlite_run_storage", "customsidebar": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.runs.sqlite.sqlite_run_storage"}}}, "schedules": {"base": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.schedules.base

\nimport abc\nfrom typing import Callable, Iterable, List, Mapping, Optional, Sequence\n\nfrom dagster.core.definitions.run_request import InstigatorType\nfrom dagster.core.instance import MayHaveInstanceWeakref\nfrom dagster.core.scheduler.instigation import InstigatorState, InstigatorTick, TickData, TickStatus\n\n\n
[docs]class ScheduleStorage(abc.ABC, MayHaveInstanceWeakref):\n """Abstract class for managing persistance of scheduler artifacts"""\n\n @abc.abstractmethod\n def wipe(self):\n """Delete all schedules from storage"""\n\n @abc.abstractmethod\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n ) -> Iterable[InstigatorState]:\n """Return all InstigationStates present in storage\n\n Args:\n repository_origin_id (Optional[str]): The ExternalRepository target id to scope results to\n repository_selector_id (Optional[str]): The repository selector id to scope results to\n instigator_type (Optional[InstigatorType]): The InstigatorType to scope results to\n """\n\n @abc.abstractmethod\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n """Return the instigator state for the given id\n\n Args:\n origin_id (str): The unique instigator identifier\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def add_instigator_state(self, state: InstigatorState):\n """Add an instigator state to storage.\n\n Args:\n state (InstigatorState): The state to add\n """\n\n @abc.abstractmethod\n def update_instigator_state(self, state: InstigatorState):\n """Update an instigator state in storage.\n\n Args:\n state (InstigatorState): The state to update\n """\n\n @abc.abstractmethod\n def delete_instigator_state(self, origin_id: str, selector_id: str):\n """Delete a state in storage.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n """\n\n @property\n def supports_batch_queries(self):\n return False\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Iterable[InstigatorTick]]:\n raise NotImplementedError()\n\n @abc.abstractmethod\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[List[TickStatus]] = None,\n ) -> Iterable[InstigatorTick]:\n """Get the ticks for a given instigator.\n\n Args:\n origin_id (str): The id of the instigator target\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def create_tick(self, tick_data: TickData):\n """Add a tick to storage.\n\n Args:\n tick_data (TickData): The tick to add\n """\n\n @abc.abstractmethod\n def update_tick(self, tick: InstigatorTick):\n """Update a tick already in storage.\n\n Args:\n tick (InstigatorTick): The tick to update\n """\n\n @abc.abstractmethod\n def purge_ticks(self, origin_id: str, selector_id: str, tick_status: TickStatus, before: float):\n """Wipe ticks for an instigator for a certain status and timestamp.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n tick_status (TickStatus): The tick status to wipe\n before (datetime): All ticks before this datetime will get purged\n """\n\n @abc.abstractmethod\n def upgrade(self):\n """Perform any needed migrations"""\n\n def migrate(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n """Call this method to run any required data migrations"""\n\n def optimize(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n """Call this method to run any optional data migrations for optimized reads"""\n\n def optimize_for_dagit(self, statement_timeout: int):\n """Allows for optimizing database connection / use in the context of a long lived dagit process"""\n\n def alembic_version(self):\n return None
\n
", "current_page_name": "_modules/dagster/core/storage/schedules/base", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.schedules.base"}, "sql_schedule_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.schedules.sql_schedule_storage

\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import Callable, Iterable, Mapping, Optional, Sequence, cast\n\nimport pendulum\nimport sqlalchemy as db\n\nimport dagster._check as check\nfrom dagster.core.definitions.run_request import InstigatorType\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.core.scheduler.instigation import InstigatorState, InstigatorTick, TickData, TickStatus\nfrom dagster.serdes import deserialize_json_to_dagster_namedtuple, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom .base import ScheduleStorage\nfrom .migration import (\n    OPTIONAL_SCHEDULE_DATA_MIGRATIONS,\n    REQUIRED_SCHEDULE_DATA_MIGRATIONS,\n    SCHEDULE_JOBS_SELECTOR_ID,\n    SCHEDULE_TICKS_SELECTOR_ID,\n)\nfrom .schema import InstigatorsTable, JobTable, JobTickTable, SecondaryIndexMigrationTable\n\n\n
[docs]class SqlScheduleStorage(ScheduleStorage):\n """Base class for SQL backed schedule storage"""\n\n @abstractmethod\n def connect(self):\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n def execute(self, query):\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def _deserialize_rows(self, rows):\n return list(map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows))\n\n def all_instigator_state(\n self, repository_origin_id=None, repository_selector_id=None, instigator_type=None\n ):\n check.opt_inst_param(instigator_type, "instigator_type", InstigatorType)\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = db.select([InstigatorsTable.c.instigator_body]).select_from(InstigatorsTable)\n if repository_selector_id:\n query = query.where(\n InstigatorsTable.c.repository_selector_id == repository_selector_id\n )\n if instigator_type:\n query = query.where(InstigatorsTable.c.instigator_type == instigator_type.value)\n else:\n query = db.select([JobTable.c.job_body]).select_from(JobTable)\n if repository_origin_id:\n query = query.where(JobTable.c.repository_origin_id == repository_origin_id)\n if instigator_type:\n query = query.where(JobTable.c.job_type == instigator_type.value)\n\n rows = self.execute(query)\n return self._deserialize_rows(rows)\n\n def get_instigator_state(self, origin_id, selector_id):\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = (\n db.select([InstigatorsTable.c.instigator_body])\n .select_from(InstigatorsTable)\n .where(InstigatorsTable.c.selector_id == selector_id)\n )\n else:\n query = (\n db.select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.job_origin_id == origin_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None\n\n def _has_instigator_state_by_selector(self, selector_id):\n check.str_param(selector_id, "selector_id")\n\n query = (\n db.select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None\n\n def _add_or_update_instigators_table(self, conn, state):\n selector_id = state.selector_id\n try:\n conn.execute(\n InstigatorsTable.insert().values( # pylint: disable=no-value-for-parameter\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_dagster_namedtuple(state),\n )\n )\n except db.exc.IntegrityError:\n conn.execute(\n InstigatorsTable.update()\n .where(InstigatorsTable.c.selector_id == selector_id)\n .values(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_dagster_namedtuple(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def add_instigator_state(self, state):\n check.inst_param(state, "state", InstigatorState)\n with self.connect() as conn:\n try:\n conn.execute(\n JobTable.insert().values( # pylint: disable=no-value-for-parameter\n job_origin_id=state.instigator_origin_id,\n repository_origin_id=state.repository_origin_id,\n status=state.status.value,\n job_type=state.instigator_type.value,\n job_body=serialize_dagster_namedtuple(state),\n )\n )\n except db.exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is already present in storage"\n ) from exc\n\n # try writing to the instigators table\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def update_instigator_state(self, state):\n check.inst_param(state, "state", InstigatorState)\n if not self.get_instigator_state(state.instigator_origin_id, state.selector_id):\n raise DagsterInvariantViolationError(\n "InstigatorState {id} is not present in storage".format(\n id=state.instigator_origin_id\n )\n )\n\n values = {\n "status": state.status.value,\n "job_body": serialize_dagster_namedtuple(state),\n "update_timestamp": pendulum.now("UTC"),\n }\n if self.has_instigators_table():\n values["selector_id"] = state.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTable.update() # pylint: disable=no-value-for-parameter\n .where(JobTable.c.job_origin_id == state.instigator_origin_id)\n .values(**values)\n )\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n def delete_instigator_state(self, origin_id, selector_id):\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if not self.get_instigator_state(origin_id, selector_id):\n raise DagsterInvariantViolationError(\n "InstigatorState {id} is not present in storage".format(id=origin_id)\n )\n\n with self.connect() as conn:\n conn.execute(\n JobTable.delete().where( # pylint: disable=no-value-for-parameter\n JobTable.c.job_origin_id == origin_id\n )\n )\n\n if self._has_instigators_table(conn):\n if not self._jobs_has_selector_state(conn, selector_id):\n conn.execute(\n InstigatorsTable.delete().where( # pylint: disable=no-value-for-parameter\n InstigatorsTable.c.selector_id == selector_id\n )\n )\n\n def _jobs_has_selector_state(self, conn, selector_id):\n query = (\n db.select([db.func.count()])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n result = conn.execute(query)\n row = result.fetchone()\n result.close()\n return row[0] > 0\n\n def _add_filter_limit(self, query, before=None, after=None, limit=None, statuses=None):\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n if before:\n query = query.where(JobTickTable.c.timestamp < utc_datetime_from_timestamp(before))\n if after:\n query = query.where(JobTickTable.c.timestamp > utc_datetime_from_timestamp(after))\n if limit:\n query = query.limit(limit)\n if statuses:\n query = query.where(JobTickTable.c.status.in_([status.value for status in statuses]))\n return query\n\n @property\n def supports_batch_queries(self):\n return self.has_instigators_table() and self.has_built_index(SCHEDULE_TICKS_SELECTOR_ID)\n\n def has_instigators_table(self):\n with self.connect() as conn:\n return self._has_instigators_table(conn)\n\n def _has_instigators_table(self, conn):\n table_names = db.inspect(conn).get_table_names()\n return "instigators" in table_names\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Iterable[InstigatorTick]]:\n check.list_param(selector_ids, "selector_ids", of_type=str)\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n bucket_rank_column = (\n db.func.rank()\n .over(\n order_by=db.desc(JobTickTable.c.timestamp),\n partition_by=JobTickTable.c.selector_id,\n )\n .label("rank")\n )\n subquery = (\n db.select(\n [\n JobTickTable.c.id,\n JobTickTable.c.selector_id,\n JobTickTable.c.tick_body,\n bucket_rank_column,\n ]\n )\n .select_from(JobTickTable)\n .where(JobTickTable.c.selector_id.in_(selector_ids))\n .alias("subquery")\n )\n if statuses:\n subquery = subquery.where(\n JobTickTable.c.status.in_([status.value for status in statuses])\n )\n\n query = (\n db.select([subquery.c.id, subquery.c.selector_id, subquery.c.tick_body])\n .order_by(subquery.c.rank.asc())\n .where(subquery.c.rank <= limit)\n )\n\n rows = self.execute(query)\n results = defaultdict(list)\n for row in rows:\n tick_id = row[0]\n selector_id = row[1]\n tick_data = cast(TickData, deserialize_json_to_dagster_namedtuple(row[2]))\n results[selector_id].append(InstigatorTick(tick_id, tick_data))\n return results\n\n def get_ticks(self, origin_id, selector_id, before=None, after=None, limit=None, statuses=None):\n check.str_param(origin_id, "origin_id")\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n base_query = (\n db.select([JobTickTable.c.id, JobTickTable.c.tick_body])\n .select_from(JobTickTable)\n .order_by(JobTickTable.c.timestamp.desc())\n )\n if self.has_instigators_table():\n query = base_query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id == None,\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = base_query.where(JobTickTable.c.job_origin_id == origin_id)\n\n query = self._add_filter_limit(\n query, before=before, after=after, limit=limit, statuses=statuses\n )\n\n rows = self.execute(query)\n return list(\n map(lambda r: InstigatorTick(r[0], deserialize_json_to_dagster_namedtuple(r[1])), rows)\n )\n\n def create_tick(self, tick_data):\n check.inst_param(tick_data, "tick_data", TickData)\n\n values = {\n "job_origin_id": tick_data.instigator_origin_id,\n "status": tick_data.status.value,\n "type": tick_data.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick_data.timestamp),\n "tick_body": serialize_dagster_namedtuple(tick_data),\n }\n if self.has_instigators_table() and tick_data.selector_id:\n values["selector_id"] = tick_data.selector_id\n\n with self.connect() as conn:\n try:\n tick_insert = JobTickTable.insert().values(\n **values\n ) # pylint: disable=no-value-for-parameter\n result = conn.execute(tick_insert)\n tick_id = result.inserted_primary_key[0]\n return InstigatorTick(tick_id, tick_data)\n except db.exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"Unable to insert InstigatorTick for job {tick_data.instigator_name} in storage"\n ) from exc\n\n def update_tick(self, tick):\n check.inst_param(tick, "tick", InstigatorTick)\n\n values = {\n "status": tick.status.value,\n "type": tick.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick.timestamp),\n "tick_body": serialize_dagster_namedtuple(tick.tick_data),\n }\n if self.has_instigators_table() and tick.selector_id:\n values["selector_id"] = tick.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTickTable.update() # pylint: disable=no-value-for-parameter\n .where(JobTickTable.c.id == tick.tick_id)\n .values(**values)\n )\n\n return tick\n\n def purge_ticks(self, origin_id, selector_id, tick_status, before):\n check.str_param(origin_id, "origin_id")\n check.inst_param(tick_status, "tick_status", TickStatus)\n check.float_param(before, "before")\n\n utc_before = utc_datetime_from_timestamp(before)\n\n base_query = (\n JobTickTable.delete() # pylint: disable=no-value-for-parameter\n .where(JobTickTable.c.status == tick_status.value)\n .where(JobTickTable.c.timestamp < utc_before)\n )\n\n if self.has_instigators_table():\n query = base_query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id == None,\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = base_query.where(JobTickTable.c.job_origin_id == origin_id)\n\n with self.connect() as conn:\n conn.execute(query)\n\n def wipe(self):\n """Clears the schedule storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(JobTable.delete()) # pylint: disable=no-value-for-parameter\n conn.execute(JobTickTable.delete()) # pylint: disable=no-value-for-parameter\n if self._has_instigators_table(conn):\n conn.execute(InstigatorsTable.delete())\n\n # MIGRATIONS\n\n def has_secondary_index_table(self):\n with self.connect() as conn:\n return "secondary_indexes" in db.inspect(conn).get_table_names()\n\n def has_built_index(self, migration_name: str) -> bool:\n if not self.has_secondary_index_table():\n return False\n\n query = (\n db.select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None)\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str):\n query = (\n SecondaryIndexMigrationTable.insert().values( # pylint: disable=no-value-for-parameter\n name=migration_name,\n migration_completed=datetime.now(),\n )\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db.exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update() # pylint: disable=no-value-for-parameter\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n def _execute_data_migrations(\n self, migrations, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False\n ):\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn("Skipping already applied migration: {}".format(migration_name))\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n self._execute_data_migrations(\n REQUIRED_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )\n\n def optimize(self, print_fn: Optional[Callable] = None, force_rebuild_all: bool = False):\n self._execute_data_migrations(\n OPTIONAL_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )
\n
", "current_page_name": "_modules/dagster/core/storage/schedules/sql_schedule_storage", "customsidebar": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.schedules.sql_schedule_storage"}, "sqlite": {"sqlite_schedule_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.storage.schedules.sqlite.sqlite_schedule_storage

\nfrom contextlib import contextmanager\n\nfrom packaging.version import parse\nfrom sqlalchemy.pool import NullPool\n\nfrom dagster import StringSource\nfrom dagster import _check as check\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.core.storage.sqlite import create_db_conn_string, get_sqlite_version\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import mkdir_p\n\nfrom ..schema import ScheduleStorageSqlMetadata\nfrom ..sql_schedule_storage import SqlScheduleStorage\n\nMINIMUM_SQLITE_BATCH_VERSION = "3.25.0"\n\n\n
[docs]class SqliteScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Local SQLite backed schedule storage"""\n\n def __init__(self, conn_string, inst_data=None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return SqliteScheduleStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(cls, base_dir, inst_data=None):\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "schedules")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_migrate_data = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n ScheduleStorageSqlMetadata.create_all(engine)\n engine.execute("PRAGMA journal_mode=WAL;")\n stamp_alembic_rev(alembic_config, connection)\n should_migrate_data = True\n\n schedule_storage = cls(conn_string, inst_data)\n if should_migrate_data:\n schedule_storage.migrate()\n schedule_storage.optimize()\n\n return schedule_storage\n\n @contextmanager\n def connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n conn = engine.connect()\n try:\n yield conn\n finally:\n conn.close()\n\n @property\n def supports_batch_queries(self):\n if not super().supports_batch_queries:\n return False\n\n return super().supports_batch_queries and parse(get_sqlite_version()) >= parse(\n MINIMUM_SQLITE_BATCH_VERSION\n )\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def alembic_version(self):\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/core/storage/schedules/sqlite/sqlite_schedule_storage", "customsidebar": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.storage.schedules.sqlite.sqlite_schedule_storage"}}}}, "types": {"config_schema": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.types.config_schema

\nimport hashlib\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, AbstractSet, Callable, Iterator, Optional, Union\n\nimport dagster._check as check\nfrom dagster.config.config_type import ConfigType\nfrom dagster.core.decorator_utils import get_function_params, validate_expected_params\nfrom dagster.core.definitions.events import AssetMaterialization, Materialization\nfrom dagster.core.errors import DagsterInvalidDefinitionError\nfrom dagster.utils import ensure_gen\nfrom dagster.utils.backcompat import experimental_arg_warning\n\nif TYPE_CHECKING:\n    from dagster.core.execution.context.system import StepExecutionContext\n\n\n
[docs]class DagsterTypeLoader(ABC):\n """\n Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\n to.\n\n The recommended way to define a type loader is with the\n :py:func:`@dagster_type_loader <dagster_type_loader>` decorator.\n """\n\n @property\n @abstractmethod\n def schema_type(self) -> ConfigType:\n pass\n\n @property\n def loader_version(self) -> Optional[str]:\n return None\n\n def compute_loaded_input_version(self, _config_value: object) -> Optional[str]:\n return None\n\n def construct_from_config_value(\n self, _context: "StepExecutionContext", config_value: object\n ) -> object:\n """\n How to create a runtime value from config data.\n """\n return config_value\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset()
\n\n\n
[docs]class DagsterTypeMaterializer(ABC):\n """\n Dagster type materializers are used to materialize outputs of the dagster type they are attached\n to.\n\n The recommended way to define a type loader is with the\n :py:func:`@dagster_type_materializer <dagster_type_materializer>` decorator.\n """\n\n @property\n @abstractmethod\n def schema_type(self) -> ConfigType:\n pass\n\n @abstractmethod\n def materialize_runtime_values(\n self, _context: "StepExecutionContext", _config_value: object, _runtime_value: object\n ) -> Iterator[Union[AssetMaterialization, Materialization]]:\n """\n How to materialize a runtime value given configuration.\n """\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset()
\n\n\nclass DagsterTypeLoaderFromDecorator(DagsterTypeLoader):\n def __init__(\n self,\n config_type,\n func,\n required_resource_keys,\n loader_version=None,\n external_version_fn=None,\n ):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._loader_version = check.opt_str_param(loader_version, "loader_version")\n if self._loader_version:\n experimental_arg_warning("loader_version", "DagsterTypeLoaderFromDecorator.__init__")\n self._external_version_fn = check.opt_callable_param(\n external_version_fn, "external_version_fn"\n )\n if self._external_version_fn:\n experimental_arg_warning(\n "external_version_fn", "DagsterTypeLoaderFromDecorator.__init__"\n )\n\n @property\n def schema_type(self) -> ConfigType:\n return self._config_type\n\n @property\n def loader_version(self) -> Optional[str]:\n return self._loader_version\n\n def compute_loaded_input_version(self, config_value: object) -> Optional[str]:\n """Compute the type-loaded input from a given config_value.\n\n Args:\n config_value (object): Config value to be ingested by the external version\n loading function.\n Returns:\n Optional[str]: Hash of concatenated loader version and external input version if both\n are provided, else None.\n """\n version = ""\n if self.loader_version:\n version += str(self.loader_version)\n if self._external_version_fn:\n ext_version = self._external_version_fn(config_value)\n version += str(ext_version)\n\n if version == "":\n return None # Sentinel value for no version provided.\n else:\n return hashlib.sha1(version.encode("utf-8")).hexdigest()\n\n def construct_from_config_value(self, context: "StepExecutionContext", config_value: object):\n return self._func(context, config_value)\n\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n\ndef _create_type_loader_for_decorator(\n config_type: ConfigType,\n func,\n required_resource_keys: AbstractSet[str],\n loader_version=None,\n external_version_fn=None,\n):\n return DagsterTypeLoaderFromDecorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n\n
[docs]def dagster_type_loader(\n config_schema: object,\n required_resource_keys=None,\n loader_version=None,\n external_version_fn=None,\n):\n """Create an dagster type loader that maps config data to a runtime value.\n\n The decorated function should take the execution context and parsed config value and return the\n appropriate runtime value.\n\n Args:\n config_schema (ConfigSchema): The schema for the config that's passed to the decorated\n function.\n loader_version (str): (Experimental) The version of the decorated compute function. Two\n loading functions should have the same version if and only if they deterministically\n produce the same outputs when provided the same inputs.\n external_version_fn (Callable): (Experimental) A function that takes in the same parameters as the loader\n function (config_value) and returns a representation of the version of the external\n asset (str). Two external assets with identical versions are treated as identical to one\n another.\n\n Examples:\n\n .. code-block:: python\n\n @dagster_type_loader(Permissive())\n def load_dict(_context, value):\n return value\n """\n from dagster.config.field import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n EXPECTED_POSITIONALS = ["context", "*"]\n\n def wrapper(func):\n params = get_function_params(func)\n missing_positional = validate_expected_params(params, EXPECTED_POSITIONALS)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n "@dagster_type_loader '{solid_name}' decorated function does not have required positional "\n "parameter '{missing_param}'. @dagster_type_loader decorated functions should only have keyword arguments "\n "that match input names and a first positional parameter named 'context'.".format(\n solid_name=func.__name__, missing_param=missing_positional\n )\n )\n\n return _create_type_loader_for_decorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n return wrapper
\n\n\nclass DagsterTypeMaterializerForDecorator(DagsterTypeMaterializer):\n def __init__(self, config_type, func, required_resource_keys):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n\n @property\n def schema_type(self) -> ConfigType:\n return self._config_type\n\n def materialize_runtime_values(\n self, context: "StepExecutionContext", config_value: object, runtime_value: object\n ) -> Iterator[Union[Materialization, AssetMaterialization]]:\n return ensure_gen(self._func(context, config_value, runtime_value))\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset(self._required_resource_keys)\n\n\ndef _create_output_materializer_for_decorator(\n config_type: ConfigType,\n func: Callable[["StepExecutionContext", object, object], AssetMaterialization],\n required_resource_keys: Optional[AbstractSet[str]],\n) -> DagsterTypeMaterializerForDecorator:\n return DagsterTypeMaterializerForDecorator(config_type, func, required_resource_keys)\n\n\n
[docs]def dagster_type_materializer(\n config_schema: object, required_resource_keys: Optional[AbstractSet[str]] = None\n) -> Callable[\n [Callable[["StepExecutionContext", object, object], AssetMaterialization]],\n DagsterTypeMaterializerForDecorator,\n]:\n """Create an output materialization hydration config that configurably materializes a runtime\n value.\n\n The decorated function should take the execution context, the parsed config value, and the\n runtime value. It should materialize the runtime value, and should\n return an appropriate :py:class:`AssetMaterialization`.\n\n Args:\n config_schema (object): The type of the config data expected by the decorated function.\n\n Examples:\n\n .. code-block:: python\n\n # Takes a list of dicts such as might be read in using csv.DictReader, as well as a config\n value, and writes\n @dagster_type_materializer(str)\n def materialize_df(_context, path, value):\n with open(path, 'w') as fd:\n writer = csv.DictWriter(fd, fieldnames=value[0].keys())\n writer.writeheader()\n writer.writerows(rowdicts=value)\n\n return AssetMaterialization.file(path)\n\n """\n from dagster.config.field import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n return lambda func: _create_output_materializer_for_decorator(\n config_type, func, required_resource_keys # type: ignore\n )
\n
", "current_page_name": "_modules/dagster/core/types/config_schema", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.types.config_schema"}, "dagster_type": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.types.dagster_type

\nimport typing as t\nfrom abc import abstractmethod\nfrom enum import Enum as PythonEnum\nfrom functools import partial\nfrom typing import cast\n\nimport dagster._check as check\nfrom dagster.builtins import BuiltinEnum\nfrom dagster.config.config_type import Array, ConfigType\nfrom dagster.config.config_type import Noneable as ConfigNoneable\nfrom dagster.core.definitions.events import DynamicOutput, Output, TypeCheck\nfrom dagster.core.definitions.metadata import MetadataEntry, RawMetadataValue, normalize_metadata\nfrom dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster.serdes import whitelist_for_serdes\n\nfrom .builtin_config_schemas import BuiltinSchemas\nfrom .config_schema import DagsterTypeLoader, DagsterTypeMaterializer\n\nif t.TYPE_CHECKING:\n    from dagster.core.execution.context.system import (  # pylint: disable=unused-import\n        StepExecutionContext,\n        TypeCheckContext,\n    )\n\nTypeCheckFn = t.Callable[["TypeCheckContext", object], t.Union[TypeCheck, bool]]\n\n\n@whitelist_for_serdes\nclass DagsterTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    LIST = "LIST"\n    NOTHING = "NOTHING"\n    NULLABLE = "NULLABLE"\n    REGULAR = "REGULAR"\n\n\n
[docs]class DagsterType:\n """Define a type in dagster. These can be used in the inputs and outputs of ops.\n\n Args:\n type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]):\n The function that defines the type check. It takes the value flowing\n through the input or output of the op. If it passes, return either\n ``True`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``True``. If it fails,\n return either ``False`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``False``.\n The first argument must be named ``context`` (or, if unused, ``_``, ``_context``, or ``context_``).\n Use ``required_resource_keys`` for access to resources.\n key (Optional[str]): The unique key to identify types programmatically.\n The key property always has a value. If you omit key to the argument\n to the init function, it instead receives the value of ``name``. If\n neither ``key`` nor ``name`` is provided, a ``CheckError`` is thrown.\n\n In the case of a generic type such as ``List`` or ``Optional``, this is\n generated programmatically based on the type parameters.\n\n For most use cases, name should be set and the key argument should\n not be specified.\n name (Optional[str]): A unique name given by a user. If ``key`` is ``None``, ``key``\n becomes this value. Name is not given in a case where the user does\n not specify a unique name for this type, such as a generic class.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_materializer <dagster.dagster_type_materializer>`\n decorator to construct these arguments.\n required_resource_keys (Optional[Set[str]]): Resource keys required by the ``type_check_fn``.\n is_builtin (bool): Defaults to False. This is used by tools to display or\n filter built-in types (such as :py:class:`~dagster.String`, :py:class:`~dagster.Int`) to visually distinguish\n them from user-defined types. Meant for internal use.\n kind (DagsterTypeKind): Defaults to None. This is used to determine the kind of runtime type\n for InputDefinition and OutputDefinition type checking.\n typing_type: Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\n value contained within the DagsterType. Meant for internal use.\n """\n\n def __init__(\n self,\n type_check_fn: TypeCheckFn,\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n loader: t.Optional[DagsterTypeLoader] = None,\n materializer: t.Optional[DagsterTypeMaterializer] = None,\n required_resource_keys: t.Optional[t.Set[str]] = None,\n kind: DagsterTypeKind = DagsterTypeKind.REGULAR,\n typing_type: t.Any = None,\n metadata_entries: t.Optional[t.List[MetadataEntry]] = None,\n metadata: t.Optional[t.Dict[str, RawMetadataValue]] = None,\n ):\n check.opt_str_param(key, "key")\n check.opt_str_param(name, "name")\n\n check.invariant(not (name is None and key is None), "Must set key or name")\n if name is None:\n key = check.not_none(\n key,\n "If name is not provided, must provide key.",\n )\n self.key, self._name = key, None\n elif key is None:\n name = check.not_none(\n name,\n "If key is not provided, must provide name.",\n )\n self.key, self._name = name, name\n else:\n check.invariant(key and name)\n self.key, self._name = key, name\n\n self.description = check.opt_str_param(description, "description")\n self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)\n self.materializer = check.opt_inst_param(\n materializer, "materializer", DagsterTypeMaterializer\n )\n\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys,\n "required_resource_keys",\n )\n\n self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")\n _validate_type_check_fn(self._type_check_fn, self._name)\n\n self.is_builtin = check.bool_param(is_builtin, "is_builtin")\n check.invariant(\n self.display_name is not None,\n "All types must have a valid display name, got None for key {}".format(key),\n )\n\n self.kind = check.inst_param(kind, "kind", DagsterTypeKind)\n\n self.typing_type = typing_type\n\n metadata_entries = check.opt_list_param(\n metadata_entries, "metadata_entries", of_type=MetadataEntry\n )\n metadata = check.opt_dict_param(metadata, "metadata", key_type=str)\n self._metadata_entries = normalize_metadata(metadata, metadata_entries)\n\n def type_check(self, context: "TypeCheckContext", value: object) -> TypeCheck:\n retval = self._type_check_fn(context, value)\n\n if not isinstance(retval, (bool, TypeCheck)):\n raise DagsterInvariantViolationError(\n (\n "You have returned {retval} of type {retval_type} from the type "\n 'check function of type "{type_key}". Return value must be instance '\n "of TypeCheck or a bool."\n ).format(retval=repr(retval), retval_type=type(retval), type_key=self.key)\n )\n\n return TypeCheck(success=retval) if isinstance(retval, bool) else retval\n\n def __eq__(self, other):\n return isinstance(other, DagsterType) and self.key == other.key\n\n def __ne__(self, other):\n return not self.__eq__(other)\n\n @staticmethod\n def from_builtin_enum(builtin_enum) -> "DagsterType":\n check.invariant(BuiltinEnum.contains(builtin_enum), "must be member of BuiltinEnum")\n return _RUNTIME_MAP[builtin_enum]\n\n @property\n def metadata_entries(self) -> t.List[MetadataEntry]:\n return self._metadata_entries # type: ignore\n\n @property\n def display_name(self) -> str:\n """Either the name or key (if name is `None`) of the type, overridden in many subclasses"""\n return cast(str, self._name or self.key)\n\n @property\n def unique_name(self) -> t.Optional[str]:\n """The unique name of this type. Can be None if the type is not unique, such as container types"""\n # TODO: docstring and body inconsistent-- can this be None or not?\n check.invariant(\n self._name is not None,\n "unique_name requested but is None for type {}".format(self.display_name),\n )\n return self._name\n\n @property\n def has_unique_name(self) -> bool:\n return self._name is not None\n\n @property\n def inner_types(self) -> t.List["DagsterType"]:\n return []\n\n @property\n def loader_schema_key(self) -> t.Optional[str]:\n return self.loader.schema_type.key if self.loader else None\n\n @property\n def materializer_schema_key(self) -> t.Optional[str]:\n return self.materializer.schema_type.key if self.materializer else None\n\n @property\n def type_param_keys(self) -> t.List[str]:\n return []\n\n @property\n def is_nothing(self) -> bool:\n return self.kind == DagsterTypeKind.NOTHING\n\n @property\n def supports_fan_in(self) -> bool:\n return False\n\n def get_inner_type_for_fan_in(self) -> "DagsterType":\n check.failed(\n "DagsterType {name} does not support fan-in, should have checked supports_fan_in before calling getter.".format(\n name=self.display_name\n )\n )
\n\n\ndef _validate_type_check_fn(fn: t.Callable, name: t.Optional[str]) -> bool:\n from dagster.seven import get_arg_names\n\n args = get_arg_names(fn)\n\n # py2 doesn't filter out self\n if len(args) >= 1 and args[0] == "self":\n args = args[1:]\n\n if len(args) == 2:\n possible_names = {\n "_",\n "context",\n "_context",\n "context_",\n }\n if args[0] not in possible_names:\n DagsterInvalidDefinitionError(\n 'type_check function on type "{name}" must have first '\n 'argument named "context" (or _, _context, context_).'.format(\n name=name,\n )\n )\n return True\n\n raise DagsterInvalidDefinitionError(\n 'type_check_fn argument on type "{name}" must take 2 arguments, '\n "received {count}.".format(name=name, count=len(args))\n )\n\n\nclass BuiltinScalarDagsterType(DagsterType):\n def __init__(self, name: str, type_check_fn: TypeCheckFn, typing_type: t.Type, **kwargs):\n super(BuiltinScalarDagsterType, self).__init__(\n key=name,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=type_check_fn,\n is_builtin=True,\n typing_type=typing_type,\n **kwargs,\n )\n\n # This is passed to the constructor of subclasses as the argument `type_check_fn`-- that's why\n # it exists together with the `type_check_fn` arg.\n def type_check_fn(self, _context, value) -> TypeCheck:\n return self.type_check_scalar_value(value)\n\n @abstractmethod\n def type_check_scalar_value(self, _value) -> TypeCheck:\n raise NotImplementedError()\n\n\ndef _typemismatch_error_str(value: object, expected_type_desc: str) -> str:\n return 'Value "{value}" of python type "{python_type}" must be a {type_desc}.'.format(\n value=value, python_type=type(value).__name__, type_desc=expected_type_desc\n )\n\n\ndef _fail_if_not_of_type(\n value: object, value_type: t.Type[t.Any], value_type_desc: str\n) -> TypeCheck:\n\n if not isinstance(value, value_type):\n return TypeCheck(success=False, description=_typemismatch_error_str(value, value_type_desc))\n\n return TypeCheck(success=True)\n\n\nclass _Int(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Int, self).__init__(\n name="Int",\n loader=BuiltinSchemas.INT_INPUT,\n materializer=BuiltinSchemas.INT_OUTPUT,\n type_check_fn=self.type_check_fn,\n typing_type=int,\n )\n\n def type_check_scalar_value(self, value) -> TypeCheck:\n return _fail_if_not_of_type(value, int, "int")\n\n\nclass _String(BuiltinScalarDagsterType):\n def __init__(self):\n super(_String, self).__init__(\n name="String",\n loader=BuiltinSchemas.STRING_INPUT,\n materializer=BuiltinSchemas.STRING_OUTPUT,\n type_check_fn=self.type_check_fn,\n typing_type=str,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\nclass _Float(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Float, self).__init__(\n name="Float",\n loader=BuiltinSchemas.FLOAT_INPUT,\n materializer=BuiltinSchemas.FLOAT_OUTPUT,\n type_check_fn=self.type_check_fn,\n typing_type=float,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, float, "float")\n\n\nclass _Bool(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Bool, self).__init__(\n name="Bool",\n loader=BuiltinSchemas.BOOL_INPUT,\n materializer=BuiltinSchemas.BOOL_OUTPUT,\n type_check_fn=self.type_check_fn,\n typing_type=bool,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, bool, "bool")\n\n\nclass Anyish(DagsterType):\n def __init__(\n self,\n key: t.Optional[str],\n name: t.Optional[str],\n loader: t.Optional[DagsterTypeLoader] = None,\n materializer: t.Optional[DagsterTypeMaterializer] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n ):\n super(Anyish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.ANY,\n loader=loader,\n materializer=materializer,\n is_builtin=is_builtin,\n type_check_fn=self.type_check_method,\n description=description,\n typing_type=t.Any,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", _value: object) -> TypeCheck:\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n # Anyish all the way down\n return self\n\n\nclass _Any(Anyish):\n def __init__(self):\n super(_Any, self).__init__(\n key="Any",\n name="Any",\n loader=BuiltinSchemas.ANY_INPUT,\n materializer=BuiltinSchemas.ANY_OUTPUT,\n is_builtin=True,\n )\n\n\ndef create_any_type(\n name: str,\n loader: t.Optional[DagsterTypeLoader] = None,\n materializer: t.Optional[DagsterTypeMaterializer] = None,\n description: t.Optional[str] = None,\n) -> Anyish:\n return Anyish(\n key=name,\n name=name,\n description=description,\n loader=loader,\n materializer=materializer,\n )\n\n\nclass _Nothing(DagsterType):\n def __init__(self):\n super(_Nothing, self).__init__(\n key="Nothing",\n name="Nothing",\n kind=DagsterTypeKind.NOTHING,\n loader=None,\n materializer=None,\n type_check_fn=self.type_check_method,\n is_builtin=True,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n if value is not None:\n return TypeCheck(\n success=False,\n description="Value must be None, got a {value_type}".format(value_type=type(value)),\n )\n\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n return self\n\n\ndef isinstance_type_check_fn(\n expected_python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n dagster_type_name: str,\n expected_python_type_str: str,\n) -> TypeCheckFn:\n def type_check(_context: "TypeCheckContext", value: object) -> TypeCheck:\n if not isinstance(value, expected_python_type):\n return TypeCheck(\n success=False,\n description=(\n f"Value of type {type(value)} failed type check for Dagster type {dagster_type_name}, "\n f"expected value to be of Python type {expected_python_type_str}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check\n\n\n
[docs]class PythonObjectDagsterType(DagsterType):\n """Define a type in dagster whose typecheck is an isinstance check.\n\n Specifically, the type can either be a single python type (e.g. int),\n or a tuple of types (e.g. (int, float)) which is treated as a union.\n\n Examples:\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=int)\n assert ntype.name == 'int'\n assert_success(ntype, 1)\n assert_failure(ntype, 'a')\n\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=(int, float))\n assert ntype.name == 'Union[int, float]'\n assert_success(ntype, 1)\n assert_success(ntype, 1.5)\n assert_failure(ntype, 'a')\n\n\n Args:\n python_type (Union[Type, Tuple[Type, ...]): The dagster typecheck function calls instanceof on\n this type.\n name (Optional[str]): Name the type. Defaults to the name of ``python_type``.\n key (Optional[str]): Key of the type. Defaults to name.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_mate <dagster.dagster_type_mate>`\n decorator to construct these arguments.\n """\n\n def __init__(\n self,\n python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n **kwargs,\n ):\n if isinstance(python_type, tuple):\n self.python_type = check.tuple_param(\n python_type, "python_type", of_shape=tuple(type for item in python_type)\n )\n self.type_str = "Union[{}]".format(\n ", ".join(python_type.__name__ for python_type in python_type)\n )\n typing_type = t.Union[python_type] # type: ignore\n\n else:\n self.python_type = check.class_param(python_type, "python_type") # type: ignore\n self.type_str = cast(str, python_type.__name__)\n typing_type = self.python_type # type: ignore\n name = check.opt_str_param(name, "name", self.type_str)\n key = check.opt_str_param(key, "key", name)\n super(PythonObjectDagsterType, self).__init__(\n key=key,\n name=name,\n type_check_fn=isinstance_type_check_fn(python_type, name, self.type_str),\n typing_type=typing_type,\n **kwargs,\n )
\n\n\nclass NoneableInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type: DagsterType):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n self._inner_loader = check.not_none_param(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = ConfigNoneable(self._inner_loader.schema_type)\n\n @property\n def schema_type(self) -> ConfigType:\n return self._schema_type\n\n def construct_from_config_value(\n self, context: "StepExecutionContext", config_value: object\n ) -> object:\n if config_value is None:\n return None\n return self._inner_loader.construct_from_config_value(context, config_value)\n\n\ndef _create_nullable_input_schema(inner_type: DagsterType) -> t.Optional[DagsterTypeLoader]:\n if not inner_type.loader:\n return None\n\n return NoneableInputSchema(inner_type)\n\n\nclass OptionalType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n inner_type = resolve_dagster_type(inner_type)\n\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError(\n "Type Nothing can not be wrapped in List or Optional"\n )\n\n key = "Optional." + cast(str, inner_type.key)\n self.inner_type = inner_type\n super(OptionalType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.NULLABLE,\n type_check_fn=self.type_check_method,\n loader=_create_nullable_input_schema(inner_type),\n # This throws a type error with Py\n typing_type=t.Optional[inner_type.typing_type], # type: ignore\n )\n\n @property\n def display_name(self) -> str:\n return self.inner_type.display_name + "?"\n\n def type_check_method(self, context, value):\n return (\n TypeCheck(success=True) if value is None else self.inner_type.type_check(context, value)\n )\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return self.inner_type.supports_fan_in\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type.get_inner_type_for_fan_in()\n\n\nclass ListInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = Array(inner_dagster_type.loader.schema_type)\n\n @property\n def schema_type(self):\n return self._schema_type\n\n def construct_from_config_value(self, context, config_value):\n convert_item = partial(self._inner_dagster_type.loader.construct_from_config_value, context)\n return list(map(convert_item, config_value))\n\n\ndef _create_list_input_schema(inner_type):\n if not inner_type.loader:\n return None\n\n return ListInputSchema(inner_type)\n\n\nclass ListType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n key = "List." + inner_type.key\n self.inner_type = inner_type\n super(ListType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.LIST,\n type_check_fn=self.type_check_method,\n loader=_create_list_input_schema(inner_type),\n typing_type=t.List[inner_type.typing_type], # type: ignore\n )\n\n @property\n def display_name(self):\n return "[" + self.inner_type.display_name + "]"\n\n def type_check_method(self, context, value):\n value_check = _fail_if_not_of_type(value, list, "list")\n if not value_check.success:\n return value_check\n\n for item in value:\n item_check = self.inner_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type\n\n\nclass DagsterListApi:\n def __getitem__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(resolve_dagster_type(inner_type))\n\n def __call__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(inner_type)\n\n\nList = DagsterListApi()\n\n\ndef _List(inner_type):\n check.inst_param(inner_type, "inner_type", DagsterType)\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError("Type Nothing can not be wrapped in List or Optional")\n return ListType(inner_type)\n\n\nclass Stringish(DagsterType):\n def __init__(self, key: t.Optional[str] = None, name: t.Optional[str] = None, **kwargs):\n name = check.opt_str_param(name, "name", type(self).__name__)\n key = check.opt_str_param(key, "key", name)\n super(Stringish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=self.type_check_method,\n loader=BuiltinSchemas.STRING_INPUT,\n materializer=BuiltinSchemas.STRING_OUTPUT,\n typing_type=str,\n **kwargs,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\ndef create_string_type(name, description=None):\n return Stringish(name=name, key=name, description=description)\n\n\nAny = _Any()\nBool = _Bool()\nFloat = _Float()\nInt = _Int()\nString = _String()\nNothing = _Nothing()\n\n_RUNTIME_MAP = {\n BuiltinEnum.ANY: Any,\n BuiltinEnum.BOOL: Bool,\n BuiltinEnum.FLOAT: Float,\n BuiltinEnum.INT: Int,\n BuiltinEnum.STRING: String,\n BuiltinEnum.NOTHING: Nothing,\n}\n\n_PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY: t.Dict[type, DagsterType] = {}\n"""Python types corresponding to user-defined RunTime types created using @map_to_dagster_type or\nas_dagster_type are registered here so that we can remap the Python types to runtime types."""\n\n\n
[docs]def make_python_type_usable_as_dagster_type(python_type: t.Type, dagster_type: DagsterType) -> None:\n """\n Take any existing python type and map it to a dagster type (generally created with\n :py:class:`DagsterType <dagster.DagsterType>`) This can only be called once\n on a given python type.\n """\n check.inst_param(python_type, "python_type", type)\n check.inst_param(dagster_type, "dagster_type", DagsterType)\n registered_dagster_type = _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY.get(python_type)\n\n if registered_dagster_type is None:\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n elif registered_dagster_type is not dagster_type:\n # This would be just a great place to insert a short URL pointing to the type system\n # documentation into the error message\n # https://github.com/dagster-io/dagster/issues/1831\n if isinstance(registered_dagster_type, TypeHintInferredDagsterType):\n raise DagsterInvalidDefinitionError(\n f"A Dagster type has already been registered for the Python type "\n f'{python_type}. The Dagster type was "auto-registered" - i.e. a solid definition '\n f"used the Python type as an annotation for one of its arguments or for its return "\n f"value before make_python_type_usable_as_dagster_type was called, and we "\n f"generated a Dagster type to correspond to it. To override the auto-generated "\n f"Dagster type, call make_python_type_usable_as_dagster_type before any solid "\n f"definitions refer to the Python type."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"A Dagster type has already been registered for the Python type "\n f"{python_type}. make_python_type_usable_as_dagster_type can only "\n f"be called once on a python type as it is registering a 1:1 mapping "\n f"between that python type and a dagster type."\n )
\n\n\nDAGSTER_INVALID_TYPE_ERROR_MESSAGE = (\n "Invalid type: dagster_type must be an instance of DagsterType or a Python type: "\n "got {dagster_type}{additional_msg}"\n)\n\n\nclass TypeHintInferredDagsterType(DagsterType):\n def __init__(self, python_type: t.Type):\n qualified_name = f"{python_type.__module__}.{python_type.__name__}"\n self.python_type = python_type\n super(TypeHintInferredDagsterType, self).__init__(\n key=f"_TypeHintInferred[{qualified_name}]",\n description=f"DagsterType created from a type hint for the Python type {qualified_name}",\n type_check_fn=isinstance_type_check_fn(\n python_type, python_type.__name__, qualified_name\n ),\n typing_type=python_type,\n )\n\n @property\n def display_name(self) -> str:\n return self.python_type.__name__\n\n\ndef resolve_dagster_type(dagster_type: object) -> DagsterType:\n # circular dep\n from dagster.primitive_mapping import (\n is_supported_runtime_python_builtin,\n remap_python_builtin_for_runtime,\n )\n from dagster.seven.typing import get_args\n from dagster.utils.typing_api import is_typing_type\n\n from .python_dict import Dict, PythonDict\n from .python_set import DagsterSetApi, PythonSet\n from .python_tuple import DagsterTupleApi, PythonTuple\n from .transform_typing import transform_typing_type\n\n check.invariant(\n not (isinstance(dagster_type, type) and issubclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType)),\n "Do not pass runtime type classes. Got {}".format(dagster_type),\n )\n\n # First, check to see if we're using Dagster's generic output type to do the type catching.\n if _is_generic_output_annotation(dagster_type):\n type_args = get_args(dagster_type)\n # If no inner type was provided, forward Any type.\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n elif is_dynamic_output_annotation(dagster_type):\n dynamic_out_annotation = get_args(dagster_type)[0]\n type_args = get_args(dynamic_out_annotation)\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n\n # Then, check to see if it is part of python's typing library\n if is_typing_type(dagster_type):\n dagster_type = transform_typing_type(dagster_type)\n if isinstance(dagster_type, DagsterType):\n return dagster_type\n\n # Test for unhashable objects -- this is if, for instance, someone has passed us an instance of\n # a dict where they meant to pass dict or Dict, etc.\n try:\n hash(dagster_type)\n except TypeError as e:\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n additional_msg=(\n ", which isn't hashable. Did you pass an instance of a type instead of "\n "the type?"\n ),\n dagster_type=str(dagster_type),\n )\n ) from e\n\n if BuiltinEnum.contains(dagster_type):\n return DagsterType.from_builtin_enum(dagster_type)\n\n if is_supported_runtime_python_builtin(dagster_type):\n return remap_python_builtin_for_runtime(dagster_type)\n\n if dagster_type is None:\n return Any\n\n if dagster_type is Dict:\n return PythonDict\n if isinstance(dagster_type, DagsterTupleApi):\n return PythonTuple\n if isinstance(dagster_type, DagsterSetApi):\n return PythonSet\n if isinstance(dagster_type, DagsterListApi):\n return List(Any)\n\n if isinstance(dagster_type, type):\n return resolve_python_type_to_dagster_type(dagster_type)\n\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n dagster_type=str(dagster_type), additional_msg="."\n )\n )\n\n\ndef is_dynamic_output_annotation(dagster_type: object) -> bool:\n from dagster.seven.typing import get_args, get_origin\n\n check.invariant(\n not (isinstance(dagster_type, type) and issubclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType)),\n "Do not pass runtime type classes. Got {}".format(dagster_type),\n )\n\n if dagster_type == DynamicOutput or get_origin(dagster_type) == DynamicOutput:\n raise DagsterInvariantViolationError(\n "Op annotated with return type DynamicOutput. DynamicOutputs can only be returned in the context of a List. If only one output is needed, use the Output API."\n )\n\n if get_origin(dagster_type) == list and len(get_args(dagster_type)) == 1:\n list_inner_type = get_args(dagster_type)[0]\n return list_inner_type == DynamicOutput or get_origin(list_inner_type) == DynamicOutput\n return False\n\n\ndef _is_generic_output_annotation(dagster_type: object) -> bool:\n from dagster.seven.typing import get_origin\n\n return dagster_type == Output or get_origin(dagster_type) == Output\n\n\ndef resolve_python_type_to_dagster_type(python_type: t.Type) -> DagsterType:\n """\n Resolves a Python type to a Dagster type.\n """\n check.inst_param(python_type, "python_type", type)\n\n if python_type in _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY:\n return _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type]\n else:\n dagster_type = TypeHintInferredDagsterType(python_type)\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n return dagster_type\n\n\nALL_RUNTIME_BUILTINS = list(_RUNTIME_MAP.values())\n\n\ndef construct_dagster_type_dictionary(solid_defs):\n type_dict_by_name = {t.unique_name: t for t in ALL_RUNTIME_BUILTINS}\n type_dict_by_key = {t.key: t for t in ALL_RUNTIME_BUILTINS}\n for solid_def in solid_defs:\n for dagster_type in solid_def.all_dagster_types():\n # We don't do uniqueness check on key because with classes\n # like Array, Noneable, etc, those are ephemeral objects\n # and it is perfectly fine to have many of them.\n type_dict_by_key[dagster_type.key] = dagster_type\n\n if not dagster_type.has_unique_name:\n continue\n\n if dagster_type.unique_name not in type_dict_by_name:\n type_dict_by_name[dagster_type.unique_name] = dagster_type\n continue\n\n if type_dict_by_name[dagster_type.unique_name] is not dagster_type:\n raise DagsterInvalidDefinitionError(\n (\n 'You have created two dagster types with the same name "{type_name}". '\n "Dagster types have must have unique names."\n ).format(type_name=dagster_type.display_name)\n )\n\n return type_dict_by_key\n\n\nclass DagsterOptionalApi:\n def __getitem__(self, inner_type: t.Union[t.Type, DagsterType]) -> OptionalType:\n inner_type = resolve_dagster_type(check.not_none_param(inner_type, "inner_type"))\n return OptionalType(inner_type)\n\n\nOptional = DagsterOptionalApi()\n
", "current_page_name": "_modules/dagster/core/types/dagster_type", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.types.dagster_type"}, "decorator": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.core.types.decorator

\nimport dagster._check as check\n\nfrom .dagster_type import PythonObjectDagsterType, make_python_type_usable_as_dagster_type\n\n\n
[docs]def usable_as_dagster_type(\n name=None,\n description=None,\n loader=None,\n materializer=None,\n):\n """Decorate a Python class to make it usable as a Dagster Type.\n\n This is intended to make it straightforward to annotate existing business logic classes to\n make them dagster types whose typecheck is an isinstance check against that python class.\n\n Args:\n python_type (cls): The python type to make usable as python type.\n name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of\n the ``python_type`` will be used.\n description (Optional[str]): A user-readable description of the type.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`DagsterTypeMaterializer` and can persist values of\n this type. As a rule, you should use the\n :py:func:`@dagster_type_materializer <dagster.dagster_type_materializer>`\n decorator to construct these arguments.\n\n Examples:\n\n .. code-block:: python\n\n # dagster_aws.s3.file_manager.S3FileHandle\n @usable_as_dagster_type\n class S3FileHandle(FileHandle):\n def __init__(self, s3_bucket, s3_key):\n self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n self._s3_key = check.str_param(s3_key, 's3_key')\n\n @property\n def s3_bucket(self):\n return self._s3_bucket\n\n @property\n def s3_key(self):\n return self._s3_key\n\n @property\n def path_desc(self):\n return self.s3_path\n\n @property\n def s3_path(self):\n return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n """\n\n def _with_args(bare_cls):\n check.class_param(bare_cls, "bare_cls")\n new_name = name if name else bare_cls.__name__\n\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(\n name=new_name,\n description=description,\n python_type=bare_cls,\n loader=loader,\n materializer=materializer,\n ),\n )\n return bare_cls\n\n # check for no args, no parens case\n if callable(name):\n bare_cls = name # with no parens, name is actually the decorated class\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(python_type=bare_cls, name=bare_cls.__name__, description=None),\n )\n return bare_cls\n\n return _with_args
\n
", "current_page_name": "_modules/dagster/core/types/decorator", "customsidebar": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.core.types.decorator"}}}, "serdes": {"config_class": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.serdes.config_class

\nimport importlib\nfrom abc import ABC, abstractmethod\nfrom typing import NamedTuple\n\nimport yaml\n\nimport dagster._check as check\n\nfrom .serdes import whitelist_for_serdes\n\n\n
[docs]@whitelist_for_serdes\nclass ConfigurableClassData(\n NamedTuple(\n "_ConfigurableClassData",\n [\n ("module_name", str),\n ("class_name", str),\n ("config_yaml", str),\n ],\n )\n):\n """Serializable tuple describing where to find a class and the config fragment that should\n be used to instantiate it.\n\n Users should not instantiate this class directly.\n\n Classes intended to be serialized in this way should implement the\n :py:class:`dagster.serdes.ConfigurableClass` mixin.\n """\n\n def __new__(cls, module_name, class_name, config_yaml):\n return super(ConfigurableClassData, cls).__new__(\n cls,\n check.str_param(module_name, "module_name"),\n check.str_param(class_name, "class_name"),\n check.str_param(config_yaml, "config_yaml"),\n )\n\n @property\n def config_dict(self):\n return yaml.safe_load(self.config_yaml)\n\n def info_dict(self):\n return {\n "module": self.module_name,\n "class": self.class_name,\n "config": self.config_dict,\n }\n\n def rehydrate(self):\n from dagster.config.field import resolve_to_config_type\n from dagster.config.validate import process_config\n from dagster.core.errors import DagsterInvalidConfigError\n\n try:\n module = importlib.import_module(self.module_name)\n except ModuleNotFoundError:\n check.failed(\n f"Couldn't import module {self.module_name} when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n try:\n klass = getattr(module, self.class_name)\n except AttributeError:\n check.failed(\n f"Couldn't find class {self.class_name} in module when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n\n if not issubclass(klass, ConfigurableClass):\n raise check.CheckError(\n klass,\n f"class {self.class_name} in module {self.module_name}",\n ConfigurableClass,\n )\n\n config_dict = self.config_dict\n result = process_config(resolve_to_config_type(klass.config_type()), config_dict)\n if not result.success:\n raise DagsterInvalidConfigError(\n f"Errors whilst loading configuration for {klass.config_type()}.",\n result.errors,\n config_dict,\n )\n return klass.from_config_value(self, result.value)
\n\n\n
[docs]class ConfigurableClass(ABC):\n """Abstract mixin for classes that can be loaded from config.\n\n This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\n of conditional imports / optional extras_requires in dagster core and b) a magic directory or\n file in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\n run storage, pluggable with a config chunk like:\n\n .. code-block:: yaml\n\n run_storage:\n module: very_cool_package.run_storage\n class: SplendidRunStorage\n config:\n magic_word: "quux"\n\n This same pattern should eventually be viable for other system components, e.g. engines.\n\n The ``ConfigurableClass`` mixin provides the necessary hooks for classes to be instantiated from\n an instance of ``ConfigurableClassData``.\n\n Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\n type such as:\n\n .. code-block:: python\n\n {'module': str, 'class': str, 'config': Field(Permissive())}\n\n """\n\n @property\n @abstractmethod\n def inst_data(self):\n """\n Subclass must be able to return the inst_data as a property if it has been constructed\n through the from_config_value code path.\n """\n\n
[docs] @classmethod\n @abstractmethod\n def config_type(cls):\n """dagster.ConfigType: The config type against which to validate a config yaml fragment\n serialized in an instance of ``ConfigurableClassData``.\n """\n raise NotImplementedError(f"{cls.__name__} must implement the config_type classmethod")
\n\n
[docs] @staticmethod\n @abstractmethod\n def from_config_value(inst_data, config_value):\n """New up an instance of the ConfigurableClass from a validated config value.\n\n Called by ConfigurableClassData.rehydrate.\n\n Args:\n config_value (dict): The validated config value to use. Typically this should be the\n ``value`` attribute of a\n :py:class:`~dagster.core.types.evaluator.evaluation.EvaluateValueResult`.\n\n\n A common pattern is for the implementation to align the config_value with the signature\n of the ConfigurableClass's constructor:\n\n .. code-block:: python\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n\n """\n raise NotImplementedError(\n "ConfigurableClass subclasses must implement the from_config_value staticmethod"\n )
\n\n\ndef class_from_code_pointer(module_name, class_name):\n try:\n module = importlib.import_module(module_name)\n except ModuleNotFoundError:\n check.failed(\n "Couldn't import module {module_name} when attempting to load the "\n "class {klass}".format(\n module_name=module_name,\n klass=module_name + "." + class_name,\n )\n )\n try:\n return getattr(module, class_name)\n except AttributeError:\n check.failed(\n "Couldn't find class {class_name} in module when attempting to load the "\n "class {klass}".format(\n class_name=class_name,\n klass=module_name + "." + class_name,\n )\n )\n
", "current_page_name": "_modules/dagster/serdes/config_class", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.serdes.config_class"}}, "utils": {"alabaster_version": "0.7.12", "backcompat": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.backcompat

\nimport inspect\nimport warnings\nfrom functools import wraps\nfrom typing import Callable, Optional, Type, TypeVar, cast\n\nimport dagster._check as check\n\nT = TypeVar("T")\n\nEXPERIMENTAL_WARNING_HELP = (\n    "To mute warnings for experimental functionality, invoke"\n    ' warnings.filterwarnings("ignore", category=dagster.ExperimentalWarning) or use'\n    " one of the other methods described at"\n    " https://docs.python.org/3/library/warnings.html#describing-warning-filters."\n)\n\n\ndef canonicalize_backcompat_args(\n    new_val: T, new_arg: str, old_val: T, old_arg: str, breaking_version: str, **kwargs: object\n) -> T:\n    """\n    Utility for managing backwards compatibility of two related arguments.\n\n    For example if you had an existing function\n\n    def is_new(old_flag):\n        return not new_flag\n\n    And you decided you wanted a new function to be:\n\n    def is_new(new_flag):\n        return new_flag\n\n    However you want an in between period where either flag is accepted. Use\n    canonicalize_backcompat_args to manage that:\n\n    def is_new(old_flag=None, new_flag=None):\n        return canonicalize_backcompat_args(\n            new_val=new_flag,\n            new_arg='new_flag',\n            old_val=old_flag,\n            old_arg='old_flag',\n            breaking_version='0.9.0',\n            coerce_old_to_new=lambda val: not val,\n        )\n\n\n    In this example, if the caller sets both new_flag and old_flag, it will fail by throwing\n    a CheckError. If the caller sets old_flag, it will run it through the coercion function\n    , warn, and then execute.\n\n    canonicalize_backcompat_args returns the value as if *only* new_val were specified\n    """\n    coerce_old_to_new = cast(Optional[Callable], kwargs.get("coerce_old_to_new"))\n    additional_warn_txt = kwargs.get("additional_warn_txt")\n    # stacklevel=3 punches up to the caller of canonicalize_backcompat_args\n    stacklevel = kwargs.get("stacklevel", 3)\n\n    check.str_param(new_arg, "new_arg")\n    check.str_param(old_arg, "old_arg")\n    check.opt_callable_param(coerce_old_to_new, "coerce_old_to_new")\n    check.opt_str_param(additional_warn_txt, "additional_warn_txt")\n    check.opt_int_param(stacklevel, "stacklevel")\n    if new_val is not None:\n        if old_val is not None:\n            check.failed(\n                'Do not use deprecated "{old_arg}" now that you are using "{new_arg}".'.format(\n                    old_arg=old_arg, new_arg=new_arg\n                )\n            )\n        return new_val\n    if old_val is not None:\n        _additional_warn_txt = f'Use "{new_arg}" instead.' + (\n            (" " + additional_warn_txt) if additional_warn_txt else ""\n        )\n        deprecation_warning(\n            f'Argument "{old_arg}"', breaking_version, _additional_warn_txt, stacklevel + 1\n        )\n        return coerce_old_to_new(old_val) if coerce_old_to_new else old_val\n\n    return new_val\n\n\ndef deprecation_warning(\n    subject: str,\n    breaking_version: str,\n    additional_warn_txt: Optional[str] = None,\n    stacklevel: int = 3,\n):\n    warnings.warn(\n        f"{subject} is deprecated and will be removed in {breaking_version}."\n        + ((" " + additional_warn_txt) if additional_warn_txt else ""),\n        category=DeprecationWarning,\n        stacklevel=stacklevel,\n    )\n\n\ndef rename_warning(\n    new_name: str,\n    old_name: str,\n    breaking_version: str,\n    additional_warn_txt: Optional[str] = None,\n    stacklevel: int = 3,\n) -> None:\n    """\n    Common utility for managing backwards compatibility of renaming.\n    """\n    warnings.warn(\n        '"{old_name}" is deprecated and will be removed in {breaking_version}, use "{new_name}" instead.'.format(\n            old_name=old_name,\n            new_name=new_name,\n            breaking_version=breaking_version,\n        )\n        + ((" " + additional_warn_txt) if additional_warn_txt else ""),\n        category=DeprecationWarning,\n        stacklevel=stacklevel,\n    )\n\n\n
[docs]class ExperimentalWarning(Warning):\n pass
\n\n\ndef experimental_fn_warning(name: str, stacklevel: int = 3) -> None:\n """Utility for warning that a function is experimental"""\n warnings.warn(\n '"{name}" is an experimental function. It may break in future versions, even between dot'\n " releases. {help}".format(name=name, help=EXPERIMENTAL_WARNING_HELP),\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_decorator_warning(name: str, stacklevel: int = 3) -> None:\n """Utility for warning that a decorator is experimental"""\n warnings.warn(\n f'"{name}" is an experimental decorator. It may break in future versions, even between dot'\n f" releases. {EXPERIMENTAL_WARNING_HELP}",\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_class_warning(name: str, stacklevel: int = 3) -> None:\n """Utility for warning that a class is experimental. Expected to be called from the class's\n __init__ method.\n\n Usage:\n\n .. code-block:: python\n\n class MyExperimentalClass:\n def __init__(self, some_arg):\n experimental_class_warning('MyExperimentalClass')\n # do other initialization stuff\n """\n warnings.warn(\n '"{name}" is an experimental class. It may break in future versions, even between dot'\n " releases. {help}".format(name=name, help=EXPERIMENTAL_WARNING_HELP),\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_arg_warning(arg_name: str, fn_name: str, stacklevel: int = 3) -> None:\n """Utility for warning that an argument to a function is experimental"""\n warnings.warn(\n '"{arg_name}" is an experimental argument to function "{fn_name}". '\n "It may break in future versions, even between dot releases. {help}".format(\n arg_name=arg_name, fn_name=fn_name, help=EXPERIMENTAL_WARNING_HELP\n ),\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_functionality_warning(desc: str, stacklevel: int = 3) -> None:\n """Utility for warning that a particular functionality is experimental"""\n warnings.warn(\n f"{desc} is currently experimental functionality. It may break in future versions, even "\n f"between dot releases. {EXPERIMENTAL_WARNING_HELP}",\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\ndef experimental_class_param_warning(param_name: str, class_name: str, stacklevel=3) -> None:\n """Utility for warning that an argument to a constructor is experimental"""\n warnings.warn(\n (\n f'"{param_name}" is an experimental parameter to the class "{class_name}". It may '\n f"break in future versions, even between dot releases. {EXPERIMENTAL_WARNING_HELP}"\n ),\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\nF = TypeVar("F", bound=Callable)\n\n\ndef experimental(callable_: F) -> F:\n """\n Spews an "experimental" warning whenever the given callable is called. If the argument is a\n class, this means the warning will be emitted when the class is instantiated.\n\n Usage:\n\n .. code-block:: python\n\n @experimental\n def my_experimental_function(my_arg):\n do_stuff()\n\n @experimental\n class MyExperimentalClass:\n pass\n """\n check.callable_param(callable_, "callable_")\n\n if inspect.isfunction(callable_):\n\n @wraps(callable_)\n def _inner(*args, **kwargs):\n experimental_fn_warning(callable_.__name__, stacklevel=3)\n return callable_(*args, **kwargs)\n\n return cast(F, _inner)\n\n elif inspect.isclass(callable_):\n\n undecorated_init = callable_.__init__\n\n def __init__(self, *args, **kwargs):\n experimental_class_warning(callable_.__name__, stacklevel=3)\n # Tuples must be handled differently, because the undecorated_init does not take any\n # arguments-- they're assigned in __new__.\n if issubclass(cast(Type, callable_), tuple):\n undecorated_init(self)\n else:\n undecorated_init(self, *args, **kwargs)\n\n callable_.__init__ = __init__\n\n return cast(F, callable_)\n\n else:\n check.failed("callable_ must be a function or a class")\n\n\ndef experimental_decorator(decorator: F) -> F:\n """\n Spews an "experimental" warning whenever the given decorator is invoked.\n\n Usage:\n\n .. code-block:: python\n\n @experimental_decorator\n def my_experimental_decorator(...):\n ...\n """\n check.callable_param(decorator, "decorator")\n\n @wraps(decorator)\n def _inner(*args, **kwargs):\n experimental_decorator_warning(decorator.__name__, stacklevel=3)\n return decorator(*args, **kwargs)\n\n return cast(F, _inner)\n
", "current_page_name": "_modules/dagster/utils/backcompat", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.backcompat"}, "body": "

Source code for dagster.utils

\nimport _thread as thread\nimport contextlib\nimport contextvars\nimport datetime\nimport errno\nimport functools\nimport inspect\nimport multiprocessing\nimport os\nimport re\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport tempfile\nimport threading\nfrom collections import OrderedDict\nfrom datetime import timezone\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Callable, ContextManager, Generator, Generic, Iterator\nfrom typing import Mapping as TypingMapping\nfrom typing import Optional, Type, TypeVar, Union, cast, overload\nfrom warnings import warn\n\nimport yaml\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster.seven import IS_WINDOWS\nfrom dagster.seven.abc import Mapping\n\nfrom .merger import merge_dicts\nfrom .yaml_utils import load_yaml_from_glob_list, load_yaml_from_globs, load_yaml_from_path\n\nif sys.version_info > (3,):\n    from pathlib import Path  # pylint: disable=import-error\nelse:\n    from pathlib2 import Path  # pylint: disable=import-error\n\nif TYPE_CHECKING:\n    from dagster.core.events import DagsterEvent\n\nT = TypeVar("T")\n\nEPOCH = datetime.datetime.utcfromtimestamp(0)\n\nPICKLE_PROTOCOL = 4\n\n\nDEFAULT_WORKSPACE_YAML_FILENAME = "workspace.yaml"\n\n\n# Back-compat after make_email_on_pipeline_failure_sensor and make_email_on_run_failure_sensor\n# were moved to avoid circular-dependency issues\ndef make_email_on_pipeline_failure_sensor(*args, **kwargs):\n    from .alert import make_email_on_pipeline_failure_sensor  # pylint: disable=redefined-outer-name\n\n    return make_email_on_pipeline_failure_sensor(*args, **kwargs)\n\n\n
[docs]def make_email_on_run_failure_sensor(*args, **kwargs):\n from .alert import make_email_on_run_failure_sensor # pylint: disable=redefined-outer-name\n\n return make_email_on_run_failure_sensor(*args, **kwargs)
\n\n\n
[docs]def file_relative_path(dunderfile: str, relative_path: str) -> str:\n """Get a path relative to the currently executing Python file.\n\n This function is useful when one needs to load a file that is relative to the position of\n the current file. (Such as when you encode a configuration file path in source file and want\n in runnable in any current working directory)\n\n Args:\n dunderfile (str): Should always be ``__file__``.\n relative_path (str): Path to get relative to the currently executing file.\n\n **Examples**:\n\n .. code-block:: python\n\n file_relative_path(__file__, 'path/relative/to/file')\n\n """\n\n check.str_param(dunderfile, "dunderfile")\n check.str_param(relative_path, "relative_path")\n\n return os.path.join(os.path.dirname(dunderfile), relative_path)
\n\n\ndef script_relative_path(file_path: str) -> str:\n """\n Useful for testing with local files. Use a path relative to where the\n test resides and this function will return the absolute path\n of that file. Otherwise it will be relative to script that\n ran the test\n\n Note: this is function is very, very expensive (on the order of 1\n millisecond per invocation) so this should only be used in performance\n insensitive contexts. Prefer file_relative_path for anything with\n performance constraints.\n\n """\n # from http://bit.ly/2snyC6s\n\n check.str_param(file_path, "file_path")\n scriptdir = inspect.stack()[1][1]\n return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(scriptdir)), file_path))\n\n\n# Adapted from https://github.com/okunishinishi/python-stringcase/blob/master/stringcase.py\ndef camelcase(string):\n check.str_param(string, "string")\n\n string = re.sub(r"^[\\-_\\.]", "", str(string))\n if not string:\n return string\n return str(string[0]).upper() + re.sub(\n r"[\\-_\\.\\s]([a-z])", lambda matched: str(matched.group(1)).upper(), string[1:]\n )\n\n\ndef ensure_single_item(ddict):\n check.dict_param(ddict, "ddict")\n check.param_invariant(len(ddict) == 1, "ddict", "Expected dict with single item")\n return list(ddict.items())[0]\n\n\n@contextlib.contextmanager\ndef pushd(path):\n old_cwd = os.getcwd()\n os.chdir(path)\n try:\n yield path\n finally:\n os.chdir(old_cwd)\n\n\ndef safe_isfile(path):\n """ "Backport of Python 3.8 os.path.isfile behavior.\n\n This is intended to backport https://docs.python.org/dev/whatsnew/3.8.html#os-path. I'm not\n sure that there are other ways to provoke this behavior on Unix other than the null byte,\n but there are certainly other ways to do it on Windows. Afaict, we won't mask other\n ValueErrors, and the behavior in the status quo ante is rough because we risk throwing an\n unexpected, uncaught ValueError from very deep in our logic.\n """\n try:\n return os.path.isfile(path)\n except ValueError:\n return False\n\n\ndef mkdir_p(path):\n try:\n os.makedirs(path)\n return path\n except OSError as exc: # Python >2.5\n if exc.errno == errno.EEXIST and os.path.isdir(path):\n pass\n else:\n raise\n\n\n# TODO: Make frozendict generic for type annotations\n# https://github.com/dagster-io/dagster/issues/3641\nclass frozendict(dict):\n def __readonly__(self, *args, **kwargs):\n raise RuntimeError("Cannot modify ReadOnlyDict")\n\n # https://docs.python.org/3/library/pickle.html#object.__reduce__\n #\n # For a dict, the default behavior for pickle is to iteratively call __setitem__ (see 5th item\n # in __reduce__ tuple). Since we want to disable __setitem__ and still inherit dict, we\n # override this behavior by defining __reduce__. We return the 3rd item in the tuple, which is\n # passed to __setstate__, allowing us to restore the frozendict.\n\n def __reduce__(self):\n return (frozendict, (), dict(self))\n\n def __setstate__(self, state):\n self.__init__(state)\n\n __setitem__ = __readonly__\n __delitem__ = __readonly__\n pop = __readonly__ # type: ignore[assignment]\n popitem = __readonly__\n clear = __readonly__\n update = __readonly__ # type: ignore[assignment]\n setdefault = __readonly__ # type: ignore[assignment]\n del __readonly__\n\n def __hash__(self):\n return hash(tuple(sorted(self.items())))\n\n\nclass frozenlist(list):\n def __readonly__(self, *args, **kwargs):\n raise RuntimeError("Cannot modify ReadOnlyList")\n\n # https://docs.python.org/3/library/pickle.html#object.__reduce__\n #\n # Like frozendict, implement __reduce__ and __setstate__ to handle pickling.\n # Otherwise, __setstate__ will be called to restore the frozenlist, causing\n # a RuntimeError because frozenlist is not mutable.\n\n def __reduce__(self):\n return (frozenlist, (), list(self))\n\n def __setstate__(self, state):\n self.__init__(state)\n\n __setitem__ = __readonly__ # type: ignore[assignment]\n __delitem__ = __readonly__\n append = __readonly__\n clear = __readonly__\n extend = __readonly__\n insert = __readonly__\n pop = __readonly__\n remove = __readonly__\n reverse = __readonly__\n sort = __readonly__ # type: ignore[assignment]\n\n def __hash__(self):\n return hash(tuple(self))\n\n\ndef make_readonly_value(value):\n if isinstance(value, list):\n return frozenlist(list(map(make_readonly_value, value)))\n elif isinstance(value, dict):\n return frozendict({key: make_readonly_value(value) for key, value in value.items()})\n else:\n return value\n\n\ndef get_prop_or_key(elem, key):\n if isinstance(elem, Mapping):\n return elem.get(key)\n else:\n return getattr(elem, key)\n\n\ndef list_pull(alist, key):\n return list(map(lambda elem: get_prop_or_key(elem, key), alist))\n\n\ndef all_none(kwargs):\n for value in kwargs.values():\n if value is not None:\n return False\n return True\n\n\ndef check_script(path, return_code=0):\n try:\n subprocess.check_output([sys.executable, path])\n except subprocess.CalledProcessError as exc:\n if return_code != 0:\n if exc.returncode == return_code:\n return\n raise\n\n\ndef check_cli_execute_file_pipeline(path, pipeline_fn_name, env_file=None):\n from dagster.core.test_utils import instance_for_test\n\n with instance_for_test():\n cli_cmd = [\n sys.executable,\n "-m",\n "dagster",\n "pipeline",\n "execute",\n "-f",\n path,\n "-a",\n pipeline_fn_name,\n ]\n\n if env_file:\n cli_cmd.append("-c")\n cli_cmd.append(env_file)\n\n try:\n subprocess.check_output(cli_cmd)\n except subprocess.CalledProcessError as cpe:\n print(cpe) # pylint: disable=print-call\n raise cpe\n\n\ndef safe_tempfile_path_unmanaged() -> str:\n # This gets a valid temporary file path in the safest possible way, although there is still no\n # guarantee that another process will not create a file at this path. The NamedTemporaryFile is\n # deleted when the context manager exits and the file object is closed.\n #\n # This is preferable to using NamedTemporaryFile as a context manager and passing the name\n # attribute of the file object around because NamedTemporaryFiles cannot be opened a second time\n # if already open on Windows NT or later:\n # https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile\n # https://github.com/dagster-io/dagster/issues/1582\n with tempfile.NamedTemporaryFile() as fd:\n path = fd.name\n return Path(path).as_posix()\n\n\n@contextlib.contextmanager\ndef safe_tempfile_path() -> Iterator[str]:\n path = None\n try:\n path = safe_tempfile_path_unmanaged()\n yield path\n finally:\n if path is not None and os.path.exists(path):\n os.unlink(path)\n\n\n@overload\ndef ensure_gen(thing_or_gen: Generator[T, Any, Any]) -> Generator[T, Any, Any]:\n pass\n\n\n@overload\ndef ensure_gen(thing_or_gen: T) -> Generator[T, Any, Any]:\n pass\n\n\ndef ensure_gen(\n thing_or_gen: Union[T, Iterator[T], Generator[T, Any, Any]]\n) -> Generator[T, Any, Any]:\n if not inspect.isgenerator(thing_or_gen):\n thing_or_gen = cast(T, thing_or_gen)\n\n def _gen_thing():\n yield thing_or_gen\n\n return _gen_thing()\n\n return thing_or_gen\n\n\ndef ensure_dir(file_path):\n try:\n os.makedirs(file_path)\n except OSError as e:\n if e.errno != errno.EEXIST:\n raise\n\n\ndef ensure_file(path):\n ensure_dir(os.path.dirname(path))\n if not os.path.exists(path):\n touch_file(path)\n\n\ndef touch_file(path):\n ensure_dir(os.path.dirname(path))\n with open(path, "a", encoding="utf8"):\n os.utime(path, None)\n\n\ndef _kill_on_event(termination_event):\n termination_event.wait()\n send_interrupt()\n\n\ndef send_interrupt():\n if IS_WINDOWS:\n # This will raise a KeyboardInterrupt in python land - meaning this wont be able to\n # interrupt things like sleep()\n thread.interrupt_main()\n else:\n # If on unix send an os level signal to interrupt any situation we may be stuck in\n os.kill(os.getpid(), signal.SIGINT)\n\n\n# Function to be invoked by daemon thread in processes which seek to be cancellable.\n# The motivation for this approach is to be able to exit cleanly on Windows. An alternative\n# path is to change how the processes are opened and send CTRL_BREAK signals, which at\n# the time of authoring seemed a more costly approach.\n#\n# Reading for the curious:\n# * https://stackoverflow.com/questions/35772001/how-to-handle-the-signal-in-python-on-windows-machine\n# * https://stefan.sofa-rockers.org/2013/08/15/handling-sub-process-hierarchies-python-linux-os-x/\ndef start_termination_thread(termination_event):\n check.inst_param(termination_event, "termination_event", ttype=type(multiprocessing.Event()))\n\n int_thread = threading.Thread(\n target=_kill_on_event, args=(termination_event,), name="kill-on-event"\n )\n int_thread.daemon = True\n int_thread.start()\n\n\n# Executes the next() function within an instance of the supplied context manager class\n# (leaving the context before yielding each result)\ndef iterate_with_context(\n context_fn: Callable[[], ContextManager], iterator: Iterator[T]\n) -> Iterator[T]:\n while True:\n # Allow interrupts during user code so that we can terminate slow/hanging steps\n with context_fn():\n try:\n next_output = next(iterator)\n except StopIteration:\n return\n\n yield next_output\n\n\ndef datetime_as_float(dt):\n check.inst_param(dt, "dt", datetime.datetime)\n return float((dt - EPOCH).total_seconds())\n\n\n# hashable frozen string to string dict\nclass frozentags(frozendict):\n def __init__(self, *args, **kwargs):\n super(frozentags, self).__init__(*args, **kwargs)\n check.dict_param(self, "self", key_type=str, value_type=str)\n\n def __hash__(self):\n return hash(tuple(sorted(self.items())))\n\n def updated_with(self, new_tags):\n check.dict_param(new_tags, "new_tags", key_type=str, value_type=str)\n updated = dict(self)\n for key, value in new_tags.items():\n updated[key] = value\n\n return frozentags(updated)\n\n\nGeneratedContext = TypeVar("GeneratedContext")\n\n\nclass EventGenerationManager(Generic[GeneratedContext]):\n """Utility class that wraps an event generator function, that also yields a single instance of\n a typed object. All events yielded before the typed object are yielded through the method\n `generate_setup_events` and all events yielded after the typed object are yielded through the\n method `generate_teardown_events`.\n\n This is used to help replace the context managers used in pipeline initialization with\n generators so that we can begin emitting initialization events AND construct a pipeline context\n object, while managing explicit setup/teardown.\n\n This does require calling `generate_setup_events` AND `generate_teardown_events` in order to\n get the typed object.\n """\n\n def __init__(\n self,\n generator: Generator[Union["DagsterEvent", GeneratedContext], None, None],\n object_cls: Type[GeneratedContext],\n require_object: Optional[bool] = True,\n ):\n self.generator = check.generator(generator)\n self.object_cls: Type[GeneratedContext] = check.class_param(object_cls, "object_cls")\n self.require_object = check.bool_param(require_object, "require_object")\n self.object: Optional[GeneratedContext] = None\n self.did_setup = False\n self.did_teardown = False\n\n def generate_setup_events(self) -> Iterator["DagsterEvent"]:\n self.did_setup = True\n try:\n while self.object is None:\n obj = next(self.generator)\n if isinstance(obj, self.object_cls):\n self.object = obj\n else:\n yield obj\n except StopIteration:\n if self.require_object:\n check.inst_param(\n self.object,\n "self.object",\n self.object_cls,\n "generator never yielded object of type {}".format(self.object_cls.__name__),\n )\n\n def get_object(self) -> GeneratedContext:\n if not self.did_setup:\n check.failed("Called `get_object` before `generate_setup_events`")\n return cast(GeneratedContext, self.object)\n\n def generate_teardown_events(self) -> Iterator["DagsterEvent"]:\n self.did_teardown = True\n if self.object:\n yield from self.generator\n\n\ndef utc_datetime_from_timestamp(timestamp):\n tz = timezone.utc\n return datetime.datetime.fromtimestamp(timestamp, tz=tz)\n\n\ndef utc_datetime_from_naive(dt):\n tz = timezone.utc\n return dt.replace(tzinfo=tz)\n\n\ndef is_enum_value(value):\n return False if value is None else issubclass(value.__class__, Enum)\n\n\ndef git_repository_root():\n return subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()\n\n\ndef segfault():\n """Reliable cross-Python version segfault.\n\n https://bugs.python.org/issue1215#msg143236\n """\n import ctypes\n\n ctypes.string_at(0)\n\n\ndef find_free_port():\n with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:\n s.bind(("", 0))\n s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n return s.getsockname()[1]\n\n\n@contextlib.contextmanager\ndef alter_sys_path(to_add, to_remove):\n to_restore = [path for path in sys.path]\n\n # remove paths\n for path in to_remove:\n if path in sys.path:\n sys.path.remove(path)\n\n # add paths\n for path in to_add:\n sys.path.insert(0, path)\n\n try:\n yield\n finally:\n sys.path = to_restore\n\n\n@contextlib.contextmanager\ndef restore_sys_modules():\n sys_modules = {k: v for k, v in sys.modules.items()}\n try:\n yield\n finally:\n to_delete = set(sys.modules) - set(sys_modules)\n for key in to_delete:\n del sys.modules[key]\n\n\ndef process_is_alive(pid):\n if IS_WINDOWS:\n import psutil # pylint: disable=import-error\n\n return psutil.pid_exists(pid=pid)\n else:\n try:\n subprocess.check_output(["ps", str(pid)])\n except subprocess.CalledProcessError as exc:\n assert exc.returncode == 1\n return False\n return True\n\n\ndef compose(*args):\n """\n Compose python functions args such that compose(f, g)(x) is equivalent to f(g(x)).\n """\n # reduce using functional composition over all the arguments, with the identity function as\n # initializer\n return functools.reduce(lambda f, g: lambda x: f(g(x)), args, lambda x: x)\n\n\ndef dict_without_keys(ddict, *keys):\n return {key: value for key, value in ddict.items() if key not in set(keys)}\n\n\nclass Counter:\n def __init__(self):\n self._lock = threading.Lock()\n self._counts = OrderedDict()\n super(Counter, self).__init__()\n\n def increment(self, key: str):\n with self._lock:\n self._counts[key] = self._counts.get(key, 0) + 1\n\n def counts(self) -> TypingMapping[str, int]:\n with self._lock:\n copy = {k: v for k, v in self._counts.items()}\n return copy\n\n\ntraced_counter = contextvars.ContextVar("traced_counts", default=Counter())\n\n\ndef traced(func=None):\n """\n A decorator that keeps track of how many times a function is called.\n """\n\n def inner(*args, **kwargs):\n counter = traced_counter.get()\n if counter and isinstance(counter, Counter):\n counter.increment(func.__qualname__)\n\n return func(*args, **kwargs)\n\n return inner\n
", "current_page_name": "_modules/dagster/utils", "customsidebar": null, "forked_pdb": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.forked_pdb

\nimport pdb\nimport sys\n\n\n# From https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess\n
[docs]class ForkedPdb(pdb.Pdb):\n """A pdb subclass that may be used from a forked multiprocessing child\n\n **Examples**:\n\n .. code-block:: python\n\n from dagster.utils.forked_pdb import ForkedPdb\n\n @solid\n def complex_solid(_):\n # some complicated stuff\n\n ForkedPdb().set_trace()\n\n # some other complicated stuff\n\n You can initiate pipeline execution via dagit and use the pdb debugger to examine/step through\n execution at the breakpoint.\n """\n\n def interaction(self, frame, traceback):\n _stdin = sys.stdin\n try:\n sys.stdin = open("/dev/stdin", encoding="utf8")\n pdb.Pdb.interaction(self, frame, traceback)\n finally:\n sys.stdin = _stdin
\n
", "current_page_name": "_modules/dagster/utils/forked_pdb", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.forked_pdb"}, "log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.log

\nimport copy\nimport logging\nimport sys\nimport traceback\nfrom contextlib import contextmanager\nfrom typing import Dict, NamedTuple, Optional\n\nimport coloredlogs\nimport pendulum\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.config import Enum, EnumValue\nfrom dagster.core.definitions.logger_definition import logger\nfrom dagster.core.utils import PYTHON_LOGGING_LEVELS_MAPPING, coerce_valid_log_level\n\nLogLevelEnum = Enum("log_level", list(map(EnumValue, PYTHON_LOGGING_LEVELS_MAPPING.keys())))\n\n\nclass JsonFileHandler(logging.Handler):\n    def __init__(self, json_path: str):\n        super(JsonFileHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            log_dict = copy.copy(record.__dict__)\n\n            # This horrific monstrosity is to maintain backwards compatability\n            # with the old behavior of the JsonFileHandler, which the clarify\n            # project has a dependency on. It relied on the dagster-defined\n            # properties smashing all the properties of the LogRecord object\n            # and uploads all of those properties to a redshift table for\n            # in order to do analytics on the log\n\n            if "dagster_meta" in log_dict:\n                dagster_meta_dict = log_dict["dagster_meta"]\n                del log_dict["dagster_meta"]\n            else:\n                dagster_meta_dict = {}\n\n            log_dict.update(dagster_meta_dict)\n\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(log_dict)\n                ff.write(text_line + "\\n")\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:  # pylint: disable=W0703\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerMessage(\n    NamedTuple(\n        "_StructuredLoggerMessage",\n        [\n            ("name", str),\n            ("message", str),\n            ("level", int),\n            ("meta", Dict[object, object]),\n            ("record", logging.LogRecord),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        message: str,\n        level: int,\n        meta: Dict[object, object],\n        record: logging.LogRecord,\n    ):\n        return super(StructuredLoggerMessage, cls).__new__(\n            cls,\n            check.str_param(name, "name"),\n            check.str_param(message, "message"),\n            coerce_valid_log_level(level),\n            check.dict_param(meta, "meta"),\n            check.inst_param(record, "record", logging.LogRecord),\n        )\n\n\nclass JsonEventLoggerHandler(logging.Handler):\n    def __init__(self, json_path: str, construct_event_record):\n        super(JsonEventLoggerHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n        self.construct_event_record = construct_event_record\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            event_record = self.construct_event_record(record)\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(event_record.to_dict())\n                ff.write(text_line + "\\n")\n\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:  # pylint: disable=W0703\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerHandler(logging.Handler):\n    def __init__(self, callback):\n        super(StructuredLoggerHandler, self).__init__()\n        self.callback = check.is_callable(callback, "callback")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            self.callback(\n                StructuredLoggerMessage(\n                    name=record.name,\n                    message=record.msg,\n                    level=record.levelno,\n                    meta=record.dagster_meta,  # type: ignore\n                    record=record,\n                )\n            )\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:  # pylint: disable=W0703\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\ndef construct_single_handler_logger(name, level, handler):\n    check.str_param(name, "name")\n    check.inst_param(handler, "handler", logging.Handler)\n\n    level = coerce_valid_log_level(level)\n\n    @logger\n    def single_handler_logger(_init_context):\n        klass = logging.getLoggerClass()\n        logger_ = klass(name, level=level)\n        logger_.addHandler(handler)\n        handler.setLevel(level)\n        return logger_\n\n    return single_handler_logger\n\n\n# Base python logger whose messages will be captured as structured Dagster log messages.\nBASE_DAGSTER_LOGGER = logging.getLogger(name="dagster")\n\n\n
[docs]def get_dagster_logger(name: Optional[str] = None) -> logging.Logger:\n """\n Creates a python logger whose output messages will be captured and converted into Dagster log\n messages. This means they will have structured information such as the step_key, run_id, etc.\n embedded into them, and will show up in the Dagster event log.\n\n This can be used as a more convenient alternative to `context.log` in most cases. If log level\n is not set explicitly, defaults to DEBUG.\n\n Args:\n name (Optional[str]): If supplied, will create a logger with the name "dagster.builtin.{name}",\n with properties inherited from the base Dagster logger. If omitted, the returned logger\n will be named "dagster.builtin".\n\n Returns:\n :class:`logging.Logger`: A logger whose output will be captured by Dagster.\n\n Example:\n\n .. code-block:: python\n\n from dagster import get_dagster_logger, op\n\n @op\n def hello_op():\n log = get_dagster_logger()\n for i in range(5):\n # do something\n log.info(f"Did {i+1} things!")\n\n """\n\n # enforce that the parent logger will always have a DEBUG log level\n BASE_DAGSTER_LOGGER.setLevel(logging.DEBUG)\n base_builtin = BASE_DAGSTER_LOGGER.getChild("builtin")\n if name:\n return base_builtin.getChild(name)\n return base_builtin
\n\n\ndef define_structured_logger(name, callback, level):\n check.str_param(name, "name")\n check.callable_param(callback, "callback")\n level = coerce_valid_log_level(level)\n\n return construct_single_handler_logger(name, level, StructuredLoggerHandler(callback))\n\n\ndef define_json_file_logger(name, json_path, level):\n check.str_param(name, "name")\n check.str_param(json_path, "json_path")\n level = coerce_valid_log_level(level)\n\n stream_handler = JsonFileHandler(json_path)\n stream_handler.setFormatter(define_default_formatter())\n return construct_single_handler_logger(name, level, stream_handler)\n\n\ndef get_stack_trace_array(exception):\n check.inst_param(exception, "exception", Exception)\n if hasattr(exception, "__traceback__"):\n tb = exception.__traceback__\n else:\n _exc_type, _exc_value, tb = sys.exc_info()\n return traceback.format_tb(tb)\n\n\ndef _mockable_formatTime(record, datefmt=None): # pylint: disable=unused-argument\n """Uses pendulum.now to determine the logging time, causing pendulum\n mocking to affect the logger timestamp in tests."""\n return pendulum.now().strftime(datefmt if datefmt else default_date_format_string())\n\n\ndef default_format_string():\n return "%(asctime)s - %(name)s - %(levelname)s - %(message)s"\n\n\ndef default_date_format_string():\n return "%Y-%m-%d %H:%M:%S %z"\n\n\ndef define_default_formatter():\n return logging.Formatter(default_format_string(), default_date_format_string())\n\n\n@contextmanager\ndef quieten(quiet=True, level=logging.WARNING):\n if quiet:\n logging.disable(level)\n try:\n yield\n finally:\n if quiet:\n logging.disable(logging.NOTSET)\n\n\ndef configure_loggers(handler="default", log_level="INFO"):\n LOGGING_CONFIG = {\n "version": 1,\n "disable_existing_loggers": False,\n "formatters": {\n "colored": {\n "()": coloredlogs.ColoredFormatter,\n "fmt": default_format_string(),\n "datefmt": default_date_format_string(),\n "field_styles": {"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n "level_styles": {"debug": {}, "error": {"color": "red"}},\n },\n },\n "handlers": {\n "default": {\n "formatter": "colored",\n "class": "logging.StreamHandler",\n "stream": sys.stdout,\n "level": log_level,\n },\n "null": {\n "class": "logging.NullHandler",\n },\n },\n "loggers": {\n "dagster": {\n "handlers": [handler],\n "level": "INFO",\n },\n "dagit": {\n "handlers": [handler],\n "level": "INFO",\n },\n },\n }\n\n logging.config.dictConfig(LOGGING_CONFIG)\n\n if handler == "default":\n for name in ["dagster", "dagit"]:\n logging.getLogger(name).handlers[0].formatter.formatTime = _mockable_formatTime\n
", "current_page_name": "_modules/dagster/utils/log", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.log"}, "parents": [{"link": "../../", "title": "Module code"}], "partitions": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.partitions

\nimport datetime\nfrom typing import Callable, Union\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster.core.definitions.partition import Partition, PartitionSetDefinition\nfrom dagster.core.definitions.run_request import SkipReason\nfrom dagster.core.definitions.schedule_definition import ScheduleEvaluationContext\nfrom dagster.core.errors import DagsterInvariantViolationError\nfrom dagster.seven.compat.pendulum import PendulumDateTime, to_timezone\n\nDEFAULT_MONTHLY_FORMAT = "%Y-%m"\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\nDEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE = "%Y-%m-%d-%H:%M"\nDEFAULT_HOURLY_FORMAT_WITH_TIMEZONE = DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE + "%z"\n\n\n
[docs]def date_partition_range(\n start,\n end=None,\n delta_range="days",\n fmt=None,\n inclusive=False,\n timezone=None,\n):\n """Utility function that returns a partition generating function to be used in creating a\n `PartitionSet` definition.\n\n Args:\n start (datetime): Datetime capturing the start of the time range.\n end (Optional(datetime)): Datetime capturing the end of the partition. By default, the\n current time is used. The range is not inclusive of the end\n value.\n delta_range (Optional(str)): string representing the time duration of each partition.\n Must be a valid argument to pendulum.period.range ("days", "hours", "months", etc.).\n fmt (Optional(str)): Format string to represent each partition by its start time\n inclusive (Optional(bool)): By default, the partition set only contains date interval\n partitions for which the end time of the interval is less than current time. In other\n words, the partition set contains date interval partitions that are completely in the\n past. If inclusive is set to True, then the partition set will include all date\n interval partitions for which the start time of the interval is less than the\n current time.\n timezone (Optional(str)): Timezone in which the partition values should be expressed.\n Returns:\n Callable[[], List[Partition]]\n """\n\n check.inst_param(start, "start", datetime.datetime)\n check.opt_inst_param(end, "end", datetime.datetime)\n check.str_param(delta_range, "delta_range")\n fmt = check.opt_str_param(fmt, "fmt", default=DEFAULT_DATE_FORMAT)\n check.opt_str_param(timezone, "timezone")\n\n delta_amount = 1\n\n if end and start > end:\n raise DagsterInvariantViolationError(\n 'Selected date range start "{start}" is after date range end "{end}'.format(\n start=start.strftime(fmt),\n end=end.strftime(fmt),\n )\n )\n\n def get_date_range_partitions(current_time=None):\n check.opt_inst_param(current_time, "current_time", datetime.datetime)\n tz = timezone if timezone else "UTC"\n _start = (\n to_timezone(start, tz)\n if isinstance(start, PendulumDateTime)\n else pendulum.instance(start, tz=tz)\n )\n\n if end:\n _end = end\n elif current_time:\n _end = current_time\n else:\n _end = pendulum.now(tz)\n\n # coerce to the definition timezone\n if isinstance(_end, PendulumDateTime):\n _end = to_timezone(_end, tz)\n else:\n _end = pendulum.instance(_end, tz=tz)\n\n period = pendulum.period(_start, _end)\n date_names = [\n Partition(value=current, name=current.strftime(fmt))\n for current in period.range(delta_range, delta_amount)\n ]\n\n # We don't include the last element here by default since we only want\n # fully completed intervals, and the _end time is in the middle of the interval\n # represented by the last element of date_names\n if inclusive:\n return date_names\n\n return date_names[:-1]\n\n return get_date_range_partitions
\n\n\n
[docs]def identity_partition_selector(context, partition_set_def):\n """Utility function for supplying a partition selector when creating a schedule from a\n partition set made of ``datetime`` objects that assumes the schedule always executes at the\n partition time.\n\n It's important that the cron string passed into ``create_schedule_definition`` match\n the partition set times. For example, a schedule created from a partition set with partitions for each day at\n midnight would create its partition selector as follows:\n\n .. code-block:: python\n\n partition_set = PartitionSetDefinition(\n name='hello_world_partition_set',\n pipeline_name='hello_world_pipeline',\n partition_fn= date_partition_range(\n start=datetime.datetime(2021, 1, 1),\n delta_range="days",\n timezone="US/Central",\n )\n run_config_fn_for_partition=my_run_config_fn,\n )\n\n schedule_definition = partition_set.create_schedule_definition(\n "hello_world_daily_schedule",\n "0 0 * * *",\n partition_selector=identity_partition_selector,\n execution_timezone="US/Central",\n )\n """\n\n return create_offset_partition_selector(lambda d: d)(context, partition_set_def)
\n\n\n
[docs]def create_offset_partition_selector(\n execution_time_to_partition_fn,\n) -> Callable[[ScheduleEvaluationContext, PartitionSetDefinition], Union[Partition, SkipReason]]:\n """Utility function for supplying a partition selector when creating a schedule from a\n partition set made of ``datetime`` objects that assumes a fixed time offset between the\n partition time and the time at which the schedule executes.\n\n It's important to keep the cron string that's supplied to\n ``PartitionSetDefinition.create_schedule_definition`` in sync with the offset that's\n supplied to this function. For example, a schedule created from a partition set with\n partitions for each day at midnight that fills in the partition for day N at day N+1 at\n 10:00AM would create the partition selector as follows:\n\n .. code-block:: python\n\n partition_set = PartitionSetDefinition(\n name='hello_world_partition_set',\n pipeline_name='hello_world_pipeline',\n partition_fn= date_partition_range(\n start=datetime.datetime(2021, 1, 1),\n delta_range="days",\n timezone="US/Central",\n )\n run_config_fn_for_partition=my_run_config_fn,\n )\n\n schedule_definition = partition_set.create_schedule_definition(\n "daily_10am_schedule",\n "0 10 * * *",\n partition_selector=create_offset_partition_selector(lambda d: d.subtract(hours=10, days=1))\n execution_timezone="US/Central",\n )\n\n Args:\n execution_time_to_partition_fn (Callable[[datetime.datetime], datetime.datetime]): A\n function that maps the execution time of the schedule to the partition time.\n """\n\n check.callable_param(execution_time_to_partition_fn, "execution_time_to_partition_fn")\n\n def offset_partition_selector(\n context: ScheduleEvaluationContext, partition_set_def: PartitionSetDefinition\n ) -> Union[Partition, SkipReason]:\n no_partitions_skip_reason = SkipReason(\n "Partition selector did not return a partition. Make sure that the timezone "\n "on your partition set matches your execution timezone."\n )\n\n earliest_possible_partition = next(iter(partition_set_def.get_partitions(None)), None)\n if not earliest_possible_partition:\n return no_partitions_skip_reason\n\n valid_partitions = partition_set_def.get_partitions(context.scheduled_execution_time)\n\n if not context.scheduled_execution_time:\n if not valid_partitions:\n return no_partitions_skip_reason\n return valid_partitions[-1]\n\n partition_time = execution_time_to_partition_fn(context.scheduled_execution_time)\n\n if partition_time < earliest_possible_partition.value:\n return SkipReason(\n f"Your partition ({partition_time.isoformat()}) is before the beginning of "\n f"the partition set ({earliest_possible_partition.value.isoformat()}). "\n "Verify your schedule's start_date is correct."\n )\n\n if partition_time > valid_partitions[-1].value:\n return SkipReason(\n f"Your partition ({partition_time.isoformat()}) is after the end of "\n f"the partition set ({valid_partitions[-1].value.isoformat()}). "\n "Verify your schedule's end_date is correct."\n )\n\n for partition in valid_partitions:\n if partition.value.isoformat() == partition_time.isoformat():\n return partition\n\n return no_partitions_skip_reason\n\n return offset_partition_selector
\n
", "current_page_name": "_modules/dagster/utils/partitions", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.partitions"}, "sidebars": ["globaltoc.html", "searchbox.html"], "test": {"alabaster_version": "0.7.12", "body": "

Source code for dagster.utils.test

\nimport os\nimport shutil\nimport tempfile\nimport uuid\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, Generator, Optional, Union\n\n# top-level include is dangerous in terms of incurring circular deps\nfrom dagster import (\n    DagsterInvariantViolationError,\n    DependencyDefinition,\n    Failure,\n    ModeDefinition,\n    NodeInvocation,\n    PipelineDefinition,\n    RepositoryDefinition,\n    TypeCheck,\n)\nfrom dagster import _check as check\nfrom dagster import execute_pipeline, lambda_solid\nfrom dagster.core.definitions.logger_definition import LoggerDefinition\nfrom dagster.core.definitions.pipeline_base import InMemoryPipeline\nfrom dagster.core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster.core.definitions.solid_definition import NodeDefinition\nfrom dagster.core.execution.api import create_execution_plan, scoped_pipeline_context\nfrom dagster.core.execution.context.system import PlanExecutionContext\nfrom dagster.core.execution.context_creation_pipeline import (\n    create_context_creation_data,\n    create_execution_data,\n    create_executor,\n    create_log_manager,\n    create_plan_data,\n)\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.core.scheduler import Scheduler\nfrom dagster.core.scheduler.scheduler import DagsterScheduleDoesNotExist, DagsterSchedulerError\nfrom dagster.core.snap import snapshot_from_execution_plan\nfrom dagster.core.storage.file_manager import LocalFileManager\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.types.dagster_type import resolve_dagster_type\nfrom dagster.core.utility_solids import define_stub_solid\nfrom dagster.core.utils import make_new_run_id\nfrom dagster.serdes import ConfigurableClass\n\n# pylint: disable=unused-import\nfrom ..temp_file import (\n    get_temp_dir,\n    get_temp_file_handle,\n    get_temp_file_handle_with_data,\n    get_temp_file_name,\n    get_temp_file_name_with_data,\n    get_temp_file_names,\n)\nfrom ..typing_api import is_typing_type\n\nif TYPE_CHECKING:\n    from dagster._core.execution.results import CompositeSolidExecutionResult, SolidExecutionResult\n\n\ndef create_test_pipeline_execution_context(\n    logger_defs: Optional[Dict[str, LoggerDefinition]] = None\n) -> PlanExecutionContext:\n    loggers = check.opt_dict_param(\n        logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition\n    )\n    mode_def = ModeDefinition(logger_defs=loggers)\n    pipeline_def = PipelineDefinition(\n        name="test_legacy_context", solid_defs=[], mode_defs=[mode_def]\n    )\n    run_config: Dict[str, Dict[str, Dict]] = {"loggers": {key: {} for key in loggers}}\n    pipeline_run = PipelineRun(pipeline_name="test_legacy_context", run_config=run_config)\n    instance = DagsterInstance.ephemeral()\n    execution_plan = create_execution_plan(pipeline=pipeline_def, run_config=run_config)\n    creation_data = create_context_creation_data(\n        InMemoryPipeline(pipeline_def), execution_plan, run_config, pipeline_run, instance\n    )\n    log_manager = create_log_manager(creation_data)\n    scoped_resources_builder = ScopedResourcesBuilder()\n    executor = create_executor(creation_data)\n\n    return PlanExecutionContext(\n        plan_data=create_plan_data(creation_data, True, executor.retries),\n        execution_data=create_execution_data(\n            context_creation_data=creation_data,\n            scoped_resources_builder=scoped_resources_builder,\n        ),\n        log_manager=log_manager,\n        output_capture=None,\n    )\n\n\ndef _dep_key_of(solid):\n    return NodeInvocation(solid.definition.name, solid.name)\n\n\ndef build_pipeline_with_input_stubs(\n    pipeline_def: PipelineDefinition, inputs: Dict[str, dict]\n) -> PipelineDefinition:\n    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n    check.dict_param(inputs, "inputs", key_type=str, value_type=dict)\n\n    deps: Dict[str, Dict[str, object]] = defaultdict(dict)\n    for solid_name, dep_dict in pipeline_def.dependencies.items():\n        for input_name, dep in dep_dict.items():\n            deps[solid_name][input_name] = dep  # type: ignore\n\n    stub_solid_defs = []\n\n    for solid_name, input_dict in inputs.items():\n        if not pipeline_def.has_solid_named(solid_name):\n            raise DagsterInvariantViolationError(\n                (\n                    "You are injecting an input value for solid {solid_name} "\n                    "into pipeline {pipeline_name} but that solid was not found"\n                ).format(solid_name=solid_name, pipeline_name=pipeline_def.name)\n            )\n\n        solid = pipeline_def.solid_named(solid_name)\n        for input_name, input_value in input_dict.items():\n            stub_solid_def = define_stub_solid(\n                "__stub_{solid_name}_{input_name}".format(\n                    solid_name=solid_name, input_name=input_name\n                ),\n                input_value,\n            )\n            stub_solid_defs.append(stub_solid_def)\n            deps[_dep_key_of(solid)][input_name] = DependencyDefinition(stub_solid_def.name)  # type: ignore\n\n    return PipelineDefinition(\n        name=pipeline_def.name + "_stubbed",\n        solid_defs=pipeline_def.top_level_solid_defs + stub_solid_defs,\n        mode_defs=pipeline_def.mode_definitions,\n        dependencies=deps,  # type: ignore\n    )\n\n\n
[docs]def execute_solids_within_pipeline(\n pipeline_def: PipelineDefinition,\n solid_names: AbstractSet[str],\n inputs: Optional[Dict[str, dict]] = None,\n run_config: Optional[Dict[str, object]] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, str]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> Dict[str, Union["CompositeSolidExecutionResult", "SolidExecutionResult"]]:\n """Execute a set of solids within an existing pipeline.\n\n Intended to support tests. Input values may be passed directly.\n\n Args:\n pipeline_def (PipelineDefinition): The pipeline within which to execute the solid.\n solid_names (FrozenSet[str]): A set of the solid names, or the aliased solids, to execute.\n inputs (Optional[Dict[str, Dict[str, Any]]]): A dict keyed on solid names, whose values are\n dicts of input names to input values, used to pass input values to the solids directly.\n You may also use the ``run_config`` to configure any inputs that are configurable.\n run_config (Optional[dict]): The configuration that parameterized this\n execution, as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Dict[str, Union[CompositeSolidExecutionResult, SolidExecutionResult]]: The results of\n executing the solids, keyed by solid name.\n """\n check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n check.set_param(solid_names, "solid_names", of_type=str)\n inputs = check.opt_dict_param(inputs, "inputs", key_type=str, value_type=dict)\n\n sub_pipeline = pipeline_def.get_pipeline_subset_def(solid_names)\n stubbed_pipeline = build_pipeline_with_input_stubs(sub_pipeline, inputs)\n result = execute_pipeline(\n stubbed_pipeline,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n instance=instance,\n )\n\n return {sr.solid.name: sr for sr in result.solid_result_list}
\n\n\n
[docs]def execute_solid_within_pipeline(\n pipeline_def,\n solid_name,\n inputs=None,\n run_config=None,\n mode=None,\n preset=None,\n tags=None,\n instance=None,\n):\n """Execute a single solid within an existing pipeline.\n\n Intended to support tests. Input values may be passed directly.\n\n Args:\n pipeline_def (PipelineDefinition): The pipeline within which to execute the solid.\n solid_name (str): The name of the solid, or the aliased solid, to execute.\n inputs (Optional[Dict[str, Any]]): A dict of input names to input values, used to\n pass input values to the solid directly. You may also use the ``run_config`` to\n configure any inputs that are configurable.\n run_config (Optional[dict]): The configuration that parameterized this\n execution, as a dict.\n mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``\n and ``preset``.\n preset (Optional[str]): The name of the pipeline preset to use. You may not set both\n ``mode`` and ``preset``.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,\n an ephemeral instance will be used, and no artifacts will be persisted from the run.\n\n Returns:\n Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the\n solid.\n """\n\n return execute_solids_within_pipeline(\n pipeline_def,\n solid_names={solid_name},\n inputs={solid_name: inputs} if inputs else None,\n run_config=run_config,\n mode=mode,\n preset=preset,\n tags=tags,\n instance=instance,\n )[solid_name]
\n\n\n@contextmanager\ndef yield_empty_pipeline_context(\n run_id: Optional[str] = None, instance: Optional[DagsterInstance] = None\n) -> Generator[PlanExecutionContext, None, None]:\n pipeline = InMemoryPipeline(PipelineDefinition([], "empty"))\n pipeline_def = pipeline.get_definition()\n instance = check.opt_inst_param(\n instance, "instance", DagsterInstance, default=DagsterInstance.ephemeral()\n )\n\n execution_plan = create_execution_plan(pipeline)\n\n pipeline_run = instance.create_run(\n pipeline_name="<empty>",\n run_id=run_id,\n run_config=None,\n mode=None,\n solids_to_execute=None,\n step_keys_to_execute=None,\n status=None,\n tags=None,\n root_run_id=None,\n parent_run_id=None,\n pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan, pipeline_def.get_pipeline_snapshot_id()\n ),\n parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),\n )\n with scoped_pipeline_context(execution_plan, pipeline, {}, pipeline_run, instance) as context:\n yield context\n\n\n
[docs]def execute_solid(\n solid_def: NodeDefinition,\n mode_def: Optional[ModeDefinition] = None,\n input_values: Optional[Dict[str, object]] = None,\n tags: Optional[Dict[str, Any]] = None,\n run_config: Optional[Dict[str, object]] = None,\n raise_on_error: bool = True,\n) -> Union["CompositeSolidExecutionResult", "SolidExecutionResult"]:\n """Execute a single solid in an ephemeral pipeline.\n\n Intended to support unit tests. Input values may be passed directly, and no pipeline need be\n specified -- an ephemeral pipeline will be constructed.\n\n Args:\n solid_def (SolidDefinition): The solid to execute.\n mode_def (Optional[ModeDefinition]): The mode within which to execute the solid. Use this\n if, e.g., custom resources, loggers, or executors are desired.\n input_values (Optional[Dict[str, Any]]): A dict of input names to input values, used to\n pass inputs to the solid directly. You may also use the ``run_config`` to\n configure any inputs that are configurable.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline\n logs.\n run_config (Optional[dict]): The configuration that parameterized this\n execution, as a dict.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``, since this is the most useful behavior in test.\n\n Returns:\n Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the\n solid.\n """\n check.inst_param(solid_def, "solid_def", NodeDefinition)\n check.opt_inst_param(mode_def, "mode_def", ModeDefinition)\n input_values = check.opt_dict_param(input_values, "input_values", key_type=str)\n solid_defs = [solid_def]\n\n def create_value_solid(input_name, input_value):\n @lambda_solid(name=input_name)\n def input_solid():\n return input_value\n\n return input_solid\n\n dependencies: Dict[str, Dict] = defaultdict(dict)\n\n for input_name, input_value in input_values.items():\n dependencies[solid_def.name][input_name] = DependencyDefinition(input_name)\n solid_defs.append(create_value_solid(input_name, input_value))\n\n result = execute_pipeline(\n PipelineDefinition(\n name="ephemeral_{}_solid_pipeline".format(solid_def.name),\n solid_defs=solid_defs,\n dependencies=dependencies, # type: ignore\n mode_defs=[mode_def] if mode_def else None,\n ),\n run_config=run_config,\n mode=mode_def.name if mode_def else None,\n tags=tags,\n raise_on_error=raise_on_error,\n )\n return result.result_for_handle(solid_def.name)
\n\n\n
[docs]def check_dagster_type(dagster_type, value):\n """Test a custom Dagster type.\n\n Args:\n dagster_type (Any): The Dagster type to test. Should be one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or\n :py:func:`PythonObjectDagsterType`, or a Python type.\n value (Any): The runtime value to test.\n\n Returns:\n TypeCheck: The result of the type check.\n\n\n Examples:\n\n .. code-block:: python\n\n assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n """\n\n if is_typing_type(dagster_type):\n raise DagsterInvariantViolationError(\n (\n "Must pass in a type from dagster module. You passed {dagster_type} "\n "which is part of python's typing module."\n ).format(dagster_type=dagster_type)\n )\n\n dagster_type = resolve_dagster_type(dagster_type)\n with yield_empty_pipeline_context() as pipeline_context:\n context = pipeline_context.for_type(dagster_type)\n try:\n type_check = dagster_type.type_check(context, value)\n except Failure as failure:\n return TypeCheck(success=False, description=failure.description)\n\n if not isinstance(type_check, TypeCheck):\n raise DagsterInvariantViolationError(\n "Type checks can only return TypeCheck. Type {type_name} returned {value}.".format(\n type_name=dagster_type.display_name, value=repr(type_check)\n )\n )\n return type_check
\n\n\n@contextmanager\ndef copy_directory(src):\n with tempfile.TemporaryDirectory() as temp_dir:\n dst = os.path.join(temp_dir, os.path.basename(src))\n shutil.copytree(src, dst)\n yield dst\n\n\nclass FilesystemTestScheduler(Scheduler, ConfigurableClass):\n """This class is used in dagster core and dagster_graphql to test the scheduler's interactions\n with schedule storage, which are implemented in the methods defined on the base Scheduler class.\n Therefore, the following methods used to actually schedule jobs (e.g. create and remove cron jobs\n on a cron tab) are left unimplemented.\n """\n\n def __init__(self, artifacts_dir: str, inst_data: object = None):\n check.str_param(artifacts_dir, "artifacts_dir")\n self._artifacts_dir = artifacts_dir\n self._inst_data = inst_data\n\n @property\n def inst_data(self) -> object:\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": str}\n\n @staticmethod\n def from_config_value(inst_data: object, config_value):\n return FilesystemTestScheduler(artifacts_dir=config_value["base_dir"], inst_data=inst_data)\n\n def debug_info(self) -> str:\n return ""\n\n def get_logs_path(self, _instance: DagsterInstance, schedule_origin_id: str) -> str:\n check.str_param(schedule_origin_id, "schedule_origin_id")\n return os.path.join(self._artifacts_dir, "logs", schedule_origin_id, "scheduler.log")\n\n def wipe(self, instance: DagsterInstance) -> None:\n pass\n
", "current_page_name": "_modules/dagster/utils/test", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster.utils"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster.utils.test"}, "title": "dagster.utils"}}, "dagster_airbyte": {"asset_defs": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airbyte.asset_defs

\nfrom typing import List, Optional\n\nfrom dagster_airbyte.utils import generate_materializations\n\nfrom dagster import AssetKey, Out, Output\nfrom dagster import _check as check\nfrom dagster.core.asset_defs import AssetsDefinition, multi_asset\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]@experimental\ndef build_airbyte_assets(\n connection_id: str,\n destination_tables: List[str],\n asset_key_prefix: Optional[List[str]] = None,\n) -> List[AssetsDefinition]:\n """\n Builds a set of assets representing the tables created by an Airbyte sync operation.\n\n Args:\n connection_id (str): The Airbyte Connection ID that this op will sync. You can retrieve this\n value from the "Connections" tab of a given connector in the Airbyte UI.\n destination_tables (List[str]): The names of the tables that you want to be represented\n in the Dagster asset graph for this sync. This will generally map to the name of the\n stream in Airbyte, unless a stream prefix has been specified in Airbyte.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([table_name])`.\n """\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n @multi_asset(\n name=f"airbyte_sync_{connection_id[:5]}",\n outs={\n table: Out(\n asset_key=AssetKey(\n asset_key_prefix + [table],\n )\n )\n for table in destination_tables\n },\n required_resource_keys={"airbyte"},\n compute_kind="airbyte",\n )\n def _assets(context):\n ab_output = context.resources.airbyte.sync_and_poll(connection_id=connection_id)\n for materialization in generate_materializations(ab_output, asset_key_prefix):\n table_name = materialization.asset_key.path[-1]\n if table_name in destination_tables:\n yield Output(\n value=None,\n output_name=table_name,\n metadata={\n entry.label: entry.entry_data for entry in materialization.metadata_entries\n },\n )\n else:\n yield materialization\n\n return [_assets]
\n
", "current_page_name": "_modules/dagster_airbyte/asset_defs", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airbyte.asset_defs"}, "ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airbyte.ops

\nfrom dagster_airbyte.resources import DEFAULT_POLL_INTERVAL_SECONDS\nfrom dagster_airbyte.types import AirbyteOutput\nfrom dagster_airbyte.utils import generate_materializations\n\nfrom dagster import Array, Bool, Field, In, Noneable, Nothing, Out, Output, op\n\n\n
[docs]@op(\n required_resource_keys={"airbyte"},\n ins={"start_after": In(Nothing)},\n out=Out(\n AirbyteOutput,\n description="Parsed json dictionary representing the details of the Airbyte connector after "\n "the sync successfully completes. "\n "See the [Airbyte API Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview) "\n "to see detailed information on this response.",\n ),\n config_schema={\n "connection_id": Field(\n str,\n is_required=True,\n description="The Airbyte Connection ID that this op will sync. You can retrieve this "\n 'value from the "Connections" tab of a given connector in the Airbyte UI.',\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL_SECONDS,\n description="The time (in seconds) that will be waited between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description="The maximum time that will waited before this operation is timed out. By "\n "default, this will never time out.",\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Airbyte sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["airbyte"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "airbyte"},\n)\ndef airbyte_sync_op(context):\n """\n Executes a Airbyte job sync for a given ``connection_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\n the job details for a given ``connection_id``.\n\n It requires the use of the :py:class:`~dagster_airbyte.airbyte_resource`, which allows it to\n communicate with the Airbyte API.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource, airbyte_sync_op\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n }\n )\n\n sync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_simple_airbyte_job():\n sync_foobar()\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_composed_airbyte_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n\n airbyte_output = context.resources.airbyte.sync_and_poll(\n connection_id=context.op_config["connection_id"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n yield from generate_materializations(\n airbyte_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(\n airbyte_output,\n metadata={\n **airbyte_output.job_details.get("attempts", [{}])[-1]\n .get("attempt", {})\n .get("totalStats", {})\n },\n )
\n
", "current_page_name": "_modules/dagster_airbyte/ops", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airbyte.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airbyte.resources

\nimport logging\nimport sys\nimport time\nfrom typing import Dict, List, Optional, cast\n\nimport requests\nfrom dagster_airbyte.types import AirbyteOutput\nfrom requests.exceptions import RequestException\n\nfrom dagster import Failure, Field, StringSource, __version__\nfrom dagster import _check as check\nfrom dagster import get_dagster_logger, resource\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n\n\nclass AirbyteState:\n    RUNNING = "running"\n    SUCCEEDED = "succeeded"\n    CANCELLED = "cancelled"\n    PENDING = "pending"\n    FAILED = "failed"\n    ERROR = "error"\n    INCOMPLETE = "incomplete"\n\n\n
[docs]class AirbyteResource:\n """\n This class exposes methods on top of the Airbyte REST API.\n """\n\n def __init__(\n self,\n host: str,\n port: str,\n use_https: bool,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n log: logging.Logger = get_dagster_logger(),\n ):\n self._host = host\n self._port = port\n self._use_https = use_https\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._log = log\n\n @property\n def api_base_url(self) -> str:\n return (\n ("https://" if self._use_https else "http://")\n + (f"{self._host}:{self._port}" if self._port else self._host)\n + "/api/v1"\n )\n\n
[docs] def make_request(\n self, endpoint: str, data: Optional[Dict[str, object]]\n ) -> Optional[Dict[str, object]]:\n """\n Creates and sends a request to the desired Airbyte REST API endpoint.\n\n Args:\n endpoint (str): The Airbyte API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Optional[Dict[str, Any]]: Parsed json data from the response to this request\n """\n\n headers = {"accept": "application/json"}\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method="POST",\n url=self.api_base_url + endpoint,\n headers=headers,\n json=data,\n timeout=15,\n )\n response.raise_for_status()\n if response.status_code == 204:\n return None\n return response.json()\n except RequestException as e:\n self._log.error("Request to Airbyte API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure("Exceeded max number of retries.")
\n\n def cancel_job(self, job_id: int):\n self.make_request(endpoint="/jobs/cancel", data={"id": job_id})\n\n def get_job_status(self, job_id: int) -> dict:\n return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))\n\n def start_sync(self, connection_id: str) -> Dict[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})\n )\n\n def get_connection_details(self, connection_id: str) -> Dict[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})\n )\n\n
[docs] def sync_and_poll(\n self,\n connection_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL_SECONDS,\n poll_timeout: Optional[float] = None,\n ) -> AirbyteOutput:\n """\n Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n "Connection" tab of a given connection in the Arbyte UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~AirbyteOutput`:\n Details of the sync job.\n """\n connection_details = self.get_connection_details(connection_id)\n job_details = self.start_sync(connection_id)\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n job_id = cast(int, job_info.get("id"))\n\n self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n start = time.monotonic()\n logged_attempts = 0\n logged_lines = 0\n state = None\n\n try:\n while True:\n if poll_timeout and start + poll_timeout < time.monotonic():\n raise Failure(\n f"Timeout: Airbyte job {job_id} is not ready after the timeout {poll_timeout} seconds"\n )\n time.sleep(poll_interval)\n job_details = self.get_job_status(job_id)\n attempts = cast(List, job_details.get("attempts", []))\n cur_attempt = len(attempts)\n # spit out the available Airbyte log info\n if cur_attempt:\n log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n for line in log_lines[logged_lines:]:\n sys.stdout.write(line + "\\n")\n sys.stdout.flush()\n logged_lines = len(log_lines)\n\n # if there's a next attempt, this one will have no more log messages\n if logged_attempts < cur_attempt - 1:\n logged_lines = 0\n logged_attempts += 1\n\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n state = job_info.get("status")\n\n if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n continue\n elif state == AirbyteState.SUCCEEDED:\n break\n elif state == AirbyteState.ERROR:\n raise Failure(f"Job failed: {job_id}")\n elif state == AirbyteState.CANCELLED:\n raise Failure(f"Job was cancelled: {job_id}")\n else:\n raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n finally:\n # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n # the python process\n if state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED):\n self.cancel_job(job_id)\n\n return AirbyteOutput(job_details=job_details, connection_details=connection_details)
\n\n\n
[docs]@resource(\n config_schema={\n "host": Field(\n StringSource,\n is_required=True,\n description="The Airbyte Server Address.",\n ),\n "port": Field(\n StringSource,\n is_required=False,\n description="Port for the Airbyte Server.",\n ),\n "use_https": Field(\n bool,\n default_value=False,\n description="Use https to connect in Airbyte Server.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description="The maximum number of times requests to the Airbyte API should be retried "\n "before failing.",\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps manage Airbyte connectors",\n)\ndef airbyte_resource(context) -> AirbyteResource:\n """\n This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Airbyte REST API, including expected response JSON\n schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n }\n )\n\n @job(resource_defs={"airbyte":my_airbyte_resource})\n def my_airbyte_job():\n ...\n\n """\n return AirbyteResource(\n host=context.resource_config["host"],\n port=context.resource_config["port"],\n use_https=context.resource_config["use_https"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airbyte.resources"}}, "dagster_airflow": {"dagster_job_factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airflow.dagster_job_factory

\nfrom dagster_airflow.dagster_pipeline_factory import make_dagster_pipeline_from_airflow_dag\n\n\n
[docs]def make_dagster_job_from_airflow_dag(\n dag, tags=None, use_airflow_template_context=False, unique_id=None\n):\n """Construct a Dagster job corresponding to a given Airflow DAG.\n\n Tasks in the resulting job will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Execute job directly. This will set execution_date to the\n time (in UTC) of the run.\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the job tags. This will override\n behavior from (1).\n\n .. code-block:: python\n\n my_dagster_job = make_dagster_job_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n my_dagster_job.execute_in_process()\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the run tags,\n such as in the Dagit UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating job name and op\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n tags (Dict[str, Field]): Job tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n unique_id (int): If not None, this id will be postpended to generated op names. Used by\n framework authors to enforce unique op names within a repo.\n\n Returns:\n JobDefinition: The generated Dagster job\n\n """\n pipeline_def = make_dagster_pipeline_from_airflow_dag(\n dag, tags, use_airflow_template_context, unique_id\n )\n # pass in tags manually because pipeline_def.graph doesn't have it threaded\n return pipeline_def.graph.to_job(tags={**pipeline_def.tags})
\n
", "current_page_name": "_modules/dagster_airflow/dagster_job_factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airflow.dagster_job_factory"}, "dagster_pipeline_factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airflow.dagster_pipeline_factory

\nimport datetime\nimport logging\nimport sys\nfrom contextlib import contextmanager\n\nimport dateutil\nimport lazy_object_proxy\nimport pendulum\nfrom airflow.models import TaskInstance\nfrom airflow.models.baseoperator import BaseOperator\nfrom airflow.models.dag import DAG\nfrom airflow.models.dagbag import DagBag\nfrom airflow.settings import LOG_FORMAT\nfrom dagster_airflow.patch_airflow_example_dag import patch_airflow_example_dag\n\nfrom dagster import (\n    DagsterInvariantViolationError,\n    DependencyDefinition,\n    InputDefinition,\n    MultiDependencyDefinition,\n    Nothing,\n    OutputDefinition,\n    PipelineDefinition,\n    SolidDefinition,\n)\nfrom dagster import _check as check\nfrom dagster import repository, solid\nfrom dagster.core.definitions.utils import VALID_NAME_REGEX, validate_tags\nfrom dagster.core.instance import AIRFLOW_EXECUTION_DATE_STR, IS_AIRFLOW_INGEST_PIPELINE_STR\n\n\nclass DagsterAirflowError(Exception):\n    pass\n\n\ndef contains_duplicate_task_names(dag_bag, refresh_from_airflow_db):\n    check.inst_param(dag_bag, "dag_bag", DagBag)\n    check.bool_param(refresh_from_airflow_db, "refresh_from_airflow_db")\n    seen_task_names = set()\n\n    # To enforce predictable iteration order\n    sorted_dag_ids = sorted(dag_bag.dag_ids)\n    for dag_id in sorted_dag_ids:\n        dag = dag_bag.dags.get(dag_id) if not refresh_from_airflow_db else dag_bag.get_dag(dag_id)\n        for task in dag.tasks:\n            if task.task_id in seen_task_names:\n                return True\n            else:\n                seen_task_names.add(task.task_id)\n    return False\n\n\n
[docs]def make_dagster_repo_from_airflow_dag_bag(\n dag_bag, repo_name, refresh_from_airflow_db=False, use_airflow_template_context=False\n):\n """Construct a Dagster repository corresponding to Airflow DAGs in DagBag.\n\n Usage:\n Create `make_dagster_repo.py`:\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dag_bag\n from airflow_home import my_dag_bag\n\n def make_repo_from_dag_bag():\n return make_dagster_repo_from_airflow_dag_bag(my_dag_bag, 'my_repo_name')\n\n Use RepositoryDefinition as usual, for example:\n `dagit -f path/to/make_dagster_repo.py -n make_repo_from_dag_bag`\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n repo_name (str): Name for generated RepositoryDefinition\n refresh_from_airflow_db (bool): If True, will refresh DAG if expired via DagBag.get_dag(),\n which requires access to initialized Airflow DB. If False (recommended), gets dag from\n DagBag's dags dict without depending on Airflow DB. (default: False)\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n\n Returns:\n RepositoryDefinition\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n check.str_param(repo_name, "repo_name")\n check.bool_param(refresh_from_airflow_db, "refresh_from_airflow_db")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n\n use_unique_id = contains_duplicate_task_names(dag_bag, refresh_from_airflow_db)\n\n pipeline_defs = []\n count = 0\n # To enforce predictable iteration order\n sorted_dag_ids = sorted(dag_bag.dag_ids)\n for dag_id in sorted_dag_ids:\n # Only call Airflow DB via dag_bag.get_dag(dag_id) if refresh_from_airflow_db is True\n dag = dag_bag.dags.get(dag_id) if not refresh_from_airflow_db else dag_bag.get_dag(dag_id)\n if not use_unique_id:\n pipeline_defs.append(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag, tags=None, use_airflow_template_context=use_airflow_template_context\n )\n )\n else:\n pipeline_defs.append(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag,\n tags=None,\n use_airflow_template_context=use_airflow_template_context,\n unique_id=count,\n )\n )\n count += 1\n\n @repository(name=repo_name)\n def _repo():\n return pipeline_defs\n\n return _repo
\n\n\n
[docs]def make_dagster_repo_from_airflow_example_dags(repo_name="airflow_example_dags_repo"):\n """Construct a Dagster repository for Airflow's example DAGs.\n\n Execution of the following Airflow example DAGs is not currently supported:\n 'example_external_task_marker_child',\n 'example_pig_operator',\n 'example_skip_dag',\n 'example_trigger_target_dag',\n 'example_xcom',\n 'test_utils',\n\n Usage:\n\n Create `make_dagster_repo.py`:\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_example_dags\n\n def make_airflow_example_dags():\n return make_dagster_repo_from_airflow_example_dags()\n\n Use RepositoryDefinition as usual, for example:\n `dagit -f path/to/make_dagster_repo.py -n make_airflow_example_dags`\n\n Args:\n repo_name (str): Name for generated RepositoryDefinition\n\n Returns:\n RepositoryDefinition\n """\n dag_bag = DagBag(\n dag_folder="some/empty/folder/with/no/dags", # prevent defaulting to settings.DAGS_FOLDER\n include_examples=True,\n )\n\n # There is a bug in Airflow v1.10.8, v1.10.9, v1.10.10 where the python_callable for task\n # 'search_catalog' is missing a required position argument '_'. It is currently fixed in master.\n # v1.10 stable: https://github.com/apache/airflow/blob/v1-10-stable/airflow/example_dags/example_complex.py#L133\n # master (05-05-2020): https://github.com/apache/airflow/blob/master/airflow/example_dags/example_complex.py#L136\n patch_airflow_example_dag(dag_bag)\n\n return make_dagster_repo_from_airflow_dag_bag(dag_bag, repo_name)
\n\n\n
[docs]def make_dagster_repo_from_airflow_dags_path(\n dag_path,\n repo_name,\n safe_mode=True,\n store_serialized_dags=False,\n use_airflow_template_context=False,\n):\n """Construct a Dagster repository corresponding to Airflow DAGs in dag_path.\n\n ``DagBag.get_dag()`` dependency requires Airflow DB to be initialized.\n\n Usage:\n Create ``make_dagster_repo.py``:\n\n .. code-block:: python\n\n from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dags_path\n\n def make_repo_from_dir():\n return make_dagster_repo_from_airflow_dags_path(\n '/path/to/dags/', 'my_repo_name'\n )\n\n Use RepositoryDefinition as usual, for example:\n ``dagit -f path/to/make_dagster_repo.py -n make_repo_from_dir``\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n repo_name (str): Name for generated RepositoryDefinition\n include_examples (bool): True to include Airflow's example DAGs. (default: False)\n safe_mode (bool): True to use Airflow's default heuristic to find files that contain DAGs\n (ie find files that contain both b'DAG' and b'airflow') (default: True)\n store_serialized_dags (bool): True to read Airflow DAGS from Airflow DB. False to read DAGS\n from Python files. (default: False)\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n\n Returns:\n RepositoryDefinition\n """\n check.str_param(dag_path, "dag_path")\n check.str_param(repo_name, "repo_name")\n check.bool_param(safe_mode, "safe_mode")\n check.bool_param(store_serialized_dags, "store_serialized_dags")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n\n try:\n dag_bag = DagBag(\n dag_folder=dag_path,\n include_examples=False, # Exclude Airflow example dags\n safe_mode=safe_mode,\n store_serialized_dags=store_serialized_dags,\n )\n except Exception:\n raise DagsterAirflowError("Error initializing airflow.models.dagbag object with arguments")\n\n return make_dagster_repo_from_airflow_dag_bag(dag_bag, repo_name, use_airflow_template_context)
\n\n\n
[docs]def make_dagster_pipeline_from_airflow_dag(\n dag, tags=None, use_airflow_template_context=False, unique_id=None\n):\n """Construct a Dagster pipeline corresponding to a given Airflow DAG.\n\n Tasks in the resulting pipeline will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Run Pipeline with 'default' preset, which sets execution_date to the\n time (in UTC) of pipeline invocation:\n\n .. code-block:: python\n\n execute_pipeline(\n pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),\n preset='default')\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineDefinition tags. This will\n override behavior from (1).\n\n .. code-block:: python\n\n execute_pipeline(\n make_dagster_pipeline_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n )\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineRun tags,\n such as in the Dagit UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating pipeline name and solid\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster pipeline\n tags (Dict[str, Field]): Pipeline tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n use_airflow_template_context (bool): If True, will call get_template_context() on the\n Airflow TaskInstance model which requires and modifies the DagRun table.\n (default: False)\n unique_id (int): If not None, this id will be postpended to generated solid names. Used by\n framework authors to enforce unique solid names within a repo.\n\n Returns:\n pipeline_def (PipelineDefinition): The generated Dagster pipeline\n\n """\n check.inst_param(dag, "dag", DAG)\n tags = check.opt_dict_param(tags, "tags")\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:\n tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true"\n\n tags = validate_tags(tags)\n\n pipeline_dependencies, solid_defs = _get_pipeline_definition_args(\n dag, use_airflow_template_context, unique_id\n )\n pipeline_def = PipelineDefinition(\n name=normalized_name(dag.dag_id, None),\n solid_defs=solid_defs,\n dependencies=pipeline_dependencies,\n tags=tags,\n )\n return pipeline_def
\n\n\n# Airflow DAG ids and Task ids allow a larger valid character set (alphanumeric characters,\n# dashes, dots and underscores) than Dagster's naming conventions (alphanumeric characters,\n# underscores), so Dagster will strip invalid characters and replace with '_'\ndef normalized_name(name, unique_id):\n base_name = "airflow_" + "".join(c if VALID_NAME_REGEX.match(c) else "_" for c in name)\n if not unique_id:\n return base_name\n else:\n return base_name + "_" + str(unique_id)\n\n\ndef _get_pipeline_definition_args(dag, use_airflow_template_context, unique_id=None):\n check.inst_param(dag, "dag", DAG)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n pipeline_dependencies = {}\n solid_defs = []\n seen_tasks = []\n\n # To enforce predictable iteration order\n dag_roots = sorted(dag.roots, key=lambda x: x.task_id)\n for task in dag_roots:\n _traverse_airflow_dag(\n task,\n seen_tasks,\n pipeline_dependencies,\n solid_defs,\n use_airflow_template_context,\n unique_id,\n )\n return (pipeline_dependencies, solid_defs)\n\n\ndef _traverse_airflow_dag(\n task, seen_tasks, pipeline_dependencies, solid_defs, use_airflow_template_context, unique_id\n):\n check.inst_param(task, "task", BaseOperator)\n check.list_param(seen_tasks, "seen_tasks", BaseOperator)\n check.list_param(solid_defs, "solid_defs", SolidDefinition)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n seen_tasks.append(task)\n current_solid = make_dagster_solid_from_airflow_task(\n task, use_airflow_template_context, unique_id\n )\n solid_defs.append(current_solid)\n\n if len(task.upstream_list) > 0:\n # To enforce predictable iteration order\n task_upstream_list = sorted(task.upstream_list, key=lambda x: x.task_id)\n\n pipeline_dependencies[current_solid.name] = {\n "airflow_task_ready": MultiDependencyDefinition(\n [\n DependencyDefinition(\n solid=normalized_name(task_upstream.task_id, unique_id),\n output="airflow_task_complete",\n )\n for task_upstream in task_upstream_list\n ]\n )\n }\n\n # To enforce predictable iteration order\n task_downstream_list = sorted(task.downstream_list, key=lambda x: x.task_id)\n for child_task in task_downstream_list:\n if child_task not in seen_tasks:\n _traverse_airflow_dag(\n child_task,\n seen_tasks,\n pipeline_dependencies,\n solid_defs,\n use_airflow_template_context,\n unique_id,\n )\n\n\n@contextmanager\ndef replace_airflow_logger_handlers():\n prev_airflow_handlers = logging.getLogger("airflow.task").handlers\n try:\n # Redirect airflow handlers to stdout / compute logs\n handler = logging.StreamHandler(sys.stdout)\n handler.setFormatter(logging.Formatter(LOG_FORMAT))\n root = logging.getLogger("airflow.task")\n root.handlers = [handler]\n yield\n finally:\n # Restore previous log handlers\n logging.getLogger("airflow.task").handlers = prev_airflow_handlers\n\n\n# If unique_id is not None, this id will be postpended to generated solid names, generally used\n# to enforce unique solid names within a repo.\ndef make_dagster_solid_from_airflow_task(task, use_airflow_template_context, unique_id=None):\n check.inst_param(task, "task", BaseOperator)\n check.bool_param(use_airflow_template_context, "use_airflow_template_context")\n unique_id = check.opt_int_param(unique_id, "unique_id")\n\n @solid(\n name=normalized_name(task.task_id, unique_id),\n input_defs=[InputDefinition("airflow_task_ready", Nothing)],\n output_defs=[OutputDefinition(Nothing, "airflow_task_complete")],\n )\n def _solid(context): # pylint: disable=unused-argument\n if AIRFLOW_EXECUTION_DATE_STR not in context.pipeline_run.tags:\n raise DagsterInvariantViolationError(\n 'Could not find "{AIRFLOW_EXECUTION_DATE_STR}" in {target} tags "{tags}". Please '\n 'add "{AIRFLOW_EXECUTION_DATE_STR}" to {target} tags before executing'.format(\n target="job" if context.pipeline_def.is_graph_job_op_target else "pipeline",\n AIRFLOW_EXECUTION_DATE_STR=AIRFLOW_EXECUTION_DATE_STR,\n tags=context.pipeline_run.tags,\n )\n )\n execution_date_str = context.pipeline_run.tags.get(AIRFLOW_EXECUTION_DATE_STR)\n\n check.str_param(execution_date_str, "execution_date_str")\n try:\n execution_date = dateutil.parser.parse(execution_date_str)\n except ValueError:\n raise DagsterInvariantViolationError(\n 'Could not parse execution_date "{execution_date_str}". Please use datetime format '\n "compatible with dateutil.parser.parse.".format(\n execution_date_str=execution_date_str,\n )\n )\n except OverflowError:\n raise DagsterInvariantViolationError(\n 'Date "{execution_date_str}" exceeds the largest valid C integer on the system.'.format(\n execution_date_str=execution_date_str,\n )\n )\n\n check.inst_param(execution_date, "execution_date", datetime.datetime)\n\n with replace_airflow_logger_handlers():\n task_instance = TaskInstance(task=task, execution_date=execution_date)\n\n ti_context = (\n dagster_get_template_context(task_instance, task, execution_date)\n if not use_airflow_template_context\n else task_instance.get_template_context()\n )\n task.render_template_fields(ti_context)\n\n task.execute(ti_context)\n\n return None\n\n return _solid\n\n\ndef dagster_get_template_context(task_instance, task, execution_date):\n """\n Modified from /airflow/models/taskinstance.py to not reference Airflow DB\n (1) Removes the following block, which queries DB, removes dagrun instances, recycles run_id\n if hasattr(task, 'dag'):\n if task.dag.params:\n params.update(task.dag.params)\n from airflow.models.dagrun import DagRun # Avoid circular import\n\n dag_run = (\n session.query(DagRun)\n .filter_by(dag_id=task.dag.dag_id, execution_date=execution_date)\n .first()\n )\n run_id = dag_run.run_id if dag_run else None\n session.expunge_all()\n session.commit()\n (2) Removes returning 'conf': conf which passes along Airflow config\n (3) Removes 'var': {'value': VariableAccessor(), 'json': VariableJsonAccessor()}, which allows\n fetching Variable from Airflow DB\n """\n from airflow import macros\n\n tables = None\n if "tables" in task.params:\n tables = task.params["tables"]\n\n params = {}\n run_id = ""\n dag_run = None\n\n ds = execution_date.strftime("%Y-%m-%d")\n ts = execution_date.isoformat()\n yesterday_ds = (execution_date - datetime.timedelta(1)).strftime("%Y-%m-%d")\n tomorrow_ds = (execution_date + datetime.timedelta(1)).strftime("%Y-%m-%d")\n\n # For manually triggered dagruns that aren't run on a schedule, next/previous\n # schedule dates don't make sense, and should be set to execution date for\n # consistency with how execution_date is set for manually triggered tasks, i.e.\n # triggered_date == execution_date.\n if dag_run and dag_run.external_trigger:\n prev_execution_date = execution_date\n next_execution_date = execution_date\n else:\n prev_execution_date = task.dag.previous_schedule(execution_date)\n next_execution_date = task.dag.following_schedule(execution_date)\n\n next_ds = None\n next_ds_nodash = None\n if next_execution_date:\n next_ds = next_execution_date.strftime("%Y-%m-%d")\n next_ds_nodash = next_ds.replace("-", "")\n next_execution_date = pendulum.instance(next_execution_date)\n\n prev_ds = None\n prev_ds_nodash = None\n if prev_execution_date:\n prev_ds = prev_execution_date.strftime("%Y-%m-%d")\n prev_ds_nodash = prev_ds.replace("-", "")\n prev_execution_date = pendulum.instance(prev_execution_date)\n\n ds_nodash = ds.replace("-", "")\n ts_nodash = execution_date.strftime("%Y%m%dT%H%M%S")\n ts_nodash_with_tz = ts.replace("-", "").replace(":", "")\n yesterday_ds_nodash = yesterday_ds.replace("-", "")\n tomorrow_ds_nodash = tomorrow_ds.replace("-", "")\n\n ti_key_str = "{dag_id}__{task_id}__{ds_nodash}".format(\n dag_id=task.dag_id, task_id=task.task_id, ds_nodash=ds_nodash\n )\n\n if task.params:\n params.update(task.params)\n\n return {\n "dag": task.dag,\n "ds": ds,\n "next_ds": next_ds,\n "next_ds_nodash": next_ds_nodash,\n "prev_ds": prev_ds,\n "prev_ds_nodash": prev_ds_nodash,\n "ds_nodash": ds_nodash,\n "ts": ts,\n "ts_nodash": ts_nodash,\n "ts_nodash_with_tz": ts_nodash_with_tz,\n "yesterday_ds": yesterday_ds,\n "yesterday_ds_nodash": yesterday_ds_nodash,\n "tomorrow_ds": tomorrow_ds,\n "tomorrow_ds_nodash": tomorrow_ds_nodash,\n "END_DATE": ds,\n "end_date": ds,\n "dag_run": dag_run,\n "run_id": run_id,\n "execution_date": pendulum.instance(execution_date),\n "prev_execution_date": prev_execution_date,\n "prev_execution_date_success": lazy_object_proxy.Proxy(\n lambda: task_instance.previous_execution_date_success\n ),\n "prev_start_date_success": lazy_object_proxy.Proxy(\n lambda: task_instance.previous_start_date_success\n ),\n "next_execution_date": next_execution_date,\n "latest_date": ds,\n "macros": macros,\n "params": params,\n "tables": tables,\n "task": task,\n "task_instance": task_instance,\n "ti": task_instance,\n "task_instance_key_str": ti_key_str,\n "test_mode": task_instance.test_mode,\n "inlets": task.inlets,\n "outlets": task.outlets,\n }\n
", "current_page_name": "_modules/dagster_airflow/dagster_pipeline_factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airflow.dagster_pipeline_factory"}, "factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_airflow.factory

\nimport datetime\nimport os\nimport re\nfrom collections import namedtuple\n\nfrom airflow import DAG\nfrom airflow.models.baseoperator import BaseOperator\nfrom dagster_airflow.operators.util import check_storage_specified\n\nimport dagster._check as check\nimport dagster.seven as seven\nfrom dagster.core.definitions.reconstruct import ReconstructableRepository\nfrom dagster.core.execution.api import create_execution_plan\nfrom dagster.core.instance import DagsterInstance, is_dagster_home_set\nfrom dagster.core.instance.ref import InstanceRef\nfrom dagster.core.snap import ExecutionPlanSnapshot, PipelineSnapshot, snapshot_from_execution_plan\nfrom dagster.utils.backcompat import canonicalize_backcompat_args\n\nfrom .compile import coalesce_execution_steps\nfrom .operators.docker_operator import DagsterDockerOperator\nfrom .operators.python_operator import DagsterPythonOperator\n\nDEFAULT_ARGS = {\n    "depends_on_past": False,\n    "email": ["airflow@example.com"],\n    "email_on_failure": False,\n    "email_on_retry": False,\n    "owner": "airflow",\n    "retries": 1,\n    "retry_delay": datetime.timedelta(0, 300),\n    "start_date": datetime.datetime(1900, 1, 1, 0, 0),\n}\n\n# Airflow DAG names are not allowed to be longer than 250 chars\nAIRFLOW_MAX_DAG_NAME_LEN = 250\n\n\ndef _make_dag_description(pipeline_name):\n    return """Editable scaffolding autogenerated by dagster-airflow from pipeline {pipeline_name}\n    """.format(\n        pipeline_name=pipeline_name\n    )\n\n\ndef _rename_for_airflow(name):\n    """Modify pipeline name for Airflow to meet constraints on DAG names:\n    https://github.com/apache/airflow/blob/1.10.3/airflow/utils/helpers.py#L52-L63\n\n    Here, we just substitute underscores for illegal characters to avoid imposing Airflow's\n    constraints on our naming schemes.\n    """\n    return re.sub(r"[^\\w\\-\\.]", "_", name)[:AIRFLOW_MAX_DAG_NAME_LEN]\n\n\nclass DagsterOperatorInvocationArgs(\n    namedtuple(\n        "DagsterOperatorInvocationArgs",\n        "recon_repo pipeline_name run_config mode step_keys instance_ref pipeline_snapshot "\n        "execution_plan_snapshot parent_pipeline_snapshot",\n    )\n):\n    def __new__(\n        cls,\n        recon_repo,\n        pipeline_name,\n        run_config,\n        mode,\n        step_keys,\n        instance_ref,\n        pipeline_snapshot,\n        execution_plan_snapshot,\n        parent_pipeline_snapshot,\n    ):\n        return super(DagsterOperatorInvocationArgs, cls).__new__(\n            cls,\n            recon_repo=recon_repo,\n            pipeline_name=pipeline_name,\n            run_config=run_config,\n            mode=mode,\n            step_keys=step_keys,\n            instance_ref=instance_ref,\n            pipeline_snapshot=pipeline_snapshot,\n            execution_plan_snapshot=execution_plan_snapshot,\n            parent_pipeline_snapshot=parent_pipeline_snapshot,\n        )\n\n\nclass DagsterOperatorParameters(\n    namedtuple(\n        "_DagsterOperatorParameters",\n        (\n            "recon_repo pipeline_name run_config "\n            "mode task_id step_keys dag instance_ref op_kwargs pipeline_snapshot "\n            "execution_plan_snapshot parent_pipeline_snapshot"\n        ),\n    )\n):\n    def __new__(\n        cls,\n        pipeline_name,\n        task_id,\n        recon_repo=None,\n        run_config=None,\n        mode=None,\n        step_keys=None,\n        dag=None,\n        instance_ref=None,\n        op_kwargs=None,\n        pipeline_snapshot=None,\n        execution_plan_snapshot=None,\n        parent_pipeline_snapshot=None,\n    ):\n        pipeline_def = recon_repo.get_definition().get_pipeline(pipeline_name)\n\n        if mode is None:\n            mode = pipeline_def.get_default_mode_name()\n\n        mode_def = pipeline_def.get_mode_definition(mode)\n\n        check_storage_specified(pipeline_def, mode_def)\n\n        return super(DagsterOperatorParameters, cls).__new__(\n            cls,\n            recon_repo=check.opt_inst_param(recon_repo, "recon_repo", ReconstructableRepository),\n            pipeline_name=check.str_param(pipeline_name, "pipeline_name"),\n            run_config=check.opt_dict_param(run_config, "run_config", key_type=str),\n            mode=check.opt_str_param(mode, "mode"),\n            task_id=check.str_param(task_id, "task_id"),\n            step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n            dag=check.opt_inst_param(dag, "dag", DAG),\n            instance_ref=check.opt_inst_param(instance_ref, "instance_ref", InstanceRef),\n            op_kwargs=check.opt_dict_param(op_kwargs.copy(), "op_kwargs", key_type=str),\n            pipeline_snapshot=check.inst_param(\n                pipeline_snapshot, "pipeline_snapshot", PipelineSnapshot\n            ),\n            execution_plan_snapshot=check.inst_param(\n                execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot\n            ),\n            parent_pipeline_snapshot=check.opt_inst_param(\n                parent_pipeline_snapshot, "parent_pipeline_snapshot", PipelineSnapshot\n            ),\n        )\n\n    @property\n    def invocation_args(self):\n        return DagsterOperatorInvocationArgs(\n            recon_repo=self.recon_repo,\n            pipeline_name=self.pipeline_name,\n            run_config=self.run_config,\n            mode=self.mode,\n            step_keys=self.step_keys,\n            instance_ref=self.instance_ref,\n            pipeline_snapshot=self.pipeline_snapshot,\n            execution_plan_snapshot=self.execution_plan_snapshot,\n            parent_pipeline_snapshot=self.parent_pipeline_snapshot,\n        )\n\n\ndef _make_airflow_dag(\n    recon_repo,\n    job_name,\n    run_config=None,\n    mode=None,\n    instance=None,\n    dag_id=None,\n    dag_description=None,\n    dag_kwargs=None,\n    op_kwargs=None,\n    operator=DagsterPythonOperator,\n):\n    check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)\n    check.str_param(job_name, "job_name")\n    run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n    mode = check.opt_str_param(mode, "mode")\n    # Default to use the (persistent) system temp directory rather than a TemporaryDirectory,\n    # which would not be consistent between Airflow task invocations.\n\n    if instance is None:\n        if is_dagster_home_set():\n            instance = DagsterInstance.get()\n        else:\n            instance = DagsterInstance.local_temp(tempdir=seven.get_system_temp_directory())\n\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    # Only used for Airflow; internally we continue to use pipeline.name\n    dag_id = check.opt_str_param(dag_id, "dag_id", _rename_for_airflow(job_name))\n\n    dag_description = check.opt_str_param(\n        dag_description, "dag_description", _make_dag_description(job_name)\n    )\n    check.class_param(operator, "operator", superclass=BaseOperator)\n\n    dag_kwargs = dict(\n        {"default_args": DEFAULT_ARGS},\n        **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str),\n    )\n\n    op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n\n    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)\n    pipeline = recon_repo.get_definition().get_pipeline(job_name)\n\n    if mode is None:\n        mode = pipeline.get_default_mode_name()\n\n    execution_plan = create_execution_plan(pipeline, run_config, mode=mode)\n\n    tasks = {}\n\n    coalesced_plan = coalesce_execution_steps(execution_plan)\n\n    for solid_handle, solid_steps in coalesced_plan.items():\n        step_keys = [step.key for step in solid_steps]\n\n        operator_parameters = DagsterOperatorParameters(\n            recon_repo=recon_repo,\n            pipeline_name=job_name,\n            run_config=run_config,\n            mode=mode,\n            task_id=solid_handle,\n            step_keys=step_keys,\n            dag=dag,\n            instance_ref=instance.get_ref(),\n            op_kwargs=op_kwargs,\n            pipeline_snapshot=pipeline.get_pipeline_snapshot(),\n            execution_plan_snapshot=snapshot_from_execution_plan(\n                execution_plan, pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()\n            ),\n        )\n        task = operator(operator_parameters)\n\n        tasks[solid_handle] = task\n\n        for solid_step in solid_steps:\n            for step_input in solid_step.step_inputs:\n                for key in step_input.dependency_keys:\n                    prev_solid_handle = execution_plan.get_step_by_key(key).solid_handle.to_string()\n                    if solid_handle != prev_solid_handle:\n                        tasks[prev_solid_handle].set_downstream(task)\n\n    return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])\n\n\n
[docs]def make_airflow_dag(\n module_name,\n job_name,\n run_config=None,\n mode=None,\n instance=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n pipeline_name=None,\n):\n """Construct an Airflow DAG corresponding to a given Dagster job/pipeline.\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate as a Python\n callable, run by an underlying :py:class:`PythonOperator <airflow:PythonOperator>`. As a\n consequence, both dagster, any Python dependencies required by your solid logic, and the module\n containing your pipeline definition must be available in the Python environment within which\n your Airflow tasks execute. If you cannot install requirements into this environment, or you\n are looking for a containerized solution to provide better isolation, see instead\n :py:func:`make_airflow_dag_containerized`.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n module_name (str): The name of the importable module in which the pipeline/job definition can be\n found.\n job_name (str): The name of the job definition.\n run_config (Optional[dict]): The config, if any, with which to compile\n the pipeline/job to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline/job.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator (a subclass of\n :py:class:`PythonOperator <airflow:airflow.operators.python_operator.PythonOperator>`).\n pipeline_name (str): (legacy) The name of the pipeline definition.\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n\n """\n check.str_param(module_name, "module_name")\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n\n recon_repo = ReconstructableRepository.for_module(module_name, job_name, os.getcwd())\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n instance=instance,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n )
\n\n\n
[docs]def make_airflow_dag_for_operator(\n recon_repo,\n job_name,\n operator,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n pipeline_name=None,\n):\n """Construct an Airflow DAG corresponding to a given Dagster job/pipeline and custom operator.\n\n `Custom operator template <https://github.com/dagster-io/dagster/blob/master/python_modules/dagster-test/dagster_test/dagster_airflow/custom_operator.py>`_\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate run by the given\n Operator :py:class:`BaseOperator <airflow.models.BaseOperator>`. If you\n are looking for a containerized solution to provide better isolation, see instead\n :py:func:`make_airflow_dag_containerized`.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n recon_repo (:class:`dagster.ReconstructableRepository`): reference to a Dagster RepositoryDefinition\n that can be reconstructed in another process\n job_name (str): The name of the job definition.\n operator (type): The operator to use. Must be a class that inherits from\n :py:class:`BaseOperator <airflow.models.BaseOperator>`\n run_config (Optional[dict]): The config, if any, with which to compile\n the pipeline to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator.\n pipeline_name (str): (legacy) The name of the pipeline definition.\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n """\n check.class_param(operator, "operator", superclass=BaseOperator)\n\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=operator,\n )
\n\n\ndef make_airflow_dag_for_recon_repo(\n recon_repo,\n job_name,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n pipeline_name=None,\n):\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n )\n\n\n
[docs]def make_airflow_dag_containerized(\n module_name,\n job_name,\n image,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n pipeline_name=None,\n):\n """Construct a containerized Airflow DAG corresponding to a given Dagster job/pipeline.\n\n Tasks in the resulting DAG will execute the Dagster logic they encapsulate using a subclass of\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`. As a\n consequence, both dagster, any Python dependencies required by your solid logic, and the module\n containing your pipeline definition must be available in the container spun up by this operator.\n Typically you'll want to install these requirements onto the image you're using.\n\n This function should be invoked in an Airflow DAG definition file, such as that created by an\n invocation of the dagster-airflow scaffold CLI tool.\n\n Args:\n module_name (str): The name of the importable module in which the pipeline/job definition can be\n found.\n job_name (str): The name of the job definition.\n image (str): The name of the Docker image to use for execution (passed through to\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`).\n run_config (Optional[dict]): The config, if any, with which to compile\n the pipeline/job to an execution plan, as a Python dict.\n mode (Optional[str]): The mode in which to execute the pipeline.\n dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to\n :py:class:`DAG <airflow:airflow.models.DAG>`).\n dag_description (Optional[str]): The description to use for the compiled Airflow DAG\n (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)\n dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow\n :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.\n op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow\n operator (a subclass of\n :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`).\n pipeline_name (str): (legacy) The name of the pipeline definition.\n\n Returns:\n (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a\n list of its constituent tasks.\n """\n check.str_param(module_name, "module_name")\n check.str_param(job_name, "job_name")\n check.str_param(image, "image")\n check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(dag_id, "dag_id")\n check.opt_str_param(dag_description, "dag_description")\n check.opt_dict_param(dag_kwargs, "dag_kwargs")\n check.opt_dict_param(op_kwargs, "op_kwargs")\n\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n recon_repo = ReconstructableRepository.for_module(module_name, job_name, os.getcwd())\n\n op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n op_kwargs["image"] = image\n\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=DagsterDockerOperator,\n )
\n\n\ndef make_airflow_dag_containerized_for_recon_repo(\n recon_repo,\n job_name,\n image,\n run_config=None,\n mode=None,\n dag_id=None,\n dag_description=None,\n dag_kwargs=None,\n op_kwargs=None,\n instance=None,\n pipeline_name=None,\n):\n check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)\n check.str_param(job_name, "job_name")\n check.str_param(image, "image")\n check.opt_dict_param(run_config, "run_config")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(dag_id, "dag_id")\n check.opt_str_param(dag_description, "dag_description")\n check.opt_dict_param(dag_kwargs, "dag_kwargs")\n op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)\n check.opt_str_param(pipeline_name, "pipeline_name")\n\n op_kwargs["image"] = image\n\n job_name = canonicalize_backcompat_args(\n new_val=job_name,\n new_arg="job_name",\n old_val=pipeline_name,\n old_arg="pipeline_name",\n breaking_version="future versions",\n coerce_old_to_new=lambda val: val,\n )\n return _make_airflow_dag(\n recon_repo=recon_repo,\n job_name=job_name,\n run_config=run_config,\n mode=mode,\n dag_id=dag_id,\n dag_description=dag_description,\n dag_kwargs=dag_kwargs,\n op_kwargs=op_kwargs,\n operator=DagsterDockerOperator,\n instance=instance,\n )\n
", "current_page_name": "_modules/dagster_airflow/factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_airflow.factory"}}, "dagster_aws": {"ecs": {"launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.ecs.launcher

\nimport warnings\nfrom collections import namedtuple\nfrom contextlib import suppress\n\nimport boto3\nfrom botocore.exceptions import ClientError\n\nfrom dagster import Array, Field, Noneable, ScalarUnion, StringSource\nfrom dagster import _check as check\nfrom dagster.core.events import EngineEventData, MetadataEntry\nfrom dagster.core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.grpc.types import ExecuteRunArgs\nfrom dagster.serdes import ConfigurableClass\n\nfrom ..secretsmanager import get_secrets_from_arns\nfrom .container_context import EcsContainerContext\nfrom .tasks import default_ecs_task_definition, default_ecs_task_metadata\nfrom .utils import sanitize_family\n\nTags = namedtuple("Tags", ["arn", "cluster", "cpu", "memory"])\n\nRUNNING_STATUSES = [\n    "PROVISIONING",\n    "PENDING",\n    "ACTIVATING",\n    "RUNNING",\n    "DEACTIVATING",\n    "STOPPING",\n    "DEPROVISIONING",\n]\nSTOPPED_STATUSES = ["STOPPED"]\n\n\n
[docs]class EcsRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a task in ECS for each Dagster job run."""\n\n def __init__(\n self,\n inst_data=None,\n task_definition=None,\n container_name="run",\n secrets=None,\n secrets_tag="dagster",\n include_sidecars=False,\n ):\n self._inst_data = inst_data\n self.ecs = boto3.client("ecs")\n self.ec2 = boto3.resource("ec2")\n self.secrets_manager = boto3.client("secretsmanager")\n self.logs = boto3.client("logs")\n\n self.task_definition = task_definition\n self.container_name = container_name\n\n self.secrets = check.opt_list_param(secrets, "secrets")\n\n if self.secrets and all(isinstance(secret, str) for secret in self.secrets):\n warnings.warn(\n "Setting secrets as a list of ARNs is deprecated. "\n "Secrets should instead follow the same structure as the ECS API: "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html",\n DeprecationWarning,\n )\n self.secrets = [\n {"name": name, "valueFrom": value_from}\n for name, value_from in get_secrets_from_arns(\n self.secrets_manager, self.secrets\n ).items()\n ]\n\n self.secrets_tags = [secrets_tag] if secrets_tag else []\n self.include_sidecars = include_sidecars\n\n if self.task_definition:\n task_definition = self.ecs.describe_task_definition(taskDefinition=task_definition)\n container_names = [\n container.get("name")\n for container in task_definition["taskDefinition"]["containerDefinitions"]\n ]\n check.invariant(\n container_name in container_names,\n f"Cannot override container '{container_name}' in task definition "\n f"'{self.task_definition}' because the container is not defined.",\n )\n self.task_definition = task_definition["taskDefinition"]["taskDefinitionArn"]\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "task_definition": Field(\n StringSource,\n is_required=False,\n description=(\n "The task definition to use when launching new tasks. "\n "If none is provided, each run will create its own task "\n "definition."\n ),\n ),\n "container_name": Field(\n StringSource,\n is_required=False,\n default_value="run",\n description=(\n "The container name to use when launching new tasks. Defaults to 'run'."\n ),\n ),\n "secrets": Field(\n Array(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={"name": StringSource, "valueFrom": StringSource},\n )\n ),\n is_required=False,\n description=(\n "An array of AWS Secrets Manager secrets. These secrets will "\n "be mounted as environment variables in the container. See "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html."\n ),\n ),\n "secrets_tag": Field(\n Noneable(StringSource),\n is_required=False,\n default_value="dagster",\n description=(\n "AWS Secrets Manager secrets with this tag will be mounted as "\n "environment variables in the container. Defaults to 'dagster'."\n ),\n ),\n "include_sidecars": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "Whether each run should use the same sidecars as the task that launches it. "\n "Defaults to False."\n ),\n ),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return EcsRunLauncher(inst_data=inst_data, **config_value)\n\n def _set_ecs_tags(self, run_id, task_arn):\n try:\n tags = [{"key": "dagster/run_id", "value": run_id}]\n self.ecs.tag_resource(resourceArn=task_arn, tags=tags)\n except ClientError:\n pass\n\n def _set_run_tags(self, run_id, task_arn):\n cluster = self._task_metadata().cluster\n tags = {"ecs/task_arn": task_arn, "ecs/cluster": cluster}\n self._instance.add_run_tags(run_id, tags)\n\n def _get_run_tags(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n tags = run.tags if run else {}\n arn = tags.get("ecs/task_arn")\n cluster = tags.get("ecs/cluster")\n cpu = tags.get("ecs/cpu")\n memory = tags.get("ecs/memory")\n\n return Tags(arn, cluster, cpu, memory)\n\n def launch_run(self, context: LaunchRunContext) -> None:\n\n """\n Launch a run in an ECS task.\n\n Currently, Fargate is the only supported launchType and awsvpc is the\n only supported networkMode. These are the defaults that are set up by\n docker-compose when you use the Dagster ECS reference deployment.\n """\n run = context.pipeline_run\n family = sanitize_family(\n run.external_pipeline_origin.external_repository_origin.repository_location_origin.location_name # type: ignore\n )\n\n container_context = EcsContainerContext.create_for_run(run, self)\n\n metadata = self._task_metadata()\n pipeline_origin = check.not_none(context.pipeline_code_origin)\n image = pipeline_origin.repository_origin.container_image\n task_definition = self._task_definition(family, metadata, image, container_context)[\n "family"\n ]\n\n args = ExecuteRunArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n )\n command = args.get_command_args()\n\n # Set cpu or memory overrides\n # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html\n cpu_and_memory_overrides = {}\n tags = self._get_run_tags(run.run_id)\n if tags.cpu:\n cpu_and_memory_overrides["cpu"] = tags.cpu\n if tags.memory:\n cpu_and_memory_overrides["memory"] = tags.memory\n\n # Run a task using the same network configuration as this processes's\n # task.\n response = self.ecs.run_task(\n taskDefinition=task_definition,\n cluster=metadata.cluster,\n overrides={\n "containerOverrides": [\n {\n "name": self.container_name,\n "command": command,\n # containerOverrides expects cpu/memory as integers\n **{k: int(v) for k, v in cpu_and_memory_overrides.items()},\n }\n ],\n # taskOverrides expects cpu/memory as strings\n **cpu_and_memory_overrides,\n },\n networkConfiguration={\n "awsvpcConfiguration": {\n "subnets": metadata.subnets,\n "assignPublicIp": metadata.assign_public_ip,\n "securityGroups": metadata.security_groups,\n }\n },\n launchType="FARGATE",\n )\n\n tasks = response["tasks"]\n\n if not tasks:\n failures = response["failures"]\n exceptions = []\n for failure in failures:\n arn = failure.get("arn")\n reason = failure.get("reason")\n detail = failure.get("detail")\n exceptions.append(Exception(f"Task {arn} failed because {reason}: {detail}"))\n raise Exception(exceptions)\n\n arn = tasks[0]["taskArn"]\n self._set_run_tags(run.run_id, task_arn=arn)\n self._set_ecs_tags(run.run_id, task_arn=arn)\n self._instance.report_engine_event(\n message="Launching run in ECS task",\n pipeline_run=run,\n engine_event_data=EngineEventData(\n [\n MetadataEntry("ECS Task ARN", value=arn),\n MetadataEntry("ECS Cluster", value=metadata.cluster),\n MetadataEntry("Run ID", value=run.run_id),\n ]\n ),\n cls=self.__class__,\n )\n\n def terminate(self, run_id):\n tags = self._get_run_tags(run_id)\n\n if not (tags.arn and tags.cluster):\n return False\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return False\n\n status = tasks[0].get("lastStatus")\n if status == "STOPPED":\n return False\n\n self.ecs.stop_task(task=tags.arn, cluster=tags.cluster)\n return True\n\n def _task_definition(self, family, metadata, image, container_context):\n """\n Return the launcher's task definition if it's configured.\n\n Otherwise, a new task definition revision is registered for every run.\n First, the process that calls this method finds its own task\n definition. Next, it creates a new task definition based on its own\n but it overrides the image with the pipeline origin's image.\n """\n if self.task_definition:\n task_definition = self.ecs.describe_task_definition(taskDefinition=self.task_definition)\n return task_definition["taskDefinition"]\n\n secrets = container_context.get_secrets_dict(self.secrets_manager)\n secrets_definition = (\n {"secrets": [{"name": key, "valueFrom": value} for key, value in secrets.items()]}\n if secrets\n else {}\n )\n\n task_definition = {}\n with suppress(ClientError):\n task_definition = self.ecs.describe_task_definition(taskDefinition=family)[\n "taskDefinition"\n ]\n secrets = secrets_definition.get("secrets", [])\n if self._reuse_task_definition(task_definition, metadata, image, secrets):\n return task_definition\n\n return default_ecs_task_definition(\n self.ecs,\n family,\n metadata,\n image,\n self.container_name,\n secrets=secrets_definition,\n include_sidecars=self.include_sidecars,\n )\n\n def _reuse_task_definition(self, task_definition, metadata, image, secrets):\n container_definitions_match = False\n task_definitions_match = False\n\n container_definitions = task_definition.get("containerDefinitions", [{}])\n # Only check for diffs to the primary container. This ignores changes to sidecars.\n for container_definition in container_definitions:\n if (\n container_definition.get("image") == image\n and container_definition.get("name") == self.container_name\n and container_definition.get("secrets") == secrets\n ):\n container_definitions_match = True\n\n if task_definition.get("executionRoleArn") == metadata.task_definition.get(\n "executionRoleArn"\n ) and task_definition.get("taskRoleArn") == metadata.task_definition.get("taskRoleArn"):\n task_definitions_match = True\n\n return container_definitions_match & task_definitions_match\n\n def _task_metadata(self):\n return default_ecs_task_metadata(self.ec2, self.ecs)\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: PipelineRun):\n\n tags = self._get_run_tags(run.run_id)\n\n if not (tags.arn and tags.cluster):\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "")\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "")\n\n t = tasks[0]\n\n if t.get("lastStatus") in RUNNING_STATUSES:\n return CheckRunHealthResult(WorkerStatus.RUNNING)\n elif t.get("lastStatus") in STOPPED_STATUSES:\n\n failed_containers = []\n for c in t.get("containers"):\n if c.get("exitCode") != 0:\n failed_containers.append(c)\n if len(failed_containers) > 0:\n if len(failed_containers) > 1:\n container_str = "Containers"\n else:\n container_str = "Container"\n return CheckRunHealthResult(\n WorkerStatus.FAILED,\n f"ECS task failed. Stop code: {t.get('stopCode')}. Stop reason {t.get('stopReason')}. "\n f"{container_str} {c.get('name') for c in failed_containers} failed."\n f"Check the logs for task {t.get('taskArn')} for details.",\n )\n\n return CheckRunHealthResult(WorkerStatus.SUCCESS)\n\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "ECS task health status is unknown.")
\n
", "current_page_name": "_modules/dagster_aws/ecs/launcher", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.ecs.launcher"}}, "emr": {"emr": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.emr.emr

\n# Portions of this file are copied from the Yelp MRJob project:\n#\n#   https://github.com/Yelp/mrjob\n#\n#\n# Copyright 2009-2013 Yelp, David Marin\n# Copyright 2015 Yelp\n# Copyright 2017 Yelp\n# Copyright 2018 Contributors\n# Copyright 2019 Yelp and Contributors\n#\n# Licensed under the Apache License, Version 2.0 (the "License");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an "AS IS" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport gzip\nimport re\nfrom io import BytesIO\nfrom urllib.parse import urlparse\n\nimport boto3\nfrom botocore.exceptions import WaiterError\nfrom dagster_aws.utils.mrjob.utils import _boto3_now, _wrap_aws_client, strip_microseconds\n\nimport dagster\nimport dagster._check as check\n\nfrom .types import EMR_CLUSTER_TERMINATED_STATES, EmrClusterState, EmrStepState\n\n# if we can't create or find our own service role, use the one\n# created by the AWS console and CLI\n_FALLBACK_SERVICE_ROLE = "EMR_DefaultRole"\n\n# if we can't create or find our own instance profile, use the one\n# created by the AWS console and CLI\n_FALLBACK_INSTANCE_PROFILE = "EMR_EC2_DefaultRole"\n\n\n
[docs]class EmrError(Exception):\n pass
\n\n\n
[docs]class EmrJobRunner:\n def __init__(\n self,\n region,\n check_cluster_every=30,\n aws_access_key_id=None,\n aws_secret_access_key=None,\n ):\n """This object encapsulates various utilities for interacting with EMR clusters and invoking\n steps (jobs) on them.\n\n See also :py:class:`~dagster_aws.emr.EmrPySparkResource`, which wraps this job runner in a\n resource for pyspark workloads.\n\n Args:\n region (str): AWS region to use\n check_cluster_every (int, optional): How frequently to poll boto3 APIs for updates.\n Defaults to 30 seconds.\n aws_access_key_id ([type], optional): AWS access key ID. Defaults to None, which will\n use the default boto3 credentials chain.\n aws_secret_access_key ([type], optional): AWS secret access key. Defaults to None, which\n will use the default boto3 credentials chain.\n """\n self.region = check.str_param(region, "region")\n\n # This is in seconds\n self.check_cluster_every = check.int_param(check_cluster_every, "check_cluster_every")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n\n def make_emr_client(self):\n """Creates a boto3 EMR client. Construction is wrapped in retries in case client connection\n fails transiently.\n\n Returns:\n botocore.client.EMR: An EMR client\n """\n raw_emr_client = boto3.client(\n "emr",\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n region_name=self.region,\n )\n return _wrap_aws_client(raw_emr_client, min_backoff=self.check_cluster_every)\n\n def cluster_id_from_name(self, cluster_name):\n """Get a cluster ID in the format "j-123ABC123ABC1" given a cluster name "my cool cluster".\n\n Args:\n cluster_name (str): The name of the cluster for which to find an ID\n\n Returns:\n str: The ID of the cluster\n\n Raises:\n EmrError: No cluster with the specified name exists\n """\n check.str_param(cluster_name, "cluster_name")\n\n response = self.make_emr_client().list_clusters().get("Clusters", [])\n for cluster in response:\n if cluster["Name"] == cluster_name:\n return cluster["Id"]\n\n raise EmrError(\n "cluster {cluster_name} not found in region {region}".format(\n cluster_name=cluster_name, region=self.region\n )\n )\n\n @staticmethod\n def construct_step_dict_for_command(step_name, command, action_on_failure="CONTINUE"):\n """Construct an EMR step definition which uses command-runner.jar to execute a shell command\n on the EMR master.\n\n Args:\n step_name (str): The name of the EMR step (will show up in the EMR UI)\n command (str): The shell command to execute with command-runner.jar\n action_on_failure (str, optional): Configure action on failure (e.g., continue, or\n terminate the cluster). Defaults to 'CONTINUE'.\n\n Returns:\n dict: Step definition dict\n """\n check.str_param(step_name, "step_name")\n check.list_param(command, "command", of_type=str)\n check.str_param(action_on_failure, "action_on_failure")\n\n return {\n "Name": step_name,\n "ActionOnFailure": action_on_failure,\n "HadoopJarStep": {"Jar": "command-runner.jar", "Args": command},\n }\n\n def add_tags(self, log, tags, cluster_id):\n """Add tags in the dict tags to cluster cluster_id.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n tags (dict): Dictionary of {'key': 'value'} tags\n cluster_id (str): The ID of the cluster to tag\n """\n check.dict_param(tags, "tags")\n check.str_param(cluster_id, "cluster_id")\n\n tags_items = sorted(tags.items())\n\n self.make_emr_client().add_tags(\n ResourceId=cluster_id, Tags=[dict(Key=k, Value=v) for k, v in tags_items]\n )\n\n log.info(\n "Added EMR tags to cluster %s: %s"\n % (cluster_id, ", ".join("%s=%s" % (tag, value) for tag, value in tags_items))\n )\n\n def run_job_flow(self, log, cluster_config):\n """Create an empty cluster on EMR, and return the ID of that job flow.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_config (dict): Configuration for this EMR job flow. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html\n\n Returns:\n str: The cluster ID, e.g. "j-ZKIY4CKQRX72"\n """\n check.dict_param(cluster_config, "cluster_config")\n\n log.debug("Creating Elastic MapReduce cluster")\n emr_client = self.make_emr_client()\n\n log.debug(\n "Calling run_job_flow(%s)"\n % (", ".join("%s=%r" % (k, v) for k, v in sorted(cluster_config.items())))\n )\n cluster_id = emr_client.run_job_flow(**cluster_config)["JobFlowId"]\n\n log.info("Created new cluster %s" % cluster_id)\n\n # set EMR tags for the cluster\n tags_items = cluster_config.get("Tags", [])\n tags = {k: v for k, v in tags_items}\n tags["__dagster_version"] = dagster.__version__\n self.add_tags(log, tags, cluster_id)\n return cluster_id\n\n def describe_cluster(self, cluster_id):\n """Thin wrapper over boto3 describe_cluster.\n\n Args:\n cluster_id (str): Cluster to inspect\n\n Returns:\n dict: The cluster info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeCluster.html\n """\n check.str_param(cluster_id, "cluster_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_cluster(ClusterId=cluster_id)\n\n def describe_step(self, cluster_id, step_id):\n """Thin wrapper over boto3 describe_step.\n\n Args:\n cluster_id (str): Cluster to inspect\n step_id (str): Step ID to describe\n\n Returns:\n dict: The step info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeStep.html\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)\n\n def add_job_flow_steps(self, log, cluster_id, step_defs):\n """Submit the constructed job flow steps to EMR for execution.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): The ID of the cluster\n step_defs (List[dict]): List of steps; see also `construct_step_dict_for_command`\n\n Returns:\n List[str]: list of step IDs.\n """\n check.str_param(cluster_id, "cluster_id")\n check.list_param(step_defs, "step_defs", of_type=dict)\n\n emr_client = self.make_emr_client()\n\n steps_kwargs = dict(JobFlowId=cluster_id, Steps=step_defs)\n log.debug(\n "Calling add_job_flow_steps(%s)"\n % ",".join(("%s=%r" % (k, v)) for k, v in steps_kwargs.items())\n )\n return emr_client.add_job_flow_steps(**steps_kwargs)["StepIds"]\n\n def is_emr_step_complete(self, log, cluster_id, emr_step_id):\n step = self.describe_step(cluster_id, emr_step_id)["Step"]\n step_state = EmrStepState(step["Status"]["State"])\n\n if step_state == EmrStepState.Pending:\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n\n log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc))\n return False\n\n elif step_state == EmrStepState.Running:\n time_running_desc = ""\n\n start = step["Status"]["Timeline"].get("StartDateTime")\n if start:\n time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start)\n\n log.info("RUNNING%s" % time_running_desc)\n return False\n\n # we're done, will return at the end of this\n elif step_state == EmrStepState.Completed:\n log.info("COMPLETED")\n return True\n else:\n # step has failed somehow. *reason* seems to only be set\n # when job is cancelled (e.g. 'Job terminated')\n reason = _get_reason(step)\n reason_desc = (" (%s)" % reason) if reason else ""\n\n log.info("%s%s" % (step_state.value, reason_desc))\n\n # print cluster status; this might give more context\n # why step didn't succeed\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n log.info(\n "Cluster %s %s %s%s"\n % (\n cluster["Id"],\n "was" if "ED" in cluster["Status"]["State"] else "is",\n cluster["Status"]["State"],\n reason_desc,\n )\n )\n\n if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES:\n # was it caused by IAM roles?\n self._check_for_missing_default_iam_roles(log, cluster)\n\n # TODO: extract logs here to surface failure reason\n # See: https://github.com/dagster-io/dagster/issues/1954\n\n if step_state == EmrStepState.Failed:\n log.error("EMR step %s failed" % emr_step_id)\n\n raise EmrError("EMR step %s failed" % emr_step_id)\n\n def _check_for_missing_default_iam_roles(self, log, cluster):\n """If cluster couldn't start due to missing IAM roles, tell user what to do."""\n\n check.dict_param(cluster, "cluster")\n\n reason = _get_reason(cluster)\n if any(\n reason.endswith("/%s is invalid" % role)\n for role in (_FALLBACK_INSTANCE_PROFILE, _FALLBACK_SERVICE_ROLE)\n ):\n log.warning(\n "IAM roles are missing. See documentation for IAM roles on EMR here: "\n "https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html"\n )\n\n def log_location_for_cluster(self, cluster_id):\n """EMR clusters are typically launched with S3 logging configured. This method inspects a\n cluster using boto3 describe_cluster to retrieve the log URI.\n\n Args:\n cluster_id (str): The cluster to inspect.\n\n Raises:\n EmrError: the log URI was missing (S3 log mirroring not enabled for this cluster)\n\n Returns:\n (str, str): log bucket and key\n """\n check.str_param(cluster_id, "cluster_id")\n\n # The S3 log URI is specified per job flow (cluster)\n log_uri = self.describe_cluster(cluster_id)["Cluster"].get("LogUri", None)\n\n # ugh, seriously boto3?! This will come back as string "None"\n if log_uri == "None" or log_uri is None:\n raise EmrError("Log URI not specified, cannot retrieve step execution logs")\n\n # For some reason the API returns an s3n:// protocol log URI instead of s3://\n log_uri = re.sub("^s3n", "s3", log_uri)\n log_uri_parsed = urlparse(log_uri)\n log_bucket = log_uri_parsed.netloc\n log_key_prefix = log_uri_parsed.path.lstrip("/")\n return log_bucket, log_key_prefix\n\n def retrieve_logs_for_step_id(self, log, cluster_id, step_id):\n """Retrieves stdout and stderr logs for the given step ID.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): EMR cluster ID\n step_id (str): EMR step ID for the job that was submitted.\n\n Returns\n (str, str): Tuple of stdout log string contents, and stderr log string contents\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n log_bucket, log_key_prefix = self.log_location_for_cluster(cluster_id)\n\n prefix = "{log_key_prefix}{cluster_id}/steps/{step_id}".format(\n log_key_prefix=log_key_prefix, cluster_id=cluster_id, step_id=step_id\n )\n stdout_log = self.wait_for_log(log, log_bucket, "{prefix}/stdout.gz".format(prefix=prefix))\n stderr_log = self.wait_for_log(log, log_bucket, "{prefix}/stderr.gz".format(prefix=prefix))\n return stdout_log, stderr_log\n\n def wait_for_log(self, log, log_bucket, log_key, waiter_delay=30, waiter_max_attempts=20):\n """Wait for gzipped EMR logs to appear on S3. Note that EMR syncs logs to S3 every 5\n minutes, so this may take a long time.\n\n Args:\n log_bucket (str): S3 bucket where log is expected to appear\n log_key (str): S3 key for the log file\n waiter_delay (int): How long to wait between attempts to check S3 for the log file\n waiter_max_attempts (int): Number of attempts before giving up on waiting\n\n Raises:\n EmrError: Raised if we waited the full duration and the logs did not appear\n\n Returns:\n str: contents of the log file\n """\n check.str_param(log_bucket, "log_bucket")\n check.str_param(log_key, "log_key")\n check.int_param(waiter_delay, "waiter_delay")\n check.int_param(waiter_max_attempts, "waiter_max_attempts")\n\n log.info(\n "Attempting to get log: s3://{log_bucket}/{log_key}".format(\n log_bucket=log_bucket, log_key=log_key\n )\n )\n\n s3 = _wrap_aws_client(boto3.client("s3"), min_backoff=self.check_cluster_every)\n waiter = s3.get_waiter("object_exists")\n try:\n waiter.wait(\n Bucket=log_bucket,\n Key=log_key,\n WaiterConfig={"Delay": waiter_delay, "MaxAttempts": waiter_max_attempts},\n )\n except WaiterError as err:\n raise EmrError("EMR log file did not appear on S3 after waiting") from err\n\n obj = BytesIO(s3.get_object(Bucket=log_bucket, Key=log_key)["Body"].read())\n gzip_file = gzip.GzipFile(fileobj=obj)\n return gzip_file.read().decode("utf-8")
\n\n\ndef _get_reason(cluster_or_step):\n """Get state change reason message."""\n # StateChangeReason is {} before the first state change\n return cluster_or_step["Status"]["StateChangeReason"].get("Message", "")\n
", "current_page_name": "_modules/dagster_aws/emr/emr", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.emr.emr"}, "pyspark_step_launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.emr.pyspark_step_launcher

\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport time\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster_aws.emr import EmrError, EmrJobRunner, emr_step_main\nfrom dagster_aws.emr.configs_spark import spark_config as get_spark_config\nfrom dagster_aws.utils.mrjob.log4j import parse_hadoop_log4j_records\n\nfrom dagster import Field, StringSource\nfrom dagster import _check as check\nfrom dagster import resource\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import DagsterInvariantViolationError, raise_execution_interrupts\nfrom dagster.core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster.serdes import deserialize_value\n\n# On EMR, Spark is installed here\nEMR_SPARK_HOME = "/usr/lib/spark/"\n\nCODE_ZIP_NAME = "code.zip"\n\n\n
[docs]@resource(\n {\n "spark_config": get_spark_config(),\n "cluster_id": Field(\n StringSource, description="Name of the job flow (cluster) on which to execute."\n ),\n "region_name": Field(StringSource, description="The AWS region that the cluster is in."),\n "action_on_failure": Field(\n str,\n is_required=False,\n default_value="CANCEL_AND_WAIT",\n description="The EMR action to take when the cluster step fails: "\n "https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html",\n ),\n "staging_bucket": Field(\n StringSource,\n is_required=True,\n description="S3 bucket to use for passing files between the plan process and EMR "\n "process.",\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="emr_staging",\n description="S3 key prefix inside the staging_bucket to use for files passed the plan "\n "process and EMR process",\n ),\n "wait_for_logs": Field(\n bool,\n is_required=False,\n default_value=False,\n description="If set, the system will wait for EMR logs to appear on S3. Note that logs "\n "are copied every 5 minutes, so enabling this will add several minutes to the job "\n "runtime.",\n ),\n "local_job_package_path": Field(\n StringSource,\n is_required=False,\n description="Absolute path to the package that contains the job definition(s) "\n "whose steps will execute remotely on EMR. This is a path on the local fileystem of "\n "the process executing the job. The expectation is that this package will "\n "also be available on the python path of the launched process running the Spark step "\n "on EMR, either deployed on step launch via the deploy_local_job_package option, "\n "referenced on s3 via the s3_job_package_path option, or installed on the cluster "\n "via bootstrap actions.",\n ),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description="(legacy) Absolute path to the package that contains the pipeline definition(s) "\n "whose steps will execute remotely on EMR. This is a path on the local fileystem of "\n "the process executing the pipeline. The expectation is that this package will "\n "also be available on the python path of the launched process running the Spark step "\n "on EMR, either deployed on step launch via the deploy_local_pipeline_package option, "\n "referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster "\n "via bootstrap actions.",\n ),\n "deploy_local_job_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description="If set, before every step run, the launcher will zip up all the code in "\n "local_job_package_path, upload it to s3, and pass it to spark-submit's "\n "--py-files option. This gives the remote process access to up-to-date user code. "\n "If not set, the assumption is that some other mechanism is used for distributing code "\n "to the EMR cluster. If this option is set to True, s3_job_package_path should "\n "not also be set.",\n ),\n "deploy_local_pipeline_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description="(legacy) If set, before every step run, the launcher will zip up all the code in "\n "local_job_package_path, upload it to s3, and pass it to spark-submit's "\n "--py-files option. This gives the remote process access to up-to-date user code. "\n "If not set, the assumption is that some other mechanism is used for distributing code "\n "to the EMR cluster. If this option is set to True, s3_job_package_path should "\n "not also be set.",\n ),\n "s3_job_package_path": Field(\n StringSource,\n is_required=False,\n description="If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_job_package should not be set to True.",\n ),\n "s3_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description="If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_pipeline_package should not be set to True.",\n ),\n }\n)\ndef emr_pyspark_step_launcher(context):\n\n # Resolve legacy arguments\n if context.resource_config.get("local_job_package_path") and context.resource_config.get(\n "local_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``local_job_package_path`` and legacy version "\n "``local_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n if not context.resource_config.get(\n "local_job_package_path"\n ) and not context.resource_config.get("local_pipeline_package_path"):\n raise DagsterInvariantViolationError(\n "For resource ``emr_pyspark_step_launcher``, no config value provided for required "\n "schema entry ``local_job_package_path``."\n )\n\n local_job_package_path = context.resource_config.get(\n "local_job_package_path"\n ) or context.resource_config.get("local_pipeline_package_path")\n\n if context.resource_config.get("deploy_local_job_package") and context.resource_config.get(\n "deploy_local_job_package"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``deploy_local_job_package`` and legacy version "\n "``deploy_local_pipeline_package`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n deploy_local_job_package = context.resource_config.get(\n "deploy_local_job_package"\n ) or context.resource_config.get("deploy_local_pipeline_package")\n\n if context.resource_config.get("s3_job_package_path") and context.resource_config.get(\n "s3_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``s3_job_package_path`` and legacy version "\n "``s3_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n s3_job_package_path = context.resource_config.get(\n "s3_job_package_path"\n ) or context.resource_config.get("s3_pipeline_package_path")\n\n return EmrPySparkStepLauncher(\n region_name=context.resource_config.get("region_name"),\n staging_bucket=context.resource_config.get("staging_bucket"),\n staging_prefix=context.resource_config.get("staging_prefix"),\n wait_for_logs=context.resource_config.get("wait_for_logs"),\n action_on_failure=context.resource_config.get("action_on_failure"),\n cluster_id=context.resource_config.get("cluster_id"),\n spark_config=context.resource_config.get("spark_config"),\n local_job_package_path=local_job_package_path,\n deploy_local_job_package=deploy_local_job_package,\n s3_job_package_path=s3_job_package_path,\n )
\n\n\nemr_pyspark_step_launcher.__doc__ = "\\n".join(\n "- **" + option + "**: " + (field.description or "")\n for option, field in emr_pyspark_step_launcher.config_schema.config_type.fields.items()\n)\n\n\nclass EmrPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n region_name,\n staging_bucket,\n staging_prefix,\n wait_for_logs,\n action_on_failure,\n cluster_id,\n spark_config,\n local_job_package_path,\n deploy_local_job_package,\n s3_job_package_path=None,\n ):\n self.region_name = check.str_param(region_name, "region_name")\n self.staging_bucket = check.str_param(staging_bucket, "staging_bucket")\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n self.action_on_failure = check.str_param(action_on_failure, "action_on_failure")\n self.cluster_id = check.str_param(cluster_id, "cluster_id")\n self.spark_config = spark_config\n\n check.invariant(\n not deploy_local_job_package or not s3_job_package_path,\n "If deploy_local_job_package is set to True, s3_job_package_path should not "\n "also be set.",\n )\n\n self.local_job_package_path = check.str_param(\n local_job_package_path, "local_job_package_path"\n )\n self.deploy_local_job_package = check.bool_param(\n deploy_local_job_package, "deploy_local_job_package"\n )\n self.s3_job_package_path = check.opt_str_param(s3_job_package_path, "s3_job_package_path")\n\n self.emr_job_runner = EmrJobRunner(region=self.region_name)\n\n def _post_artifacts(self, log, step_run_ref, run_id, step_key):\n """\n Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.\n\n For the zip file, consider the following toy example:\n\n # Folder: my_pyspark_project/\n # a.py\n def foo():\n print(1)\n\n # b.py\n def bar():\n print(2)\n\n # main.py\n from a import foo\n from b import bar\n\n foo()\n bar()\n\n This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running\n `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will\n print 1, 2.\n """\n from dagster_pyspark.utils import build_pyspark_zip\n\n with tempfile.TemporaryDirectory() as temp_dir:\n s3 = boto3.client("s3", region_name=self.region_name)\n\n # Upload step run ref\n def _upload_file_to_s3(local_path, s3_filename):\n key = self._artifact_s3_key(run_id, step_key, s3_filename)\n s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)\n log.debug(\n "Uploading file {local_path} to {s3_uri}".format(\n local_path=local_path, s3_uri=s3_uri\n )\n )\n s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key)\n\n # Upload main file.\n # The remote Dagster installation should also have the file, but locating it there\n # could be a pain.\n main_local_path = self._main_file_local_path()\n _upload_file_to_s3(main_local_path, self._main_file_name())\n\n if self.deploy_local_job_package:\n # Zip and upload package containing job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n\n build_pyspark_zip(zip_local_path, self.local_job_package_path)\n _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)\n\n # Create step run ref pickle file\n step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)\n with open(step_run_ref_local_path, "wb") as step_pickle_file:\n pickle.dump(step_run_ref, step_pickle_file)\n\n _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)\n\n def launch_step(self, step_context, prior_attempts_count):\n step_run_ref = step_context_to_step_run_ref(\n step_context, prior_attempts_count, self.local_job_package_path\n )\n\n run_id = step_context.pipeline_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._post_artifacts(log, step_run_ref, run_id, step_key)\n\n emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.solid.name)\n emr_step_id = self.emr_job_runner.add_job_flow_steps(log, self.cluster_id, [emr_step_def])[\n 0\n ]\n\n yield from self.wait_for_completion_and_log(run_id, step_key, emr_step_id, step_context)\n\n def wait_for_completion_and_log(self, run_id, step_key, emr_step_id, step_context):\n s3 = boto3.resource("s3", region_name=self.region_name)\n try:\n for event in self.wait_for_completion(step_context, s3, run_id, step_key, emr_step_id):\n yield event\n except EmrError as emr_error:\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n raise emr_error\n\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n\n def wait_for_completion(\n self, step_context, s3, run_id, step_key, emr_step_id, check_interval=15\n ):\n """We want to wait for the EMR steps to complete, and while that's happening, we want to\n yield any events that have been written to S3 for us by the remote process.\n After the the EMR steps complete, we want a final chance to fetch events before finishing\n the step.\n """\n done = False\n all_events = []\n # If this is being called within a `capture_interrupts` context, allow interrupts\n # while waiting for the pyspark execution to complete, so that we can terminate slow or\n # hanging steps\n while not done:\n with raise_execution_interrupts():\n time.sleep(check_interval) # AWS rate-limits us if we poll it too often\n done = self.emr_job_runner.is_emr_step_complete(\n step_context.log, self.cluster_id, emr_step_id\n )\n\n all_events_new = self.read_events(s3, run_id, step_key)\n\n if len(all_events_new) > len(all_events):\n for i in range(len(all_events), len(all_events_new)):\n event = all_events_new[i]\n # write each event from the EMR instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.dagster_event\n all_events = all_events_new\n\n def read_events(self, s3, run_id, step_key):\n events_s3_obj = s3.Object( # pylint: disable=no-member\n self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)\n )\n\n try:\n events_data = events_s3_obj.get()["Body"].read()\n return deserialize_value(pickle.loads(events_data))\n except ClientError as ex:\n # The file might not be there yet, which is fine\n if ex.response["Error"]["Code"] == "NoSuchKey":\n return []\n else:\n raise ex\n\n def _log_logs_from_s3(self, log, emr_step_id):\n """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs\n them to the given log."""\n stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(\n log, self.cluster_id, emr_step_id\n )\n # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for\n # Dagster's logging system.\n records = parse_hadoop_log4j_records(stderr_log)\n for record in records:\n if record.level:\n log.log(\n level=record.level,\n msg="".join(["Spark Driver stderr: ", record.logger, ": ", record.message]),\n )\n else:\n log.debug(f"Spark Driver stderr: {record.message}")\n\n sys.stdout.write(\n "---------- Spark Driver stdout: ----------\\n"\n + stdout_log\n + "\\n"\n + "---------- End of Spark Driver stdout ----------\\n"\n )\n\n def _get_emr_step_def(self, run_id, step_key, solid_name):\n """From the local Dagster instance, construct EMR steps that will kick off execution on a\n remote EMR cluster.\n """\n from dagster_spark.utils import flatten_dict, format_for_cli\n\n action_on_failure = self.action_on_failure\n\n # Execute Solid via spark-submit\n conf = dict(flatten_dict(self.spark_config))\n conf["spark.app.name"] = conf.get("spark.app.name", solid_name)\n\n check.invariant(\n conf.get("spark.master", "yarn") == "yarn",\n desc="spark.master is configured as %s; cannot set Spark master on EMR to anything "\n 'other than "yarn"' % conf.get("spark.master"),\n )\n\n command = (\n [\n EMR_SPARK_HOME + "bin/spark-submit",\n "--master",\n "yarn",\n "--deploy-mode",\n conf.get("spark.submit.deployMode", "client"),\n ]\n + format_for_cli(list(flatten_dict(conf)))\n + [\n "--py-files",\n self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),\n self._artifact_s3_uri(run_id, step_key, self._main_file_name()),\n self.staging_bucket,\n self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n ]\n )\n\n return EmrJobRunner.construct_step_dict_for_command(\n "Execute Solid/Op %s" % solid_name, command, action_on_failure=action_on_failure\n )\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return emr_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _artifact_s3_uri(self, run_id, step_key, filename):\n key = self._artifact_s3_key(run_id, self._sanitize_step_key(step_key), filename)\n return "s3://{bucket}/{key}".format(bucket=self.staging_bucket, key=key)\n\n def _artifact_s3_key(self, run_id, step_key, filename):\n return "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n
", "current_page_name": "_modules/dagster_aws/emr/pyspark_step_launcher", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.emr.pyspark_step_launcher"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.emr.types

\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\n\nEbsVolumeType = Enum(\n    name="EbsVolumeType", enum_values=[EnumValue("gp2"), EnumValue("io1"), EnumValue("standard")]\n)\n\n\n
[docs]class EmrClusterState(PyEnum):\n Starting = "STARTING"\n Bootstrapping = "BOOTSTRAPPING"\n Running = "RUNNING"\n Waiting = "WAITING"\n Terminating = "TERMINATING"\n Terminated = "TERMINATED"\n TerminatedWithErrors = "TERMINATED_WITH_ERRORS"
\n\n\nEMR_CLUSTER_TERMINATED_STATES = [\n EmrClusterState.Terminating,\n EmrClusterState.Terminated,\n EmrClusterState.TerminatedWithErrors,\n]\n\nEMR_CLUSTER_DONE_STATES = EMR_CLUSTER_TERMINATED_STATES + [EmrClusterState.Waiting]\n\n\n
[docs]class EmrStepState(PyEnum):\n Pending = "PENDING"\n Running = "RUNNING"\n Continue = "CONTINUE"\n Completed = "COMPLETED"\n Cancelled = "CANCELLED"\n Failed = "FAILED"\n Interrupted = "INTERRUPTED"
\n\n\nEmrActionOnFailure = Enum(\n name="EmrActionOnFailure",\n enum_values=[\n EnumValue("TERMINATE_JOB_FLOW"),\n EnumValue("TERMINATE_CLUSTER"),\n EnumValue("CANCEL_AND_WAIT"),\n EnumValue("CONTINUE"),\n ],\n)\n\nEmrAdjustmentType = Enum(\n name="EmrAdjustmentType",\n enum_values=[\n EnumValue("CHANGE_IN_CAPACITY"),\n EnumValue("PERCENT_CHANGE_IN_CAPACITY"),\n EnumValue("EXACT_CAPACITY"),\n ],\n)\n\nEmrComparisonOperator = Enum(\n name="EmrComparisonOperator",\n enum_values=[\n EnumValue("GREATER_THAN_OR_EQUAL"),\n EnumValue("GREATER_THAN"),\n EnumValue("LESS_THAN"),\n EnumValue("LESS_THAN_OR_EQUAL"),\n ],\n)\n\nEmrInstanceRole = Enum(\n name="EmrInstanceRole", enum_values=[EnumValue("MASTER"), EnumValue("CORE"), EnumValue("TASK")]\n)\n\nEmrMarket = Enum(name="EmrMarket", enum_values=[EnumValue("ON_DEMAND"), EnumValue("SPOT")])\n\nEmrRepoUpgradeOnBoot = Enum(\n name="EmrRepoUpgradeOnBoot", enum_values=[EnumValue("SECURITY"), EnumValue("NONE")]\n)\n\nEmrScaleDownBehavior = Enum(\n name="EmrScaleDownBehavior",\n enum_values=[\n EnumValue("TERMINATE_AT_INSTANCE_HOUR"),\n EnumValue("TERMINATE_AT_TASK_COMPLETION"),\n ],\n)\n\nEmrStatistic = Enum(\n name="EmrStatistic",\n enum_values=[\n EnumValue("SAMPLE_COUNT"),\n EnumValue("AVERAGE"),\n EnumValue("SUM"),\n EnumValue("MINIMUM"),\n EnumValue("MAXIMUM"),\n ],\n)\n\nEmrSupportedProducts = Enum(\n name="EmrSupportedProducts", enum_values=[EnumValue("mapr-m3"), EnumValue("mapr-m5")]\n)\n\nEmrTimeoutAction = Enum(\n name="EmrTimeoutAction",\n enum_values=[EnumValue("SWITCH_TO_ON_DEMAND"), EnumValue("TERMINATE_CLUSTER")],\n)\n\nEmrUnit = Enum(\n name="EmrUnit",\n enum_values=[\n EnumValue("NONE"),\n EnumValue("SECONDS"),\n EnumValue("MICRO_SECONDS"),\n EnumValue("MILLI_SECONDS"),\n EnumValue("BYTES"),\n EnumValue("KILO_BYTES"),\n EnumValue("MEGA_BYTES"),\n EnumValue("GIGA_BYTES"),\n EnumValue("TERA_BYTES"),\n EnumValue("BITS"),\n EnumValue("KILO_BITS"),\n EnumValue("MEGA_BITS"),\n EnumValue("GIGA_BITS"),\n EnumValue("TERA_BITS"),\n EnumValue("PERCENT"),\n EnumValue("COUNT"),\n EnumValue("BYTES_PER_SECOND"),\n EnumValue("KILO_BYTES_PER_SECOND"),\n EnumValue("MEGA_BYTES_PER_SECOND"),\n EnumValue("GIGA_BYTES_PER_SECOND"),\n EnumValue("TERA_BYTES_PER_SECOND"),\n EnumValue("BITS_PER_SECOND"),\n EnumValue("KILO_BITS_PER_SECOND"),\n EnumValue("MEGA_BITS_PER_SECOND"),\n EnumValue("GIGA_BITS_PER_SECOND"),\n EnumValue("TERA_BITS_PER_SECOND"),\n EnumValue("COUNT_PER_SECOND"),\n ],\n)\n
", "current_page_name": "_modules/dagster_aws/emr/types", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.emr.types"}}, "redshift": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.redshift.resources

\nimport abc\nfrom contextlib import contextmanager\n\nimport psycopg2\nimport psycopg2.extensions\n\nfrom dagster import Field, IntSource, StringSource\nfrom dagster import _check as check\nfrom dagster import resource\n\n\nclass RedshiftError(Exception):\n    pass\n\n\nclass _BaseRedshiftResource(abc.ABC):\n    def __init__(self, context):  # pylint: disable=too-many-locals\n        # Extract parameters from resource config\n        self.conn_args = {\n            k: context.resource_config.get(k)\n            for k in (\n                "host",\n                "port",\n                "user",\n                "password",\n                "database",\n                "schema",\n                "connect_timeout",\n                "sslmode",\n            )\n            if context.resource_config.get(k) is not None\n        }\n\n        self.autocommit = context.resource_config.get("autocommit")\n        self.log = context.log\n\n    @abc.abstractmethod\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        pass\n\n    @abc.abstractmethod\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        pass\n\n\nclass RedshiftResource(_BaseRedshiftResource):\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Synchronously execute a single query against Redshift. Will return a list of rows, where\n        each row is a tuple of values, e.g. SELECT 1 will return [(1,)].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                try:\n                    self.log.info("Executing query '{query}'".format(query=query))\n                    cursor.execute(query)\n\n                    if fetch_results and cursor.rowcount > 0:\n                        return cursor.fetchall()\n                    else:\n                        self.log.info("Empty result from query")\n\n                except Exception as e:\n                    # If autocommit is disabled or not set (it is disabled by default), Redshift\n                    # will be in the middle of a transaction at exception time, and because of\n                    # the failure the current transaction will not accept any further queries.\n                    #\n                    # This conn.commit() call closes the open transaction before handing off\n                    # control to the error callback, so that the user can issue additional\n                    # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                    # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                    # things are in a usable state in the error callback.\n                    if not self.autocommit:\n                        conn.commit()\n\n                    if error_callback is not None:\n                        error_callback(e, cursor, self.log)\n                    else:\n                        raise\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Synchronously execute a list of queries against Redshift. Will return a list of list of\n        rows, where each row is a tuple of values, e.g. ['SELECT 1', 'SELECT 1'] will return\n        [[(1,)], [(1,)]].\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n            cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        results = []\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                for query in queries:\n                    try:\n                        self.log.info("Executing query '{query}'".format(query=query))\n                        cursor.execute(query)\n\n                        if fetch_results and cursor.rowcount > 0:\n                            results.append(cursor.fetchall())\n                        else:\n                            results.append([])\n                            self.log.info("Empty result from query")\n\n                    except Exception as e:\n                        # If autocommit is disabled or not set (it is disabled by default), Redshift\n                        # will be in the middle of a transaction at exception time, and because of\n                        # the failure the current transaction will not accept any further queries.\n                        #\n                        # This conn.commit() call closes the open transaction before handing off\n                        # control to the error callback, so that the user can issue additional\n                        # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                        # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                        # things are in a usable state in the error callback.\n                        if not self.autocommit:\n                            conn.commit()\n\n                        if error_callback is not None:\n                            error_callback(e, cursor, self.log)\n                        else:\n                            raise\n\n        if fetch_results:\n            return results\n\n    @contextmanager\n    def _get_conn(self):\n        conn = None\n        try:\n            conn = psycopg2.connect(**self.conn_args)\n            yield conn\n        finally:\n            if conn:\n                conn.close()\n\n    @contextmanager\n    def _get_cursor(self, conn, cursor_factory=None):\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n\n        # Could be none, in which case we should respect the connection default. Otherwise\n        # explicitly set to true/false.\n        if self.autocommit is not None:\n            conn.autocommit = self.autocommit\n\n        with conn:\n            with conn.cursor(cursor_factory=cursor_factory) as cursor:\n                yield cursor\n\n            # If autocommit is set, we'll commit after each and every query execution. Otherwise, we\n            # want to do a final commit after we're wrapped up executing the full set of one or more\n            # queries.\n            if not self.autocommit:\n                conn.commit()\n\n\nclass FakeRedshiftResource(_BaseRedshiftResource):\n    QUERY_RESULT = [(1,)]\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Fake for execute_query; returns [self.QUERY_RESULT]\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        self.log.info("Executing query '{query}'".format(query=query))\n        if fetch_results:\n            return self.QUERY_RESULT\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Fake for execute_queries; returns [self.QUERY_RESULT] * 3\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        for query in queries:\n            self.log.info("Executing query '{query}'".format(query=query))\n        if fetch_results:\n            return [self.QUERY_RESULT] * 3\n\n\ndef define_redshift_config():\n    """Redshift configuration. See the Redshift documentation for reference:\n\n    https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-to-cluster.html\n    """\n\n    return {\n        "host": Field(StringSource, description="Redshift host", is_required=True),\n        "port": Field(\n            IntSource, description="Redshift port", is_required=False, default_value=5439\n        ),\n        "user": Field(\n            StringSource,\n            description="Username for Redshift connection",\n            is_required=False,\n        ),\n        "password": Field(\n            StringSource,\n            description="Password for Redshift connection",\n            is_required=False,\n        ),\n        "database": Field(\n            StringSource,\n            description="Name of the default database to use. After login, you can use USE DATABASE"\n            " to change the database.",\n            is_required=False,\n        ),\n        "schema": Field(\n            StringSource,\n            description="Name of the default schema to use. After login, you can use USE SCHEMA to "\n            "change the schema.",\n            is_required=False,\n        ),\n        "autocommit": Field(\n            bool,\n            description="None by default, which honors the Redshift parameter AUTOCOMMIT. Set to "\n            "True or False to enable or disable autocommit mode in the session, respectively.",\n            is_required=False,\n        ),\n        "connect_timeout": Field(\n            int,\n            description="Connection timeout in seconds. 5 seconds by default",\n            is_required=False,\n            default_value=5,\n        ),\n        "sslmode": Field(\n            str,\n            description="SSL mode to use. See the Redshift documentation for more information on "\n            "usage: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html",\n            is_required=False,\n            default_value="require",\n        ),\n    }\n\n\n
[docs]@resource(\n config_schema=define_redshift_config(),\n description="Resource for connecting to the Redshift data warehouse",\n)\ndef redshift_resource(context):\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n\n .. code-block:: python\n\n from dagster import build_op_context, op\n from dagster_aws.redshift import redshift_resource\n\n @op(required_resource_keys={'redshift'})\n def example_redshift_op(context):\n return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = redshift_resource.configured({\n 'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n 'port': 5439,\n 'user': 'dagster',\n 'password': 'dagster',\n 'database': 'dev',\n })\n context = build_op_context(resources={'redshift': redshift_configured})\n assert example_redshift_op(context) == [(1,)]\n\n """\n return RedshiftResource(context)
\n\n\n
[docs]@resource(\n config_schema=define_redshift_config(),\n description="Fake resource for connecting to the Redshift data warehouse. Usage is identical "\n "to the real redshift_resource. Will always return [(1,)] for the single query case and "\n "[[(1,)], [(1,)], [(1,)]] for the multi query case.",\n)\ndef fake_redshift_resource(context):\n return FakeRedshiftResource(context)
\n
", "current_page_name": "_modules/dagster_aws/redshift/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.redshift.resources"}}, "s3": {"compute_log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\n\nimport boto3\nfrom botocore.errorfactory import ClientError\n\nimport dagster.seven as seven\nfrom dagster import Field, StringSource\nfrom dagster import _check as check\nfrom dagster.core.storage.compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n)\nfrom dagster.core.storage.local_compute_log_manager import IO_TYPE_EXTENSION, LocalComputeLogManager\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import ensure_dir, ensure_file\n\n\n
[docs]class S3ComputeLogManager(ComputeLogManager, ConfigurableClass):\n """Logs compute function stdout and stderr to S3.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_aws.s3.compute_log_manager\n class: S3ComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n use_ssl: true\n verify: true\n verify_cert_path: "/path/to/cert/bundle.pem"\n endpoint_url: "http://alternate-s3-host.io"\n skip_empty_files: true\n\n Args:\n bucket (str): The name of the s3 bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster.seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n use_ssl (Optional[bool]): Whether or not to use SSL. Default True.\n verify (Optional[bool]): Whether or not to verify SSL certificates. Default True.\n verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if\n `verify` set to False.\n endpoint_url (Optional[str]): Override for the S3 endpoint url.\n skip_empty_files: (Optional[bool]): Skip upload of empty log files.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data=None,\n prefix="dagster",\n use_ssl=True,\n verify=True,\n verify_cert_path=None,\n endpoint_url=None,\n skip_empty_files=False,\n ):\n _verify = False if not verify else verify_cert_path\n self._s3_session = boto3.resource(\n "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url\n ).meta.client\n self._s3_bucket = check.str_param(bucket, "bucket")\n self._s3_prefix = check.str_param(prefix, "prefix")\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self.local_manager = LocalComputeLogManager(local_dir)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files")\n\n @contextmanager\n def _watch_logs(self, pipeline_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs( # pylint: disable=protected-access\n pipeline_run, step_key\n ):\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "use_ssl": Field(bool, is_required=False, default_value=True),\n "verify": Field(bool, is_required=False, default_value=True),\n "verify_cert_path": Field(StringSource, is_required=False),\n "endpoint_url": Field(StringSource, is_required=False),\n "skip_empty_files": Field(bool, is_required=False, default_value=False),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return S3ComputeLogManager(inst_data=inst_data, **config_value)\n\n def get_local_path(self, run_id, key, io_type):\n return self.local_manager.get_local_path(run_id, key, io_type)\n\n def on_watch_start(self, pipeline_run, step_key):\n self.local_manager.on_watch_start(pipeline_run, step_key)\n\n def on_watch_finish(self, pipeline_run, step_key):\n self.local_manager.on_watch_finish(pipeline_run, step_key)\n key = self.local_manager.get_key(pipeline_run, step_key)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR)\n\n def is_watch_completed(self, run_id, key):\n return self.local_manager.is_watch_completed(run_id, key)\n\n def download_url(self, run_id, key, io_type):\n if not self.is_watch_completed(run_id, key):\n return self.local_manager.download_url(run_id, key, io_type)\n key = self._bucket_key(run_id, key, io_type)\n\n url = self._s3_session.generate_presigned_url(\n ClientMethod="get_object", Params={"Bucket": self._s3_bucket, "Key": key}\n )\n\n return url\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n if self._should_download(run_id, key, io_type):\n self._download_to_local(run_id, key, io_type)\n data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes)\n return self._from_local_file_data(run_id, key, io_type, data)\n\n def on_subscribe(self, subscription):\n self.local_manager.on_subscribe(subscription)\n\n def on_unsubscribe(self, subscription):\n self.local_manager.on_unsubscribe(subscription)\n\n def _should_download(self, run_id, key, io_type):\n local_path = self.get_local_path(run_id, key, io_type)\n if os.path.exists(local_path):\n return False\n\n try: # https://stackoverflow.com/a/38376288/14656695\n self._s3_session.head_object(\n Bucket=self._s3_bucket, Key=self._bucket_key(run_id, key, io_type)\n )\n except ClientError:\n return False\n\n return True\n\n def _from_local_file_data(self, run_id, key, io_type, local_file_data):\n is_complete = self.is_watch_completed(run_id, key)\n path = (\n "s3://{}/{}".format(self._s3_bucket, self._bucket_key(run_id, key, io_type))\n if is_complete\n else local_file_data.path\n )\n\n return ComputeLogFileData(\n path,\n local_file_data.data,\n local_file_data.cursor,\n local_file_data.size,\n self.download_url(run_id, key, io_type),\n )\n\n def _upload_from_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_file(path)\n if self._skip_empty_files and os.stat(path).st_size == 0:\n return\n\n key = self._bucket_key(run_id, key, io_type)\n with open(path, "rb") as data:\n self._s3_session.upload_fileobj(data, self._s3_bucket, key)\n\n def _download_to_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_dir(os.path.dirname(path))\n with open(path, "wb") as fileobj:\n self._s3_session.download_fileobj(\n self._s3_bucket, self._bucket_key(run_id, key, io_type), fileobj\n )\n\n def _bucket_key(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n paths = [\n self._s3_prefix,\n "storage",\n run_id,\n "compute_logs",\n "{}.{}".format(key, extension),\n ]\n return "/".join(paths) # s3 path delimiter\n\n def dispose(self):\n self.local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_aws/s3/compute_log_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.compute_log_manager"}, "file_cache": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.file_cache

\nimport boto3\nfrom botocore.exceptions import ClientError\n\nfrom dagster import Field\nfrom dagster import _check as check\nfrom dagster import resource\nfrom dagster.core.storage.file_cache import FileCache\n\nfrom .file_manager import S3FileHandle\n\n\n
[docs]class S3FileCache(FileCache):\n def __init__(self, s3_bucket, s3_key, s3_session, overwrite=False):\n super(S3FileCache, self).__init__(overwrite=overwrite)\n\n self.s3_bucket = s3_bucket\n self.s3_key = s3_key\n self.s3 = s3_session\n\n def has_file_object(self, file_key):\n check.str_param(file_key, "file_key")\n try:\n self.s3.get_object(Bucket=self.s3_bucket, Key=self.get_full_key(file_key))\n except ClientError:\n return False\n return True\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self.s3_key, file_key=file_key)\n\n def write_file_object(self, file_key, source_file_object):\n check.str_param(file_key, "file_key")\n\n self.s3.put_object(\n Body=source_file_object, Bucket=self.s3_bucket, Key=self.get_full_key(file_key)\n )\n return self.get_file_handle(file_key)\n\n def get_file_handle(self, file_key):\n check.str_param(file_key, "file_key")\n return S3FileHandle(self.s3_bucket, self.get_full_key(file_key))
\n\n\n@resource(\n {\n "bucket": Field(str),\n "key": Field(str),\n "overwrite": Field(bool, is_required=False, default_value=False),\n }\n)\ndef s3_file_cache(init_context):\n return S3FileCache(\n s3_bucket=init_context.resource_config["bucket"],\n s3_key=init_context.resource_config["key"],\n overwrite=init_context.resource_config["overwrite"],\n # TODO: resource dependencies\n s3_session=boto3.resource("s3", use_ssl=True).meta.client,\n )\n
", "current_page_name": "_modules/dagster_aws/s3/file_cache", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.file_cache"}, "file_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster.core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class S3FileHandle(FileHandle):\n """A reference to a file on S3."""\n\n def __init__(self, s3_bucket: str, s3_key: str):\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_key = check.str_param(s3_key, "s3_key")\n\n @property\n def s3_bucket(self) -> str:\n """str: The name of the S3 bucket."""\n return self._s3_bucket\n\n @property\n def s3_key(self) -> str:\n """str: The S3 key."""\n return self._s3_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's S3 URL."""\n return self.s3_path\n\n @property\n def s3_path(self) -> str:\n """str: The file's S3 URL."""\n return "s3://{bucket}/{key}".format(bucket=self.s3_bucket, key=self.s3_key)
\n\n\nclass S3FileManager(FileManager):\n def __init__(self, s3_session, s3_bucket, s3_base_key):\n self._s3_session = s3_session\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_base_key = check.str_param(s3_base_key, "s3_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n self._s3_session.download_file(\n Bucket=file_handle.s3_bucket, Key=file_handle.s3_key, Filename=temp_name\n )\n self._local_handle_cache[file_handle.s3_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", S3FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.s3_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.s3_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n s3_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n self._s3_session.put_object(Body=file_obj, Bucket=self._s3_bucket, Key=s3_key)\n return S3FileHandle(self._s3_bucket, s3_key)\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self._s3_base_key, file_key=file_key)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_aws/s3/file_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.file_manager"}, "io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.io_manager

\nimport io\nimport pickle\nfrom typing import Union\n\nfrom dagster import Field, InputContext, MemoizableIOManager, OutputContext, StringSource\nfrom dagster import _check as check\nfrom dagster import io_manager\nfrom dagster.utils import PICKLE_PROTOCOL\n\n\nclass PickledObjectS3IOManager(MemoizableIOManager):\n    def __init__(\n        self,\n        s3_bucket,\n        s3_session,\n        s3_prefix=None,\n    ):\n        self.bucket = check.str_param(s3_bucket, "s3_bucket")\n        self.s3_prefix = check.str_param(s3_prefix, "s3_prefix")\n        self.s3 = s3_session\n        self.s3.list_objects(Bucket=self.bucket, Prefix=self.s3_prefix, MaxKeys=1)\n\n    def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n        return "/".join([self.s3_prefix, "storage", *context.get_identifier()])\n\n    def has_output(self, context):\n        key = self._get_path(context)\n        return self._has_object(key)\n\n    def _rm_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        # delete_object wont fail even if the item has been deleted.\n        self.s3.delete_object(Bucket=self.bucket, Key=key)\n\n    def _has_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        found_object = False\n\n        try:\n            self.s3.get_object(Bucket=self.bucket, Key=key)\n            found_object = True\n        except self.s3.exceptions.NoSuchKey:\n            found_object = False\n\n        return found_object\n\n    def _uri_for_key(self, key):\n        check.str_param(key, "key")\n        return "s3://" + self.bucket + "/" + "{key}".format(key=key)\n\n    def load_input(self, context):\n        key = self._get_path(context)\n        context.log.debug(f"Loading S3 object from: {self._uri_for_key(key)}")\n        obj = pickle.loads(self.s3.get_object(Bucket=self.bucket, Key=key)["Body"].read())\n\n        return obj\n\n    def handle_output(self, context, obj):\n        key = self._get_path(context)\n        context.log.debug(f"Writing S3 object at: {self._uri_for_key(key)}")\n\n        if self._has_object(key):\n            context.log.warning(f"Removing existing S3 key: {key}")\n            self._rm_object(key)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        pickled_obj_bytes = io.BytesIO(pickled_obj)\n        self.s3.upload_fileobj(pickled_obj_bytes, self.bucket, key)\n\n\n
[docs]@io_manager(\n config_schema={\n "s3_bucket": Field(StringSource),\n "s3_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"s3"},\n)\ndef s3_pickle_io_manager(init_context):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Attach this resource definition to your job to make it available to your ops.\n\n .. code-block:: python\n\n @job(resource_defs={'io_manager': s3_pickle_io_manager, "s3": s3_resource, ...})\n def my_job():\n ...\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n s3_bucket: my-cool-bucket\n s3_prefix: good/prefix-for-files-\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3IOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager
\n\n\nclass PickledObjectS3AssetIOManager(PickledObjectS3IOManager):\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n return "/".join([self.s3_prefix, *context.get_asset_identifier()])\n\n\n
[docs]@io_manager(\n config_schema={\n "s3_bucket": Field(StringSource),\n "s3_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"s3"},\n)\ndef s3_pickle_asset_io_manager(init_context):\n """Persistent IO manager using S3 for storage, meant for use with software-defined assets.\n\n Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\n will overwrite previous materializations of that asset.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Attach this resource definition to your job to make it available to your ops.\n\n .. code-block:: python\n\n asset_group = AssetGroup(\n assets...,\n resource_defs={'io_manager': s3_pickle_asset_io_manager, "s3": s3_resource, ...}),\n )\n\n You may configure this IO manager as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n s3_bucket: my-cool-bucket\n s3_prefix: good/prefix-for-files-\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3AssetIOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_aws/s3/io_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.io_manager"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.s3.resources

\nfrom dagster import Field, StringSource, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom .file_manager import S3FileManager\nfrom .utils import construct_s3_client\n\nS3_SESSION_CONFIG = {\n    "use_unsigned_session": Field(\n        bool,\n        description="Specifies whether to use an unsigned S3 session",\n        is_required=False,\n        default_value=False,\n    ),\n    "region_name": Field(\n        str, description="Specifies a custom region for the S3 session", is_required=False\n    ),\n    "endpoint_url": Field(\n        StringSource,\n        description="Specifies a custom endpoint for the S3 session",\n        is_required=False,\n    ),\n    "max_attempts": Field(\n        int,\n        description="This provides Boto3's retry handler with a value of maximum retry attempts, "\n        "where the initial call counts toward the max_attempts value that you provide",\n        is_required=False,\n        default_value=5,\n    ),\n    "profile_name": Field(\n        str,\n        description="Specifies a profile to connect that session",\n        is_required=False,\n    ),\n}\n\n\n
[docs]@resource(S3_SESSION_CONFIG)\ndef s3_resource(context):\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.s3 import s3_resource\n\n @op(required_resource_keys={'s3'})\n def example_s3_op(context):\n return context.resources.s3.list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job(resource_defs={'s3': s3_resource})\n def example_job(context):\n example_s3_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 's3': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n s3:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n # through the ordinary boto credential chain.\n use_unsigned_session: false\n # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n endpoint_url: "http://localhost"\n # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for S3 session. Default is default\n # profile as specified in ~/.aws/credentials file\n\n """\n return construct_s3_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n endpoint_url=context.resource_config.get("endpoint_url"),\n use_unsigned_session=context.resource_config["use_unsigned_session"],\n profile_name=context.resource_config.get("profile_name"),\n )
\n\n\n
[docs]@resource(\n merge_dicts(\n S3_SESSION_CONFIG,\n {\n "s3_bucket": Field(StringSource),\n "s3_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef s3_file_manager(context):\n """FileManager that provides abstract access to S3.\n\n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n """\n return S3FileManager(\n s3_session=construct_s3_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n endpoint_url=context.resource_config.get("endpoint_url"),\n use_unsigned_session=context.resource_config["use_unsigned_session"],\n profile_name=context.resource_config.get("profile_name"),\n ),\n s3_bucket=context.resource_config["s3_bucket"],\n s3_base_key=context.resource_config["s3_prefix"],\n )
\n
", "current_page_name": "_modules/dagster_aws/s3/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.s3.resources"}}, "secretsmanager": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_aws.secretsmanager.resources

\nfrom contextlib import contextmanager\n\nfrom dagster import Array, Field, Noneable\nfrom dagster import _check as check\nfrom dagster import resource\nfrom dagster.core.test_utils import environ\nfrom dagster.utils.merger import merge_dicts\n\nfrom .secrets import construct_secretsmanager_client, get_secrets_from_arns, get_tagged_secrets\n\nSECRETSMANAGER_SESSION_CONFIG = {\n    "region_name": Field(\n        str,\n        description="Specifies a custom region for the SecretsManager session",\n        is_required=False,\n    ),\n    "max_attempts": Field(\n        int,\n        description="This provides Boto3's retry handler with a value of maximum retry attempts, "\n        "where the initial call counts toward the max_attempts value that you provide",\n        is_required=False,\n        default_value=5,\n    ),\n    "profile_name": Field(\n        str,\n        description="Specifies a profile to connect that session",\n        is_required=False,\n    ),\n}\n\n\n
[docs]@resource(SECRETSMANAGER_SESSION_CONFIG)\ndef secretsmanager_resource(context):\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_resource\n\n @op(required_resource_keys={'secretsmanager'})\n def example_secretsmanager_op(context):\n return context.resources.secretsmanager.get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job(resource_defs={'secretsmanager': secretsmanager_resource})\n def example_job():\n example_secretsmanager_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secretsmanager': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n\n """\n return construct_secretsmanager_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n profile_name=context.resource_config.get("profile_name"),\n )
\n\n\n
[docs]@resource(\n merge_dicts(\n SECRETSMANAGER_SESSION_CONFIG,\n {\n "secrets": Field(\n Array(str),\n is_required=False,\n default_value=[],\n description=("An array of AWS Secrets Manager secrets arns to fetch."),\n ),\n "secrets_tag": Field(\n Noneable(str),\n is_required=False,\n default_value=None,\n description=(\n "AWS Secrets Manager secrets with this tag will be fetched and made available."\n ),\n ),\n "add_to_environment": Field(\n bool,\n is_required=False,\n default_value=False,\n description=("Whether to mount the secrets as environment variables."),\n ),\n },\n )\n)\n@contextmanager\ndef secretsmanager_secrets_resource(context):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op(context):\n return context.resources.secrets.get("my-secret-name")\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op_2(context):\n return os.getenv("my-other-secret-name")\n\n @job(resource_defs={'secrets': secretsmanager_secrets_resource})\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secrets': {\n 'config': {\n 'region_name': 'us-west-1',\n 'secrets_tag': 'dagster',\n 'add_to_environment': True,\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n secrets_tag: "dagster"\n # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n # from SecretsManager.\n add_to_environment: true\n # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n # to false.\n\n """\n add_to_environment = check.bool_param(\n context.resource_config["add_to_environment"], "add_to_environment"\n )\n secrets_tag = check.opt_str_param(context.resource_config["secrets_tag"], "secrets_tag")\n secrets = check.list_param(context.resource_config["secrets"], "secrets", of_type=str)\n\n secrets_manager = construct_secretsmanager_client(\n max_attempts=context.resource_config["max_attempts"],\n region_name=context.resource_config.get("region_name"),\n profile_name=context.resource_config.get("profile_name"),\n )\n\n secret_arns = merge_dicts(\n (get_tagged_secrets(secrets_manager, [secrets_tag]) if secrets_tag else {}),\n get_secrets_from_arns(secrets_manager, secrets),\n )\n\n secrets_map = {\n name: secrets_manager.get_secret_value(SecretId=arn).get("SecretString")\n for name, arn in secret_arns.items()\n }\n with environ(secrets_map if add_to_environment else {}):\n yield secrets_map
\n
", "current_page_name": "_modules/dagster_aws/secretsmanager/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_aws.secretsmanager.resources"}}}, "dagster_azure": {"adls2": {"fake_adls2_resource": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.fake_adls2_resource

\nimport io\nimport random\nfrom typing import Dict\nfrom unittest import mock\n\nfrom dagster_azure.blob import FakeBlobServiceClient\n\nfrom dagster import resource\n\nfrom .resources import ADLS2Resource\nfrom .utils import ResourceNotFoundError\n\n\n@resource({"account_name": str})\ndef fake_adls2_resource(context):\n    return FakeADLS2Resource(account_name=context.resource_config["account_name"])\n\n\n
[docs]class FakeADLS2Resource(ADLS2Resource):\n """Stateful mock of an ADLS2Resource for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n def __init__(\n self, account_name, credential="fake-creds"\n ): # pylint: disable=unused-argument,super-init-not-called\n self._adls2_client = FakeADLS2ServiceClient(account_name)\n self._blob_client = FakeBlobServiceClient(account_name)\n self._lease_client_constructor = FakeLeaseClient
\n\n\nclass FakeLeaseClient:\n def __init__(self, client):\n self.client = client\n self.id = None\n\n # client needs a ref to self to check if a given lease is valid\n self.client._lease = self\n\n def acquire(self, lease_duration=-1): # pylint: disable=unused-argument\n if self.id is None:\n self.id = random.randint(0, 2**9)\n else:\n raise Exception("Lease already held")\n\n def release(self):\n self.id = None\n\n def is_valid(self, lease):\n if self.id is None:\n # no lease is held so any operation is valid\n return True\n return lease == self.id\n\n\nclass FakeADLS2ServiceClient:\n """Stateful mock of an ADLS2 service client for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n def __init__(self, account_name, credential="fake-creds"):\n\n self._account_name = account_name\n self._credential = mock.MagicMock()\n self._credential.account_key = credential\n self._file_systems = {}\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def credential(self):\n return self._credential\n\n @property\n def file_systems(self):\n return self._file_systems\n\n def get_file_system_client(self, file_system):\n return self._file_systems.setdefault(\n file_system, FakeADLS2FilesystemClient(self.account_name, file_system)\n )\n\n def get_file_client(self, file_system, file_path):\n return self.get_file_system_client(file_system).get_file_client(file_path)\n\n\nclass FakeADLS2FilesystemClient:\n """Stateful mock of an ADLS2 filesystem client for testing."""\n\n def __init__(self, account_name, file_system_name):\n self._file_system: Dict[str, FakeADLS2FileClient] = {}\n self._account_name = account_name\n self._file_system_name = file_system_name\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def file_system_name(self):\n return self._file_system_name\n\n def keys(self):\n return self._file_system.keys()\n\n def get_file_system_properties(self):\n return {"account_name": self.account_name, "file_system_name": self.file_system_name}\n\n def has_file(self, path):\n return bool(self._file_system.get(path))\n\n def get_file_client(self, file_path):\n # pass fileclient a ref to self and its name so the file can delete itself\n self._file_system.setdefault(file_path, FakeADLS2FileClient(self, file_path))\n return self._file_system[file_path]\n\n def create_file(self, file):\n # pass fileclient a ref to self and the file's name so the file can delete itself by\n # accessing the self._file_system dict\n self._file_system.setdefault(file, FakeADLS2FileClient(fs_client=self, name=file))\n return self._file_system[file]\n\n def delete_file(self, file):\n for k in list(self._file_system.keys()):\n if k.startswith(file):\n del self._file_system[k]\n\n\nclass FakeADLS2FileClient:\n """Stateful mock of an ADLS2 file client for testing."""\n\n def __init__(self, name, fs_client):\n self.name = name\n self.contents = None\n self._lease = None\n self.fs_client = fs_client\n\n @property\n def lease(self):\n return self._lease if self._lease is None else self._lease.id\n\n def get_file_properties(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n lease_id = None if self._lease is None else self._lease.id\n return {"lease": lease_id}\n\n def upload_data(self, contents, overwrite=False, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n if self.contents is not None or overwrite is True:\n if isinstance(contents, str):\n self.contents = contents.encode("utf8")\n elif isinstance(contents, io.BytesIO):\n self.contents = contents.read()\n elif isinstance(contents, io.StringIO):\n self.contents = contents.read().encode("utf8")\n elif isinstance(contents, bytes):\n self.contents = contents\n else:\n self.contents = contents\n\n def download_file(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n return FakeADLS2FileDownloader(contents=self.contents)\n\n def delete_file(self, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n self.fs_client.delete_file(self.name)\n\n\nclass FakeADLS2FileDownloader:\n """Mock of an ADLS2 file downloader for testing."""\n\n def __init__(self, contents):\n self.contents = contents\n\n def readall(self):\n return self.contents\n\n def readinto(self, fileobj):\n fileobj.write(self.contents)\n
", "current_page_name": "_modules/dagster_azure/adls2/fake_adls2_resource", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.fake_adls2_resource"}, "file_cache": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.file_cache

\nfrom dagster import Field, Selector, StringSource\nfrom dagster import _check as check\nfrom dagster import resource\nfrom dagster.core.storage.file_cache import FileCache\n\nfrom .file_manager import ADLS2FileHandle\nfrom .utils import ResourceNotFoundError, create_adls2_client\n\n\nclass ADLS2FileCache(FileCache):\n    def __init__(\n        self, storage_account, file_system, prefix, credential=None, overwrite=False, client=None\n    ):\n        super(ADLS2FileCache, self).__init__(overwrite=overwrite)\n\n        self.storage_account = storage_account\n        self.file_system = file_system\n        self.prefix = prefix\n\n        self.client = client or create_adls2_client(storage_account, credential)\n\n    def has_file_object(self, file_key):\n        check.str_param(file_key, "file_key")\n        try:\n            file = self.client.get_file_client(self.file_system, self.get_full_key(file_key))\n            file.get_file_properties()\n        except ResourceNotFoundError:\n            return False\n        return True\n\n    def get_full_key(self, file_key):\n        return "{base_key}/{file_key}".format(base_key=self.prefix, file_key=file_key)\n\n    def write_file_object(self, file_key, source_file_object):\n        check.str_param(file_key, "file_key")\n\n        adls2_key = self.get_full_key(file_key)\n        adls2_file = self.client.get_file_client(file_system=self.file_system, file_path=adls2_key)\n        adls2_file.upload_data(source_file_object, overwrite=True)\n        return self.get_file_handle(file_key)\n\n    def get_file_handle(self, file_key):\n        check.str_param(file_key, "file_key")\n        return ADLS2FileHandle(\n            self.client.account_name, self.file_system, self.get_full_key(file_key)\n        )\n\n\n
[docs]@resource(\n {\n "storage_account": Field(StringSource, description="The storage account name."),\n "credential": Field(\n Selector(\n {\n "sas": Field(StringSource, description="SAS token for the account."),\n "key": Field(StringSource, description="Shared Access Key for the account"),\n }\n ),\n description="The credentials with which to authenticate.",\n ),\n "prefix": Field(StringSource, description="The base path prefix to use in ADLS2"),\n "file_system": Field(\n StringSource, description="The storage account filesystem (aka container)"\n ),\n "overwrite": Field(bool, is_required=False, default_value=False),\n }\n)\ndef adls2_file_cache(init_context):\n return ADLS2FileCache(\n storage_account=init_context.resource_config["storage_account"],\n file_system=init_context.resource_config["file_system"],\n prefix=init_context.resource_config["prefix"],\n credential=init_context.resource_config["credential"],\n overwrite=init_context.resource_config["overwrite"],\n # TODO: resource dependencies\n )
\n
", "current_page_name": "_modules/dagster_azure/adls2/file_cache", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.file_cache"}, "file_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster.core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class ADLS2FileHandle(FileHandle):\n """A reference to a file on ADLS2."""\n\n def __init__(self, account: str, file_system: str, key: str):\n self._account = check.str_param(account, "account")\n self._file_system = check.str_param(file_system, "file_system")\n self._key = check.str_param(key, "key")\n\n @property\n def account(self):\n """str: The name of the ADLS2 account."""\n return self._account\n\n @property\n def file_system(self):\n """str: The name of the ADLS2 file system."""\n return self._file_system\n\n @property\n def key(self):\n """str: The ADLS2 key."""\n return self._key\n\n @property\n def path_desc(self):\n """str: The file's ADLS2 URL."""\n return self.adls2_path\n\n @property\n def adls2_path(self):\n """str: The file's ADLS2 URL."""\n return "adfss://{file_system}@{account}.dfs.core.windows.net/{key}".format(\n file_system=self.file_system,\n account=self.account,\n key=self.key,\n )
\n\n\nclass ADLS2FileManager(FileManager):\n def __init__(self, adls2_client, file_system, prefix):\n self._client = adls2_client\n self._file_system = check.str_param(file_system, "file_system")\n self._prefix = check.str_param(prefix, "prefix")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n file = self._client.get_file_client(\n file_system=file_handle.file_system,\n file_path=file_handle.key,\n )\n download = file.download_file()\n with open(temp_name, "wb") as file_obj:\n download.readinto(file_obj)\n self._local_handle_cache[file_handle.adls2_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", ADLS2FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if "b" in mode else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.adls2_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.adls2_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None): # pylint: disable=unused-argument\n check_file_like_obj(file_obj)\n adls2_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n adls2_file = self._client.get_file_client(\n file_system=self._file_system, file_path=adls2_key\n )\n adls2_file.upload_data(file_obj, overwrite=True)\n return ADLS2FileHandle(self._client.account_name, self._file_system, adls2_key)\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self._prefix, file_key=file_key)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_azure/adls2/file_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.file_manager"}, "io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.io_manager

\nimport pickle\nfrom contextlib import contextmanager\nfrom typing import Union\n\nfrom dagster_azure.adls2.utils import ResourceNotFoundError\n\nfrom dagster import Field, IOManager, InputContext, OutputContext, StringSource\nfrom dagster import _check as check\nfrom dagster import io_manager\nfrom dagster.utils import PICKLE_PROTOCOL\n\n_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectADLS2IOManager(IOManager):\n    def __init__(\n        self, file_system, adls2_client, blob_client, lease_client_constructor, prefix="dagster"\n    ):\n        self.adls2_client = adls2_client\n        self.file_system_client = self.adls2_client.get_file_system_client(file_system)\n        # We also need a blob client to handle copying as ADLS doesn't have a copy API yet\n        self.blob_client = blob_client\n        self.blob_container_client = self.blob_client.get_container_client(file_system)\n        self.prefix = check.str_param(prefix, "prefix")\n\n        self.lease_client_constructor = lease_client_constructor\n        self.lease_duration = _LEASE_DURATION\n        self.file_system_client.get_file_system_properties()\n\n    def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n        keys = context.get_identifier()\n        run_id = keys[0]\n        output_identifiers = keys[1:]  # variable length because of mapping key\n        return "/".join(\n            [\n                self.prefix,\n                "storage",\n                run_id,\n                "files",\n                *output_identifiers,\n            ]\n        )\n\n    def _rm_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        file_client = self.file_system_client.get_file_client(key)\n        with self._acquire_lease(file_client, is_rm=True) as lease:\n            file_client.delete_file(lease=lease, recursive=True)\n\n    def _has_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        try:\n            file = self.file_system_client.get_file_client(key)\n            file.get_file_properties()\n            return True\n        except ResourceNotFoundError:\n            return False\n\n    def _uri_for_key(self, key, protocol=None):\n        check.str_param(key, "key")\n        protocol = check.opt_str_param(protocol, "protocol", default="abfss://")\n        return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format(\n            protocol=protocol,\n            filesystem=self.file_system_client.file_system_name,\n            account=self.file_system_client.account_name,\n            key=key,\n        )\n\n    @contextmanager\n    def _acquire_lease(self, client, is_rm=False):\n        lease_client = self.lease_client_constructor(client=client)\n        try:\n            lease_client.acquire(lease_duration=self.lease_duration)\n            yield lease_client.id\n        finally:\n            # cannot release a lease on a file that no longer exists, so need to check\n            if not is_rm:\n                lease_client.release()\n\n    def load_input(self, context):\n        key = self._get_path(context)\n        context.log.debug(f"Loading ADLS2 object from: {self._uri_for_key(key)}")\n        file = self.file_system_client.get_file_client(key)\n        stream = file.download_file()\n        obj = pickle.loads(stream.readall())\n\n        return obj\n\n    def handle_output(self, context, obj):\n        key = self._get_path(context)\n        context.log.debug(f"Writing ADLS2 object at: {self._uri_for_key(key)}")\n\n        if self._has_object(key):\n            context.log.warning(f"Removing existing ADLS2 key: {key}")\n            self._rm_object(key)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n        file = self.file_system_client.create_file(key)\n        with self._acquire_lease(file) as lease:\n            file.upload_data(pickled_obj, lease=lease, overwrite=True)\n\n\n
[docs]@io_manager(\n config_schema={\n "adls2_file_system": Field(StringSource, description="ADLS Gen2 file system name"),\n "adls2_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"adls2"},\n)\ndef adls2_pickle_io_manager(init_context):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Attach this resource definition to your job in order to make it available all your ops:\n\n .. code-block:: python\n\n @job(resource_defs={\n 'io_manager': adls2_pickle_io_manager,\n 'adls2': adls2_resource,\n ...,\n })\n def my_job():\n ...\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n adls2_file_system: my-cool-file-system\n adls2_prefix: good/prefix-for-files-\n """\n adls_resource = init_context.resources.adls2\n adls2_client = adls_resource.adls2_client\n blob_client = adls_resource.blob_client\n lease_client = adls_resource.lease_client_constructor\n pickled_io_manager = PickledObjectADLS2IOManager(\n init_context.resource_config["adls2_file_system"],\n adls2_client,\n blob_client,\n lease_client,\n init_context.resource_config.get("adls2_prefix"),\n )\n return pickled_io_manager
\n\n\nclass PickledObjectADLS2AssetIOManager(PickledObjectADLS2IOManager):\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n return "/".join([self.prefix, *context.get_asset_identifier()])\n\n\n
[docs]@io_manager(\n config_schema={\n "adls2_file_system": Field(StringSource, description="ADLS Gen2 file system name"),\n "adls2_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"adls2"},\n)\ndef adls2_pickle_asset_io_manager(init_context):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage, meant for use with\n software-defined assets.\n\n Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\n will overwrite previous materializations of that asset.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Attach this resource definition to your job in order to make it available all your ops:\n\n .. code-block:: python\n\n asset_group = AssetGroup(\n assets...,\n resource_defs={'io_manager': adls2_pickle_io_manager, "adls2": adls2_resource, ...}),\n )\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n adls2_file_system: my-cool-file-system\n adls2_prefix: good/prefix-for-files\n """\n adls_resource = init_context.resources.adls2\n adls2_client = adls_resource.adls2_client\n blob_client = adls_resource.blob_client\n lease_client = adls_resource.lease_client_constructor\n pickled_io_manager = PickledObjectADLS2AssetIOManager(\n init_context.resource_config["adls2_file_system"],\n adls2_client,\n blob_client,\n lease_client,\n init_context.resource_config.get("adls2_prefix"),\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_azure/adls2/io_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.io_manager"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.adls2.resources

\nfrom azure.storage.filedatalake import DataLakeLeaseClient\nfrom dagster_azure.blob.utils import create_blob_client\n\nfrom dagster import Field, Selector, StringSource, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom .file_manager import ADLS2FileManager\nfrom .utils import create_adls2_client\n\nADLS2_CLIENT_CONFIG = {\n    "storage_account": Field(StringSource, description="The storage account name."),\n    "credential": Field(\n        Selector(\n            {\n                "sas": Field(StringSource, description="SAS token for the account."),\n                "key": Field(StringSource, description="Shared Access Key for the account"),\n            }\n        ),\n        description="The credentials with which to authenticate.",\n    ),\n}\n\n\n
[docs]@resource(ADLS2_CLIENT_CONFIG)\ndef adls2_resource(context):\n """Resource that gives ops access to Azure Data Lake Storage Gen2.\n\n The underlying client is a :py:class:`~azure.storage.filedatalake.DataLakeServiceClient`.\n\n Attach this resource definition to a :py:class:`~dagster.JobDefinition` in order to make it\n available to your ops.\n\n Example:\n\n .. code-block:: python\n\n from dagster import job, op\n from dagster_azure.adls2 import adls2_resource\n\n @op(required_resource_keys={'adls2'})\n def example_adls2_op(context):\n return list(context.resources.adls2.adls2_client.list_file_systems())\n\n @job(resource_defs={"adls2": adls2_resource})\n def my_job():\n example_adls2_op()\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may pass credentials to this resource using either a SAS token or a key, using\n environment variables if desired:\n\n .. code-block:: YAML\n\n resources:\n adls2:\n config:\n storage_account: my_storage_account\n # str: The storage account name.\n credential:\n sas: my_sas_token\n # str: the SAS token for the account.\n key:\n env: AZURE_DATA_LAKE_STORAGE_KEY\n # str: The shared access key for the account.\n """\n return _adls2_resource_from_config(context.resource_config)
\n\n\n
[docs]@resource(\n merge_dicts(\n ADLS2_CLIENT_CONFIG,\n {\n "adls2_file_system": Field(StringSource, description="ADLS Gen2 file system name"),\n "adls2_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef adls2_file_manager(context):\n """FileManager that provides abstract access to ADLS2.\n\n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n """\n adls2_client = _adls2_resource_from_config(context.resource_config).adls2_client\n\n return ADLS2FileManager(\n adls2_client=adls2_client,\n file_system=context.resource_config["adls2_file_system"],\n prefix=context.resource_config["adls2_prefix"],\n )
\n\n\nclass ADLS2Resource:\n """Resource containing clients to access Azure Data Lake Storage Gen2.\n\n Contains a client for both the Data Lake and Blob APIs, to work around the limitations\n of each.\n """\n\n def __init__(self, storage_account, credential):\n self._adls2_client = create_adls2_client(storage_account, credential)\n self._blob_client = create_blob_client(storage_account, credential)\n self._lease_client_constructor = DataLakeLeaseClient\n\n @property\n def adls2_client(self):\n return self._adls2_client\n\n @property\n def blob_client(self):\n return self._blob_client\n\n @property\n def lease_client_constructor(self):\n return self._lease_client_constructor\n\n\ndef _adls2_resource_from_config(config):\n """\n Args:\n config: A configuration containing the fields in ADLS2_CLIENT_CONFIG.\n\n Returns: An adls2 client.\n """\n storage_account = config["storage_account"]\n credential = config["credential"].copy().popitem()[1]\n return ADLS2Resource(storage_account, credential)\n
", "current_page_name": "_modules/dagster_azure/adls2/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.adls2.resources"}}, "blob": {"compute_log_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_azure.blob.compute_log_manager

\nimport itertools\nimport os\nfrom contextlib import contextmanager\n\nimport dagster.seven as seven\nfrom dagster import Field, StringSource\nfrom dagster import _check as check\nfrom dagster.core.storage.compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n)\nfrom dagster.core.storage.local_compute_log_manager import IO_TYPE_EXTENSION, LocalComputeLogManager\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import ensure_dir, ensure_file\n\nfrom .utils import create_blob_client, generate_blob_sas\n\n\n
[docs]class AzureBlobComputeLogManager(ComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to Azure Blob Storage.\n\n This is also compatible with Azure Data Lake Storage.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_azure.blob.compute_log_manager\n class: AzureBlobComputeLogManager\n config:\n storage_account: my-storage-account\n container: my-container\n credential: sas-token-or-secret-key\n prefix: "dagster-test-"\n local_dir: "/tmp/cool"\n\n Args:\n storage_account (str): The storage account name to which to log.\n container (str): The container (or ADLS2 filesystem) to which to log.\n secret_key (str): Secret key for the storage account. SAS tokens are not\n supported because we need a secret key to generate a SAS token for a download URL.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster.seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n storage_account,\n container,\n secret_key,\n local_dir=None,\n inst_data=None,\n prefix="dagster",\n ):\n self._storage_account = check.str_param(storage_account, "storage_account")\n self._container = check.str_param(container, "container")\n self._blob_prefix = check.str_param(prefix, "prefix")\n check.str_param(secret_key, "secret_key")\n\n self._blob_client = create_blob_client(storage_account, secret_key)\n self._container_client = self._blob_client.get_container_client(container)\n self._download_urls = {}\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self.local_manager = LocalComputeLogManager(local_dir)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, pipeline_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs( # pylint: disable=protected-access\n pipeline_run, step_key\n ):\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "storage_account": StringSource,\n "container": StringSource,\n "secret_key": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n }\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return AzureBlobComputeLogManager(inst_data=inst_data, **config_value)\n\n def get_local_path(self, run_id, key, io_type):\n return self.local_manager.get_local_path(run_id, key, io_type)\n\n def on_watch_start(self, pipeline_run, step_key):\n self.local_manager.on_watch_start(pipeline_run, step_key)\n\n def on_watch_finish(self, pipeline_run, step_key):\n self.local_manager.on_watch_finish(pipeline_run, step_key)\n key = self.local_manager.get_key(pipeline_run, step_key)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT)\n self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR)\n\n def is_watch_completed(self, run_id, key):\n return self.local_manager.is_watch_completed(run_id, key)\n\n def download_url(self, run_id, key, io_type):\n if not self.is_watch_completed(run_id, key):\n return self.local_manager.download_url(run_id, key, io_type)\n key = self._blob_key(run_id, key, io_type)\n if key in self._download_urls:\n return self._download_urls[key]\n blob = self._container_client.get_blob_client(key)\n sas = generate_blob_sas(\n self._storage_account,\n self._container,\n key,\n account_key=self._blob_client.credential.account_key,\n )\n url = blob.url + sas\n self._download_urls[key] = url\n return url\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n if self._should_download(run_id, key, io_type):\n self._download_to_local(run_id, key, io_type)\n data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes)\n return self._from_local_file_data(run_id, key, io_type, data)\n\n def on_subscribe(self, subscription):\n self.local_manager.on_subscribe(subscription)\n\n def on_unsubscribe(self, subscription):\n self.local_manager.on_unsubscribe(subscription)\n\n def _should_download(self, run_id, key, io_type):\n local_path = self.get_local_path(run_id, key, io_type)\n if os.path.exists(local_path):\n return False\n blob_objects = self._container_client.list_blobs(self._blob_key(run_id, key, io_type))\n # Limit the generator to avoid paging since we only need one element\n # to return True\n limited_blob_objects = itertools.islice(blob_objects, 1)\n return len(list(limited_blob_objects)) > 0\n\n def _from_local_file_data(self, run_id, key, io_type, local_file_data):\n is_complete = self.is_watch_completed(run_id, key)\n path = (\n "https://{account}.blob.core.windows.net/{container}/{key}".format(\n account=self._storage_account,\n container=self._container,\n key=self._blob_key(run_id, key, io_type),\n )\n if is_complete\n else local_file_data.path\n )\n\n return ComputeLogFileData(\n path,\n local_file_data.data,\n local_file_data.cursor,\n local_file_data.size,\n self.download_url(run_id, key, io_type),\n )\n\n def _upload_from_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_file(path)\n key = self._blob_key(run_id, key, io_type)\n with open(path, "rb") as data:\n blob = self._container_client.get_blob_client(key)\n blob.upload_blob(data)\n\n def _download_to_local(self, run_id, key, io_type):\n path = self.get_local_path(run_id, key, io_type)\n ensure_dir(os.path.dirname(path))\n key = self._blob_key(run_id, key, io_type)\n with open(path, "wb") as fileobj:\n blob = self._container_client.get_blob_client(key)\n blob.download_blob().readinto(fileobj)\n\n def _blob_key(self, run_id, key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n paths = [\n self._blob_prefix,\n "storage",\n run_id,\n "compute_logs",\n "{}.{}".format(key, extension),\n ]\n return "/".join(paths) # blob path delimiter\n\n def dispose(self):\n self.local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_azure/blob/compute_log_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_azure.blob.compute_log_manager"}}}, "dagster_celery": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_celery.executor

\nfrom dagster import Executor, Field, Noneable, Permissive, StringSource\nfrom dagster import _check as check\nfrom dagster import executor, multiple_process_executor_requirements\nfrom dagster.core.execution.retries import RetryMode, get_retries_config\nfrom dagster.grpc.types import ExecuteStepArgs\nfrom dagster.serdes import pack_value\n\nfrom .config import DEFAULT_CONFIG, dict_wrapper\nfrom .defaults import broker_url, result_backend\n\nCELERY_CONFIG = {\n    "broker": Field(\n        Noneable(StringSource),\n        is_required=False,\n        description=(\n            "The URL of the Celery broker. Default: "\n            "'pyamqp://guest@{os.getenv('DAGSTER_CELERY_BROKER_HOST',"\n            "'localhost')}//'."\n        ),\n    ),\n    "backend": Field(\n        Noneable(StringSource),\n        is_required=False,\n        default_value="rpc://",\n        description="The URL of the Celery results backend. Default: 'rpc://'.",\n    ),\n    "include": Field(\n        [str], is_required=False, description="List of modules every worker should import"\n    ),\n    "config_source": Field(\n        Noneable(Permissive()),\n        is_required=False,\n        description="Additional settings for the Celery app.",\n    ),\n    "retries": get_retries_config(),\n}\n\n\n
[docs]@executor(\n name="celery",\n config_schema=CELERY_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_executor(init_context):\n """Celery-based executor.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery import celery_executor\n\n @job(executor_def=celery_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n """\n\n return CeleryExecutor(\n broker=init_context.executor_config.get("broker"),\n backend=init_context.executor_config.get("backend"),\n config_source=init_context.executor_config.get("config_source"),\n include=init_context.executor_config.get("include"),\n retries=RetryMode.from_config(init_context.executor_config["retries"]),\n )
\n\n\ndef _submit_task(app, plan_context, step, queue, priority, known_state):\n from .tasks import create_task\n\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=plan_context.reconstructable_pipeline.get_python_origin(),\n pipeline_run_id=plan_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n )\n\n task = create_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n executable_dict=plan_context.reconstructable_pipeline.to_dict(),\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key="{queue}.execute_plan".format(queue=queue),\n )\n\n\nclass CeleryExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self._retries = check.inst_param(retries, "retries", RetryMode)\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from .core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task\n )\n\n @staticmethod\n def for_cli(broker=None, backend=None, include=None, config_source=None):\n return CeleryExecutor(\n retries=RetryMode(RetryMode.DISABLED),\n broker=broker,\n backend=backend,\n include=include,\n config_source=config_source,\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n
", "current_page_name": "_modules/dagster_celery/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_celery.executor"}}, "dagster_celery_docker": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_celery_docker.executor

\nimport json\nimport os\n\nimport docker.client\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER, core_celery_execution_loop\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_celery.executor import CELERY_CONFIG\n\nfrom dagster import DagsterInstance, Executor, Field, MetadataEntry, Permissive, StringSource\nfrom dagster import _check as check\nfrom dagster import executor, multiple_process_executor_requirements\nfrom dagster.cli.api import ExecuteStepArgs\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.serdes import pack_value, serialize_dagster_namedtuple, unpack_value\nfrom dagster.utils import merge_dicts\n\nCELERY_DOCKER_CONFIG_KEY = "celery-docker"\n\n\ndef celery_docker_config():\n    additional_config = {\n        "docker": Field(\n            {\n                "image": Field(\n                    StringSource,\n                    is_required=False,\n                    description="The docker image to be used for step execution.",\n                ),\n                "registry": Field(\n                    {\n                        "url": Field(StringSource),\n                        "username": Field(StringSource),\n                        "password": Field(StringSource),\n                    },\n                    is_required=False,\n                    description="Information for using a non local/public docker registry",\n                ),\n                "env_vars": Field(\n                    [str],\n                    is_required=False,\n                    description="The list of environment variables names to forward from the celery worker in to the docker container",\n                ),\n                "network": Field(\n                    str,\n                    is_required=False,\n                    description="Name of the network this container will be connected to at creation time",\n                ),\n                "container_kwargs": Field(\n                    Permissive(),\n                    is_required=False,\n                    description="Additional keyword args for the docker container",\n                ),\n            },\n            is_required=True,\n            description="The configuration for interacting with docker in the celery worker.",\n        ),\n    }\n\n    cfg = merge_dicts(CELERY_CONFIG, additional_config)\n    return cfg\n\n\n
[docs]@executor(\n name=CELERY_DOCKER_CONFIG_KEY,\n config_schema=celery_docker_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_docker_executor(init_context):\n """Celery-based executor which launches tasks in docker containers.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_docker_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery_docker.executor import celery_docker_executor\n\n @job(executor_def=celery_docker_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n docker:\n image: 'my_repo.com/image_name:latest'\n registry:\n url: 'my_repo.com'\n username: 'my_user'\n password: {env: 'DOCKER_PASSWORD'}\n env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n container_kwargs: # keyword args to be passed to the container. example:\n volumes: ['/home/user1/:/mnt/vol2','/var/www:/mnt/vol1']\n\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_docker_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_docker.app` argument.\n """\n\n exc_cfg = init_context.executor_config\n\n return CeleryDockerExecutor(\n broker=exc_cfg.get("broker"),\n backend=exc_cfg.get("backend"),\n config_source=exc_cfg.get("config_source"),\n include=exc_cfg.get("include"),\n retries=RetryMode.from_config(exc_cfg.get("retries")),\n docker_config=exc_cfg.get("docker"),\n )
\n\n\nclass CeleryDockerExecutor(Executor):\n def __init__(\n self,\n retries,\n docker_config,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.docker_config = check.dict_param(docker_config, "docker_config")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_docker\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_docker(app, plan_context, step, queue, priority, known_state):\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=plan_context.reconstructable_pipeline.get_python_origin(),\n pipeline_run_id=plan_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n )\n\n task = create_docker_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n docker_config=plan_context.executor.docker_config,\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key="{queue}.execute_step_docker".format(queue=queue),\n )\n\n\ndef create_docker_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_docker", **task_kwargs)\n def _execute_step_docker(\n self,\n execute_step_args_packed,\n docker_config,\n ):\n """Run step execution in a Docker container."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n )\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n\n check.dict_param(docker_config, "docker_config")\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n pipeline_run = instance.get_run_by_id(execute_step_args.pipeline_run_id)\n check.inst(\n pipeline_run,\n PipelineRun,\n "Could not load run {}".format(execute_step_args.pipeline_run_id),\n )\n step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)\n\n input_json = serialize_dagster_namedtuple(execute_step_args)\n\n command = "dagster api execute_step {}".format(json.dumps(input_json))\n\n docker_image = (\n docker_config["image"]\n if docker_config.get("image")\n else execute_step_args.pipeline_origin.repository_origin.container_image\n )\n\n if not docker_image:\n raise Exception("No docker image specified by either the job or the repository")\n\n client = docker.client.from_env()\n\n if docker_config.get("registry"):\n client.login(\n registry=docker_config["registry"]["url"],\n username=docker_config["registry"]["username"],\n password=docker_config["registry"]["password"],\n )\n\n # Post event for starting execution\n engine_event = instance.report_engine_event(\n "Executing steps {} in Docker container {}".format(step_keys_str, docker_image),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step keys", value=step_keys_str),\n MetadataEntry("Image", value=docker_image),\n MetadataEntry("Celery worker", value=self.request.hostname),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n\n serialized_events = [serialize_dagster_namedtuple(engine_event)]\n\n docker_env = {}\n if docker_config.get("env_vars"):\n docker_env = {env_name: os.getenv(env_name) for env_name in docker_config["env_vars"]}\n\n container_kwargs = check.opt_dict_param(\n docker_config.get("container_kwargs"), "container_kwargs", key_type=str\n )\n\n # set defaults for detach and auto_remove\n container_kwargs["detach"] = container_kwargs.get("detach", False)\n container_kwargs["auto_remove"] = container_kwargs.get("auto_remove", True)\n\n # if environment variables are provided via container_kwargs, merge with env_vars\n if container_kwargs.get("environment") is not None:\n e_vars = container_kwargs.get("environment")\n if isinstance(e_vars, dict):\n docker_env.update(e_vars)\n else:\n for v in e_vars:\n key, val = v.split("=")\n docker_env[key] = val\n del container_kwargs["environment"]\n\n try:\n docker_response = client.containers.run(\n docker_image,\n command=command,\n # pass through this worker's environment for things like AWS creds etc.\n environment=docker_env,\n network=docker_config.get("network", None),\n **container_kwargs,\n )\n\n res = docker_response.decode("utf-8")\n except docker.errors.ContainerError as err:\n entries = [MetadataEntry("Job image", value=docker_image)]\n if err.stderr is not None:\n entries.append(MetadataEntry("Docker stderr", value=err.stderr))\n\n instance.report_engine_event(\n "Failed to run steps {} in Docker container {}".format(step_keys_str, docker_image),\n pipeline_run,\n EngineEventData(entries),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n raise\n else:\n if res is None:\n raise Exception("No response from execute_step in CeleryDockerExecutor")\n\n events = filter_dagster_events_from_cli_logs(res.split("\\n"))\n serialized_events += [serialize_dagster_namedtuple(event) for event in events]\n\n return serialized_events\n\n return _execute_step_docker\n
", "current_page_name": "_modules/dagster_celery_docker/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_celery_docker.executor"}}, "dagster_celery_k8s": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_celery_k8s.executor

\nimport logging\nimport os\nimport sys\nimport time\n\nimport kubernetes\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_k8s import DagsterK8sJobConfig, construct_dagster_k8s_job\nfrom dagster_k8s.client import (\n    DagsterK8sAPIRetryLimitExceeded,\n    DagsterK8sError,\n    DagsterK8sPipelineStatusException,\n    DagsterK8sTimeoutError,\n    DagsterK8sUnrecoverableAPIError,\n)\nfrom dagster_k8s.job import (\n    UserDefinedDagsterK8sConfig,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\nfrom dagster_k8s.utils import (\n    delete_job,\n    get_pod_names_in_job,\n    retrieve_pod_logs,\n    wait_for_job_success,\n)\n\nfrom dagster import DagsterEvent, DagsterEventType, DagsterInstance, Executor, MetadataEntry\nfrom dagster import _check as check\nfrom dagster import executor, multiple_process_executor_requirements\nfrom dagster.cli.api import ExecuteStepArgs\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster.core.execution.plan.objects import StepFailureData, UserFailureData\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.serdes import pack_value, serialize_dagster_namedtuple, unpack_value\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\nfrom .launcher import CeleryK8sRunLauncher\n\n\n
[docs]@executor(\n name=CELERY_K8S_CONFIG_KEY,\n config_schema=celery_k8s_executor_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_k8s_job_executor(init_context):\n """Celery-based executor which launches tasks as Kubernetes Jobs.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute dagster jobs\n with variations on these settings.\n\n To use the `celery_k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-celery-k8s/dagster_celery_k8s_tests/example_celery_mode_def.py\n :language: python\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_image: 'my_repo.com/image_name:latest'\n job_namespace: 'some-namespace'\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_k8s.app` argument.\n """\n\n run_launcher = init_context.instance.run_launcher\n exc_cfg = init_context.executor_config\n\n if not isinstance(run_launcher, CeleryK8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a CeleryK8sRunLauncher; configure the "\n "CeleryK8sRunLauncher on your instance to use it.",\n )\n\n job_config = run_launcher.get_k8s_job_config(\n job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"), exc_config=exc_cfg\n )\n\n # Set on the instance but overrideable here\n broker = run_launcher.broker or exc_cfg.get("broker")\n backend = run_launcher.backend or exc_cfg.get("backend")\n config_source = run_launcher.config_source or exc_cfg.get("config_source")\n include = run_launcher.include or exc_cfg.get("include")\n retries = run_launcher.retries or RetryMode.from_config(exc_cfg.get("retries"))\n\n return CeleryK8sJobExecutor(\n broker=broker,\n backend=backend,\n config_source=config_source,\n include=include,\n retries=retries,\n job_config=job_config,\n job_namespace=exc_cfg.get("job_namespace"),\n load_incluster_config=exc_cfg.get("load_incluster_config"),\n kubeconfig_file=exc_cfg.get("kubeconfig_file"),\n repo_location_name=exc_cfg.get("repo_location_name"),\n job_wait_timeout=exc_cfg.get("job_wait_timeout"),\n )
\n\n\nclass CeleryK8sJobExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n job_config=None,\n job_namespace=None,\n load_incluster_config=False,\n kubeconfig_file=None,\n repo_location_name=None,\n job_wait_timeout=None,\n ):\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.job_config = check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace", default="default")\n\n self.load_incluster_config = check.bool_param(\n load_incluster_config, "load_incluster_config"\n )\n\n self.kubeconfig_file = check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n self.repo_location_name = check.opt_str_param(repo_location_name, "repo_location_name")\n self.job_wait_timeout = check.float_param(job_wait_timeout, "job_wait_timeout")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from dagster_celery.core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_k8s_job\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_k8s_job(app, plan_context, step, queue, priority, known_state):\n user_defined_k8s_config = get_user_defined_k8s_config(step.tags)\n\n pipeline_origin = plan_context.reconstructable_pipeline.get_python_origin()\n\n execute_step_args = ExecuteStepArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=plan_context.pipeline_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n should_verify_step=True,\n )\n\n job_config = plan_context.executor.job_config\n if not job_config.job_image:\n job_config = job_config.with_image(pipeline_origin.repository_origin.container_image)\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the dagster job")\n\n task = create_k8s_job_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n job_config_dict=job_config.to_dict(),\n job_namespace=plan_context.executor.job_namespace,\n user_defined_k8s_config_dict=user_defined_k8s_config.to_dict(),\n load_incluster_config=plan_context.executor.load_incluster_config,\n job_wait_timeout=plan_context.executor.job_wait_timeout,\n kubeconfig_file=plan_context.executor.kubeconfig_file,\n )\n\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key="{queue}.execute_step_k8s_job".format(queue=queue),\n )\n\n\ndef construct_step_failure_event_and_handle(pipeline_run, step_key, err, instance):\n step_failure_event = DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=pipeline_run.pipeline_name,\n step_key=step_key,\n event_specific_data=StepFailureData(\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n user_failure_data=UserFailureData(label="K8sError"),\n ),\n )\n event_record = EventLogEntry(\n user_message=str(err),\n level=logging.ERROR,\n pipeline_name=pipeline_run.pipeline_name,\n run_id=pipeline_run.run_id,\n error_info=None,\n step_key=step_key,\n timestamp=time.time(),\n dagster_event=step_failure_event,\n )\n instance.handle_new_event(event_record)\n return step_failure_event\n\n\ndef create_k8s_job_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_k8s_job", **task_kwargs)\n def _execute_step_k8s_job(\n self,\n execute_step_args_packed,\n job_config_dict,\n job_namespace,\n load_incluster_config,\n job_wait_timeout,\n user_defined_k8s_config_dict=None,\n kubeconfig_file=None,\n ):\n """Run step execution in a K8s job pod."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n )\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n check.invariant(\n len(execute_step_args.step_keys_to_execute) == 1,\n "Celery K8s task executor can only execute 1 step at a time",\n )\n\n # Celery will serialize this as a list\n job_config = DagsterK8sJobConfig.from_dict(job_config_dict)\n check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n check.str_param(job_namespace, "job_namespace")\n\n check.bool_param(load_incluster_config, "load_incluster_config")\n\n user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(\n user_defined_k8s_config_dict\n )\n check.opt_inst_param(\n user_defined_k8s_config,\n "user_defined_k8s_config",\n UserDefinedDagsterK8sConfig,\n )\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n # For when launched via DinD or running the cluster\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n pipeline_run = instance.get_run_by_id(execute_step_args.pipeline_run_id)\n\n check.inst(\n pipeline_run,\n PipelineRun,\n "Could not load run {}".format(execute_step_args.pipeline_run_id),\n )\n step_key = execute_step_args.step_keys_to_execute[0]\n\n celery_worker_name = self.request.hostname\n celery_pod_name = os.environ.get("HOSTNAME")\n instance.report_engine_event(\n "Task for step {step_key} picked up by Celery".format(step_key=step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Celery worker name", value=celery_worker_name),\n MetadataEntry("Celery worker Kubernetes Pod name", value=celery_pod_name),\n ]\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n if pipeline_run.status != PipelineRunStatus.STARTED:\n instance.report_engine_event(\n "Not scheduling step because dagster run status is not STARTED",\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ]\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Ensure we stay below k8s name length limits\n k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key)\n\n retry_state = execute_step_args.known_state.get_retry_state()\n\n if retry_state.get_attempt_count(step_key):\n attempt_number = retry_state.get_attempt_count(step_key)\n job_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n pod_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n else:\n job_name = "dagster-step-%s" % (k8s_name_key)\n pod_name = "dagster-step-%s" % (k8s_name_key)\n\n args = execute_step_args.get_command_args()\n\n job = construct_dagster_k8s_job(\n job_config,\n args,\n job_name,\n user_defined_k8s_config,\n pod_name,\n component="step_worker",\n labels={\n "dagster/job": execute_step_args.pipeline_origin.pipeline_name,\n "dagster/op": step_key,\n "dagster/run-id": execute_step_args.pipeline_run_id,\n },\n )\n\n # Running list of events generated from this task execution\n events = []\n\n # Post event for starting execution\n job_name = job.metadata.name\n engine_event = instance.report_engine_event(\n "Executing step {} in Kubernetes job {}".format(step_key, job_name),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Job image", value=job_config.job_image),\n MetadataEntry("Image pull policy", value=job_config.image_pull_policy),\n MetadataEntry("Image pull secrets", value=str(job_config.image_pull_secrets)),\n MetadataEntry(\n "Service account name", value=str(job_config.service_account_name)\n ),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n # validated above that step_keys is length 1, and it is not possible to use ETH or\n # execution plan in this function (Celery K8s workers should not access to user code)\n step_key=step_key,\n )\n events.append(engine_event)\n try:\n kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n if e.reason == "Conflict":\n # There is an existing job with the same name so proceed and see if the existing job succeeded\n instance.report_engine_event(\n "Did not create Kubernetes job {} for step {} since job name already "\n "exists, proceeding with existing job.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Kubernetes Job name", value=job_name),\n ],\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n else:\n instance.report_engine_event(\n "Encountered unexpected error while creating Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n wait_for_job_success(\n job_name=job_name,\n namespace=job_namespace,\n instance=instance,\n run_id=execute_step_args.pipeline_run_id,\n wait_timeout=job_wait_timeout,\n )\n except (DagsterK8sError, DagsterK8sTimeoutError) as err:\n step_failure_event = construct_step_failure_event_and_handle(\n pipeline_run, step_key, err, instance=instance\n )\n events.append(step_failure_event)\n except DagsterK8sPipelineStatusException:\n instance.report_engine_event(\n "Terminating Kubernetes Job because dagster run status is not STARTED",\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Kubernetes Job namespace", value=job_namespace),\n ]\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n delete_job(job_name=job_name, namespace=job_namespace)\n return []\n except (\n DagsterK8sUnrecoverableAPIError,\n DagsterK8sAPIRetryLimitExceeded,\n # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in\n # a retry boundary. We still catch it here just in case we missed one so that we can\n # report it to the event log\n kubernetes.client.rest.ApiException,\n ):\n instance.report_engine_event(\n "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)\n except kubernetes.client.rest.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Post engine event for log retrieval\n engine_event = instance.report_engine_event(\n "Retrieving logs from Kubernetes Job pods",\n pipeline_run,\n EngineEventData([MetadataEntry("Pod names", value="\\n".join(pod_names))]),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n events.append(engine_event)\n\n logs = []\n for pod_name in pod_names:\n try:\n raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)\n logs += raw_logs.split("\\n")\n except kubernetes.client.rest.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "\n "Pod name {} for step {}. Will attempt to continue with other pods.".format(\n job_name, pod_name, step_key\n ),\n pipeline_run,\n EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n ],\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n events += filter_dagster_events_from_cli_logs(logs)\n serialized_events = [serialize_dagster_namedtuple(event) for event in events]\n return serialized_events\n\n return _execute_step_k8s_job\n
", "current_page_name": "_modules/dagster_celery_k8s/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_celery_k8s.executor"}, "launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_celery_k8s.launcher

\nimport sys\nfrom typing import cast\n\nimport kubernetes\nfrom dagster_k8s.job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_job_name_from_run_id,\n    get_user_defined_k8s_config,\n)\nfrom dagster_k8s.utils import delete_job\n\nfrom dagster import DagsterInvariantViolationError, MetadataEntry\nfrom dagster import _check as check\nfrom dagster.config.field import resolve_to_config_type\nfrom dagster.config.validate import process_config\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.launcher import LaunchRunContext, RunLauncher\nfrom dagster.core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster.core.origin import PipelinePythonOrigin\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import frozentags, merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\n\n\n
[docs]class CeleryK8sRunLauncher(RunLauncher, ConfigurableClass):\n """In contrast to the :py:class:`K8sRunLauncher`, which launches dagster runs as single K8s\n Jobs, this run launcher is intended for use in concert with\n :py:func:`dagster_celery_k8s.celery_k8s_job_executor`.\n\n With this run launcher, execution is delegated to:\n\n 1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\n submits steps to Celery queues for execution;\n 2. The step executions which are submitted to Celery queues are picked up by Celery workers,\n and each step execution spawns a step execution Kubernetes Job. See the implementation\n defined in :py:func:`dagster_celery_k8.executor.create_k8s_job_task`.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: CeleryK8sRunLauncher\n config:\n instance_config_map: "dagster-k8s-instance-config-map"\n dagster_home: "/some/path"\n postgres_password_secret: "dagster-k8s-pg-password"\n broker: "some_celery_broker_url"\n backend: "some_celery_backend_url"\n\n """\n\n def __init__(\n self,\n instance_config_map,\n dagster_home,\n postgres_password_secret,\n load_incluster_config=True,\n kubeconfig_file=None,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n retries=None,\n inst_data=None,\n k8s_client_batch_api=None,\n env_config_maps=None,\n env_secrets=None,\n volume_mounts=None,\n volumes=None,\n service_account_name=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n labels=None,\n fail_pod_on_run_failure=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._fixed_batch_api = k8s_client_batch_api\n\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self.postgres_password_secret = check.str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self.broker = check.opt_str_param(broker, "broker")\n self.backend = check.opt_str_param(backend, "backend")\n self.include = check.opt_list_param(include, "include")\n self.config_source = check.opt_dict_param(config_source, "config_source")\n\n retries = check.opt_dict_param(retries, "retries") or {"enabled": {}}\n self.retries = RetryMode.from_config(retries)\n\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n\n self._service_account_name = check.opt_str_param(\n service_account_name, "service_account_name"\n )\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n\n super().__init__()\n\n @property\n def _batch_api(self):\n return self._fixed_batch_api if self._fixed_batch_api else kubernetes.client.BatchV1Api()\n\n @classmethod\n def config_type(cls):\n from dagster_celery.executor import CELERY_CONFIG\n\n return merge_dicts(DagsterK8sJobConfig.config_type_run_launcher(), CELERY_CONFIG)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.pipeline_run\n\n job_name = get_job_name_from_run_id(run.run_id)\n pod_name = job_name\n exc_config = _get_validated_celery_k8s_executor_config(run.run_config)\n env_vars = None\n\n job_image_from_executor_config = exc_config.get("job_image")\n\n pipeline_origin = cast(PipelinePythonOrigin, context.pipeline_code_origin)\n repository_origin = pipeline_origin.repository_origin\n\n job_image = repository_origin.container_image\n\n if job_image:\n if job_image_from_executor_config:\n job_image = job_image_from_executor_config\n self._instance.report_engine_event(\n f"You have specified a job_image {job_image_from_executor_config} in your executor configuration, "\n f"but also {job_image} in your user-code deployment. Using the job image {job_image_from_executor_config} "\n f"from executor configuration as it takes precedence.",\n run,\n cls=self.__class__,\n )\n else:\n if not job_image_from_executor_config:\n raise DagsterInvariantViolationError(\n "You have not specified a job_image in your executor configuration. "\n "To resolve this error, specify the job_image configuration in the executor "\n "config section in your run config. \\n"\n "Note: You may also be seeing this error because you are using the configured API. "\n "Using configured with the celery-k8s executor is not supported at this time, "\n "and the job_image must be configured at the top-level executor config without "\n "using configured."\n )\n\n job_image = job_image_from_executor_config\n\n job_config = self.get_k8s_job_config(job_image, exc_config)\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_config.job_image},\n )\n\n user_defined_k8s_config = get_user_defined_k8s_config(frozentags(run.tags))\n\n from dagster.cli.api import ExecuteRunArgs\n\n run_args = ExecuteRunArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n job = construct_dagster_k8s_job(\n job_config,\n args=run_args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n env_vars=env_vars,\n labels={\n "dagster/job": pipeline_origin.pipeline_name,\n "dagster/run-id": run.run_id,\n },\n )\n\n job_namespace = exc_config.get("job_namespace")\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n [\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Kubernetes Namespace", value=job_namespace),\n MetadataEntry("Run ID", value=run.run_id),\n ]\n ),\n cls=self.__class__,\n )\n\n self._batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n EngineEventData(\n [\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Kubernetes Namespace", value=job_namespace),\n MetadataEntry("Run ID", value=run.run_id),\n ]\n ),\n cls=self.__class__,\n )\n\n def get_k8s_job_config(self, job_image, exc_config):\n return DagsterK8sJobConfig(\n dagster_home=self.dagster_home,\n instance_config_map=self.instance_config_map,\n postgres_password_secret=self.postgres_password_secret,\n job_image=check.opt_str_param(job_image, "job_image"),\n image_pull_policy=exc_config.get("image_pull_policy", self._image_pull_policy),\n image_pull_secrets=exc_config.get("image_pull_secrets", []) + self._image_pull_secrets,\n service_account_name=exc_config.get("service_account_name", self._service_account_name),\n env_config_maps=exc_config.get("env_config_maps", []) + self._env_config_maps,\n env_secrets=exc_config.get("env_secrets", []) + self._env_secrets,\n volume_mounts=exc_config.get("volume_mounts", []) + self._volume_mounts,\n volumes=exc_config.get("volumes", []) + self._volumes,\n labels=merge_dicts(self._labels, exc_config.get("labels", {})),\n )\n\n # https://github.com/dagster-io/dagster/issues/2741\n def can_terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n if not pipeline_run:\n return False\n\n if pipeline_run.status != PipelineRunStatus.STARTED:\n return False\n\n return True\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n can_terminate = self.can_terminate(run_id)\n if not can_terminate:\n self._instance.report_engine_event(\n message="Unable to terminate dagster job: can_terminate returned {}.".format(\n can_terminate\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n job_name = get_job_name_from_run_id(run_id)\n\n job_namespace = self.get_namespace_from_run_config(run_id)\n\n self._instance.report_run_canceling(run)\n\n try:\n termination_result = delete_job(job_name=job_name, namespace=job_namespace)\n if termination_result:\n self._instance.report_engine_event(\n message="Dagster Job was terminated successfully.",\n pipeline_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Dagster Job was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message="Dagster Job was not terminated successfully; encountered error in delete_job",\n pipeline_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n def get_namespace_from_run_config(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n run_config = pipeline_run.run_config\n executor_config = _get_validated_celery_k8s_executor_config(run_config)\n return executor_config.get("job_namespace")\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: PipelineRun):\n job_namespace = _get_validated_celery_k8s_executor_config(run.run_config).get(\n "job_namespace"\n )\n job_name = get_job_name_from_run_id(run.run_id)\n try:\n job = self._batch_api.read_namespaced_job(namespace=job_namespace, name=job_name)\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n if job.status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n\n\ndef _get_validated_celery_k8s_executor_config(run_config):\n check.dict_param(run_config, "run_config")\n\n executor_config = run_config.get("execution", {})\n execution_config_schema = resolve_to_config_type(celery_k8s_executor_config())\n\n # In run config on jobs, we don't have an executor key\n if not CELERY_K8S_CONFIG_KEY in executor_config:\n\n execution_run_config = executor_config.get("config", {})\n else:\n execution_run_config = (run_config["execution"][CELERY_K8S_CONFIG_KEY] or {}).get(\n "config", {}\n )\n\n res = process_config(execution_config_schema, execution_run_config)\n\n check.invariant(\n res.success,\n "Incorrect execution schema provided. Note: You may also be seeing this error "\n "because you are using the configured API. "\n "Using configured with the {config_key} executor is not supported at this time, "\n "and all executor config must be directly in the run config without using configured.".format(\n config_key=CELERY_K8S_CONFIG_KEY,\n ),\n )\n\n return res.value\n
", "current_page_name": "_modules/dagster_celery_k8s/launcher", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_celery_k8s.launcher"}}, "dagster_dask": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dask.executor

\nimport dask\nimport dask.distributed\n\nfrom dagster import Executor, Field, Permissive, Selector, StringSource\nfrom dagster import _check as check\nfrom dagster import multiple_process_executor_requirements, seven\nfrom dagster.core.definitions.executor_definition import executor\nfrom dagster.core.errors import raise_execution_interrupts\nfrom dagster.core.events import DagsterEvent\nfrom dagster.core.execution.api import create_execution_plan, execute_plan\nfrom dagster.core.execution.context.system import PlanOrchestrationContext\nfrom dagster.core.execution.plan.plan import ExecutionPlan\nfrom dagster.core.execution.retries import RetryMode\nfrom dagster.core.instance import DagsterInstance\nfrom dagster.utils import frozentags, iterate_with_context\n\n# Dask resource requirements are specified under this key\nDASK_RESOURCE_REQUIREMENTS_KEY = "dagster-dask/resource_requirements"\n\n\n
[docs]@executor(\n name="dask",\n requirements=multiple_process_executor_requirements(),\n config_schema={\n "cluster": Field(\n Selector(\n {\n "existing": Field(\n {"address": StringSource},\n description="Connect to an existing scheduler.",\n ),\n "local": Field(\n Permissive(), is_required=False, description="Local cluster configuration."\n ),\n "yarn": Field(\n Permissive(), is_required=False, description="YARN cluster configuration."\n ),\n "ssh": Field(\n Permissive(), is_required=False, description="SSH cluster configuration."\n ),\n "pbs": Field(\n Permissive(), is_required=False, description="PBS cluster configuration."\n ),\n "moab": Field(\n Permissive(), is_required=False, description="Moab cluster configuration."\n ),\n "sge": Field(\n Permissive(), is_required=False, description="SGE cluster configuration."\n ),\n "lsf": Field(\n Permissive(), is_required=False, description="LSF cluster configuration."\n ),\n "slurm": Field(\n Permissive(), is_required=False, description="SLURM cluster configuration."\n ),\n "oar": Field(\n Permissive(), is_required=False, description="OAR cluster configuration."\n ),\n "kube": Field(\n Permissive(),\n is_required=False,\n description="Kubernetes cluster configuration.",\n ),\n }\n )\n )\n },\n)\ndef dask_executor(init_context):\n """Dask-based executor.\n\n The 'cluster' can be one of the following:\n ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').\n\n If the Dask executor is used without providing executor-specific config, a local Dask cluster\n will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`\n with :py:class:`dask.distributed.LocalCluster() <dask:distributed.LocalCluster>`).\n\n The Dask executor optionally takes the following config:\n\n .. code-block:: none\n\n cluster:\n {\n local?: # takes distributed.LocalCluster parameters\n {\n timeout?: 5, # Timeout duration for initial connection to the scheduler\n n_workers?: 4 # Number of workers to start\n threads_per_worker?: 1 # Number of threads per each worker\n }\n }\n\n To use the `dask_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dask import dask_executor\n\n @job(executor_def=dask_executor)\n def dask_enabled_job():\n pass\n\n """\n ((cluster_type, cluster_configuration),) = init_context.executor_config["cluster"].items()\n return DaskExecutor(cluster_type, cluster_configuration)
\n\n\ndef query_on_dask_worker(\n dependencies,\n recon_pipeline,\n pipeline_run,\n run_config,\n step_keys,\n mode,\n instance_ref,\n known_state,\n): # pylint: disable=unused-argument\n """Note that we need to pass "dependencies" to ensure Dask sequences futures during task\n scheduling, even though we do not use this argument within the function.\n """\n\n with DagsterInstance.from_ref(instance_ref) as instance:\n subset_pipeline = recon_pipeline.subset_for_execution_from_existing_pipeline(\n pipeline_run.solids_to_execute\n )\n\n execution_plan = create_execution_plan(\n subset_pipeline,\n run_config=run_config,\n step_keys_to_execute=step_keys,\n mode=mode,\n known_state=known_state,\n )\n\n return execute_plan(\n execution_plan, subset_pipeline, instance, pipeline_run, run_config=run_config\n )\n\n\ndef get_dask_resource_requirements(tags):\n check.inst_param(tags, "tags", frozentags)\n req_str = tags.get(DASK_RESOURCE_REQUIREMENTS_KEY)\n if req_str is not None:\n return seven.json.loads(req_str)\n\n return {}\n\n\nclass DaskExecutor(Executor):\n def __init__(self, cluster_type, cluster_configuration):\n self.cluster_type = check.opt_str_param(cluster_type, "cluster_type", default="local")\n self.cluster_configuration = check.opt_dict_param(\n cluster_configuration, "cluster_configuration"\n )\n\n @property\n def retries(self):\n return RetryMode.DISABLED\n\n def execute(self, plan_context, execution_plan):\n check.inst_param(plan_context, "plan_context", PlanOrchestrationContext)\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.param_invariant(\n isinstance(plan_context.executor, DaskExecutor),\n "plan_context",\n "Expected executor to be DaskExecutor got {}".format(plan_context.executor),\n )\n\n check.invariant(\n plan_context.instance.is_persistent,\n "Dask execution requires a persistent DagsterInstance",\n )\n\n step_levels = execution_plan.get_steps_to_execute_by_level()\n\n pipeline_name = plan_context.pipeline_name\n\n instance = plan_context.instance\n\n cluster_type = self.cluster_type\n if cluster_type == "existing":\n # address passed directly to Client() below to connect to existing Scheduler\n cluster = self.cluster_configuration["address"]\n elif cluster_type == "local":\n from dask.distributed import LocalCluster\n\n cluster = LocalCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "yarn":\n from dask_yarn import YarnCluster\n\n cluster = YarnCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "ssh":\n from dask.distributed import SSHCluster\n\n cluster = SSHCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "pbs":\n from dask_jobqueue import PBSCluster\n\n cluster = PBSCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "moab":\n from dask_jobqueue import MoabCluster\n\n cluster = MoabCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "sge":\n from dask_jobqueue import SGECluster\n\n cluster = SGECluster(**self.build_dict(pipeline_name))\n elif cluster_type == "lsf":\n from dask_jobqueue import LSFCluster\n\n cluster = LSFCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "slurm":\n from dask_jobqueue import SLURMCluster\n\n cluster = SLURMCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "oar":\n from dask_jobqueue import OARCluster\n\n cluster = OARCluster(**self.build_dict(pipeline_name))\n elif cluster_type == "kube":\n from dask_kubernetes import KubeCluster\n\n cluster = KubeCluster(**self.build_dict(pipeline_name))\n else:\n raise ValueError(\n f"Must be providing one of the following ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"\n )\n\n with dask.distributed.Client(cluster) as client:\n execution_futures = []\n execution_futures_dict = {}\n\n for step_level in step_levels:\n for step in step_level:\n # We ensure correctness in sequencing by letting Dask schedule futures and\n # awaiting dependencies within each step.\n dependencies = []\n for step_input in step.step_inputs:\n for key in step_input.dependency_keys:\n dependencies.append(execution_futures_dict[key])\n\n if plan_context.pipeline.get_definition().is_job:\n run_config = plan_context.run_config\n else:\n run_config = dict(plan_context.run_config, execution={"in_process": {}})\n\n dask_task_name = "%s.%s" % (pipeline_name, step.key)\n\n recon_pipeline = plan_context.reconstructable_pipeline\n\n future = client.submit(\n query_on_dask_worker,\n dependencies,\n recon_pipeline,\n plan_context.pipeline_run,\n run_config,\n [step.key],\n plan_context.pipeline_run.mode,\n instance.get_ref(),\n execution_plan.known_state,\n key=dask_task_name,\n resources=get_dask_resource_requirements(step.tags),\n )\n\n execution_futures.append(future)\n execution_futures_dict[step.key] = future\n\n # This tells Dask to awaits the step executions and retrieve their results to the\n # master\n futures = dask.distributed.as_completed(execution_futures, with_results=True)\n\n # Allow interrupts while waiting for the results from Dask\n for future, result in iterate_with_context(raise_execution_interrupts, futures):\n for step_event in result:\n check.inst(step_event, DagsterEvent)\n yield step_event\n\n def build_dict(self, pipeline_name):\n """Returns a dict we can use for kwargs passed to dask client instantiation.\n\n Intended to be used like:\n\n with dask.distributed.Client(**cfg.build_dict()) as client:\n << use client here >>\n\n """\n if self.cluster_type in ["yarn", "pbs", "moab", "sge", "lsf", "slurm", "oar", "kube"]:\n dask_cfg = {"name": pipeline_name}\n else:\n dask_cfg = {}\n\n if self.cluster_configuration:\n for k, v in self.cluster_configuration.items():\n dask_cfg[k] = v\n\n # if address is set, don't add LocalCluster args\n # context: https://github.com/dask/distributed/issues/3313\n if (self.cluster_type == "local") and ("address" not in dask_cfg):\n # We set threads_per_worker because Dagster is not thread-safe. Even though\n # environments=True by default, there is a clever piece of machinery\n # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution\n # multithreaded by default when the number of available cores is greater than 4.\n # See: https://github.com/dagster-io/dagster/issues/2181\n # We may want to try to figure out a way to enforce this on remote Dask clusters against\n # which users run Dagster workloads.\n dask_cfg["threads_per_worker"] = 1\n\n return dask_cfg\n
", "current_page_name": "_modules/dagster_dask/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dask.executor"}}, "dagster_databricks": {"databricks": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_databricks.databricks

\nimport base64\nimport time\n\nimport requests.exceptions\nfrom databricks_api import DatabricksAPI\n\nimport dagster\nimport dagster._check as check\n\nfrom .types import (\n    DATABRICKS_RUN_TERMINATED_STATES,\n    DatabricksRunLifeCycleState,\n    DatabricksRunResultState,\n)\n\n# wait at most 24 hours by default for run execution\nDEFAULT_RUN_MAX_WAIT_TIME_SEC = 24 * 60 * 60\n\n\n
[docs]class DatabricksError(Exception):\n pass
\n\n\nclass DatabricksClient:\n """A thin wrapper over the Databricks REST API."""\n\n def __init__(self, host, token, workspace_id=None):\n self.host = host\n self.workspace_id = workspace_id\n self.client = DatabricksAPI(host=host, token=token)\n\n def submit_run(self, *args, **kwargs):\n """Submit a run directly to the 'Runs Submit' API."""\n return self.client.jobs.submit_run(*args, **kwargs)["run_id"] # pylint: disable=no-member\n\n def read_file(self, dbfs_path, block_size=1024**2):\n """Read a file from DBFS to a **byte string**."""\n\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n data = b""\n bytes_read = 0\n jdoc = self.client.dbfs.read(path=dbfs_path, length=block_size) # pylint: disable=no-member\n data += base64.b64decode(jdoc["data"])\n while jdoc["bytes_read"] == block_size:\n bytes_read += jdoc["bytes_read"]\n jdoc = self.client.dbfs.read( # pylint: disable=no-member\n path=dbfs_path, offset=bytes_read, length=block_size\n )\n data += base64.b64decode(jdoc["data"])\n return data\n\n def put_file(self, file_obj, dbfs_path, overwrite=False, block_size=1024**2):\n """Upload an arbitrary large file to DBFS.\n\n This doesn't use the DBFS `Put` API because that endpoint is limited to 1MB.\n """\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n create_response = self.client.dbfs.create( # pylint: disable=no-member\n path=dbfs_path, overwrite=overwrite\n )\n handle = create_response["handle"]\n\n block = file_obj.read(block_size)\n while block:\n data = base64.b64encode(block).decode("utf-8")\n self.client.dbfs.add_block(data=data, handle=handle) # pylint: disable=no-member\n block = file_obj.read(block_size)\n\n self.client.dbfs.close(handle=handle) # pylint: disable=no-member\n\n def get_run_state(self, databricks_run_id):\n """Get the state of a run by Databricks run ID (_not_ dagster run ID).\n\n Return a `DatabricksRunState` object. Note that the `result_state`\n attribute may be `None` if the run hasn't yet terminated.\n """\n run = self.client.jobs.get_run(databricks_run_id) # pylint: disable=no-member\n state = run["state"]\n result_state = state.get("result_state")\n if result_state:\n result_state = DatabricksRunResultState(result_state)\n return DatabricksRunState(\n life_cycle_state=DatabricksRunLifeCycleState(state["life_cycle_state"]),\n result_state=result_state,\n state_message=state["state_message"],\n )\n\n\nclass DatabricksRunState:\n """Represents the state of a Databricks job run."""\n\n def __init__(self, life_cycle_state, result_state, state_message):\n self.life_cycle_state = life_cycle_state\n self.result_state = result_state\n self.state_message = state_message\n\n def has_terminated(self):\n """Has the job terminated?"""\n return self.life_cycle_state in DATABRICKS_RUN_TERMINATED_STATES\n\n def is_successful(self):\n """Was the job successful?"""\n return self.result_state == DatabricksRunResultState.Success\n\n def __repr__(self):\n return str(self.__dict__)\n\n\nclass DatabricksJobRunner:\n """Submits jobs created using Dagster config to Databricks, and monitors their progress."""\n\n def __init__(\n self, host, token, poll_interval_sec=5, max_wait_time_sec=DEFAULT_RUN_MAX_WAIT_TIME_SEC\n ):\n """Args:\n host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net\n token (str): Databricks token\n """\n self.host = check.str_param(host, "host")\n self.token = check.str_param(token, "token")\n self.poll_interval_sec = check.numeric_param(poll_interval_sec, "poll_interval_sec")\n self.max_wait_time_sec = check.int_param(max_wait_time_sec, "max_wait_time_sec")\n\n self._client = DatabricksClient(host=self.host, token=self.token)\n\n @property\n def client(self):\n """Return the underlying `DatabricksClient` object."""\n return self._client\n\n def submit_run(self, run_config, task):\n """Submit a new run using the 'Runs submit' API."""\n existing_cluster_id = run_config["cluster"].get("existing")\n\n new_cluster = run_config["cluster"].get("new")\n\n # The Databricks API needs different keys to be present in API calls depending\n # on new/existing cluster, so we need to process the new_cluster\n # config first.\n if new_cluster:\n new_cluster = new_cluster.copy()\n\n nodes = new_cluster.pop("nodes")\n if "instance_pool_id" in nodes:\n new_cluster["instance_pool_id"] = nodes["instance_pool_id"]\n else:\n node_types = nodes["node_types"]\n new_cluster["node_type_id"] = node_types["node_type_id"]\n if "driver_node_type_id" in node_types:\n new_cluster["driver_node_type_id"] = node_types["driver_node_type_id"]\n\n cluster_size = new_cluster.pop("size")\n if "num_workers" in cluster_size:\n new_cluster["num_workers"] = cluster_size["num_workers"]\n else:\n new_cluster["autoscale"] = cluster_size["autoscale"]\n\n tags = new_cluster.get("custom_tags", [])\n tags.append({"key": "__dagster_version", "value": dagster.__version__})\n new_cluster["custom_tags"] = tags\n\n check.invariant(\n existing_cluster_id is not None or new_cluster is not None,\n "Invalid value for run_config.cluster",\n )\n\n # We'll always need some libraries, namely dagster/dagster_databricks/dagster_pyspark,\n # since they're imported by our scripts.\n # Add them if they're not already added by users in config.\n libraries = list(run_config.get("libraries", []))\n python_libraries = {\n x["pypi"]["package"].split("==")[0].replace("_", "-") for x in libraries if "pypi" in x\n }\n for library in ["dagster", "dagster-databricks", "dagster-pyspark"]:\n if library not in python_libraries:\n libraries.append(\n {"pypi": {"package": "{}=={}".format(library, dagster.__version__)}}\n )\n\n # Only one task should be able to be chosen really; make sure of that here.\n check.invariant(\n sum(\n task.get(key) is not None\n for key in [\n "notebook_task",\n "spark_python_task",\n "spark_jar_task",\n "spark_submit_task",\n ]\n )\n == 1,\n "Multiple tasks specified in Databricks run",\n )\n\n config = dict(\n run_name=run_config.get("run_name"),\n new_cluster=new_cluster,\n existing_cluster_id=existing_cluster_id,\n libraries=libraries,\n **task,\n )\n return self.client.submit_run(**config)\n\n def retrieve_logs_for_run_id(self, log, databricks_run_id):\n """Retrieve the stdout and stderr logs for a run."""\n api_client = self.client.client\n run = api_client.jobs.get_run(databricks_run_id) # pylint: disable=no-member\n cluster = api_client.cluster.get_cluster( # pylint: disable=no-member\n run["cluster_instance"]["cluster_id"]\n )\n log_config = cluster.get("cluster_log_conf")\n if log_config is None:\n log.warn(\n "Logs not configured for cluster {cluster} used for run {run}".format(\n cluster=cluster["cluster_id"], run=databricks_run_id\n )\n )\n return None\n if "s3" in log_config:\n logs_prefix = log_config["s3"]["destination"]\n log.warn("Retrieving S3 logs not yet implemented")\n return None\n elif "dbfs" in log_config:\n logs_prefix = log_config["dbfs"]["destination"]\n stdout = self.wait_for_dbfs_logs(log, logs_prefix, cluster["cluster_id"], "stdout")\n stderr = self.wait_for_dbfs_logs(log, logs_prefix, cluster["cluster_id"], "stderr")\n return stdout, stderr\n\n def wait_for_dbfs_logs(\n self, log, prefix, cluster_id, filename, waiter_delay=10, waiter_max_attempts=10\n ):\n """Attempt up to `waiter_max_attempts` attempts to get logs from DBFS."""\n path = "/".join([prefix, cluster_id, "driver", filename])\n log.info("Retrieving logs from {}".format(path))\n num_attempts = 0\n while num_attempts <= waiter_max_attempts:\n try:\n logs = self.client.read_file(path)\n return logs.decode("utf-8")\n except requests.exceptions.HTTPError:\n num_attempts += 1\n time.sleep(waiter_delay)\n log.warn("Could not retrieve cluster logs!")\n\n def wait_for_run_to_complete(self, log, databricks_run_id):\n return wait_for_run_to_complete(\n self.client, log, databricks_run_id, self.poll_interval_sec, self.max_wait_time_sec\n )\n\n\ndef poll_run_state(\n client,\n log,\n start_poll_time: float,\n databricks_run_id: int,\n max_wait_time_sec: float,\n):\n run_state = client.get_run_state(databricks_run_id)\n if run_state.has_terminated():\n if run_state.is_successful():\n log.info("Run %s completed successfully" % databricks_run_id)\n return True\n else:\n error_message = "Run %s failed with result state: %s. Message: %s" % (\n databricks_run_id,\n run_state.result_state,\n run_state.state_message,\n )\n log.error(error_message)\n raise DatabricksError(error_message)\n else:\n log.info("Run %s in state %s" % (databricks_run_id, run_state))\n if time.time() - start_poll_time > max_wait_time_sec:\n raise DatabricksError(\n "Job run {} took more than {}s to complete; failing".format(\n databricks_run_id, max_wait_time_sec\n )\n )\n return False\n\n\ndef wait_for_run_to_complete(client, log, databricks_run_id, poll_interval_sec, max_wait_time_sec):\n """Wait for a Databricks run to complete."""\n check.int_param(databricks_run_id, "databricks_run_id")\n log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n start = time.time()\n while True:\n if poll_run_state(client, log, start, databricks_run_id, max_wait_time_sec):\n return\n time.sleep(poll_interval_sec)\n
", "current_page_name": "_modules/dagster_databricks/databricks", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_databricks.databricks"}, "databricks_pyspark_step_launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_databricks.databricks_pyspark_step_launcher

\nimport io\nimport os.path\nimport pickle\nimport tempfile\nimport time\n\nfrom dagster_databricks import databricks_step_main\nfrom dagster_databricks.databricks import (\n    DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n    DatabricksJobRunner,\n    poll_run_state,\n)\nfrom dagster_pyspark.utils import build_pyspark_zip\nfrom requests import HTTPError\n\nfrom dagster import Bool, Field, IntSource, StringSource\nfrom dagster import _check as check\nfrom dagster import resource\nfrom dagster.core.definitions.step_launcher import StepLauncher\nfrom dagster.core.errors import raise_execution_interrupts\nfrom dagster.core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster.serdes import deserialize_value\nfrom dagster.utils.backoff import backoff\n\nfrom .configs import (\n    define_databricks_secrets_config,\n    define_databricks_storage_config,\n    define_databricks_submit_run_config,\n)\n\nCODE_ZIP_NAME = "code.zip"\nPICKLED_CONFIG_FILE_NAME = "config.pkl"\n\n\n
[docs]@resource(\n {\n "run_config": define_databricks_submit_run_config(),\n "databricks_host": Field(\n StringSource,\n is_required=True,\n description="Databricks host, e.g. uksouth.azuredatabricks.com",\n ),\n "databricks_token": Field(\n StringSource,\n is_required=True,\n description="Databricks access token",\n ),\n "secrets_to_env_variables": define_databricks_secrets_config(),\n "storage": define_databricks_storage_config(),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description="Absolute path to the package that contains the pipeline definition(s) "\n "whose steps will execute remotely on Databricks. This is a path on the local "\n "fileystem of the process executing the pipeline. Before every step run, the "\n "launcher will zip up the code in this path, upload it to DBFS, and unzip it "\n "into the Python path of the remote Spark process. This gives the remote process "\n "access to up-to-date user code.",\n ),\n "local_dagster_job_package_path": Field(\n StringSource,\n is_required=False,\n description="Absolute path to the package that contains the dagster job definition(s) "\n "whose steps will execute remotely on Databricks. This is a path on the local "\n "fileystem of the process executing the dagster job. Before every step run, the "\n "launcher will zip up the code in this path, upload it to DBFS, and unzip it "\n "into the Python path of the remote Spark process. This gives the remote process "\n "access to up-to-date user code.",\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="/dagster_staging",\n description="Directory in DBFS to use for uploaded job code. Must be absolute.",\n ),\n "wait_for_logs": Field(\n Bool,\n is_required=False,\n default_value=False,\n description="If set, and if the specified cluster is configured to export logs, "\n "the system will wait after job completion for the logs to appear in the configured "\n "location. Note that logs are copied every 5 minutes, so enabling this will add "\n "several minutes to the job runtime. NOTE: this integration will export stdout/stderr"\n "from the remote Databricks process automatically, so this option is not generally "\n "necessary.",\n ),\n "max_completion_wait_time_seconds": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n description="If the Databricks job run takes more than this many seconds, then "\n "consider it failed and terminate the step.",\n ),\n "poll_interval_sec": Field(\n float,\n is_required=False,\n default_value=5.0,\n description="How frequently Dagster will poll Databricks to determine the state of the job.",\n ),\n }\n)\ndef databricks_pyspark_step_launcher(context):\n """Resource for running ops as a Databricks Job.\n\n When this resource is used, the op will be executed in Databricks using the 'Run Submit'\n API. Pipeline code will be zipped up and copied to a directory in DBFS along with the op's\n execution context.\n\n Use the 'run_config' configuration to specify the details of the Databricks cluster used, and\n the 'storage' key to configure persistent storage on that cluster. Storage is accessed by\n setting the credentials in the Spark context, as documented `here for S3`_ and `here for ADLS`_.\n\n .. _`here for S3`: https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context\n .. _`here for ADLS`: https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n """\n return DatabricksPySparkStepLauncher(**context.resource_config)
\n\n\nclass DatabricksPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n run_config,\n databricks_host,\n databricks_token,\n secrets_to_env_variables,\n storage,\n staging_prefix,\n wait_for_logs,\n max_completion_wait_time_seconds,\n poll_interval_sec=5,\n local_pipeline_package_path=None,\n local_dagster_job_package_path=None,\n ):\n self.run_config = check.dict_param(run_config, "run_config")\n self.databricks_host = check.str_param(databricks_host, "databricks_host")\n self.databricks_token = check.str_param(databricks_token, "databricks_token")\n self.secrets = check.list_param(secrets_to_env_variables, "secrets_to_env_variables", dict)\n self.storage = check.dict_param(storage, "storage")\n check.invariant(\n local_dagster_job_package_path is not None or local_pipeline_package_path is not None,\n "Missing config: need to provide either 'local_dagster_job_package_path' or 'local_pipeline_package_path' config entry",\n )\n check.invariant(\n local_dagster_job_package_path is None or local_pipeline_package_path is None,\n "Error in config: Provided both 'local_dagster_job_package_path' and 'local_pipeline_package_path' entries. Need to specify one or the other.",\n )\n self.local_dagster_job_package_path = check.str_param(\n local_pipeline_package_path or local_dagster_job_package_path,\n "local_dagster_job_package_path",\n )\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n check.invariant(staging_prefix.startswith("/"), "staging_prefix must be an absolute path")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n\n self.databricks_runner = DatabricksJobRunner(\n host=databricks_host,\n token=databricks_token,\n poll_interval_sec=poll_interval_sec,\n max_wait_time_sec=max_completion_wait_time_seconds,\n )\n\n def launch_step(self, step_context, prior_attempts_count):\n step_run_ref = step_context_to_step_run_ref(\n step_context, prior_attempts_count, self.local_dagster_job_package_path\n )\n run_id = step_context.pipeline_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._upload_artifacts(log, step_run_ref, run_id, step_key)\n\n task = self._get_databricks_task(run_id, step_key)\n databricks_run_id = self.databricks_runner.submit_run(self.run_config, task)\n\n try:\n # If this is being called within a `capture_interrupts` context, allow interrupts while\n # waiting for the execution to complete, so that we can terminate slow or hanging steps\n with raise_execution_interrupts():\n yield from self.step_events_iterator(step_context, step_key, databricks_run_id)\n finally:\n self.log_compute_logs(log, run_id, step_key)\n # this is somewhat obsolete\n if self.wait_for_logs:\n self._log_logs_from_cluster(log, databricks_run_id)\n\n def log_compute_logs(self, log, run_id, step_key):\n stdout = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stdout")\n ).decode()\n stderr = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stderr")\n ).decode()\n log.info(f"Captured stdout for step {step_key}:")\n log.info(stdout)\n log.info(f"Captured stderr for step {step_key}:")\n log.info(stderr)\n\n def step_events_iterator(self, step_context, step_key: str, databricks_run_id: int):\n """The launched Databricks job writes all event records to a specific dbfs file. This iterator\n regularly reads the contents of the file, adds any events that have not yet been seen to\n the instance, and yields any DagsterEvents.\n\n By doing this, we simulate having the remote Databricks process able to directly write to\n the local DagsterInstance. Importantly, this means that timestamps (and all other record\n properties) will be sourced from the Databricks process, rather than recording when this\n process happens to log them.\n """\n\n check.int_param(databricks_run_id, "databricks_run_id")\n processed_events = 0\n start = time.time()\n done = False\n step_context.log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n while not done:\n with raise_execution_interrupts():\n step_context.log.debug(\n "Waiting %.1f seconds...", self.databricks_runner.poll_interval_sec\n )\n time.sleep(self.databricks_runner.poll_interval_sec)\n try:\n done = poll_run_state(\n self.databricks_runner.client,\n step_context.log,\n start,\n databricks_run_id,\n self.databricks_runner.max_wait_time_sec,\n )\n finally:\n all_events = self.get_step_events(\n step_context.run_id, step_key, step_context.previous_attempt_count\n )\n # we get all available records on each poll, but we only want to process the\n # ones we haven't seen before\n for event in all_events[processed_events:]:\n # write each event from the DataBricks instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.dagster_event\n processed_events = len(all_events)\n\n step_context.log.info(f"Databricks run {databricks_run_id} completed.")\n\n def get_step_events(self, run_id: str, step_key: str, retry_number: int):\n path = self._dbfs_path(run_id, step_key, f"{retry_number}_{PICKLED_EVENTS_FILE_NAME}")\n\n def _get_step_records():\n serialized_records = self.databricks_runner.client.read_file(path)\n if not serialized_records:\n return []\n return deserialize_value(pickle.loads(serialized_records))\n\n try:\n # reading from dbfs while it writes can be flaky\n # allow for retry if we get malformed data\n return backoff(\n fn=_get_step_records,\n retry_on=(pickle.UnpicklingError,),\n max_retries=2,\n )\n # if you poll before the Databricks process has had a chance to create the file,\n # we expect to get this error\n except HTTPError as e:\n if e.response.json().get("error_code") == "RESOURCE_DOES_NOT_EXIST":\n return []\n\n return []\n\n def _get_databricks_task(self, run_id, step_key):\n """Construct the 'task' parameter to be submitted to the Databricks API.\n\n This will create a 'spark_python_task' dict where `python_file` is a path on DBFS\n pointing to the 'databricks_step_main.py' file, and `parameters` is an array with a single\n element, a path on DBFS pointing to the picked `step_run_ref` data.\n\n See https://docs.databricks.com/dev-tools/api/latest/jobs.html#jobssparkpythontask.\n """\n python_file = self._dbfs_path(run_id, step_key, self._main_file_name())\n parameters = [\n self._internal_dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, CODE_ZIP_NAME),\n ]\n return {"spark_python_task": {"python_file": python_file, "parameters": parameters}}\n\n def _upload_artifacts(self, log, step_run_ref, run_id, step_key):\n """Upload the step run ref and pyspark code to DBFS to run as a job."""\n\n log.info("Uploading main file to DBFS")\n main_local_path = self._main_file_local_path()\n with open(main_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, self._main_file_name()), overwrite=True\n )\n\n log.info("Uploading dagster job to DBFS")\n with tempfile.TemporaryDirectory() as temp_dir:\n # Zip and upload package containing dagster job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n build_pyspark_zip(zip_local_path, self.local_dagster_job_package_path)\n with open(zip_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME), overwrite=True\n )\n\n log.info("Uploading step run ref file to DBFS")\n step_pickle_file = io.BytesIO()\n\n pickle.dump(step_run_ref, step_pickle_file)\n step_pickle_file.seek(0)\n self.databricks_runner.client.put_file(\n step_pickle_file,\n self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n overwrite=True,\n )\n\n databricks_config = DatabricksConfig(\n storage=self.storage,\n secrets=self.secrets,\n )\n log.info("Uploading Databricks configuration to DBFS")\n databricks_config_file = io.BytesIO()\n\n pickle.dump(databricks_config, databricks_config_file)\n databricks_config_file.seek(0)\n self.databricks_runner.client.put_file(\n databricks_config_file,\n self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n overwrite=True,\n )\n\n def _log_logs_from_cluster(self, log, run_id):\n logs = self.databricks_runner.retrieve_logs_for_run_id(log, run_id)\n if logs is None:\n return\n stdout, stderr = logs\n if stderr:\n log.info(stderr)\n if stdout:\n log.info(stdout)\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return databricks_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _dbfs_path(self, run_id, step_key, filename):\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return "dbfs://{}".format(path)\n\n def _internal_dbfs_path(self, run_id, step_key, filename):\n """Scripts running on Databricks should access DBFS at /dbfs/."""\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return "/dbfs/{}".format(path)\n\n\nclass DatabricksConfig:\n """Represents configuration required by Databricks to run jobs.\n\n Instances of this class will be created when a Databricks step is launched and will contain\n all configuration and secrets required to set up storage and environment variables within\n the Databricks environment. The instance will be serialized and uploaded to Databricks\n by the step launcher, then deserialized as part of the 'main' script when the job is running\n in Databricks.\n\n The `setup` method handles the actual setup prior to op execution on the Databricks side.\n\n This config is separated out from the regular Dagster run config system because the setup\n is done by the 'main' script before entering a Dagster context (i.e. using `run_step_from_ref`).\n We use a separate class to avoid coupling the setup to the format of the `step_run_ref` object.\n """\n\n def __init__(self, storage, secrets):\n """Create a new DatabricksConfig object.\n\n `storage` and `secrets` should be of the same shape as the `storage` and\n `secrets_to_env_variables` config passed to `databricks_pyspark_step_launcher`.\n """\n self.storage = storage\n self.secrets = secrets\n\n def setup(self, dbutils, sc):\n """Set up storage and environment variables on Databricks.\n\n The `dbutils` and `sc` arguments must be passed in by the 'main' script, as they\n aren't accessible by any other modules.\n """\n self.setup_storage(dbutils, sc)\n self.setup_environment(dbutils)\n\n def setup_storage(self, dbutils, sc):\n """Set up storage using either S3 or ADLS2."""\n if "s3" in self.storage:\n self.setup_s3_storage(self.storage["s3"], dbutils, sc)\n elif "adls2" in self.storage:\n self.setup_adls2_storage(self.storage["adls2"], dbutils, sc)\n\n def setup_s3_storage(self, s3_storage, dbutils, sc):\n """Obtain AWS credentials from Databricks secrets and export so both Spark and boto can use them."""\n\n scope = s3_storage["secret_scope"]\n\n access_key = dbutils.secrets.get(scope=scope, key=s3_storage["access_key_key"])\n secret_key = dbutils.secrets.get(scope=scope, key=s3_storage["secret_key_key"])\n\n # Spark APIs will use this.\n # See https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context.\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.s3n.awsAccessKeyId", access_key\n )\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.s3n.awsSecretAccessKey", secret_key\n )\n\n # Boto will use these.\n os.environ["AWS_ACCESS_KEY_ID"] = access_key\n os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key\n\n def setup_adls2_storage(self, adls2_storage, dbutils, sc):\n """Obtain an Azure Storage Account key from Databricks secrets and export so Spark can use it."""\n storage_account_key = dbutils.secrets.get(\n scope=adls2_storage["secret_scope"], key=adls2_storage["storage_account_key_key"]\n )\n # Spark APIs will use this.\n # See https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n # sc is globally defined in the Databricks runtime and points to the Spark context\n sc._jsc.hadoopConfiguration().set( # pylint: disable=protected-access\n "fs.azure.account.key.{}.dfs.core.windows.net".format(\n adls2_storage["storage_account_name"]\n ),\n storage_account_key,\n )\n\n def setup_environment(self, dbutils):\n """Setup any environment variables required by the run.\n\n Extract any secrets in the run config and export them as environment variables.\n\n This is important for any `StringSource` config since the environment variables\n won't ordinarily be available in the Databricks execution environment.\n """\n for secret in self.secrets:\n name = secret["name"]\n key = secret["key"]\n scope = secret["scope"]\n print( # pylint: disable=print-call\n "Exporting {} from Databricks secret {}, scope {}".format(name, key, scope)\n )\n val = dbutils.secrets.get(scope=scope, key=key)\n os.environ[name] = val\n
", "current_page_name": "_modules/dagster_databricks/databricks_pyspark_step_launcher", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_databricks.databricks_pyspark_step_launcher"}, "solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_databricks.solids

\nfrom dagster import Field, InputDefinition, Nothing, OutputDefinition, Permissive\nfrom dagster import _check as check\nfrom dagster import op, solid\n\nfrom .databricks import wait_for_run_to_complete\n\n_START = "start"\n\n_DEFAULT_POLL_INTERVAL = 10\n# wait at most 24 hours by default for run execution\n_DEFAULT_RUN_MAX_WAIT_TIME_SEC = 24 * 60 * 60\n\n\n
[docs]def create_databricks_job_op(\n name="databricks_job",\n num_inputs=1,\n description=None,\n required_resource_keys=frozenset(["databricks_client"]),\n):\n """\n Creates an op that launches a databricks job (not to be confused with Dagster's job API).\n\n As config, the op accepts a blob of the form described in Databricks' job API:\n https://docs.databricks.com/dev-tools/api/latest/jobs.html.\n\n Returns:\n OpDefinition: An op definition.\n\n Example:\n\n .. code-block:: python\n\n from dagster import graph\n from dagster_databricks import create_databricks_job_op, databricks_client\n\n sparkpi = create_databricks_job_op().configured(\n {\n "job": {\n "name": "SparkPi Python job",\n "new_cluster": {\n "spark_version": "7.3.x-scala2.12",\n "node_type_id": "i3.xlarge",\n "num_workers": 2,\n },\n "spark_python_task": {"python_file": "dbfs:/docs/pi.py", "parameters": ["10"]},\n }\n },\n name="sparkpi",\n )\n\n @graph\n def my_spark():\n sparkpi()\n\n my_spark.to_job(\n resource_defs={\n "databricks_client": databricks_client.configured(\n {"host": "my.workspace.url", "token": "my.access.token"}\n )\n }\n )\n """\n return core_create_databricks_job(\n dagster_decorator=op,\n name=name,\n num_inputs=num_inputs,\n description=description,\n required_resource_keys=required_resource_keys,\n )
\n\n\n
[docs]def create_databricks_job_solid(\n name="databricks_job",\n num_inputs=1,\n description=None,\n required_resource_keys=frozenset(["databricks_client"]),\n):\n """\n Creates a solid that launches a databricks job.\n\n As config, the solid accepts a blob of the form described in Databricks' job API:\n https://docs.databricks.com/dev-tools/api/latest/jobs.html.\n\n Returns:\n SolidDefinition: A solid definition.\n\n Example:\n\n .. code-block:: python\n\n from dagster import ModeDefinition, pipeline\n from dagster_databricks import create_databricks_job_solid, databricks_client\n\n sparkpi = create_databricks_job_solid().configured(\n {\n "job": {\n "name": "SparkPi Python job",\n "new_cluster": {\n "spark_version": "7.3.x-scala2.12",\n "node_type_id": "i3.xlarge",\n "num_workers": 2,\n },\n "spark_python_task": {"python_file": "dbfs:/docs/pi.py", "parameters": ["10"]},\n }\n },\n name="sparkspi",\n )\n\n\n @pipeline(\n mode_defs=[\n ModeDefinition(\n resource_defs={\n "databricks_client": databricks_client.configured(\n {"host": "my.workspace.url", "token": "my.access.token"}\n )\n }\n )\n ]\n )\n def my_pipeline():\n sparkpi()\n """\n return core_create_databricks_job(\n dagster_decorator=solid,\n name=name,\n num_inputs=num_inputs,\n description=description,\n required_resource_keys=required_resource_keys,\n )
\n\n\ndef core_create_databricks_job(\n dagster_decorator,\n name="databricks_job",\n num_inputs=1,\n description=None,\n required_resource_keys=frozenset(["databricks_client"]),\n):\n check.str_param(name, "name")\n check.opt_str_param(description, "description")\n check.int_param(num_inputs, "num_inputs")\n check.set_param(required_resource_keys, "required_resource_keys", of_type=str)\n\n input_defs = [InputDefinition("input_" + str(i), Nothing) for i in range(num_inputs)]\n\n @dagster_decorator(\n name=name,\n description=description,\n config_schema={\n "job": Field(\n Permissive(),\n description="Databricks job run configuration, in the form described in "\n "Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html",\n ),\n "poll_interval_sec": Field(\n float,\n description="Check whether the job is done at this interval.",\n default_value=_DEFAULT_POLL_INTERVAL,\n ),\n "max_wait_time_sec": Field(\n float,\n description="If the job is not complete after this length of time, raise an error.",\n default_value=_DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n ),\n },\n input_defs=input_defs,\n output_defs=[OutputDefinition(Nothing)],\n required_resource_keys=required_resource_keys,\n tags={"kind": "databricks"},\n )\n def databricks_fn(context):\n job_config = context.op_config["job"]\n databricks_client = context.resources.databricks_client\n run_id = databricks_client.submit_run(**job_config)\n\n context.log.info(\n "Launched databricks job with run id {run_id}. UI: {url}. Waiting to run to completion...".format(\n run_id=run_id, url=create_ui_url(databricks_client, context.op_config)\n )\n )\n wait_for_run_to_complete(\n databricks_client,\n context.log,\n run_id,\n context.op_config["poll_interval_sec"],\n context.op_config["max_wait_time_sec"],\n )\n\n return databricks_fn\n\n\ndef create_ui_url(databricks_client, op_config):\n host = databricks_client.host\n workspace_id = databricks_client.workspace_id or "<workspace_id>"\n if "existing_cluster_id" in op_config["job"]:\n return "https://{host}/?o={workspace_id}#/setting/clusters/{cluster_id}/sparkUi".format(\n host=host,\n workspace_id=workspace_id,\n cluster_id=op_config["job"]["existing_cluster_id"],\n )\n else:\n return "https://{host}/?o={workspace_id}#joblist".format(\n host=host, workspace_id=workspace_id\n )\n
", "current_page_name": "_modules/dagster_databricks/solids", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_databricks.solids"}}, "dagster_datadog": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_datadog.resources

\nfrom datadog import DogStatsd, initialize, statsd\n\nfrom dagster import Field, StringSource, resource\n\n\nclass DataDogResource:\n    # Mirroring levels from the dogstatsd library\n    OK, WARNING, CRITICAL, UNKNOWN = (\n        DogStatsd.OK,\n        DogStatsd.WARNING,\n        DogStatsd.CRITICAL,\n        DogStatsd.UNKNOWN,\n    )\n\n    def __init__(self, api_key, app_key):\n        initialize(api_key=api_key, app_key=app_key)\n\n        # Pull in methods from the dogstatsd library\n        for method in [\n            "event",\n            "gauge",\n            "increment",\n            "decrement",\n            "histogram",\n            "distribution",\n            "set",\n            "service_check",\n            "timed",\n            "timing",\n        ]:\n            setattr(self, method, getattr(statsd, method))\n\n\n
[docs]@resource(\n {\n "api_key": Field(StringSource, description="Datadog API key"),\n "app_key": Field(StringSource, description="Datadog application key"),\n },\n description="This resource is for publishing to DataDog",\n)\ndef datadog_resource(context):\n """This resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n\n .. code-block:: python\n\n @op(required_resource_keys={'datadog'})\n def datadog_op(context):\n dd = context.resources.datadog\n\n dd.event('Man down!', 'This server needs assistance.')\n dd.gauge('users.online', 1001, tags=["protocol:http"])\n dd.increment('page.views')\n dd.decrement('page.views')\n dd.histogram('album.photo.count', 26, tags=["gender:female"])\n dd.distribution('album.photo.count', 26, tags=["color:blue"])\n dd.set('visitors.uniques', 999, tags=["browser:ie"])\n dd.service_check('svc.check_name', dd.WARNING)\n dd.timing("query.response.time", 1234)\n\n # Use timed decorator\n @dd.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job(resource_defs={'datadog': datadog_resource})\n def dd_job():\n datadog_op()\n\n result = dd_job.execute_in_process(\n run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n )\n\n """\n return DataDogResource(\n context.resource_config.get("api_key"), context.resource_config.get("app_key")\n )
\n
", "current_page_name": "_modules/dagster_datadog/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_datadog.resources"}}, "dagster_dbt": {"asset_defs": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.asset_defs

\nimport hashlib\nimport json\nimport os\nimport textwrap\nfrom typing import AbstractSet, Any, Callable, Dict, Mapping, Optional, Sequence, Set, Tuple\n\nfrom dagster_dbt.cli.types import DbtCliOutput\nfrom dagster_dbt.cli.utils import execute_cli\nfrom dagster_dbt.types import DbtOutput\nfrom dagster_dbt.utils import generate_events\n\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    AssetsDefinition,\n    In,\n    MetadataValue,\n    Nothing,\n    Out,\n    Output,\n    SolidExecutionContext,\n    TableColumn,\n    TableSchema,\n)\nfrom dagster import _check as check\nfrom dagster import get_dagster_logger, op\nfrom dagster.core.definitions.metadata import RawMetadataValue\nfrom dagster.core.errors import DagsterInvalidSubsetError\n\n\ndef _load_manifest_for_project(\n    project_dir: str, profiles_dir: str, target_dir: str, select: str\n) -> Tuple[Mapping[str, Any], DbtCliOutput]:\n    # running "dbt ls" regenerates the manifest.json, which includes a superset of the actual\n    # "dbt ls" output\n    cli_output = execute_cli(\n        executable="dbt",\n        command="ls",\n        log=get_dagster_logger(),\n        flags_dict={\n            "project-dir": project_dir,\n            "profiles-dir": profiles_dir,\n            "select": select,\n            "resource-type": "model",\n            "output": "json",\n        },\n        warn_error=False,\n        ignore_handled_error=False,\n        target_path=target_dir,\n    )\n    manifest_path = os.path.join(target_dir, "manifest.json")\n    with open(manifest_path, "r", encoding="utf8") as f:\n        return json.load(f), cli_output\n\n\ndef _select_unique_ids_from_manifest_json(\n    manifest_json: Mapping[str, Any], select: str\n) -> AbstractSet[str]:\n    """Method to apply a selection string to an existing manifest.json file."""\n    try:\n        import dbt.graph.cli as graph_cli\n        import dbt.graph.selector as graph_selector\n        from dbt.contracts.graph.manifest import Manifest\n        from networkx import DiGraph\n    except ImportError:\n        check.failed(\n            "In order to use the `select` argument on load_assets_from_manifest_json, you must have"\n            "`dbt-core >= 1.0.0` and `networkx` installed."\n        )\n\n    class _DictShim(dict):\n        """Shim to enable hydrating a dictionary into a dot-accessible object"""\n\n        def __getattr__(self, item):\n            ret = super().get(item)\n            # allow recursive access e.g. foo.bar.baz\n            return _DictShim(ret) if isinstance(ret, dict) else ret\n\n    # generate a dbt-compatible graph from the existing child map\n    graph = graph_selector.Graph(DiGraph(incoming_graph_data=manifest_json["child_map"]))\n    manifest = Manifest(\n        # dbt expects dataclasses that can be accessed with dot notation, not bare dictionaries\n        nodes={unique_id: _DictShim(info) for unique_id, info in manifest_json["nodes"].items()},\n        sources={\n            unique_id: _DictShim(info) for unique_id, info in manifest_json["sources"].items()\n        },\n    )\n\n    # create a parsed selection from the select string\n    parsed_spec = graph_cli.parse_union([select], True)\n\n    # execute this selection against the graph\n    selector = graph_selector.NodeSelector(graph, manifest)\n    selected, _ = selector.select_nodes(parsed_spec)\n    if len(selected) == 0:\n        raise DagsterInvalidSubsetError(f"No dbt models match the selection string '{select}'.")\n    return selected\n\n\ndef _get_node_name(node_info: Mapping[str, Any]):\n    return "__".join([node_info["resource_type"], node_info["package_name"], node_info["name"]])\n\n\ndef _get_node_asset_key(node_info):\n    if node_info.get("schema") is not None:\n        components = [node_info["schema"], node_info["name"]]\n    else:\n        components = [node_info["name"]]\n\n    return AssetKey(components)\n\n\ndef _get_node_description(node_info):\n    code_block = textwrap.indent(node_info["raw_sql"], "    ")\n    description_sections = [\n        node_info["description"],\n        f"#### Raw SQL:\\n```\\n{code_block}\\n```",\n    ]\n    return "\\n\\n".join(filter(None, description_sections))\n\n\ndef _dbt_nodes_to_assets(\n    dbt_nodes: Mapping[str, Any],\n    select: str,\n    selected_unique_ids: AbstractSet[str],\n    runtime_metadata_fn: Optional[\n        Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ] = None,\n    io_manager_key: Optional[str] = None,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = _get_node_asset_key,\n    use_build_command: bool = False,\n) -> AssetsDefinition:\n\n    outs: Dict[str, Out] = {}\n    asset_ins: Dict[AssetKey, Tuple[str, In]] = {}\n\n    asset_deps: Dict[AssetKey, Set[AssetKey]] = {}\n\n    out_name_to_node_info: Dict[str, Mapping[str, Any]] = {}\n\n    package_name = None\n    for unique_id in selected_unique_ids:\n        cur_asset_deps = set()\n        node_info = dbt_nodes[unique_id]\n        if node_info["resource_type"] != "model":\n            continue\n        package_name = node_info.get("package_name", package_name)\n\n        for dep_name in node_info["depends_on"]["nodes"]:\n            dep_type = dbt_nodes[dep_name]["resource_type"]\n            # ignore seeds/snapshots/tests\n            if dep_type not in ["source", "model"]:\n                continue\n            dep_asset_key = node_info_to_asset_key(dbt_nodes[dep_name])\n\n            # if it's a source, it will be used as an input to this multi-asset\n            if dep_type == "source":\n                asset_ins[dep_asset_key] = (dep_name.replace(".", "_"), In(Nothing))\n\n            # regardless of type, list this as a dependency for the current asset\n            cur_asset_deps.add(dep_asset_key)\n\n        # generate the Out that corresponds to this model\n        node_name = node_info["name"]\n        outs[node_name] = Out(\n            description=_get_node_description(node_info),\n            io_manager_key=io_manager_key,\n            metadata=_columns_to_metadata(node_info["columns"]),\n            is_required=False,\n        )\n        out_name_to_node_info[node_name] = node_info\n\n        # set the asset dependencies for this asset\n        asset_deps[node_info_to_asset_key(node_info)] = cur_asset_deps\n\n    # prevent op name collisions between multiple dbt multi-assets\n    op_name = f"run_dbt_{package_name}"\n    if select != "*":\n        op_name += "_" + hashlib.md5(select.encode()).hexdigest()[-5:]\n\n    @op(\n        name=op_name,\n        tags={"kind": "dbt"},\n        ins=dict(asset_ins.values()),\n        out=outs,\n        required_resource_keys={"dbt"},\n    )\n    def dbt_op(context):\n        dbt_output = None\n        try:\n            # in the case that we're running everything, opt for the cleaner selection string\n            if len(context.selected_output_names) == len(outs):\n                subselect = select\n            else:\n                # for each output that we want to emit, translate to a dbt select string by converting\n                # the out to it's corresponding fqn\n                subselect = [\n                    ".".join(out_name_to_node_info[out_name]["fqn"])\n                    for out_name in context.selected_output_names\n                ]\n\n            if use_build_command:\n                dbt_output = context.resources.dbt.build(select=subselect)\n            else:\n                dbt_output = context.resources.dbt.run(select=subselect)\n\n        finally:\n            # in the case that the project only partially runs successfully, still attempt to generate\n            # events for the parts that were successful\n            if dbt_output is None:\n                dbt_output = DbtOutput(result=context.resources.dbt.get_run_results_json())\n\n            # yield an Output for each materialization generated in the run\n            for event in generate_events(\n                dbt_output,\n                node_info_to_asset_key=node_info_to_asset_key,\n                manifest_json=context.resources.dbt.get_manifest_json(),\n            ):\n                # convert AssetMaterializations to outputs\n                if isinstance(event, AssetMaterialization):\n                    output_name = event.asset_key.path[-1]\n                    if runtime_metadata_fn:\n                        yield Output(\n                            value=None,\n                            output_name=output_name,\n                            metadata=runtime_metadata_fn(\n                                context, out_name_to_node_info[output_name]\n                            ),\n                        )\n                    else:\n                        yield Output(\n                            value=None,\n                            output_name=output_name,\n                            metadata_entries=event.metadata_entries,\n                        )\n                # yield AssetObservations normally\n                else:\n                    yield event\n\n    return AssetsDefinition(\n        asset_keys_by_input_name={\n            input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n        },\n        asset_keys_by_output_name={\n            output_name: node_info_to_asset_key(out_name_to_node_info[output_name])\n            for output_name in outs.keys()\n        },\n        node_def=dbt_op,\n        can_subset=True,\n        asset_deps=asset_deps,\n    )\n\n\ndef _columns_to_metadata(columns: Mapping[str, Any]) -> Optional[Mapping[str, Any]]:\n    return (\n        {\n            "schema": MetadataValue.table_schema(\n                TableSchema(\n                    columns=[\n                        TableColumn(\n                            name=name,\n                            type=metadata.get("data_type") or "?",\n                            description=metadata.get("description"),\n                        )\n                        for name, metadata in columns.items()\n                    ]\n                )\n            )\n        }\n        if len(columns) > 0\n        else None\n    )\n\n\n
[docs]def load_assets_from_dbt_project(\n project_dir: str,\n profiles_dir: Optional[str] = None,\n target_dir: Optional[str] = None,\n select: Optional[str] = None,\n runtime_metadata_fn: Optional[\n Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n io_manager_key: Optional[str] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = _get_node_asset_key,\n use_build_command: bool = False,\n) -> Sequence[AssetsDefinition]:\n """\n Loads a set of DBT models from a DBT project into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` command.\n\n Args:\n project_dir (Optional[str]): The directory containing the DBT project to load.\n profiles_dir (Optional[str]): The profiles directory to use for loading the DBT project.\n Defaults to a directory called "config" inside the project_dir.\n target_dir (Optional[str]): The target directory where DBT will place compiled artifacts.\n Defaults to "target" underneath the project_dir.\n select (str): A DBT selection string for the models in a project that you want to include.\n Defaults to "*".\n runtime_metadata_fn: (Optional[Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]):\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n node_info_to_asset_key: (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model.\n use_build_command: (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset, rather than `dbt run`.\n\n """\n check.str_param(project_dir, "project_dir")\n profiles_dir = check.opt_str_param(\n profiles_dir, "profiles_dir", os.path.join(project_dir, "config")\n )\n target_dir = check.opt_str_param(target_dir, "target_dir", os.path.join(project_dir, "target"))\n\n manifest_json, cli_output = _load_manifest_for_project(\n project_dir, profiles_dir, target_dir, select or "*"\n )\n selected_unique_ids: Set[str] = set(\n filter(None, (line.get("unique_id") for line in cli_output.logs))\n )\n\n dbt_nodes = {**manifest_json["nodes"], **manifest_json["sources"]}\n return [\n _dbt_nodes_to_assets(\n dbt_nodes,\n select=select or "*",\n selected_unique_ids=selected_unique_ids,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n node_info_to_asset_key=node_info_to_asset_key,\n use_build_command=use_build_command,\n ),\n ]
\n\n\n
[docs]def load_assets_from_dbt_manifest(\n manifest_json: Mapping[str, Any],\n runtime_metadata_fn: Optional[\n Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n io_manager_key: Optional[str] = None,\n selected_unique_ids: Optional[AbstractSet[str]] = None,\n select: Optional[str] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = _get_node_asset_key,\n use_build_command: bool = False,\n) -> Sequence[AssetsDefinition]:\n """\n Loads a set of dbt models, described in a manifest.json, into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` command.\n\n Args:\n manifest_json (Optional[Mapping[str, Any]]): The contents of a DBT manifest.json, which contains\n a set of models to load into assets.\n runtime_metadata_fn: (Optional[Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]):\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n selected_unique_ids (Optional[Set[str]]): The set of dbt unique_ids that you want to load\n as assets.\n node_info_to_asset_key: (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model.\n use_build_command: (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset, rather than `dbt run`.\n """\n check.dict_param(manifest_json, "manifest_json", key_type=str)\n dbt_nodes = {**manifest_json["nodes"], **manifest_json["sources"]}\n\n if select is None:\n if selected_unique_ids:\n # generate selection string from unique ids\n select = " ".join(".".join(dbt_nodes[uid]["fqn"]) for uid in selected_unique_ids)\n else:\n # if no selection specified, default to "*"\n select = "*"\n selected_unique_ids = manifest_json["nodes"].keys()\n\n if selected_unique_ids is None:\n # must resolve the selection string using the existing manifest.json data (hacky)\n selected_unique_ids = _select_unique_ids_from_manifest_json(manifest_json, select)\n\n return [\n _dbt_nodes_to_assets(\n dbt_nodes,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n select=select,\n selected_unique_ids=selected_unique_ids,\n node_info_to_asset_key=node_info_to_asset_key,\n use_build_command=use_build_command,\n )\n ]
\n
", "current_page_name": "_modules/dagster_dbt/asset_defs", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.asset_defs"}, "cli": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cli.resources

\nfrom typing import Any, Dict, List, Optional, Set\n\nimport dagster._check as check\nfrom dagster import Permissive, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom ..dbt_resource import DbtResource\nfrom .constants import CLI_COMMON_FLAGS_CONFIG_SCHEMA, CLI_COMMON_OPTIONS_CONFIG_SCHEMA\nfrom .types import DbtCliOutput\nfrom .utils import execute_cli, parse_manifest, parse_run_results\n\n\n
[docs]class DbtCliResource(DbtResource):\n """\n A resource that allows you to execute dbt cli commands. For the most up-to-date documentation on\n the specific parameters available to you for each command, check out the dbt docs:\n\n https://docs.getdbt.com/reference/commands/run\n\n To use this as a dagster resource, we recommend using\n :func:`dbt_cli_resource <dagster_dbt.dbt_cli_resource>`.\n """\n\n def __init__(\n self,\n executable: str,\n default_flags: Dict[str, Any],\n warn_error: bool,\n ignore_handled_error: bool,\n target_path: str,\n logger: Optional[Any] = None,\n docs_url: Optional[str] = None,\n ):\n self._default_flags = default_flags\n self._executable = executable\n self._warn_error = warn_error\n self._ignore_handled_error = ignore_handled_error\n self._target_path = target_path\n self._docs_url = docs_url\n super().__init__(logger)\n\n @property\n def default_flags(self) -> Dict[str, Any]:\n """\n A set of params populated from resource config that are passed as flags to each dbt CLI command.\n """\n return self._format_params(self._default_flags, replace_underscores=True)\n\n @property\n def strict_flags(self) -> Set[str]:\n """\n A set of flags that should not be auto-populated from the default flags unless they are\n arguments to the associated function.\n """\n return {"models", "exclude", "select"}\n\n
[docs] def cli(self, command: str, **kwargs) -> DbtCliOutput:\n """\n Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n default flags that were configured on resource initialization (if any) overriding the\n default values if necessary.\n\n Args:\n command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n command = check.str_param(command, "command")\n extra_flags = {} if kwargs is None else kwargs\n\n # remove default flags that are declared as "strict" and not explicitly passed in\n default_flags = {\n k: v\n for k, v in self.default_flags.items()\n if not (k in self.strict_flags and k not in extra_flags)\n }\n\n flags = merge_dicts(\n default_flags, self._format_params(extra_flags, replace_underscores=True)\n )\n\n return execute_cli(\n executable=self._executable,\n command=command,\n flags_dict=flags,\n log=self.logger,\n warn_error=self._warn_error,\n ignore_handled_error=self._ignore_handled_error,\n target_path=self._target_path,\n docs_url=self._docs_url,\n )
\n\n
[docs] def compile(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtCliOutput:\n """\n Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("compile", models=models, exclude=exclude, **kwargs)
\n\n
[docs] def run(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtCliOutput:\n """\n Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("run", models=models, exclude=exclude, **kwargs)
\n\n
[docs] def snapshot(\n self, select: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtCliOutput:\n """\n Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("snapshot", select=select, exclude=exclude, **kwargs)
\n\n
[docs] def test(\n self,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n data: bool = True,\n schema: bool = True,\n **kwargs,\n ) -> DbtCliOutput:\n """\n Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in testing.\n exclude (List[str], optional): the models to exclude from testing.\n data (bool, optional): If ``True`` (default), then run data tests.\n schema (bool, optional): If ``True`` (default), then run schema tests.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n if data and schema:\n # do not include these arguments if both are True, as these are deprecated in later\n # versions of dbt, and for older versions the functionality is the same regardless of\n # if both are set or neither are set.\n return self.cli("test", models=models, exclude=exclude, **kwargs)\n return self.cli("test", models=models, exclude=exclude, data=data, schema=schema, **kwargs)
\n\n
[docs] def seed(\n self,\n show: bool = False,\n select: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtCliOutput:\n """\n Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n show (bool, optional): If ``True``, then show a sample of the seeded data in the\n response. Defaults to ``False``.\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("seed", show=show, select=select, exclude=exclude, **kwargs)
\n\n
[docs] def ls(\n self,\n select: Optional[List[str]] = None,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtCliOutput:\n """\n Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the resources to include in the output.\n models (List[str], optional): the models to include in the output.\n exclude (List[str], optional): the resources to exclude from the output.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("ls", select=select, models=models, exclude=exclude, **kwargs)
\n\n
[docs] def build(self, select: Optional[List[str]] = None, **kwargs) -> DbtCliOutput:\n """\n Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the models/resources to include in the run.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("build", select=select, **kwargs)
\n\n
[docs] def freshness(self, select: Optional[List[str]] = None, **kwargs) -> DbtCliOutput:\n """\n Run the ``source snapshot-freshness`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the sources to include in the run.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("source snapshot-freshness", select=select, **kwargs)
\n\n
[docs] def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtCliOutput:\n """\n Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n return self.cli("docs generate", compile=compile_project, **kwargs)
\n\n
[docs] def run_operation(\n self, macro: str, args: Optional[Dict[str, Any]] = None, **kwargs\n ) -> DbtCliOutput:\n """\n Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n macro (str): the dbt macro to invoke.\n args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n Returns:\n DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n parsed log output as well as the contents of run_results.json (if applicable).\n """\n\n return self.cli(f"run-operation {macro}", args=args, **kwargs)
\n\n
[docs] def get_run_results_json(self, **kwargs) -> Optional[Dict[str, Any]]:\n """\n Get a parsed version of the run_results.json file for the relevant dbt project.\n\n Returns:\n Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n for this dbt project.\n """\n project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n target_path = kwargs.get("target_path", self._target_path)\n return parse_run_results(project_dir, target_path)
\n\n
[docs] def get_manifest_json(self, **kwargs) -> Optional[Dict[str, Any]]:\n """\n Get a parsed version of the manifest.json file for the relevant dbt project.\n\n Returns:\n Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n for this dbt project.\n """\n project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n target_path = kwargs.get("target_path", self._target_path)\n return parse_manifest(project_dir, target_path)
\n\n\n
[docs]@resource(\n config_schema=Permissive(\n {\n k.replace("-", "_"): v\n for k, v in dict(\n **CLI_COMMON_FLAGS_CONFIG_SCHEMA, **CLI_COMMON_OPTIONS_CONFIG_SCHEMA\n ).items()\n }\n ),\n description="A resource that can run dbt CLI commands.",\n)\ndef dbt_cli_resource(context) -> DbtCliResource:\n """This resource defines a dbt CLI interface.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n Examples:\n\n .. code-block:: python\n\n custom_dbt_cli_resource = dbt_cli_resource.configured({"project-dir": "path/to/my/dbt_project"})\n\n @pipeline(mode_defs=[ModeDefinition(resource_defs={"dbt": custom_dbt_cli_resource})])\n def dbt_cli_pipeline():\n # Run solids with `required_resource_keys={"dbt", ...}`.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n dbt_cli_resource:\n config:\n project_dir: "."\n # Optional[str]: Which directory to look in for the dbt_project.yml file. Default is\n # the current working directory and its parents.\n profiles_dir: $DBT_PROFILES_DIR or $HOME/.dbt\n # Optional[str]: Which directory to look in for the profiles.yml file.\n profile: ""\n # Optional[str]: Which profile to load. Overrides setting in dbt_project.yml.\n target: ""\n # Optional[str]: Which target to load for the given profile.\n vars: {}\n # Optional[Permissive]: Supply variables to the project. This argument overrides\n # variables defined in your dbt_project.yml file. This argument should be a\n # dictionary, eg. "{'my_variable': 'my_value'}"\n bypass_cache: False\n # Optional[bool]: If set, bypass the adapter-level cache of database state.\n\n\n """\n # set of options in the config schema that are not flags\n non_flag_options = {k.replace("-", "_") for k in CLI_COMMON_OPTIONS_CONFIG_SCHEMA}\n # all config options that are intended to be used as flags for dbt commands\n default_flags = {k: v for k, v in context.resource_config.items() if k not in non_flag_options}\n return DbtCliResource(\n executable=context.resource_config["dbt_executable"],\n default_flags=default_flags,\n warn_error=context.resource_config["warn_error"],\n ignore_handled_error=context.resource_config["ignore_handled_error"],\n target_path=context.resource_config["target_path"],\n logger=context.log,\n docs_url=context.resource_config.get("docs_url"),\n )
\n
", "current_page_name": "_modules/dagster_dbt/cli/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cli.resources"}, "solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cli.solids

\nfrom dagster import (\n    Array,\n    Bool,\n    InputDefinition,\n    Noneable,\n    Nothing,\n    Output,\n    OutputDefinition,\n    Permissive,\n    StringSource,\n    solid,\n)\nfrom dagster.config.field import Field\nfrom dagster.utils.backcompat import experimental\n\nfrom ..utils import generate_materializations\nfrom .constants import (\n    CLI_COMMON_FLAGS_CONFIG_SCHEMA,\n    CLI_COMMON_OPTIONS_CONFIG_SCHEMA,\n    DEFAULT_DBT_TARGET_PATH,\n)\nfrom .types import DbtCliOutput\nfrom .utils import execute_cli\n\nCLI_CONFIG_SCHEMA = {**CLI_COMMON_FLAGS_CONFIG_SCHEMA, **CLI_COMMON_OPTIONS_CONFIG_SCHEMA}\nCLI_COMMON_FLAGS = set(CLI_COMMON_FLAGS_CONFIG_SCHEMA.keys())\n\n\ndef passthrough_flags_only(solid_config, additional_flags):\n    return {\n        flag: solid_config[flag]\n        for flag in (CLI_COMMON_FLAGS | set(additional_flags))\n        if solid_config.get(flag) is not None\n    }\n\n\n
[docs]@solid(\n description="A solid to invoke dbt run via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "full-refresh": Field(\n config=bool,\n description=(\n "If specified, DBT will drop incremental models and fully-recalculate the "\n "incremental table from the model definition. (--full-refresh)"\n ),\n is_required=False,\n default_value=False,\n ),\n "fail-fast": Field(\n config=bool,\n description="Stop execution upon a first failure. (--fail-fast)",\n is_required=False,\n default_value=False,\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n is_required=False,\n default_value=[],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_run(context):\n """This solid executes ``dbt run`` via the dbt CLI. See the solid definition for available\n parameters.\n """\n\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="run",\n flags_dict=passthrough_flags_only(\n context.solid_config, ("threads", "models", "exclude", "full-refresh", "fail-fast")\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n if context.solid_config["yield_materializations"]:\n for materialization in generate_materializations(\n cli_output,\n asset_key_prefix=context.solid_config["asset_key_prefix"],\n ):\n yield materialization\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt test via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "data": Field(\n config=bool,\n description='Run data tests defined in "tests" directory.',\n is_required=False,\n default_value=False,\n ),\n "schema": Field(\n config=bool,\n description="Run constraint validations from schema.yml files.",\n is_required=False,\n default_value=False,\n ),\n "fail-fast": Field(\n config=bool,\n description="Stop execution upon a first test failure.",\n is_required=False,\n default_value=False,\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "target-path": Field(\n config=StringSource,\n is_required=False,\n default_value=DEFAULT_DBT_TARGET_PATH,\n description=(\n "The directory path for target if different from the default `target-path` in "\n "your dbt project configuration file."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_test(context):\n """This solid executes ``dbt test`` via the dbt CLI. See the solid definition for available\n parameters.\n """\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="test",\n flags_dict=passthrough_flags_only(\n context.solid_config, ("data", "schema", "fail-fast", "threads", "models", "exclude")\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt snapshot via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings in "\n "profiles.yml."\n ),\n ),\n "select": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to include.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_snapshot(context):\n """This solid executes ``dbt snapshot`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="snapshot",\n flags_dict=passthrough_flags_only(context.solid_config, ("threads", "select", "exclude")),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt run-operation via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "macro": Field(\n config=StringSource,\n description=(\n "Specify the macro to invoke. dbt will call this macro with the supplied "\n "arguments and then exit."\n ),\n ),\n "args": Field(\n config=Permissive({}),\n is_required=False,\n description=(\n "Supply arguments to the macro. This dictionary will be mapped to the keyword "\n "arguments defined in the selected macro. This argument should be a dictionary, "\n "eg. {'my_variable': 'my_value'}"\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_run_operation(context):\n """This solid executes ``dbt run-operation`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command=f"run-operation {context.solid_config['macro']}",\n flags_dict=passthrough_flags_only(context.solid_config, ("args",)),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt source snapshot-freshness via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "select": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="Specify the sources to snapshot freshness.",\n ),\n "output": Field(\n config=StringSource,\n is_required=False,\n description=(\n "Specify the output path for the json report. By default, outputs to "\n "target/sources.json"\n ),\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides "\n "settings in profiles.yml."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_snapshot_freshness(context):\n """This solid executes ``dbt source snapshot-freshness`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="source snapshot-freshness",\n flags_dict=passthrough_flags_only(context.solid_config, ("select", "output", "threads")),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt compile via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "parse-only": Field(\n config=bool,\n is_required=False,\n default_value=False,\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "no-version-check": Field(\n config=bool,\n description=(\n "Skip the check that dbt's version matches the one specified in the "\n "dbt_project.yml file ('require-dbt-version')"\n ),\n is_required=False,\n default_value=False,\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "selector": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The selector name to use, as defined in your selectors.yml",\n ),\n "state": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description=(\n "If set, use the given directory as the source for json files to compare with "\n "this project."\n ),\n ),\n "full-refresh": Field(\n config=bool,\n description=(\n "If specified, DBT will drop incremental models and fully-recalculate "\n "the incremental table from the model definition. (--full-refresh)"\n ),\n is_required=False,\n default_value=False,\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_compile(context):\n """This solid executes ``dbt compile`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="compile",\n flags_dict=passthrough_flags_only(\n context.solid_config,\n (\n "parse-only",\n "threads",\n "no-version-check",\n "models",\n "exclude",\n "selector",\n "state",\n "full-refresh",\n ),\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")
\n\n\n@solid(\n description="A solid to invoke dbt docs generate via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "no-version-check": Field(\n config=bool,\n description=(\n "Skip the check that dbt's version matches the one specified in the "\n "dbt_project.yml file ('require-dbt-version')"\n ),\n is_required=False,\n default_value=False,\n ),\n "models": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "selector": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The selector name to use, as defined in your selectors.yml",\n ),\n "state": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description=(\n "If set, use the given directory as the source for json files to compare with "\n "this project."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_docs_generate(context):\n """This solid executes ``dbt docs generate`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="docs generate",\n flags_dict=passthrough_flags_only(\n context.solid_config,\n (\n "threads",\n "no-version-check",\n "models",\n "exclude",\n "selector",\n "state",\n ),\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")\n\n\n@solid(\n description="A solid to invoke dbt seed via CLI.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],\n config_schema={\n **CLI_CONFIG_SCHEMA,\n "full-refresh": Field(\n config=bool,\n default_value=False,\n is_required=False,\n description=("Drop existing seed tables and recreate them."),\n ),\n "show": Field(\n config=bool,\n default_value=False,\n is_required=False,\n description=("Show a sample of the loaded data in the terminal."),\n ),\n "threads": Field(\n config=Noneable(int),\n default_value=None,\n is_required=False,\n description=(\n "Specify number of threads to use while executing models. Overrides settings "\n "in profiles.yml."\n ),\n ),\n "no-version-check": Field(\n config=bool,\n description=(\n "Skip the check that dbt's version matches the one specified in the "\n "dbt_project.yml file ('require-dbt-version')"\n ),\n is_required=False,\n default_value=False,\n ),\n "select": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="Specify the nodes to include.",\n ),\n "exclude": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "selector": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description="The selector name to use, as defined in your selectors.yml",\n ),\n "state": Field(\n config=Noneable([str]),\n default_value=None,\n is_required=False,\n description=(\n "If set, use the given directory as the source for json files to compare with "\n "this project."\n ),\n ),\n },\n tags={"kind": "dbt"},\n)\n@experimental\ndef dbt_cli_seed(context):\n """This solid executes ``dbt seed`` via the dbt CLI."""\n cli_output = execute_cli(\n context.solid_config["dbt_executable"],\n command="seed",\n flags_dict=passthrough_flags_only(\n context.solid_config,\n (\n "full-refresh",\n "show",\n "threads",\n "no-version-check",\n "select",\n "exclude",\n "selector",\n "state",\n ),\n ),\n log=context.log,\n warn_error=context.solid_config["warn-error"],\n ignore_handled_error=context.solid_config["ignore_handled_error"],\n target_path=context.solid_config["target-path"],\n )\n\n yield Output(cli_output, "dbt_cli_output")\n
", "current_page_name": "_modules/dagster_dbt/cli/solids", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cli.solids"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cli.types

\nfrom typing import Any, Dict, List, Optional\n\nimport dagster._check as check\n\nfrom ..types import DbtOutput\n\n\n
[docs]class DbtCliOutput(DbtOutput):\n """The results of executing a dbt command, along with additional metadata about the dbt CLI\n process that was run.\n\n Note that users should not construct instances of this class directly. This class is intended\n to be constructed from the JSON output of dbt commands.\n\n Attributes:\n command (str): The full shell command that was executed.\n return_code (int): The return code of the dbt CLI process.\n raw_output (str): The raw output (``stdout``) of the dbt CLI process.\n logs (List[Dict[str, Any]]): List of parsed JSON logs produced by the dbt command.\n result (Optional[Dict[str, Any]]): Dictionary containing dbt-reported result information\n contained in run_results.json. Some dbt commands do not produce results, and will\n therefore have result = None.\n docs_url (Optional[str]): Hostname where dbt docs are being served for this project.\n """\n\n def __init__(\n self,\n command: str,\n return_code: int,\n raw_output: str,\n logs: List[Dict[str, Any]],\n result: Dict[str, Any],\n docs_url: Optional[str] = None,\n ):\n self._command = check.str_param(command, "command")\n self._return_code = check.int_param(return_code, "return_code")\n self._raw_output = check.str_param(raw_output, "raw_output")\n self._logs = check.list_param(logs, "logs", of_type=dict)\n self._docs_url = check.opt_str_param(docs_url, "docs_url")\n super().__init__(result)\n\n @property\n def command(self) -> str:\n return self._command\n\n @property\n def return_code(self) -> int:\n return self._return_code\n\n @property\n def raw_output(self) -> str:\n return self._raw_output\n\n @property\n def logs(self) -> List[Dict[str, Any]]:\n return self._logs\n\n @property\n def docs_url(self) -> Optional[str]:\n return self._docs_url
\n
", "current_page_name": "_modules/dagster_dbt/cli/types", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cli.types"}}, "cloud": {"ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cloud.ops

\nfrom dagster import Array, Bool, Field, In, Noneable, Nothing, Out, Output, op\n\nfrom ..utils import generate_materializations\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import DbtCloudOutput\n\n\n
[docs]@op(\n required_resource_keys={"dbt_cloud"},\n ins={"start_after": In(Nothing)},\n out=Out(DbtCloudOutput, description="Parsed output from running the dbt Cloud job."),\n config_schema={\n "job_id": Field(\n config=int,\n is_required=True,\n description=(\n "The integer ID of the relevant dbt Cloud job. You can find this value by going to "\n "the details page of your job in the dbt Cloud UI. It will be the final number in the "\n "url, e.g.: "\n " https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/"\n ),\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) that will be waited between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description="The maximum time that will waited before this operation is timed out. By "\n "default, this will never time out.",\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["dbt"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "dbt_cloud"},\n)\ndef dbt_cloud_run_op(context):\n """\n Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\n fails or is otherwised stopped before succeeding, a `dagster.Failure` exception will be raised,\n and this op will fail.\n\n It requires the use of a 'dbt_cloud' resource, which is used to connect to the dbt Cloud API.\n\n **Config Options:**\n\n job_id (int)\n The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\n page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\n ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n poll_interval (float)\n The time (in seconds) that will be waited between successive polls. Defaults to ``10``.\n poll_timeout (float)\n The maximum time (in seconds) that will waited before this operation is timed out. By\n default, this will never time out.\n yield_materializations (bool)\n If True, materializations corresponding to the results of the dbt operation will be\n yielded when the solid executes. Defaults to ``True``.\n rasset_key_prefix (float)\n If provided and yield_materializations is True, these components will be used to "\n prefix the generated asset keys. Defaults to ["dbt"].\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource, dbt_cloud_run_op\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n )\n run_dbt_nightly_sync = dbt_cloud_run_op.configured(\n {"job_id": 54321}, name="run_dbt_nightly_sync"\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def dbt_cloud():\n run_dbt_nightly_sync()\n\n\n """\n dbt_output = context.resources.dbt_cloud.run_job_and_poll(\n context.op_config["job_id"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"] and "results" in dbt_output.result:\n yield from generate_materializations(\n dbt_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(\n dbt_output,\n metadata={\n "created_at": dbt_output.run_details["created_at"],\n "started_at": dbt_output.run_details["started_at"],\n "finished_at": dbt_output.run_details["finished_at"],\n "total_duration": dbt_output.run_details["duration"],\n "run_duration": dbt_output.run_details["run_duration"],\n },\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/ops", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cloud.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.cloud.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Dict, List, Optional, cast\nfrom urllib.parse import urlencode, urljoin\n\nimport requests\nfrom requests.exceptions import RequestException\n\nfrom dagster import Failure, Field, MetadataValue, StringSource, __version__\nfrom dagster import _check as check\nfrom dagster import get_dagster_logger, resource\nfrom dagster.utils.merger import deep_merge_dicts\n\nfrom .types import DbtCloudOutput\n\nDBT_DEFAULT_HOST = "https://cloud.getdbt.com/"\nDBT_ACCOUNTS_PATH = "api/v2/accounts/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\n
[docs]class DbtCloudResourceV2:\n """This class exposes methods on top of the dbt Cloud REST API v2.\n\n For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n """\n\n def __init__(\n self,\n auth_token: str,\n account_id: int,\n disable_schedule_on_trigger: bool = True,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n dbt_cloud_host: str = DBT_DEFAULT_HOST,\n log: logging.Logger = get_dagster_logger(),\n log_requests: bool = False,\n ):\n self._auth_token = auth_token\n self._account_id = account_id\n self._disable_schedule_on_trigger = disable_schedule_on_trigger\n\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._dbt_cloud_host = dbt_cloud_host\n self._log = log\n self._log_requests = log_requests\n\n @property\n def api_base_url(self) -> str:\n return urljoin(self._dbt_cloud_host, DBT_ACCOUNTS_PATH)\n\n
[docs] def make_request(\n self,\n method: str,\n endpoint: str,\n data: Optional[Dict[str, Any]] = None,\n return_text: bool = False,\n ) -> Any:\n """\n Creates and sends a request to the desired dbt Cloud API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The dbt Cloud API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n return_text (bool): Override default behavior and return unparsed {"text": response.text}\n blob instead of json.\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n\n headers = {\n "User-Agent": f"dagster-dbt/{__version__}",\n "Content-Type": "application/json",\n "Authorization": f"Bearer {self._auth_token}",\n }\n url = urljoin(self.api_base_url, endpoint)\n\n if self._log_requests:\n self._log.debug(f"Making Request: method={method} url={url} data={data}")\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n data=json.dumps(data),\n allow_redirects=False,\n )\n response.raise_for_status()\n return {"text": response.text} if return_text else response.json()["data"]\n except RequestException as e:\n self._log.error("Request to dbt Cloud API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure("Exceeded max number of retries.")
\n\n
[docs] def get_job(self, job_id: int) -> Dict[str, Any]:\n """\n Gets details about a given dbt job from the dbt Cloud API.\n\n Args:\n job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n the details page of your job in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n return self.make_request("GET", f"{self._account_id}/jobs/{job_id}/")
\n\n
[docs] def update_job(self, job_id: int, **kwargs) -> Dict[str, Any]:\n """\n Updates specific properties of a dbt job. Documentation on the full set of potential\n parameters can be found here: https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById\n\n Args:\n job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n the details page of your job in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n kwargs: Passed in as the properties to be changed.\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n\n Examples:\n\n .. code-block:: python\n\n # disable schedule for job with id=12345\n my_dbt_cloud_resource.update_job(12345, triggers={"schedule": False})\n """\n # API requires you to supply a bunch of values, so we can just use the current state\n # as the defaults\n job_data = self.get_job(job_id)\n return self.make_request(\n "POST", f"{self._account_id}/jobs/{job_id}/", data=deep_merge_dicts(job_data, kwargs)\n )
\n\n
[docs] def run_job(self, job_id: int, **kwargs) -> Dict[str, Any]:\n """\n Initializes a run for a job. Overrides for specific properties can be set by passing in\n values to the kwargs. A full list of overridable properties can be found here:\n https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun\n\n Args:\n job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n the details page of your job in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n kwargs: Passed in as the properties to be overridden.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n if self._disable_schedule_on_trigger:\n self._log.info("Disabling dbt Cloud job schedule.")\n self.update_job(job_id, triggers={"schedule": False})\n self._log.info(f"Initializing run for job with job_id={job_id}")\n if "cause" not in kwargs:\n kwargs["cause"] = "Triggered via Dagster"\n resp = self.make_request("POST", f"{self._account_id}/jobs/{job_id}/run/", data=kwargs)\n self._log.info(\n f"Run initialized with run_id={resp['id']}. View this run in "\n f"the dbt Cloud UI: {resp['href']}"\n )\n return resp
\n\n
[docs] def get_runs(\n self,\n include_related: Optional[List[str]] = None,\n job_id: Optional[int] = None,\n order_by: Optional[str] = "-id",\n offset: int = 0,\n limit: int = 100,\n ) -> List[Dict[str, object]]:\n """\n Returns a list of runs from dbt Cloud. This can be optionally filtered to a specific job\n using the job_definition_id. It supports pagination using offset and limit as well and\n can be configured to load a variety of related information about the runs.\n\n Args:\n include_related (Optional[List[str]]): A list of resources to include in the response\n from dbt Cloud. This is technically a required field according to the API, but it\n can be passed with an empty list where it will only load the default run\n information. Valid values are "trigger", "job", "repository", and "environment".\n job_definition_id (Optional[int]): This method can be optionally filtered to only\n load runs for a specific job id if it is included here. If omitted it will pull\n runs for every job.\n order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n results before returning them. Useful when combined with offset and limit to load\n runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n the key to filter on.\n offset (int): An offset to apply when listing runs. Can be used to paginate results\n when combined with order_by and limit. Defaults to 0.\n limit (int): Limits the amount of rows returned by the API. Defaults to 100.\n\n Returns:\n List[Dict[str, Any]]: A list of dictionaries containing the runs and any included\n related information.\n """\n query_dict = {\n "include_related": include_related or [],\n "order_by": order_by,\n "offset": offset,\n "limit": limit,\n }\n if job_id:\n query_dict["job_definition_id"] = job_id\n return self.make_request("GET", f"{self._account_id}/runs/?{urlencode(query_dict)}")
\n\n
[docs] def get_run(self, run_id: int, include_related: Optional[List[str]] = None) -> Dict[str, Any]:\n """\n Gets details about a specific job run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n include_related (List[str]): List of related fields to pull with the run. Valid values\n are "trigger", "job", and "debug_logs".\n\n Returns:\n Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n """\n query_params = f"?include_related={','.join(include_related)}" if include_related else ""\n return self.make_request(\n "GET",\n f"{self._account_id}/runs/{run_id}/{query_params}",\n )
\n\n
[docs] def get_run_steps(self, run_id: int) -> List[str]:\n """\n Gets the steps of an initialized dbt Cloud run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n Returns:\n List[str, Any]: List of commands for each step of the run.\n """\n run_details = self.get_run(run_id, include_related=["trigger", "job"])\n steps = run_details["job"]["execute_steps"]\n steps_override = run_details["trigger"]["steps_override"]\n return steps_override or steps
\n\n
[docs] def cancel_run(self, run_id: int) -> Dict[str, Any]:\n """\n Cancels a dbt Cloud run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n Returns:\n Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n """\n self._log.info(f"Cancelling run with id '{run_id}'")\n return self.make_request("POST", f"{self._account_id}/runs/{run_id}/cancel/")
\n\n
[docs] def list_run_artifacts(self, run_id: int, step: Optional[int] = None) -> List[str]:\n """\n Lists the paths of the available run artifacts from a completed dbt Cloud run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n step (int): The index of the step in the run to query for artifacts. The first step in\n the run has the index 1. If the step parameter is omitted, then this endpoint will\n return the artifacts compiled for the last step in the run\n\n Returns:\n List[str]: List of the paths of the available run artifacts\n """\n query_params = f"?step={step}" if step else ""\n return cast(\n list,\n self.make_request(\n "GET",\n f"{self._account_id}/runs/{run_id}/artifacts/{query_params}",\n data={"step": step} if step else None,\n ),\n )
\n\n
[docs] def get_run_artifact(self, run_id: int, path: str, step: Optional[int] = None) -> str:\n """\n The string contents of a run artifact from a dbt Cloud run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n path (str): The path to this run artifact (e.g. 'run/my_new_project/models/example/my_first_dbt_model.sql')\n step (int): The index of the step in the run to query for artifacts. The first step in\n the run has the index 1. If the step parameter is omitted, then this endpoint will\n return the artifacts compiled for the last step in the run.\n\n Returns:\n List[str]: List of the names of the available run artifacts\n """\n query_params = f"?step={step}" if step else ""\n return self.make_request(\n "GET",\n f"{self._account_id}/runs/{run_id}/artifacts/{path}{query_params}",\n data={"step": step} if step else None,\n return_text=True,\n )["text"]
\n\n
[docs] def get_manifest(self, run_id: int, step: Optional[int] = None) -> Dict[str, Any]:\n """\n The parsed contents of a manifest.json file created by a completed run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n step (int): The index of the step in the run to query for artifacts. The first step in\n the run has the index 1. If the step parameter is omitted, then this endpoint will\n return the artifacts compiled for the last step in the run.\n\n Returns:\n Dict[str, Any]: Parsed contents of the manifest.json file\n """\n return json.loads(self.get_run_artifact(run_id, "manifest.json", step=step))
\n\n
[docs] def get_run_results(self, run_id: int, step: Optional[int] = None) -> Dict[str, Any]:\n """\n The parsed contents of a run_results.json file created by a completed run.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n step (int): The index of the step in the run to query for artifacts. The first step in\n the run has the index 1. If the step parameter is omitted, then this endpoint will\n return the artifacts compiled for the last step in the run.\n\n Returns:\n Dict[str, Any]: Parsed contents of the run_results.json file\n """\n return json.loads(self.get_run_artifact(run_id, "run_results.json", step=step))
\n\n
[docs] def poll_run(\n self,\n run_id: int,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n href: Optional[str] = None,\n ) -> Dict[str, Any]:\n """\n Polls a dbt Cloud job run until it completes. Will raise a `dagster.Failure` exception if the\n run does not complete successfully.\n\n Args:\n run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n the details page of your run in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n poll_interval (float): The time (in seconds) that should be waited between successive\n polls of the dbt Cloud API.\n poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n to complete. If this threshold is exceeded, the run will be cancelled and an\n exception will be thrown. By default, this will poll forver.\n href (str): For internal use, generally should not be set manually.\n\n Returns:\n Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n """\n\n if href is None:\n href = self.get_run(run_id).get("href")\n assert isinstance(href, str), "Run must have an href"\n\n poll_start = datetime.datetime.now()\n while True:\n run_details = self.get_run(run_id)\n status = run_details["status_humanized"]\n self._log.info(f"Polled run {run_id}. Status: [{status}]")\n\n # completed successfully\n if status == "Success":\n return self.get_run(run_id, include_related=["job", "trigger"])\n elif status in ["Error", "Cancelled"]:\n break\n elif status not in ["Queued", "Starting", "Running"]:\n check.failed(f"Received unexpected status '{status}'. This should never happen")\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n self.cancel_run(run_id)\n raise Failure(\n f"Run {run_id} timed out after "\n f"{datetime.datetime.now() - poll_start}. Attempted to cancel.",\n metadata={"run_page_url": MetadataValue.url(href)},\n )\n\n # Sleep for the configured time interval before polling again.\n time.sleep(poll_interval)\n\n run_details = self.get_run(run_id, include_related=["trigger"])\n raise Failure(\n f"Run {run_id} failed. Status Message: {run_details['status_message']}",\n metadata={\n "run_details": MetadataValue.json(run_details),\n "run_page_url": MetadataValue.url(href),\n },\n )
\n\n
[docs] def run_job_and_poll(\n self,\n job_id: int,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> DbtCloudOutput:\n """\n Runs a dbt Cloud job and polls until it completes. Will raise a `dagster.Failure` exception\n if the run does not complete successfully.\n\n Args:\n job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n the details page of your job in the dbt Cloud UI. It will be the final number in the\n url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n poll_interval (float): The time (in seconds) that should be waited between successive\n polls of the dbt Cloud API.\n poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n to complete. If this threshold is exceeded, the run will be cancelled and an\n exception will be thrown. By default, this will poll forver.\n\n Returns:\n :py:class:`~DbtCloudOutput`: Class containing details about the specific job run and the\n parsed run results.\n """\n run_details = self.run_job(job_id)\n run_id = run_details["id"]\n href = run_details["href"]\n final_run_details = self.poll_run(\n run_id, poll_interval=poll_interval, poll_timeout=poll_timeout, href=href\n )\n output = DbtCloudOutput(run_details=final_run_details, result=self.get_run_results(run_id))\n if output.docs_url:\n self._log.info(f"Docs for this run can be viewed here: {output.docs_url}")\n return output
\n\n\n
[docs]@resource(\n config_schema={\n "auth_token": Field(\n StringSource,\n is_required=True,\n description="dbt Cloud API Token. User tokens can be found in the "\n "[dbt Cloud UI](https://cloud.getdbt.com/#/profile/api/), or see the "\n "[dbt Cloud Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) "\n "for instructions on creating a Service Account token.",\n ),\n "account_id": Field(\n int,\n is_required=True,\n description="dbt Cloud Account ID. This value can be found in the url of a variety of "\n "views in the dbt Cloud UI, e.g. https://cloud.getdbt.com/#/accounts/{account_id}/settings/.",\n ),\n "disable_schedule_on_trigger": Field(\n bool,\n default_value=True,\n description="Specifies if you would like any job that is triggered using this "\n "resource to automatically disable its schedule.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description="The maximum number of times requests to the dbt Cloud API should be retried "\n "before failing.",\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n "dbt_cloud_host": Field(\n config=StringSource,\n default_value=DBT_DEFAULT_HOST,\n description="The hostname where dbt cloud is being hosted (e.g. https://my_org.cloud.getdbt.com/).",\n ),\n },\n description="This resource helps interact with dbt Cloud connectors",\n)\ndef dbt_cloud_resource(context) -> DbtCloudResourceV2:\n """\n This resource allows users to programatically interface with the dbt Cloud Administrative REST\n API (v2) to launch jobs and monitor their progress. This currently implements only a subset of\n the functionality exposed by the API.\n\n For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n "account_id": 30000,\n }\n )\n\n @job(resource_defs={"dbt_cloud":my_dbt_cloud_resource})\n def my_dbt_cloud_job():\n ...\n """\n return DbtCloudResourceV2(\n auth_token=context.resource_config["auth_token"],\n account_id=context.resource_config["account_id"],\n disable_schedule_on_trigger=context.resource_config["disable_schedule_on_trigger"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n dbt_cloud_host=context.resource_config["dbt_cloud_host"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.cloud.resources"}}, "dbt_resource": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.dbt_resource

\nimport logging\nfrom abc import abstractmethod\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import get_dagster_logger\n\nfrom .types import DbtOutput\n\n\n
[docs]class DbtResource:\n """Base class for a resource allowing users to interface with dbt"""\n\n def __init__(\n self,\n logger: Optional[logging.Logger] = None,\n ):\n """Constructor\n\n Args:\n logger (Optional[Any]): A property for injecting a logger dependency.\n Default is ``None``.\n """\n self._logger = logger or get_dagster_logger()\n\n def _format_params(\n self, flags: Dict[str, Any], replace_underscores: bool = False\n ) -> Dict[str, Any]:\n """\n Reformats arguments that are easier to express as a list into the format that dbt expects,\n and deletes and keys with no value.\n """\n\n # remove any keys with a value of None\n if replace_underscores:\n flags = {k.replace("_", "-"): v for k, v in flags.items() if v is not None}\n else:\n flags = {k: v for k, v in flags.items() if v is not None}\n\n for param in ["select", "exclude", "models"]:\n if param in flags:\n if isinstance(flags[param], list):\n # if it's a list, format as space-separated\n flags[param] = " ".join(set(flags[param]))\n\n return flags\n\n @property\n def logger(self) -> logging.Logger:\n """logging.Logger: A property for injecting a logger dependency."""\n return self._logger\n\n
[docs] @abstractmethod\n def compile(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtOutput:\n """\n Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def run(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtOutput:\n """\n Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in the run.\n exclude (List[str]), optional): the models to exclude from the run.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def snapshot(\n self, select: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtOutput:\n """\n Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def test(\n self,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n data: bool = True,\n schema: bool = True,\n **kwargs,\n ) -> DbtOutput:\n """\n Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n models (List[str], optional): the models to include in testing.\n exclude (List[str], optional): the models to exclude from testing.\n data (bool, optional): If ``True`` (default), then run data tests.\n schema (bool, optional): If ``True`` (default), then run schema tests.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def seed(\n self,\n show: bool = False,\n select: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtOutput:\n """\n Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n show (bool, optional): If ``True``, then show a sample of the seeded data in the\n response. Defaults to ``False``.\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def ls(\n self,\n select: Optional[List[str]] = None,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtOutput:\n """\n Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the resources to include in the output.\n models (List[str], optional): the models to include in the output.\n exclude (List[str], optional): the resources to exclude from the output.\n\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def build(self, select: Optional[List[str]] = None, **kwargs) -> DbtOutput:\n """\n Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the models/resources to include in the run.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """\n raise NotImplementedError()
\n\n
[docs] @abstractmethod\n def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtOutput:\n """\n Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def run_operation(\n self, macro: str, args: Optional[Dict[str, Any]] = None, **kwargs\n ) -> DbtOutput:\n """\n Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n macro (str): the dbt macro to invoke.\n args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """
\n\n
[docs] @abstractmethod\n def get_run_results_json(self, **kwargs) -> Optional[Dict[str, Any]]:\n """\n Get a parsed version of the run_results.json file for the relevant dbt project.\n\n Returns:\n Dict[str, Any]: dictionary containing the parsed contents of the run_results json file\n for this dbt project.\n """
\n\n
[docs] @abstractmethod\n def get_manifest_json(self, **kwargs) -> Optional[Dict[str, Any]]:\n """\n Get a parsed version of the manifest.json file for the relevant dbt project.\n\n Returns:\n Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n for this dbt project.\n """
\n
", "current_page_name": "_modules/dagster_dbt/dbt_resource", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.dbt_resource"}, "errors": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.errors

\nfrom abc import ABC\nfrom typing import Any, Dict, List\n\nfrom dagster import Failure, MetadataEntry\nfrom dagster import _check as check\n\n\n
[docs]class DagsterDbtError(Failure, ABC):\n """The base exception of the ``dagster-dbt`` library."""
\n\n\n
[docs]class DagsterDbtCliUnexpectedOutputError(DagsterDbtError):\n """Represents an error when parsing the output of a dbt CLI command."""\n\n invalid_line_nos: List[int]\n\n def __init__(self, invalid_line_nos: List[int]):\n check.list_param(invalid_line_nos, "invalid_line_nos", int)\n line_nos_str = ", ".join(map(str, invalid_line_nos))\n description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"\n metadata_entries = [\n MetadataEntry("Invalid CLI Output Line Numbers", value={"line_nos": invalid_line_nos})\n ]\n super().__init__(description, metadata_entries)\n self.invalid_line_nos = invalid_line_nos
\n\n\n
[docs]class DagsterDbtCliRuntimeError(DagsterDbtError, ABC):\n """Represents an error while executing a dbt CLI command."""\n\n def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str):\n metadata_entries = [\n MetadataEntry(\n "Parsed CLI Output (JSON)",\n value={"logs": logs},\n ),\n MetadataEntry(\n "Parsed CLI Output (JSON) Message Attributes",\n value=DagsterDbtCliRuntimeError.stitch_messages(logs),\n ),\n MetadataEntry(\n "Raw CLI Output",\n value=raw_output,\n ),\n ]\n super().__init__(description, metadata_entries)\n\n @staticmethod\n def stitch_messages(logs: List[dict]) -> str:\n return "\\n".join(\n log["message"].strip("\\n")\n for log in logs\n if isinstance(log.get("message"), str) # defensive\n )
\n\n\n
[docs]class DagsterDbtCliHandledRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a model error reported by the dbt CLI at runtime (return code 1)."""\n\n def __init__(self, logs: List[Dict[str, Any]], raw_output: str):\n super().__init__("Handled error in the dbt CLI (return code 1)", logs, raw_output)
\n\n\n
[docs]class DagsterDbtCliFatalRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a fatal error in the dbt CLI (return code 2)."""\n\n def __init__(self, logs: List[Dict[str, Any]], raw_output: str):\n super().__init__("Fatal error in the dbt CLI (return code 2)", logs, raw_output)
\n\n\n
[docs]class DagsterDbtRpcUnexpectedPollOutputError(DagsterDbtError):\n """Represents an unexpected response when polling the dbt RPC server."""
\n\n\n
[docs]class DagsterDbtCliOutputsNotFoundError(DagsterDbtError):\n """Represents a problem in finding the ``target/run_results.json`` artifact when executing a dbt\n CLI command.\n\n For more details on ``target/run_results.json``, see\n https://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.\n """\n\n def __init__(self, path: str):\n super().__init__("Expected to find file at path {}".format(path))
\n
", "current_page_name": "_modules/dagster_dbt/errors", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.errors"}, "ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.ops

\nfrom dagster import Array, Bool, Field, In, Nothing, Out, Output, op\n\nfrom .types import DbtOutput\nfrom .utils import generate_events, generate_materializations\n\n_DEFAULT_OP_PROPS = dict(\n    required_resource_keys={"dbt"},\n    ins={"start_after": In(Nothing)},\n    out=Out(DbtOutput, description="Parsed output from running the dbt command."),\n    tags={"kind": "dbt"},\n)\n\n\ndef _get_doc(op_name: str, dbt_command: str) -> str:\n    return f"""\nThis op executes a ``dbt {dbt_command}`` command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the :py:class:`~dagster_dbt.dbt_cli_resource`) or\nover RPC (using the :py:class:`~dbt_rpc_sync_resource`).\n\nExamples:\n\n.. code-block:: python\n\n    from dagster import job\n    from dagster_dbt import {op_name}, dbt_cli_resource, dbt_rpc_sync_resource\n\n    @job(resource_defs={{"dbt":dbt_cli_resource}})\n    def my_dbt_cli_job():\n        {op_name}()\n\n    @job(resource_defs={{"dbt":dbt_rpc_sync_resource}})\n    def my_dbt_rpc_job():\n        {op_name}()\n    """\n\n\n# NOTE: mypy fails to properly track the type of `_DEFAULT_OP_PROPS` items when they are\n# double-splatted, so we type-ignore the below op declarations.\n\n\n@op(  # type: ignore\n    **_DEFAULT_OP_PROPS,\n    config_schema={\n        "yield_asset_events": Field(\n            config=Bool,\n            default_value=True,\n            description=(\n                "If True, materializations and asset observations corresponding to the results of "\n                "the dbt operation will be yielded when the op executes. Default: True"\n            ),\n        ),\n        "asset_key_prefix": Field(\n            config=Array(str),\n            default_value=["dbt"],\n            description=(\n                "If provided and yield_materializations is True, these components will be used to "\n                "prefix the generated asset keys."\n            ),\n        ),\n    },\n)\ndef dbt_build_op(context):\n    dbt_output = context.resources.dbt.build()\n    if context.op_config["yield_asset_events"] and "results" in dbt_output.result:\n        yield from generate_events(\n            dbt_output,\n            node_info_to_asset_key=lambda info: context.op_config["asset_key_prefix"]\n            + info["unique_id"].split("."),\n            manifest_json=context.resources.dbt.get_manifest_json(),\n        )\n    yield Output(dbt_output)\n\n\n
[docs]@op( # type: ignore\n **_DEFAULT_OP_PROPS,\n config_schema={\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the op executes. Default: True"\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["dbt"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n)\ndef dbt_run_op(context):\n dbt_output = context.resources.dbt.run()\n if context.op_config["yield_materializations"] and "results" in dbt_output.result:\n yield from generate_materializations(\n dbt_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(dbt_output)
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS) # type: ignore\ndef dbt_compile_op(context):\n return context.resources.dbt.compile()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS) # type: ignore\ndef dbt_ls_op(context):\n return context.resources.dbt.ls()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS) # type: ignore\ndef dbt_test_op(context):\n return context.resources.dbt.test()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS) # type: ignore\ndef dbt_snapshot_op(context):\n return context.resources.dbt.snapshot()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS) # type: ignore\ndef dbt_seed_op(context):\n return context.resources.dbt.seed()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS) # type: ignore\ndef dbt_docs_generate_op(context):\n return context.resources.dbt.generate_docs()
\n\n\nfor dbt_op, cmd in [\n (dbt_build_op, "build"),\n (dbt_run_op, "run"),\n (dbt_compile_op, "compile"),\n (dbt_ls_op, "ls"),\n (dbt_test_op, "test"),\n (dbt_snapshot_op, "snapshot"),\n (dbt_seed_op, "seed"),\n (dbt_docs_generate_op, "docs generate"),\n]:\n dbt_op.__doc__ = _get_doc(dbt_op.name, cmd)\n
", "current_page_name": "_modules/dagster_dbt/ops", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.ops"}, "rpc": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.rpc.resources

\nimport json\nimport logging\nimport platform\nimport sys\nimport time\nimport uuid\nfrom base64 import standard_b64encode as b64\nfrom typing import Any, Dict, List, Optional\n\nimport requests\n\nfrom dagster import Failure, Field, IntSource, RetryRequested, StringSource\nfrom dagster import _check as check\nfrom dagster import resource\nfrom dagster.core.utils import coerce_valid_log_level\n\nfrom ..dbt_resource import DbtResource\nfrom .types import DbtRpcOutput\nfrom .utils import is_fatal_code\n\n\n
[docs]class DbtRpcResource(DbtResource):\n """A client for a dbt RPC server.\n\n To use this as a dagster resource, we recommend using\n :func:`dbt_rpc_resource <dagster_dbt.dbt_rpc_resource>`.\n """\n\n def __init__(\n self,\n host: str = "0.0.0.0",\n port: int = 8580,\n jsonrpc_version: str = "2.0",\n logger: Optional[Any] = None,\n **_,\n ):\n """Constructor\n\n Args:\n host (str): The IP address of the host of the dbt RPC server. Default is ``"0.0.0.0"``.\n port (int): The port of the dbt RPC server. Default is ``8580``.\n jsonrpc_version (str): The JSON-RPC version to send in RPC requests.\n Default is ``"2.0"``.\n logger (Optional[Any]): A property for injecting a logger dependency.\n Default is ``None``.\n """\n check.str_param(host, "host")\n check.int_param(port, "port")\n check.str_param(jsonrpc_version, "jsonrpc_version")\n\n self._host = host\n self._port = port\n self._jsonrpc_version = jsonrpc_version\n super().__init__(logger)\n\n @staticmethod\n def _construct_user_agent() -> str:\n """A helper method to construct a standard User-Agent string to be used in HTTP request\n headers.\n\n Returns:\n str: The constructed User-Agent value.\n """\n client = "dagster/dbt-rpc-client"\n python_version = (\n f"Python/{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"\n )\n system_info = f"{platform.system()}/{platform.release()}"\n user_agent = " ".join([python_version, client, system_info])\n return user_agent\n\n def _construct_headers(self) -> Dict[str, str]:\n """Constructs a standard set of headers for HTTP requests.\n\n Returns:\n Dict[str, str]: The HTTP request headers.\n """\n headers = requests.utils.default_headers()\n headers["User-Agent"] = self._construct_user_agent()\n headers["Content-Type"] = "application/json"\n headers["Accept"] = "application/json"\n return headers\n\n def _post(self, data: Optional[str] = None) -> DbtRpcOutput:\n """Constructs and sends a POST request to the dbt RPC server.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n headers = self._construct_headers()\n try:\n response = requests.post(self.url, headers=headers, data=data)\n response.raise_for_status()\n except requests.exceptions.HTTPError as e:\n if is_fatal_code(e):\n raise e\n else:\n raise RetryRequested(max_retries=5, seconds_to_wait=30)\n return DbtRpcOutput(response)\n\n def _get_result(self, data: Optional[str] = None) -> DbtRpcOutput:\n """Constructs and sends a POST request to the dbt RPC server.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n return self._post(data)\n\n def _default_request(\n self, method: str, params: Optional[Dict[str, Any]] = None\n ) -> Dict[str, Any]:\n """Constructs a standard HTTP request body, to be sent to a dbt RPC server.\n\n Args:\n method (str): a dbt RPC method.\n\n Returns:\n Dict: the constructed HTTP request body.\n """\n data = {\n "jsonrpc": self.jsonrpc_version,\n "method": method,\n "id": str(uuid.uuid1()),\n "params": params or {},\n }\n return data\n\n @property\n def host(self) -> str:\n """str: The IP address of the host of the dbt RPC server."""\n return self._host\n\n @property\n def port(self) -> int:\n """int: The port of the dbt RPC server."""\n return self._port\n\n @property\n def jsonrpc_version(self) -> str:\n """str: The JSON-RPC version to send in RPC requests."""\n return self._jsonrpc_version\n\n @property\n def logger(self) -> logging.Logger:\n """logging.Logger: A property for injecting a logger dependency."""\n return self._logger\n\n @property\n def url(self) -> str:\n """str: The URL for sending dbt RPC requests."""\n return f"http://{self.host}:{self.port}/jsonrpc"\n\n
[docs] def status(self):\n """Sends a request with the method ``status`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the RPC method `status\n <https://docs.getdbt.com/reference/commands/rpc/#status>`_.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="status")\n return self._post(data=json.dumps(data))
\n\n
[docs] def ls(\n self,\n select: Optional[List[str]] = None,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtRpcOutput:\n """Sends a request with the method ``list`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `list\n <https://docs.getdbt.com/reference/commands/rpc/#list>`_.\n\n Args:\n select (List[str], optional): the resources to include in the output.\n models (List[str], optional): the models to include in the output.\n exclude (List[str]), optional): the resources to exclude from compilation.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n\n explicit_params = dict(models=models, exclude=exclude)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="list", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def poll(self, request_token: str, logs: bool = False, logs_start: int = 0) -> DbtRpcOutput:\n """Sends a request with the method ``poll`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `poll\n <https://docs.getdbt.com/reference/commands/rpc/#poll>`_.\n\n Args:\n request_token (str): the token to poll responses for.\n logs (bool): Whether logs should be returned in the response. Defaults to ``False``.\n logs_start (int): The zero-indexed log line to fetch logs from. Defaults to ``0``.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="poll")\n data["params"] = {"request_token": request_token, "logs": logs, "logs_start": logs_start}\n return self._post(data=json.dumps(data))
\n\n
[docs] def ps(self, completed: bool = False) -> DbtRpcOutput:\n """Sends a request with the method ``ps`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `ps\n <https://docs.getdbt.com/reference/commands/rpc/#ps>`_.\n\n Args:\n compelted (bool): If ``True``, then also return completed tasks. Defaults to ``False``.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="ps")\n data["params"] = {"completed": completed}\n return self._post(data=json.dumps(data))
\n\n
[docs] def kill(self, task_id: str) -> DbtRpcOutput:\n """Sends a request with the method ``kill`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `kill\n <https://docs.getdbt.com/reference/commands/rpc/#kill>`_.\n\n Args:\n task_id (str): the ID of the task to terminate.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="kill")\n data["params"] = {"task_id": task_id}\n return self._post(data=json.dumps(data))
\n\n
[docs] def cli(self, command: str, **kwargs) -> DbtRpcOutput:\n """Sends a request with CLI syntax to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for `running CLI commands via RPC\n <https://docs.getdbt.com/reference/commands/rpc/#running-a-task-with-cli-syntax>`_.\n\n Args:\n cli (str): a dbt command in CLI syntax.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n params = self._format_params({"cli": command, **kwargs})\n data = self._default_request(method="cli_args", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def compile(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtRpcOutput:\n """Sends a request with the method ``compile`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `compiling projects via RPC\n <https://docs.getdbt.com/reference/commands/rpc/#compile-a-project>`_.\n\n Args:\n models (List[str], optional): the models to include in compilation.\n exclude (List[str]), optional): the models to exclude from compilation.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n\n explicit_params = dict(models=models, exclude=exclude)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="compile", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def run(\n self, models: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtRpcOutput:\n """Sends a request with the method ``run`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `run\n <https://docs.getdbt.com/reference/commands/rpc/#run-models>`_.\n\n Args:\n models (List[str], optional): the models to include in the run.\n exclude (List[str]), optional): the models to exclude from the run.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(models=models, exclude=exclude)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="run", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def snapshot(\n self, select: Optional[List[str]] = None, exclude: Optional[List[str]] = None, **kwargs\n ) -> DbtRpcOutput:\n """Sends a request with the method ``snapshot`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the command `snapshot\n <https://docs.getdbt.com/reference/commands/snapshot>`_.\n\n Args:\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(select=select, exclude=exclude)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="snapshot", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def test(\n self,\n models: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n data: bool = True,\n schema: bool = True,\n **kwargs,\n ) -> DbtRpcOutput:\n """Sends a request with the method ``test`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `test\n <https://docs.getdbt.com/reference/commands/rpc/#run-test>`_.\n\n Args:\n models (List[str], optional): the models to include in testing.\n exclude (List[str], optional): the models to exclude from testing.\n data (bool, optional): If ``True`` (default), then run data tests.\n schema (bool, optional): If ``True`` (default), then run schema tests.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(models=models, exclude=exclude, data=data, schema=schema)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="test", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def seed(\n self,\n show: bool = False,\n select: Optional[List[str]] = None,\n exclude: Optional[List[str]] = None,\n **kwargs,\n ) -> DbtRpcOutput:\n """Sends a request with the method ``seed`` to the dbt RPC server, and returns the response.\n For more details, see the dbt docs for the RPC method `seed\n <https://docs.getdbt.com/reference/commands/rpc/#run-seed>`_.\n\n Args:\n show (bool, optional): If ``True``, then show a sample of the seeded data in the\n response. Defaults to ``False``.\n select (List[str], optional): the snapshots to include in the run.\n exclude (List[str], optional): the snapshots to exclude from the run.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n data = self._default_request(method="seed")\n data["params"] = {"show": show}\n\n if kwargs is not None:\n data["params"]["task_tags"] = kwargs\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def generate_docs(\n self,\n compile_project: bool = False,\n **kwargs,\n ) -> DbtRpcOutput:\n """Sends a request with the method ``docs.generate`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the RPC method `docs.generate\n <https://docs.getdbt.com/reference/commands/rpc/#generate-docs>`_.\n\n Args:\n compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n """\n explicit_params = dict(compile=compile_project)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="docs.generate", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def run_operation(\n self, macro: str, args: Optional[Dict[str, Any]] = None, **kwargs\n ) -> DbtRpcOutput:\n """Sends a request with the method ``run-operation`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for the command `run-operation\n <https://docs.getdbt.com/reference/commands/run-operation>`_.\n\n Args:\n macro (str): the dbt macro to invoke.\n args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(macro=macro, args=args)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="run-operation", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def snapshot_freshness(self, select: Optional[List[str]] = None, **kwargs) -> DbtRpcOutput:\n """Sends a request with the method ``snapshot-freshness`` to the dbt RPC server, and returns\n the response. For more details, see the dbt docs for the command `source snapshot-freshness\n <https://docs.getdbt.com/reference/commands/source#dbt-source-snapshot-freshness>`_.\n\n Args:\n select (List[str], optional): the models to include in calculating snapshot freshness.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(select=select)\n params = self._format_params({**explicit_params, **kwargs})\n data = self._default_request(method="snapshot-freshness", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def compile_sql(self, sql: str, name: str) -> DbtRpcOutput:\n """Sends a request with the method ``compile_sql`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `compiling SQL via RPC\n <https://docs.getdbt.com/reference/commands/rpc#compiling-a-query>`_.\n\n Args:\n sql (str): the SQL to compile in base-64 encoding.\n name (str): a name for the compiled SQL.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(sql=b64(sql.encode("utf-8")).decode("utf-8"), name=name)\n params = self._format_params(explicit_params)\n data = self._default_request(method="compile_sql", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def run_sql(self, sql: str, name: str) -> DbtRpcOutput:\n """Sends a request with the method ``run_sql`` to the dbt RPC server, and returns the\n response. For more details, see the dbt docs for `running SQL via RPC\n <https://docs.getdbt.com/reference/commands/rpc#executing-a-query>`_.\n\n Args:\n sql (str): the SQL to run in base-64 encoding.\n name (str): a name for the compiled SQL.\n\n Returns:\n Response: the HTTP response from the dbt RPC server.\n """\n explicit_params = dict(sql=b64(sql.encode("utf-8")).decode("utf-8"), name=name)\n params = self._format_params(explicit_params)\n data = self._default_request(method="run_sql", params=params)\n\n return self._get_result(data=json.dumps(data))
\n\n
[docs] def build(self, select: Optional[List[str]] = None, **kwargs) -> DbtRpcOutput:\n """\n Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n Args:\n select (List[str], optional): the models/resources to include in the run.\n\n Returns:\n DbtOutput: object containing parsed output from dbt\n """\n ... # pylint: disable=unnecessary-ellipsis\n raise NotImplementedError()
\n\n
[docs] def get_run_results_json(self, **kwargs) -> Optional[Dict[str, Any]]:\n """\n Get a parsed version of the run_results.json file for the relevant dbt project.\n\n Returns:\n Dict[str, Any]: dictionary containing the parsed contents of the run_results json file\n for this dbt project.\n """\n ... # pylint: disable=unnecessary-ellipsis\n raise NotImplementedError()
\n\n
[docs] def get_manifest_json(self, **kwargs) -> Optional[Dict[str, Any]]:\n """\n Get a parsed version of the manifest.json file for the relevant dbt project.\n\n Returns:\n Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n for this dbt project.\n """\n ... # pylint: disable=unnecessary-ellipsis\n raise NotImplementedError()
\n\n\n
[docs]class DbtRpcSyncResource(DbtRpcResource):\n def __init__(\n self,\n host: str = "0.0.0.0",\n port: int = 8580,\n jsonrpc_version: str = "2.0",\n logger: Optional[Any] = None,\n poll_interval: int = 1,\n **_,\n ):\n """Constructor\n\n Args:\n host (str): The IP address of the host of the dbt RPC server. Default is ``"0.0.0.0"``.\n port (int): The port of the dbt RPC server. Default is ``8580``.\n jsonrpc_version (str): The JSON-RPC version to send in RPC requests.\n Default is ``"2.0"``.\n logger (Optional[Any]): A property for injecting a logger dependency.\n Default is ``None``.\n poll_interval (int): The polling interval in seconds.\n """\n super().__init__(host, port, jsonrpc_version, logger)\n self.poll_interval = poll_interval\n\n def _get_result(self, data: Optional[str] = None) -> DbtRpcOutput:\n """Sends a request to the dbt RPC server and continuously polls for the status of a request\n until the state is ``success``."""\n\n out = super()._get_result(data)\n request_token: str = check.not_none(out.result.get("request_token"))\n\n logs_start = 0\n\n elapsed_time = -1\n current_state = None\n\n while True:\n out = self.poll(\n request_token=request_token,\n logs=True,\n logs_start=logs_start,\n )\n logs = out.result.get("logs", [])\n for log in logs:\n self.logger.log(\n msg=log["message"],\n level=coerce_valid_log_level(log.get("levelname", "INFO")),\n extra=log.get("extra"),\n )\n logs_start += len(logs)\n\n current_state = out.result.get("state")\n # Stop polling if request's state is no longer "running".\n if current_state != "running":\n break\n\n elapsed_time = out.result.get("elapsed", 0)\n # Sleep for the configured time interval before polling again.\n time.sleep(self.poll_interval)\n\n if current_state != "success":\n raise Failure(\n description=(\n f"Request {request_token} finished with state '{current_state}' in "\n f"{elapsed_time} seconds"\n ),\n )\n\n return out
\n\n\n
[docs]@resource(\n description="A resource representing a dbt RPC client.",\n config_schema={\n "host": Field(StringSource),\n "port": Field(IntSource, is_required=False, default_value=8580),\n },\n)\ndef dbt_rpc_resource(context) -> DbtRpcResource:\n """This resource defines a dbt RPC client.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n Examples:\n\n Examples:\n\n .. code-block:: python\n\n from dagster_dbt import dbt_rpc_resource\n\n custom_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n @job(resource_defs={"dbt_rpc": custom_dbt_rpc_sync_resource})\n def dbt_rpc_job():\n # Run ops with `required_resource_keys={"dbt_rpc", ...}`.\n\n """\n return DbtRpcResource(\n host=context.resource_config["host"], port=context.resource_config["port"]\n )
\n\n\n
[docs]@resource(\n description="A resource representing a synchronous dbt RPC client.",\n config_schema={\n "host": Field(StringSource),\n "port": Field(IntSource, is_required=False, default_value=8580),\n "poll_interval": Field(IntSource, is_required=False, default_value=1),\n },\n)\ndef dbt_rpc_sync_resource(\n context,\n) -> DbtRpcSyncResource:\n """This resource defines a synchronous dbt RPC client, which sends requests to a dbt RPC server,\n and waits for the request to complete before returning.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n Examples:\n\n .. code-block:: python\n\n from dagster_dbt import dbt_rpc_sync_resource\n\n custom_sync_dbt_rpc_resource = dbt_rpc_sync_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n @job(resource_defs={"dbt_rpc": custom_dbt_rpc_sync_resource})\n def dbt_rpc_sync_job():\n # Run ops with `required_resource_keys={"dbt_rpc", ...}`.\n\n """\n return DbtRpcSyncResource(\n host=context.resource_config["host"],\n port=context.resource_config["port"],\n poll_interval=context.resource_config["poll_interval"],\n )
\n\n\nlocal_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "0.0.0.0", "port": 8580})\nlocal_dbt_rpc_resource.__doc__ = """This resource defines a dbt RPC client for an RPC server running\non 0.0.0.0:8580."""\n
", "current_page_name": "_modules/dagster_dbt/rpc/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.rpc.resources"}, "solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.rpc.solids

\nimport json\nimport time\nfrom typing import Callable, Optional\n\nimport pandas as pd\nfrom dagster_pandas import DataFrame\n\nfrom dagster import (\n    Array,\n    Bool,\n    DagsterInvalidDefinitionError,\n    Failure,\n    Field,\n    InputDefinition,\n    Int,\n    Noneable,\n    Nothing,\n    Output,\n    OutputDefinition,\n    Permissive,\n    RetryRequested,\n    String,\n)\nfrom dagster import _check as check\nfrom dagster import solid\nfrom dagster.core.execution.context.compute import SolidExecutionContext\n\nfrom ..errors import DagsterDbtRpcUnexpectedPollOutputError\nfrom .types import DbtRpcOutput\nfrom .utils import log_rpc, raise_for_rpc_error\n\n\ndef _poll_rpc(\n    context: SolidExecutionContext, request_token: str, should_yield_materializations: bool = True\n):\n    """Polls the dbt RPC server for the status of a request until the state is ``success``."""\n    from ..utils import generate_materializations\n\n    logs_start = 0\n    interval = context.solid_config.get("interval")\n\n    elapsed_time = -1\n    current_state = None\n\n    while True:\n        # Poll for the dbt RPC request.\n        context.log.debug(f"RequestToken: {request_token}")\n        out = context.resources.dbt_rpc.poll(\n            request_token=request_token, logs=context.solid_config["logs"], logs_start=logs_start\n        )\n        raise_for_rpc_error(context, out.response)\n\n        # Pass dbt RPC logs into the Dagster/Dagit logger.\n        if context.solid_config["logs"]:\n            logs = out.result.get("logs", [])\n            if len(logs) > 0:\n                log_rpc(context, logs)\n            logs_start += len(logs)\n\n        current_state = out.result.get("state")\n        # Stop polling if request's state is no longer "running".\n        if current_state != "running":\n            break\n\n        elapsed_time = out.result.get("elapsed", 0)\n        # Sleep for the configured time interval before polling again.\n        context.log.debug(\n            f"Request {request_token} currently in state '{current_state}' (elapsed time "\n            f"{elapsed_time} seconds). Sleeping for {interval}s..."\n        )\n        time.sleep(interval)\n\n    if current_state != "success":\n        raise Failure(\n            description=(\n                f"Request {request_token} finished with state '{current_state}' in "\n                f"{elapsed_time} seconds"\n            ),\n        )\n\n    context.log.info(\n        f"Request {request_token} finished with state '{current_state}' in {elapsed_time} seconds"\n    )\n    context.log.debug(json.dumps(out.result, indent=2))\n\n    if should_yield_materializations:\n        for materialization in generate_materializations(out):\n            yield materialization\n\n    yield Output(out)\n\n\ndef unwrap_result(poll_rpc_generator) -> DbtRpcOutput:\n    """A helper function that extracts the `DbtRpcOutput` value from a generator.\n\n    The parameter `poll_rpc_generator` is expected to be an invocation of `_poll_rpc`.\n    """\n    output = None\n    for x in poll_rpc_generator:\n        output = x\n\n    if output is None:\n        raise DagsterDbtRpcUnexpectedPollOutputError(\n            description="poll_rpc yielded None as its last value. Expected value of type Output containing DbtRpcOutput.",\n        )\n\n    if not isinstance(output, Output):\n        raise DagsterDbtRpcUnexpectedPollOutputError(\n            description=f"poll_rpc yielded value of type {type(output)} as its last value. Expected value of type Output containing DbtRpcOutput.",\n        )\n\n    if not isinstance(output.value, DbtRpcOutput):\n        raise DagsterDbtRpcUnexpectedPollOutputError(\n            description=f"poll_rpc yielded Output containing {type(output.value)}. Expected DbtRpcOutput.",\n        )\n\n    return output.value\n\n\n
[docs]@solid(\n description="A solid to invoke dbt run over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt run.",\n )\n ],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt run`` command to a dbt RPC server and returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.run(\n models=context.solid_config["models"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt run over RPC and poll the resulting RPC process until it's complete.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to run.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "full_refresh": Field(\n config=Bool,\n description="Whether or not to perform a --full-refresh.",\n is_required=False,\n default_value=False,\n ),\n "fail_fast": Field(\n config=Bool,\n description="Whether or not to --fail-fast.",\n is_required=False,\n default_value=False,\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "task_tags": Permissive(),\n "max_retries": Field(config=Int, is_required=False, default_value=5),\n "retry_interval": Field(config=Int, is_required=False, default_value=120),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt run`` command to a dbt RPC server and returns the result of the\n executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n if context.solid_config["task_tags"]:\n results = context.resources.dbt_rpc.ps().json()\n for task in results["result"]["rows"]:\n if task["tags"] == context.solid_config["task_tags"]:\n context.log.warning(\n f"RPC task with tags {json.dumps(task['tags'])} currently running."\n )\n raise RetryRequested(\n max_retries=context.solid_config["max_retries"],\n seconds_to_wait=context.solid_config["retry_interval"],\n )\n\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " run"\n\n if context.solid_config["models"]:\n models = " ".join(set(context.solid_config["models"]))\n command += f" --models {models}"\n\n if context.solid_config["exclude"]:\n exclude = " ".join(set(context.solid_config["exclude"]))\n command += f" --exclude {exclude}"\n\n if context.solid_config["full_refresh"]:\n command += " --full-refresh"\n\n if context.solid_config["fail_fast"]:\n command += " --fail-fast"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n out = context.resources.dbt_rpc.cli(\n command=command, task_tags=context.solid_config["task_tags"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt test over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt test.",\n )\n ],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to test.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "data": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run custom data tests.",\n ),\n "schema": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run schema tests.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_test(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt test`` command to a dbt RPC server and returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.test(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n data=context.solid_config["data"],\n schema=context.solid_config["schema"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description=(\n "A solid to invoke dbt test over RPC and poll the resulting RPC process until it's "\n "complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to test.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "data": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run custom data tests.",\n ),\n "schema": Field(\n config=Bool,\n default_value=True,\n is_required=False,\n description="Whether or not to run schema tests.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_test_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt test`` command to a dbt RPC server and returns the result of the\n executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n out = context.resources.dbt_rpc.test(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n data=context.solid_config["data"],\n schema=context.solid_config["schema"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to invoke a dbt run operation over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt run operation.",\n )\n ],\n config_schema={\n "macro": Field(\n config=String,\n is_required=True,\n description="The dbt macro to invoke as a run operation",\n ),\n "args": Field(\n config=Noneable(Permissive()),\n is_required=False,\n default_value=None,\n description="Arguments to supply to the invoked macro.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_operation(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt run-operation`` command to a dbt RPC server and returns the\n request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.run_operation(\n macro=context.solid_config["macro"], args=context.solid_config["args"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description=(\n "A solid to invoke a dbt run operation over RPC and poll the resulting RPC process until "\n "it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "macro": Field(\n config=String,\n is_required=True,\n description="The dbt macro to invoke as a run operation",\n ),\n "args": Field(\n config=Noneable(Permissive()),\n is_required=False,\n default_value=None,\n description="Arguments to supply to the invoked macro.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_run_operation_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt run-operation`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n out = context.resources.dbt_rpc.run_operation(\n macro=context.solid_config["macro"], args=context.solid_config["args"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to invoke a dbt snapshot over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt snapshot.",\n )\n ],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to snapshot.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to exclude from the snapshot.",\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt snapshot`` command to a dbt RPC server and returns the\n request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.snapshot(\n select=context.solid_config["select"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description=(\n "A solid to invoke a dbt snapshot over RPC and poll the resulting RPC process until "\n "it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to snapshot.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt snapshot files to exclude from the snapshot.",\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "task_tags": Permissive(),\n "max_retries": Field(config=Int, is_required=False, default_value=5),\n "retry_interval": Field(config=Int, is_required=False, default_value=120),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt snapshot`` command to a dbt RPC server and returns the result of\n the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n if context.solid_config["task_tags"]:\n results = context.resources.dbt_rpc.ps().json()\n for task in results["result"]["rows"]:\n if task["tags"] == context.solid_config["task_tags"]:\n context.log.warning(\n f"RPC task with tags {json.dumps(task['tags'])} currently running."\n )\n raise RetryRequested(\n max_retries=context.solid_config["max_retries"],\n seconds_to_wait=context.solid_config["retry_interval"],\n )\n\n out = context.resources.dbt_rpc.snapshot(\n select=context.solid_config["select"], exclude=context.solid_config["exclude"]\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to invoke dbt source snapshot-freshness over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt snapshot.",\n )\n ],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt sources to snapshot-freshness for.",\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_freshness(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt source snapshot-freshness`` command to a dbt RPC server and\n returns the request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " source snapshot-freshness"\n\n if context.solid_config["select"]:\n select = " ".join(set(context.solid_config["select"]))\n command += f" --select {select}"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n out = context.resources.dbt_rpc.cli(command=command)\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")
\n\n\n
[docs]@solid(\n description=(\n "A solid to invoke dbt source snapshot-freshness over RPC and poll the resulting "\n "RPC process until it's complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "select": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt sources to snapshot-freshness for.",\n ),\n "warn_error": Field(\n config=Bool,\n description="Whether or not to --warn-error.",\n is_required=False,\n default_value=False,\n ),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_snapshot_freshness_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt source snapshot`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n command = ""\n\n if context.solid_config["warn_error"]:\n command += " --warn-error"\n\n command += " source snapshot-freshness"\n\n if context.solid_config["select"]:\n select = " ".join(set(context.solid_config["select"]))\n command += f" --select {select}"\n\n context.log.debug(f"Running dbt command: dbt {command}")\n out = context.resources.dbt_rpc.cli(command=command)\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )
\n\n\n
[docs]@solid(\n description="A solid to compile a SQL query in context of a dbt project over RPC.",\n input_defs=[\n InputDefinition(name="start_after", dagster_type=Nothing),\n InputDefinition(\n name="sql", description="The SQL query to be compiled.", dagster_type=String\n ),\n ],\n output_defs=[\n OutputDefinition(name="sql", description="The compiled SQL query.", dagster_type=String)\n ],\n config_schema={\n "name": Field(config=String),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_compile_sql(context: SolidExecutionContext, sql: String) -> DbtRpcOutput:\n """This solid sends the ``dbt compile`` command to a dbt RPC server and returns the request\n token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.compile_sql(sql=sql, name=context.solid_config["name"])\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return unwrap_result(\n _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n )
\n\n\n
[docs]def create_dbt_rpc_run_sql_solid(\n name: str, output_def: Optional[OutputDefinition] = None, **kwargs\n) -> Callable:\n """This function is a factory which constructs a solid that will copy the results of a SQL query\n run within the context of a dbt project to a pandas ``DataFrame``.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@solid\n <dagster.solid>` decorator. However, note that overriding ``config_schema``, ``input_defs``, and\n ``required_resource_keys`` is not allowed and will throw a :class:`DagsterInvalidDefinitionError\n <dagster.DagsterInvalidDefinitionError>`.\n\n If you would like to configure this solid with different config fields, you could consider using\n :func:`@composite_solid <dagster.composite_solid>` to wrap this solid.\n\n Args:\n name (str): The name of this solid.\n output_def (OutputDefinition, optional): The :class:`OutputDefinition\n <dagster.OutputDefinition>` for the solid. This value should always be a representation\n of a pandas ``DataFrame``. If not specified, the solid will default to an\n :class:`OutputDefinition <dagster.OutputDefinition>` named "df" with a ``DataFrame``\n dagster type.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n check.str_param(obj=name, param_name="name")\n check.opt_inst_param(obj=output_def, param_name="output_def", ttype=OutputDefinition)\n\n if "config_schema" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding config_schema is not supported.")\n\n if "input_defs" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding input_defs is not supported.")\n\n if "required_resource_keys" in kwargs:\n raise DagsterInvalidDefinitionError("Overriding required_resource_keys is not supported.")\n\n @solid(\n name=name,\n description=kwargs.pop(\n "description",\n "A solid to run a SQL query in context of a dbt project over RPC and return the "\n "results in a pandas DataFrame.",\n ),\n input_defs=[\n InputDefinition(name="start_after", dagster_type=Nothing),\n InputDefinition(\n name="sql", description="The SQL query to be run.", dagster_type=String\n ),\n ],\n output_defs=[\n output_def\n or OutputDefinition(\n name="df", description="The results of the SQL query.", dagster_type=DataFrame\n )\n ],\n config_schema={\n "name": Field(config=String),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation "\n "will be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n **kwargs,\n )\n def _dbt_rpc_run_sql(context: SolidExecutionContext, sql: String) -> pd.DataFrame:\n out = context.resources.dbt_rpc.run_sql(sql=sql, name=context.solid_config["name"])\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n out = unwrap_result(\n _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n )\n table = out.result["results"][0]["table"]\n return pd.DataFrame.from_records(data=table["rows"], columns=table["column_names"])\n\n return _dbt_rpc_run_sql
\n\n\n@solid(\n description="A solid to invoke dbt seed over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n description="The request token of the invoked dbt seed.",\n dagster_type=String,\n ),\n ],\n config_schema={\n "show": Field(\n config=Bool,\n is_required=False,\n default_value=False,\n description="If True, show a sample of the seeded data in the response.",\n ),\n "task_tags": Permissive(),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_seed(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt seed`` command to a dbt RPC server and returns the request\n token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.seed(\n show=context.solid_config["show"],\n **context.solid_config["task_tags"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")\n\n\n@solid(\n description=(\n "A solid to invoke dbt seed over RPC and poll the resulting RPC process until it's "\n "complete."\n ),\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "show": Field(\n config=Bool,\n is_required=False,\n default_value=False,\n description="If True, show a sample of the seeded data in the response.",\n ),\n "task_tags": Permissive(),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_seed_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt seed`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n out = context.resources.dbt_rpc.seed(\n show=context.solid_config["show"],\n task_tags=context.solid_config["task_tags"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n\n\n@solid(\n description="A solid to invoke dbt docs generate over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[\n OutputDefinition(\n name="request_token",\n dagster_type=String,\n description="The request token of the invoked dbt run.",\n )\n ],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to compile and generate docs for.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "compile": Field(\n config=Bool,\n is_required=False,\n default_value=False,\n description="If True, compile the project before generating a catalog.",\n ),\n "task_tags": Permissive(),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_docs_generate(context: SolidExecutionContext) -> String:\n """This solid sends the ``dbt docs generate`` command to a dbt RPC server and returns the\n request token.\n\n This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\n poll the progress of the running dbt process.\n """\n out = context.resources.dbt_rpc.run(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n compile=context.solid_config["compile"],\n **context.solid_config["task_tags"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n return out.result.get("request_token")\n\n\n@solid(\n description="A solid to invoke dbt docs generate over RPC.",\n input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],\n output_defs=[OutputDefinition(name="result", dagster_type=DbtRpcOutput)],\n config_schema={\n "models": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to compile and generate docs for.",\n ),\n "exclude": Field(\n config=Noneable(Array(String)),\n default_value=None,\n is_required=False,\n description="The dbt models to exclude.",\n ),\n "compile": Field(\n config=Bool,\n is_required=False,\n default_value=False,\n description="If True, compile the project before generating a catalog.",\n ),\n "task_tags": Permissive(),\n "interval": Field(\n config=Int,\n is_required=False,\n default_value=10,\n description="The interval (in seconds) at which to poll the dbt rpc process.",\n ),\n "logs": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description="Whether or not to return logs from the process.",\n ),\n "yield_materializations": Field(\n config=Bool,\n is_required=False,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the dbt operation will "\n "be yielded when the solid executes. Default: True"\n ),\n ),\n },\n required_resource_keys={"dbt_rpc"},\n tags={"kind": "dbt"},\n)\ndef dbt_rpc_docs_generate_and_wait(context: SolidExecutionContext) -> DbtRpcOutput:\n """This solid sends the ``dbt docs generate`` command to a dbt RPC server and returns the\n result of the executed dbt process.\n\n This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\n process is completed.\n """\n out = context.resources.dbt_rpc.run(\n models=context.solid_config["models"],\n exclude=context.solid_config["exclude"],\n compile=context.solid_config["compile"],\n task_tags=context.solid_config["task_tags"],\n )\n context.log.debug(out.response.text)\n raise_for_rpc_error(context, out.response)\n request_token = out.result.get("request_token")\n return _poll_rpc(\n context,\n request_token,\n should_yield_materializations=context.solid_config["yield_materializations"],\n )\n
", "current_page_name": "_modules/dagster_dbt/rpc/solids", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.rpc.solids"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.rpc.types

\nfrom typing import Any, Dict\n\nimport requests\n\nfrom ..types import DbtOutput\n\n\n
[docs]class DbtRpcOutput(DbtOutput):\n """The output from executing a dbt command via the dbt RPC server.\n\n Attributes:\n result (Dict[str, Any]): The parsed contents of the "result" field of the JSON response from\n the rpc server (if any).\n response_dict (Dict[str, Any]): The entire contents of the JSON response from the rpc server.\n response (requests.Response): The original Response from which this output was generated.\n """\n\n def __init__(self, response: requests.Response):\n\n self._response = response\n self._response_dict = response.json()\n\n super().__init__(result=self._response_dict.get("result", {}))\n\n @property\n def response(self) -> requests.Response:\n return self._response\n\n @property\n def response_dict(self) -> Dict[str, Any]:\n return self._response_dict
\n
", "current_page_name": "_modules/dagster_dbt/rpc/types", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.rpc.types"}}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.types

\nfrom typing import Any, Dict, Optional\n\nimport dagster._check as check\n\n\n
[docs]class DbtOutput:\n """\n Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, `result`, which\n represents the dbt-formatted result of the command that was run (if any).\n\n Used internally, should not be instantiated directly by the user.\n """\n\n def __init__(self, result: Dict[str, Any]):\n self._result = check.dict_param(result, "result", key_type=str)\n\n @property\n def result(self) -> Dict[str, Any]:\n return self._result\n\n @property\n def docs_url(self) -> Optional[str]:\n return None
\n
", "current_page_name": "_modules/dagster_dbt/types", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.types"}, "utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_dbt.utils

\nfrom typing import Any, Callable, Dict, Iterator, List, Mapping, Optional, Union, cast\n\nimport dateutil\n\nfrom dagster import AssetKey, AssetMaterialization, AssetObservation, MetadataValue\nfrom dagster import _check as check\nfrom dagster.core.definitions.metadata import RawMetadataValue\n\nfrom .types import DbtOutput\n\n\ndef default_node_info_to_asset_key(node_info: Dict[str, Any]) -> AssetKey:\n    return AssetKey(node_info["unique_id"].split("."))\n\n\ndef _node_type(unique_id: str) -> str:\n    # returns the type of the node (e.g. model, test, snapshot)\n    return unique_id.split(".")[0]\n\n\ndef _node_result_to_metadata(node_result: Dict[str, Any]) -> Mapping[str, RawMetadataValue]:\n    return {\n        "Materialization Strategy": node_result["config"]["materialized"],\n        "Database": node_result["database"],\n        "Schema": node_result["schema"],\n        "Alias": node_result["alias"],\n        "Description": node_result["description"],\n    }\n\n\ndef _timing_to_metadata(timings: List[Dict[str, Any]]) -> Mapping[str, RawMetadataValue]:\n    metadata: Dict[str, RawMetadataValue] = {}\n    for timing in timings:\n        if timing["name"] == "execute":\n            desc = "Execution"\n        elif timing["name"] == "compile":\n            desc = "Compilation"\n        else:\n            continue\n\n        # dateutil does not properly expose its modules to static checkers\n        started_at = dateutil.parser.isoparse(timing["started_at"])  # type: ignore\n        completed_at = dateutil.parser.isoparse(timing["completed_at"])  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                f"{desc} Started At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Completed At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Duration": duration.total_seconds(),\n            }\n        )\n    return metadata\n\n\ndef result_to_events(\n    result: Dict[str, Any],\n    docs_url: Optional[str] = None,\n    node_info_to_asset_key: Optional[Callable[[Dict[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Dict[str, Any]] = None,\n) -> Optional[Iterator[Union[AssetMaterialization, AssetObservation]]]:\n    """\n    This is a hacky solution that attempts to consolidate parsing many of the potential formats\n    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,\n    as well as RPC responses for a similar time period, but as the RPC response schema is not documented\n    nor enforced, this can become out of date easily.\n    """\n    node_info_to_asset_key = check.opt_callable_param(\n        node_info_to_asset_key, "node_info_to_asset_key", default=default_node_info_to_asset_key\n    )\n\n    # status comes from set of fields rather than "status"\n    if "fail" in result:\n        status = (\n            "fail"\n            if result.get("fail")\n            else "skip"\n            if result.get("skip")\n            else "error"\n            if result.get("error")\n            else "success"\n        )\n    else:\n        status = result["status"]\n\n    # all versions represent timing the same way\n    metadata = {"Status": status, "Execution Time (seconds)": result["execution_time"]}\n    metadata.update(_timing_to_metadata(result["timing"]))\n\n    # working with a response that contains the node block (RPC and CLI 0.18.x)\n    if "node" in result:\n\n        unique_id = result["node"]["unique_id"]\n        metadata.update(_node_result_to_metadata(result["node"]))\n    else:\n        unique_id = result["unique_id"]\n\n    if docs_url:\n        metadata["docs_url"] = MetadataValue.url(f"{docs_url}#!/model/{unique_id}")\n\n    node_type = _node_type(unique_id)\n\n    # if you have a manifest available, get the full node info, otherwise just populate unique_id\n    node_info = manifest_json["nodes"][unique_id] if manifest_json else {"unique_id": unique_id}\n\n    if node_type == "model" and status == "success":\n        yield AssetMaterialization(\n            asset_key=node_info_to_asset_key(node_info),\n            description=f"dbt node: {unique_id}",\n            metadata=metadata,\n        )\n    # can only associate tests with assets if we have manifest_json available\n    elif node_type == "test" and manifest_json:\n        upstream_unique_ids = manifest_json["nodes"][unique_id]["depends_on"]["nodes"]\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            node_info = manifest_json["nodes"][upstream_id]\n            upstream_asset_key = node_info_to_asset_key(node_info)\n            yield AssetObservation(asset_key=upstream_asset_key, metadata=metadata)\n\n\ndef generate_events(\n    dbt_output: DbtOutput,\n    node_info_to_asset_key: Optional[Callable[[Dict[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Dict[str, Any]] = None,\n) -> Iterator[Union[AssetMaterialization, AssetObservation]]:\n\n    """\n    This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n    a dbt command, and :py:class:`dagster.AssetObservation` events for each test run.\n\n    Information parsed from a :py:class:`~DbtOutput` object.\n    """\n\n    for result in dbt_output.result["results"]:\n        yield from check.not_none(\n            result_to_events(\n                result,\n                docs_url=dbt_output.docs_url,\n                node_info_to_asset_key=node_info_to_asset_key,\n                manifest_json=manifest_json,\n            )\n        )\n\n\n
[docs]def generate_materializations(\n dbt_output: DbtOutput,\n asset_key_prefix: Optional[List[str]] = None,\n) -> Iterator[AssetMaterialization]:\n """\n This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n a dbt command.\n\n Information parsed from a :py:class:`~DbtOutput` object.\n\n Note that this will not work with output from the `dbt_rpc_resource`, because this resource does\n not wait for a response from the RPC server before returning. Instead, use the\n `dbt_rpc_sync_resource`, which will wait for execution to complete.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import op, Output\n from dagster_dbt.utils import generate_materializations\n from dagster_dbt import dbt_cli_resource, dbt_rpc_sync_resource\n\n @op(required_resource_keys={"dbt"})\n def my_custom_dbt_run(context):\n dbt_output = context.resources.dbt.run()\n for materialization in generate_materializations(dbt_output):\n # you can modify the materialization object to add extra metadata, if desired\n yield materialization\n yield Output(my_dbt_output)\n\n @job(resource_defs={{"dbt":dbt_cli_resource}})\n def my_dbt_cli_job():\n my_custom_dbt_run()\n\n @job(resource_defs={{"dbt":dbt_rpc_sync_resource}})\n def my_dbt_rpc_job():\n my_custom_dbt_run()\n """\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n for event in generate_events(\n dbt_output,\n node_info_to_asset_key=lambda info: AssetKey(\n asset_key_prefix + info["unique_id"].split(".")\n ),\n ):\n yield check.inst(cast(AssetMaterialization, event), AssetMaterialization)
\n
", "current_page_name": "_modules/dagster_dbt/utils", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_dbt.utils"}}, "dagster_docker": {"docker_executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_docker.docker_executor

\nfrom typing import List, Optional\n\nimport docker\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nimport dagster._check as check\nfrom dagster import executor\nfrom dagster.core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster.core.events import DagsterEvent, DagsterEventType, EngineEventData, MetadataEntry\nfrom dagster.core.execution.plan.objects import StepFailureData\nfrom dagster.core.execution.retries import RetryMode, get_retries_config\nfrom dagster.core.executor.base import Executor\nfrom dagster.core.executor.init import InitExecutorContext\nfrom dagster.core.executor.step_delegating import StepDelegatingExecutor\nfrom dagster.core.executor.step_delegating.step_handler.base import StepHandler, StepHandlerContext\nfrom dagster.serdes.utils import hash_str\nfrom dagster.utils import merge_dicts\nfrom dagster.utils.backcompat import experimental\n\nfrom .container_context import DockerContainerContext\nfrom .utils import parse_env_var\n\n\n
[docs]@executor(\n name="docker",\n config_schema=merge_dicts(\n DOCKER_CONFIG_SCHEMA,\n {\n "retries": get_retries_config(),\n },\n ),\n requirements=multiple_process_executor_requirements(),\n)\n@experimental\ndef docker_executor(init_context: InitExecutorContext) -> Executor:\n """\n Executor which launches steps as Docker containers.\n\n To use the `docker_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_executor.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n registry: ...\n network: ...\n networks: ...\n container_kwargs: ...\n\n If you're using the DockerRunLauncher, configuration set on the containers created by the run\n launcher will also be set on the containers that are created for each step.\n """\n\n config = init_context.executor_config\n image = check.opt_str_elem(config, "image")\n registry = check.opt_dict_elem(config, "registry", key_type=str)\n env_vars = check.opt_list_elem(config, "env_vars", of_type=str)\n network = check.opt_str_elem(config, "network")\n networks = check.opt_list_elem(config, "networks", of_type=str)\n container_kwargs = check.opt_dict_elem(config, "container_kwargs", key_type=str)\n retries = check.dict_elem(config, "retries", key_type=str)\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network and not networks:\n networks = [network]\n\n container_context = DockerContainerContext(\n registry=registry,\n env_vars=env_vars or [],\n networks=networks or [],\n container_kwargs=container_kwargs,\n )\n\n return StepDelegatingExecutor(\n DockerStepHandler(image, container_context),\n retries=check.not_none(RetryMode.from_config(retries)),\n )
\n\n\nclass DockerStepHandler(StepHandler):\n def __init__(\n self,\n image: Optional[str],\n container_context: DockerContainerContext,\n ):\n super().__init__()\n\n self._image = check.opt_str_param(image, "image")\n self._container_context = check.inst_param(\n container_context, "container_context", DockerContainerContext\n )\n\n def _get_image(self, step_handler_context: StepHandlerContext):\n from . import DockerRunLauncher\n\n image = (\n step_handler_context.execute_step_args.pipeline_origin.repository_origin.container_image\n )\n if not image:\n image = self._image\n\n run_launcher = step_handler_context.instance.run_launcher\n\n if not image and isinstance(run_launcher, DockerRunLauncher):\n image = run_launcher.image\n\n if not image:\n raise Exception("No docker image specified by the executor config or repository")\n\n return image\n\n def _get_docker_container_context(self, step_handler_context: StepHandlerContext):\n # This doesn't vary per step: would be good to have a hook where it can be set once\n # for the whole StepHandler but we need access to the PipelineRun for that\n\n from .docker_run_launcher import DockerRunLauncher\n\n run_launcher = step_handler_context.instance.run_launcher\n run_target = DockerContainerContext.create_for_run(\n step_handler_context.pipeline_run,\n run_launcher if isinstance(run_launcher, DockerRunLauncher) else None,\n )\n\n merged_container_context = run_target.merge(self._container_context)\n\n validate_docker_config(\n network=None,\n networks=merged_container_context.networks,\n container_kwargs=merged_container_context.container_kwargs,\n )\n\n return merged_container_context\n\n @property\n def name(self) -> str:\n return "DockerStepHandler"\n\n def _get_client(self, docker_container_context: DockerContainerContext):\n client = docker.client.from_env()\n if docker_container_context.registry:\n client.login(\n registry=docker_container_context.registry["url"],\n username=docker_container_context.registry["username"],\n password=docker_container_context.registry["password"],\n )\n return client\n\n def _get_container_name(self, run_id, step_key):\n return f"dagster-step-{hash_str(run_id + step_key)}"\n\n def _create_step_container(self, client, container_context, step_image, execute_step_args):\n return client.containers.create(\n step_image,\n name=self._get_container_name(\n execute_step_args.pipeline_run_id, execute_step_args.step_keys_to_execute[0]\n ),\n detach=True,\n network=container_context.networks[0] if len(container_context.networks) else None,\n command=execute_step_args.get_command_args(),\n environment=(dict([parse_env_var(env_var) for env_var in container_context.env_vars])),\n **container_context.container_kwargs,\n )\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> List[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n step_image = self._get_image(step_handler_context)\n validate_docker_image(step_image)\n\n try:\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context.execute_step_args\n )\n except docker.errors.ImageNotFound:\n client.images.pull(step_image)\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context.execute_step_args\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(step_container)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n events = [\n DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message="Launching step in Docker container",\n event_specific_data=EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Docker container id", value=step_container.id),\n ],\n ),\n )\n ]\n\n step_container.start()\n\n return events\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> List[DagsterEvent]:\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n step_key = step_keys_to_execute[0]\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n container_name = self._get_container_name(\n step_handler_context.execute_step_args.pipeline_run_id,\n step_key,\n )\n\n try:\n container = client.containers.get(container_name)\n\n except Exception as e:\n return [\n DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Error when checking on step container health: {e}",\n event_specific_data=StepFailureData(\n error=None,\n user_failure_data=None,\n ),\n )\n ]\n\n if container.status == "running":\n return []\n\n try:\n container_info = container.wait(timeout=0.1)\n except Exception as e:\n return [\n DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Container status is {container.status}. Hit exception attempting to get its return code: {e}",\n event_specific_data=StepFailureData(\n error=None,\n user_failure_data=None,\n ),\n )\n ]\n\n ret_code = container_info.get("StatusCode")\n if ret_code == 0:\n return []\n\n return [\n DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Container status is {container.status}. Return code is {str(ret_code)}.",\n event_specific_data=StepFailureData(\n error=None,\n user_failure_data=None,\n ),\n )\n ]\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> List[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n events = [\n DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message="Stopping Docker container for step",\n event_specific_data=EngineEventData(),\n )\n ]\n\n client = self._get_client(container_context)\n\n try:\n container = client.containers.get(\n self._get_container_name(\n step_handler_context.execute_step_args.pipeline_run_id,\n step_keys_to_execute[0],\n )\n )\n container.stop()\n except Exception as e:\n events.append(\n DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Hit error while terminating Docker container:\\n{e}",\n event_specific_data=EngineEventData(),\n )\n )\n\n return events\n
", "current_page_name": "_modules/dagster_docker/docker_executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_docker.docker_executor"}, "docker_run_launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_docker.docker_run_launcher

\nimport docker\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nimport dagster._check as check\nfrom dagster.core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    ResumeRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster.core.storage.pipeline_run import PipelineRun\nfrom dagster.core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster.grpc.types import ExecuteRunArgs, ResumeRunArgs\nfrom dagster.serdes import ConfigurableClass\n\nfrom .container_context import DockerContainerContext\nfrom .utils import parse_env_var\n\nDOCKER_CONTAINER_ID_TAG = "docker/container_id"\n\n\n
[docs]class DockerRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs in a Docker container."""\n\n def __init__(\n self,\n inst_data=None,\n image=None,\n registry=None,\n env_vars=None,\n network=None,\n networks=None,\n container_kwargs=None,\n ):\n self._inst_data = inst_data\n self.image = image\n self.registry = registry\n self.env_vars = env_vars\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network:\n self.networks = [network]\n elif networks:\n self.networks = networks\n else:\n self.networks = []\n\n self.container_kwargs = check.opt_dict_param(\n container_kwargs, "container_kwargs", key_type=str\n )\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return DOCKER_CONFIG_SCHEMA\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return DockerRunLauncher(inst_data=inst_data, **config_value)\n\n def get_container_context(self, pipeline_run: PipelineRun) -> DockerContainerContext:\n return DockerContainerContext.create_for_run(pipeline_run, self)\n\n def _get_client(self, container_context: DockerContainerContext):\n client = docker.client.from_env()\n if container_context.registry:\n client.login(\n registry=container_context.registry["url"],\n username=container_context.registry["username"],\n password=container_context.registry["password"],\n )\n return client\n\n def _get_docker_image(self, pipeline_code_origin):\n docker_image = pipeline_code_origin.repository_origin.container_image\n\n if not docker_image:\n docker_image = self.image\n\n if not docker_image:\n raise Exception("No docker image specified by the instance config or repository")\n\n validate_docker_image(docker_image)\n return docker_image\n\n def _launch_container_with_command(self, run, docker_image, command):\n container_context = self.get_container_context(run)\n docker_env = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n\n client = self._get_client(container_context)\n\n try:\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n except docker.errors.ImageNotFound:\n client.images.pull(docker_image)\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n self._instance.report_engine_event(\n message="Launching run in a new container {container_id} with image {docker_image}".format(\n container_id=container.id,\n docker_image=docker_image,\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_CONTAINER_ID_TAG: container.id, DOCKER_IMAGE_TAG: docker_image},\n )\n\n container.start()\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.pipeline_run\n pipeline_code_origin = check.not_none(context.pipeline_code_origin)\n docker_image = self._get_docker_image(pipeline_code_origin)\n\n command = ExecuteRunArgs(\n pipeline_origin=pipeline_code_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.pipeline_run\n pipeline_code_origin = check.not_none(context.pipeline_code_origin)\n docker_image = self._get_docker_image(pipeline_code_origin)\n\n command = ResumeRunArgs(\n pipeline_origin=pipeline_code_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n def _get_container(self, run):\n if not run or run.is_finished:\n return None\n\n container_id = run.tags.get(DOCKER_CONTAINER_ID_TAG)\n\n if not container_id:\n return None\n\n container_context = self.get_container_context(run)\n\n try:\n return self._get_client(container_context).containers.get(container_id)\n except Exception:\n return None\n\n def terminate(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n container = self._get_container(run)\n\n if not container:\n self._instance.report_engine_event(\n message="Unable to get docker container to send termination request to.",\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n self._instance.report_run_canceling(run)\n\n container.stop()\n\n return True\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: PipelineRun):\n container = self._get_container(run)\n if container == None:\n return CheckRunHealthResult(WorkerStatus.NOT_FOUND)\n if container.status == "running":\n return CheckRunHealthResult(WorkerStatus.RUNNING)\n return CheckRunHealthResult(\n WorkerStatus.FAILED, msg=f"Container status is {container.status}"\n )
\n
", "current_page_name": "_modules/dagster_docker/docker_run_launcher", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_docker.docker_run_launcher"}}, "dagster_fivetran": {"asset_defs": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_fivetran.asset_defs

\nfrom typing import List, Optional\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL\nfrom dagster_fivetran.utils import generate_materializations\n\nfrom dagster import AssetKey, AssetsDefinition, Out, Output\nfrom dagster import _check as check\nfrom dagster import multi_asset\nfrom dagster.utils.backcompat import experimental\n\n\n
[docs]@experimental\ndef build_fivetran_assets(\n connector_id: str,\n destination_tables: List[str],\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n io_manager_key: Optional[str] = None,\n asset_key_prefix: Optional[List[str]] = None,\n) -> List[AssetsDefinition]:\n\n """\n Build a set of assets for a given Fivetran connector.\n\n Returns an AssetsDefintion which connects the specified ``asset_keys`` to the computation that\n will update them. Internally, executes a Fivetran sync for a given ``connector_id``, and\n polls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\n :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to communicate with the\n Fivetran API.\n\n Args:\n connector_id (str): The Fivetran Connector ID that this op will sync. You can retrieve this\n value from the "Setup" tab of a given connector in the Fivetran UI.\n destination_tables (List[str]): `schema_name.table_name` for each table that you want to be\n represented in the Dagster asset graph for this connection.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n io_manager_key (Optional[str]): The io_manager to be used to handle each of these assets.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([schema_name, table_name])`.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import AssetKey, build_assets_job\n\n from dagster_fivetran import fivetran_resource\n from dagster_fivetran.assets import build_fivetran_assets\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n fivetran_assets = build_fivetran_assets(\n connector_id="foobar",\n table_names=["schema1.table1", "schema2.table2"],\n ])\n\n my_fivetran_job = build_assets_job(\n "my_fivetran_job",\n assets=[fivetran_assets],\n resource_defs={"fivetran": my_fivetran_resource}\n )\n\n """\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n tracked_asset_keys = {\n AssetKey(asset_key_prefix + table.split(".")) for table in destination_tables\n }\n\n @multi_asset(\n name=f"fivetran_sync_{connector_id}",\n outs={\n "_".join(key.path): Out(io_manager_key=io_manager_key, asset_key=key)\n for key in tracked_asset_keys\n },\n required_resource_keys={"fivetran"},\n compute_kind="fivetran",\n )\n def _assets(context):\n fivetran_output = context.resources.fivetran.sync_and_poll(\n connector_id=connector_id,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n for materialization in generate_materializations(\n fivetran_output, asset_key_prefix=asset_key_prefix\n ):\n # scan through all tables actually created, if it was expected then emit an Output.\n # otherwise, emit a runtime AssetMaterialization\n if materialization.asset_key in tracked_asset_keys:\n yield Output(\n value=None,\n output_name="_".join(materialization.asset_key.path),\n metadata={\n entry.label: entry.entry_data for entry in materialization.metadata_entries\n },\n )\n else:\n yield materialization\n\n return [_assets]
\n
", "current_page_name": "_modules/dagster_fivetran/asset_defs", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_fivetran.asset_defs"}, "ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_fivetran.ops

\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import generate_materializations\n\nfrom dagster import Array, AssetKey, Bool, Field, In, Noneable, Nothing, Out, Output, Permissive, op\n\n\n
[docs]@op(\n required_resource_keys={"fivetran"},\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description="Parsed json dictionary representing the details of the Fivetran connector after "\n "the sync successfully completes. "\n "See the [Fivetran API Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) "\n "to see detailed information on this response.",\n ),\n config_schema={\n "connector_id": Field(\n str,\n is_required=True,\n description="The Fivetran Connector ID that this op will sync. You can retrieve this "\n 'value from the "Setup" tab of a given connector in the Fivetran UI.',\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) that will be waited between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description="The maximum time that will waited before this operation is timed out. By "\n "default, this will never time out.",\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Fivetran sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["fivetran"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "fivetran"},\n)\ndef fivetran_sync_op(context):\n """\n Executes a Fivetran sync for a given ``connector_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the sync successfully completes, as well as details\n about which tables the sync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_sync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n\n fivetran_output = context.resources.fivetran.sync_and_poll(\n connector_id=context.op_config["connector_id"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n yield from generate_materializations(\n fivetran_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(fivetran_output)
\n\n\n@op(\n required_resource_keys={"fivetran"},\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description="Parsed json dictionary representing the details of the Fivetran connector after "\n "the resync successfully completes. "\n "See the [Fivetran API Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) "\n "to see detailed information on this response.",\n ),\n config_schema={\n "connector_id": Field(\n str,\n is_required=True,\n description="The Fivetran Connector ID that this op will sync. You can retrieve this "\n 'value from the "Setup" tab of a given connector in the Fivetran UI.',\n ),\n "resync_parameters": Field(\n Permissive(),\n is_required=True,\n description="The resync parameters to send in the payload to the Fivetran API. You "\n "can find an example resync payload here: https://fivetran.com/docs/rest-api/connectors#request_6",\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) that will be waited between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description="The maximum time that will waited before this operation is timed out. By "\n "default, this will never time out.",\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Fivetran sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["fivetran"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "fivetran"},\n)\ndef fivetran_resync_op(context):\n """\n Executes a Fivetran historical resync for a given ``connector_id``, and polls until that resync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the resync successfully completes, as well as details\n about which tables the resync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_resync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_resync_op.configured(\n {\n "connector_id": "foobar",\n "resync_parameters": {\n "schema_a": ["table_a", "table_b"],\n "schema_b": ["table_c"]\n }\n },\n name="sync_foobar"\n )\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n\n fivetran_output = context.resources.fivetran.resync_and_poll(\n connector_id=context.op_config["connector_id"],\n resync_parameters=context.op_config["resync_parameters"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n asset_key_filter = [\n AssetKey(context.op_config["asset_key_prefix"] + [schema, table])\n for schema, tables in context.op_config["resync_parameters"].items()\n for table in tables\n ]\n for mat in generate_materializations(\n fivetran_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n ):\n if mat.asset_key in asset_key_filter:\n yield mat\n\n yield Output(fivetran_output)\n
", "current_page_name": "_modules/dagster_fivetran/ops", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_fivetran.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_fivetran.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Dict, List, Optional, Tuple\nfrom urllib.parse import urljoin\n\nimport requests\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import get_fivetran_connector_url, get_fivetran_logs_url\nfrom dateutil import parser\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom dagster import Failure, Field, MetadataValue, StringSource, __version__\nfrom dagster import _check as check\nfrom dagster import get_dagster_logger, resource\n\nFIVETRAN_API_BASE = "https://api.fivetran.com"\nFIVETRAN_CONNECTOR_PATH = "v1/connectors/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\n
[docs]class FivetranResource:\n """\n This class exposes methods on top of the Fivetran REST API.\n """\n\n def __init__(\n self,\n api_key: str,\n api_secret: str,\n disable_schedule_on_trigger: bool = True,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n log: logging.Logger = get_dagster_logger(),\n ):\n self._auth = HTTPBasicAuth(api_key, api_secret)\n self._disable_schedule_on_trigger = disable_schedule_on_trigger\n\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._log = log\n\n @property\n def api_base_url(self) -> str:\n return urljoin(FIVETRAN_API_BASE, FIVETRAN_CONNECTOR_PATH)\n\n
[docs] def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Dict[str, Any]:\n """\n Creates and sends a request to the desired Fivetran Connector API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Fivetran API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n\n headers = {\n "User-Agent": f"dagster-fivetran/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=urljoin(self.api_base_url, endpoint),\n headers=headers,\n auth=self._auth,\n data=data,\n )\n response.raise_for_status()\n resp_dict = response.json()\n return resp_dict["data"] if "data" in resp_dict else resp_dict\n except RequestException as e:\n self._log.error("Request to Fivetran API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure("Exceeded max number of retries.")
\n\n
[docs] def get_connector_details(self, connector_id: str) -> Dict[str, Any]:\n """\n Gets details about a given connector from the Fivetran Connector API.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n return self.make_request(method="GET", endpoint=connector_id)
\n\n def _assert_syncable_connector(self, connector_id: str):\n """\n Confirms that a given connector is eligible to sync. Will raise a Failure in the event that\n the connector is either paused or not fully setup.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n """\n connector_details = self.get_connector_details(connector_id)\n if connector_details["paused"]:\n raise Failure(f"Connector '{connector_id}' cannot be synced as it is currently paused.")\n if connector_details["status"]["setup_state"] != "connected":\n raise Failure(f"Connector '{connector_id}' cannot be synced as it has not been setup")\n\n
[docs] def get_connector_sync_status(self, connector_id: str) -> Tuple[datetime.datetime, bool, str]:\n """\n Gets details about the status of the most recent Fivetran sync operation for a given\n connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Tuple[datetime.datetime, bool, str]:\n Tuple representing the timestamp of the last completeded sync, if it succeeded, and\n the currently reported sync status.\n """\n connector_details = self.get_connector_details(connector_id)\n\n min_time_str = "0001-01-01 00:00:00+00"\n succeeded_at = parser.parse(connector_details["succeeded_at"] or min_time_str)\n failed_at = parser.parse(connector_details["failed_at"] or min_time_str)\n\n return (\n max(succeeded_at, failed_at),\n succeeded_at > failed_at,\n connector_details["status"]["sync_state"],\n )
\n\n
[docs] def update_connector(\n self, connector_id: str, properties: Optional[Dict[str, Any]] = None\n ) -> Dict[str, Any]:\n """\n Updates properties of a Fivetran Connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n properties (Dict[str, Any]): The properties to be updated. For a comprehensive list of\n properties, see the [Fivetran docs](https://fivetran.com/docs/rest-api/connectors#modifyaconnector).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n return self.make_request(method="PATCH", endpoint=connector_id, data=json.dumps(properties))
\n\n
[docs] def update_schedule_type(\n self, connector_id: str, schedule_type: Optional[str] = None\n ) -> Dict[str, Any]:\n """\n Updates the schedule type property of the connector to either "auto" or "manual".\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n schedule_type (Optional[str]): Either "auto" (to turn the schedule on) or "manual" (to\n turn it off).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n if schedule_type not in ["auto", "manual"]:\n check.failed(f"schedule_type must be either 'auto' or 'manual': got '{schedule_type}'")\n return self.update_connector(connector_id, properties={"schedule_type": schedule_type})
\n\n def get_connector_schema_config(self, connector_id: str) -> Dict[str, Any]:\n return self.make_request("GET", endpoint=f"{connector_id}/schemas")\n\n
[docs] def start_sync(self, connector_id: str) -> Dict[str, Any]:\n """\n Initiates a sync of a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the sync is started.\n """\n if self._disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_request(method="POST", endpoint=f"{connector_id}/force")\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this sync in the Fivetran UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details
\n\n
[docs] def start_resync(\n self, connector_id: str, resync_parameters: Dict[str, List[str]]\n ) -> Dict[str, Any]:\n """\n Initiates a historical sync of all data for multiple schema tables within a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Dict[str, List[str]]): The resync parameters to send to the Fivetran API.\n An example payload can be found here: https://fivetran.com/docs/rest-api/connectors#request_6\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the resync is started.\n """\n if self._disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_request(\n method="POST",\n endpoint=f"{connector_id}/schemas/tables/resync",\n data=json.dumps(resync_parameters),\n )\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this resync in the Fivetran UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details
\n\n
[docs] def poll_sync(\n self,\n connector_id: str,\n initial_last_sync_completion: datetime.datetime,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Dict[str, Any]:\n """\n Given a Fivetran connector and the timestamp at which the previous sync completed, poll\n until the next sync completes.\n\n The previous sync completion time is necessary because the only way to tell when a sync\n completes is when this value changes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync\n (successful or otherwise) for this connector, prior to running this method.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n poll_start = datetime.datetime.now()\n while True:\n (\n curr_last_sync_completion,\n curr_last_sync_succeeded,\n curr_sync_state,\n ) = self.get_connector_sync_status(connector_id)\n self._log.info(f"Polled '{connector_id}'. Status: [{curr_sync_state}]")\n\n if curr_last_sync_completion > initial_last_sync_completion:\n break\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for connector '{connector_id}' timed out after "\n f"{datetime.datetime.now() - poll_start}."\n )\n\n # Sleep for the configured time interval before polling again.\n time.sleep(poll_interval)\n\n connector_details = self.get_connector_details(connector_id)\n if not curr_last_sync_succeeded:\n raise Failure(\n f"Sync for connector '{connector_id}' failed!",\n metadata={\n "connector_details": MetadataValue.json(connector_details),\n "log_url": MetadataValue.url(get_fivetran_logs_url(connector_details)),\n },\n )\n return connector_details
\n\n
[docs] def sync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> FivetranOutput:\n """\n Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_sync(connector_id)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)
\n\n
[docs] def resync_and_poll(\n self,\n connector_id: str,\n resync_parameters: Dict[str, List[str]],\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> FivetranOutput:\n """\n Initializes a historical resync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Dict[str, List[str]]): The payload to send to the Fivetran API.\n This should be a dictionary with schema names as the keys and a list of tables\n to resync as the values.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_resync(connector_id, resync_parameters)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)
\n\n\n
[docs]@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n is_required=True,\n description="Fivetran API Key. You can find this value on the Fivetran settings page: "\n "https://fivetran.com/account/settings",\n ),\n "api_secret": Field(\n StringSource,\n is_required=True,\n description="Fivetran API Secret. You can find this value on the Fivetran settings page: "\n "https://fivetran.com/account/settings",\n ),\n "disable_schedule_on_trigger": Field(\n bool,\n default_value=True,\n description="Specifies if you would like any connector that is sync'd using this "\n "resource to be automatically taken off its Fivetran schedule.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description="The maximum number of times requests to the Fivetran API should be retried "\n "before failing.",\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps manage Fivetran connectors",\n)\ndef fivetran_resource(context) -> FivetranResource:\n """\n This resource allows users to programatically interface with the Fivetran REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Fivetran REST API, including expected response JSON\n schemae, see the `Fivetran API Docs <https://fivetran.com/docs/rest-api/connectors>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n @job(resource_defs={"fivetran":my_fivetran_resource})\n def my_fivetran_job():\n ...\n\n """\n return FivetranResource(\n api_key=context.resource_config["api_key"],\n api_secret=context.resource_config["api_secret"],\n disable_schedule_on_trigger=context.resource_config["disable_schedule_on_trigger"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_fivetran/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_fivetran.resources"}}, "dagster_gcp": {"bigquery": {"ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.bigquery.ops

\nimport hashlib\n\nfrom dagster_pandas import DataFrame\nfrom google.cloud.bigquery.job import LoadJobConfig, QueryJobConfig\nfrom google.cloud.bigquery.table import EncryptionConfiguration, TimePartitioning\n\nfrom dagster import InputDefinition, List, Nothing, OutputDefinition\nfrom dagster import _check as check\nfrom dagster import op, solid\n\nfrom .configs import (\n    define_bigquery_create_dataset_config,\n    define_bigquery_delete_dataset_config,\n    define_bigquery_load_config,\n    define_bigquery_query_config,\n)\nfrom .types import BigQueryLoadSource\n\n_START = "start"\n\n\ndef _preprocess_config(cfg):\n    destination_encryption_configuration = cfg.get("destination_encryption_configuration")\n    time_partitioning = cfg.get("time_partitioning")\n\n    if destination_encryption_configuration is not None:\n        cfg["destination_encryption_configuration"] = EncryptionConfiguration(\n            kms_key_name=destination_encryption_configuration\n        )\n\n    if time_partitioning is not None:\n        cfg["time_partitioning"] = TimePartitioning(**time_partitioning)\n\n    return cfg\n\n\ndef _bq_core_command(dagster_decorator, decorator_name, sql_queries):\n    sql_queries = check.list_param(sql_queries, "sql queries", of_type=str)\n    m = hashlib.sha1()\n    for query in sql_queries:\n        m.update(query.encode("utf-8"))\n    hash_str = m.hexdigest()[:10]\n    name = f"bq_{decorator_name}_{hash_str}"\n\n    @dagster_decorator(\n        name=name,\n        input_defs=[InputDefinition(_START, Nothing)],\n        output_defs=[OutputDefinition(List[DataFrame])],\n        config_schema=define_bigquery_query_config(),\n        required_resource_keys={"bigquery"},\n        tags={"kind": "sql", "sql": "\\n".join(sql_queries)},\n    )\n    def _bq_fn(context):  # pylint: disable=unused-argument\n        query_job_config = _preprocess_config(context.op_config.get("query_job_config", {}))\n\n        # Retrieve results as pandas DataFrames\n        results = []\n        for sql_query in sql_queries:\n            # We need to construct a new QueryJobConfig for each query.\n            # See: https://bit.ly/2VjD6sl\n            cfg = QueryJobConfig(**query_job_config) if query_job_config else None\n            context.log.info(\n                "executing query %s with config: %s"\n                % (sql_query, cfg.to_api_repr() if cfg else "(no config provided)")\n            )\n            results.append(\n                context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()\n            )\n\n        return results\n\n    return _bq_fn\n\n\n
[docs]def bq_solid_for_queries(sql_queries):\n """\n Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n\n return _bq_core_command(solid, "solid", sql_queries)
\n\n\n
[docs]def bq_op_for_queries(sql_queries):\n """\n Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n\n return _bq_core_command(op, "op", sql_queries)
\n\n\nBIGQUERY_LOAD_CONFIG = define_bigquery_load_config()\n\n\n
[docs]@op(\n input_defs=[InputDefinition("paths", List[str])],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_gcs_paths_to_bq(context, paths):\n return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)
\n\n\n
[docs]@op(\n input_defs=[InputDefinition("df", DataFrame)],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_df_to_bq(context, df):\n return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)
\n\n\n
[docs]@op(\n input_defs=[InputDefinition("path", str)],\n output_defs=[OutputDefinition(Nothing)],\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_file_to_bq(context, path):\n return _execute_load_in_source(context, path, BigQueryLoadSource.File)
\n\n\ndef _execute_load_in_source(context, source, source_name):\n destination = context.op_config.get("destination")\n load_job_config = _preprocess_config(context.op_config.get("load_job_config", {}))\n cfg = LoadJobConfig(**load_job_config) if load_job_config else None\n\n context.log.info(\n "executing BQ load with config: %s for source %s"\n % (cfg.to_api_repr() if cfg else "(no config provided)", source)\n )\n\n if source_name == BigQueryLoadSource.DataFrame:\n context.resources.bigquery.load_table_from_dataframe(\n source, destination, job_config=cfg\n ).result()\n\n # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local\n elif source_name == BigQueryLoadSource.File:\n with open(source, "rb") as file_obj:\n context.resources.bigquery.load_table_from_file(\n file_obj, destination, job_config=cfg\n ).result()\n\n # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage\n elif source_name == BigQueryLoadSource.GCS:\n context.resources.bigquery.load_table_from_uri(source, destination, job_config=cfg).result()\n\n\n
[docs]@op(\n input_defs=[InputDefinition(_START, Nothing)],\n config_schema=define_bigquery_create_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_create_dataset(context):\n """BigQuery Create Dataset.\n\n This op encapsulates creating a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, exists_ok) = [context.op_config.get(k) for k in ("dataset", "exists_ok")]\n context.log.info("executing BQ create_dataset for dataset %s" % (dataset))\n context.resources.bigquery.create_dataset(dataset, exists_ok)
\n\n\n
[docs]@op(\n input_defs=[InputDefinition(_START, Nothing)],\n config_schema=define_bigquery_delete_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_delete_dataset(context):\n """BigQuery Delete Dataset.\n\n This op encapsulates deleting a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n\n (dataset, delete_contents, not_found_ok) = [\n context.op_config.get(k) for k in ("dataset", "delete_contents", "not_found_ok")\n ]\n\n context.log.info("executing BQ delete_dataset for dataset %s" % dataset)\n\n context.resources.bigquery.delete_dataset(\n dataset, delete_contents=delete_contents, not_found_ok=not_found_ok\n )
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/ops", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.bigquery.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.bigquery.resources

\nfrom google.cloud import bigquery  # type: ignore\n\nfrom dagster import resource\n\nfrom .configs import bq_resource_config\n\n\n
[docs]@resource(\n config_schema=bq_resource_config(), description="Dagster resource for connecting to BigQuery"\n)\ndef bigquery_resource(context):\n return bigquery.Client(**context.resource_config)
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.bigquery.resources"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.bigquery.types

\nimport re\nfrom enum import Enum as PyEnum\n\nfrom google.cloud.bigquery.job import (\n    CreateDisposition,\n    Encoding,\n    QueryPriority,\n    SchemaUpdateOption,\n    SourceFormat,\n    WriteDisposition,\n)\n\nfrom dagster import Enum, EnumValue\nfrom dagster.config import ConfigScalar, ConfigScalarKind, PostProcessingError\n\n\nclass BigQueryLoadSource(PyEnum):\n    DataFrame = "DATA_FRAME"\n    GCS = "GCS"\n    File = "FILE"\n\n\nBQCreateDisposition = Enum(\n    name="BQCreateDisposition",\n    enum_values=[\n        EnumValue(CreateDisposition.CREATE_IF_NEEDED),\n        EnumValue(CreateDisposition.CREATE_NEVER),\n    ],\n)\n\nBQPriority = Enum(\n    name="BQPriority",\n    enum_values=[EnumValue(QueryPriority.BATCH), EnumValue(QueryPriority.INTERACTIVE)],\n)\n\nBQSchemaUpdateOption = Enum(\n    name="BQSchemaUpdateOption",\n    enum_values=[\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_ADDITION,\n            description="Allow adding a nullable field to the schema.",\n        ),\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_RELAXATION,\n            description="Allow relaxing a required field in the original schema to nullable.",\n        ),\n    ],\n)\n\nBQWriteDisposition = Enum(\n    name="BQWriteDisposition",\n    enum_values=[\n        EnumValue(WriteDisposition.WRITE_APPEND),\n        EnumValue(WriteDisposition.WRITE_EMPTY),\n        EnumValue(WriteDisposition.WRITE_TRUNCATE),\n    ],\n)\n\nBQEncoding = Enum(\n    name="BQEncoding", enum_values=[EnumValue(Encoding.ISO_8859_1), EnumValue(Encoding.UTF_8)]\n)\n\nBQSourceFormat = Enum(\n    name="BQSourceFormat",\n    enum_values=[\n        EnumValue(SourceFormat.AVRO),\n        EnumValue(SourceFormat.CSV),\n        EnumValue(SourceFormat.DATASTORE_BACKUP),\n        EnumValue(SourceFormat.NEWLINE_DELIMITED_JSON),\n        EnumValue(SourceFormat.ORC),\n        EnumValue(SourceFormat.PARQUET),\n    ],\n)\n\n\n# Project names are permitted to have alphanumeric, dashes and underscores, up to 1024 characters.\nRE_PROJECT = r"[\\w\\d\\-\\_]{1,1024}"\n\n# Datasets and tables are permitted to have alphanumeric or underscores, no dashes allowed, up to\n# 1024 characters\nRE_DS_TABLE = r"[\\w\\d\\_]{1,1024}"\n\n# BigQuery supports writes directly to date partitions with the syntax foo.bar$20190101\nRE_PARTITION_SUFFIX = r"(\\$\\d{8})?"\n\n\ndef _is_valid_dataset(config_value):\n    """Datasets must be of form "project.dataset" or "dataset" """\n    return re.match(\n        # regex matches: project.dataset -- OR -- dataset\n        r"^" + RE_PROJECT + r"\\." + RE_DS_TABLE + r"$|^" + RE_DS_TABLE + r"$",\n        config_value,\n    )\n\n\ndef _is_valid_table(config_value):\n    """Tables must be of form "project.dataset.table" or "dataset.table" with optional\n    date-partition suffix\n    """\n    return re.match(\n        r"^"\n        + RE_PROJECT  #          project\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$|^"  #              -- OR --\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$",\n        config_value,\n    )\n\n\nclass _Dataset(ConfigScalar):\n    def __init__(self):\n        super(_Dataset, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_dataset(value):\n            raise PostProcessingError('Datasets must be of the form "project.dataset" or "dataset"')\n        return value\n\n\nclass _Table(ConfigScalar):\n    def __init__(self):\n        super(_Table, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_table(value):\n            raise PostProcessingError(\n                (\n                    'Tables must be of the form "project.dataset.table" or "dataset.table" '\n                    "with optional date-partition suffix"\n                )\n            )\n\n        return value\n\n\n# https://github.com/dagster-io/dagster/issues/1971\nTable = _Table()\nDataset = _Dataset()\n\n\n
[docs]class BigQueryError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/types", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.bigquery.types"}}, "dataproc": {"ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.dataproc.ops

\nfrom dagster import Bool, Field, Int, op, solid\nfrom dagster.seven import json\n\nfrom .configs import define_dataproc_submit_job_config\nfrom .resources import TWENTY_MINUTES\n\nDATAPROC_CONFIG_SCHEMA = {\n    "job_timeout_in_seconds": Field(\n        Int,\n        description="""Optional. Maximum time in seconds to wait for the job being\n                    completed. Default is set to 1200 seconds (20 minutes).\n                    """,\n        is_required=False,\n        default_value=TWENTY_MINUTES,\n    ),\n    "job_config": define_dataproc_submit_job_config(),\n    "job_scoped_cluster": Field(\n        Bool,\n        description="whether to create a cluster or use an existing cluster",\n        is_required=False,\n        default_value=True,\n    ),\n}\n\n\ndef _dataproc_compute(context):\n    job_config = context.solid_config["job_config"]\n    job_timeout = context.solid_config["job_timeout_in_seconds"]\n\n    context.log.info(\n        "submitting job with config: %s and timeout of: %d seconds"\n        % (str(json.dumps(job_config)), job_timeout)\n    )\n\n    if context.solid_config["job_scoped_cluster"]:\n        # Cluster context manager, creates and then deletes cluster\n        with context.resources.dataproc.cluster_context_manager() as cluster:\n            # Submit the job specified by this solid to the cluster defined by the associated resource\n            result = cluster.submit_job(job_config)\n\n            job_id = result["reference"]["jobId"]\n            context.log.info("Submitted job ID {}".format(job_id))\n            cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n    else:\n        # Submit to an existing cluster\n        # Submit the job specified by this solid to the cluster defined by the associated resource\n        result = context.resources.dataproc.submit_job(job_config)\n\n        job_id = result["reference"]["jobId"]\n        context.log.info("Submitted job ID {}".format(job_id))\n        context.resources.dataproc.wait_for_job(job_id, wait_timeout=job_timeout)\n\n\n
[docs]@solid(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_solid(context):\n return _dataproc_compute(context)
\n\n\n
[docs]@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_op(context):\n return _dataproc_compute(context)
\n
", "current_page_name": "_modules/dagster_gcp/dataproc/ops", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.dataproc.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.dataproc.resources

\nimport time\nfrom contextlib import contextmanager\n\nfrom googleapiclient.discovery import build\nfrom oauth2client.client import GoogleCredentials\n\nfrom dagster import resource\n\nfrom .configs import define_dataproc_create_cluster_config\nfrom .types import DataprocError\n\nTWENTY_MINUTES = 20 * 60\nDEFAULT_ITER_TIME_SEC = 5\n\n\nclass DataprocResource:\n    """Builds a client to the dataproc API."""\n\n    def __init__(self, config):\n        # Use Application Default Credentials to check the\n        # GOOGLE_APPLICATION_CREDENTIALS environment variable\n        # for the location of the service account key file.\n        credentials = GoogleCredentials.get_application_default()\n\n        # See https://github.com/googleapis/google-api-python-client/issues/299 for the\n        # cache_discovery=False configuration below\n        self.dataproc = build("dataproc", "v1", credentials=credentials, cache_discovery=False)\n\n        self.config = config\n\n        (self.project_id, self.region, self.cluster_name, self.cluster_config) = (\n            self.config.get(k) for k in ("projectId", "region", "clusterName", "cluster_config")\n        )\n\n    @property\n    def dataproc_clusters(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            # pylint: disable=no-member\n            self.dataproc.projects()\n            .regions()\n            .clusters()\n        )\n\n    @property\n    def dataproc_jobs(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            # pylint: disable=no-member\n            self.dataproc.projects()\n            .regions()\n            .jobs()\n        )\n\n    def create_cluster(self):\n        (\n            self.dataproc_clusters.create(\n                projectId=self.project_id,\n                region=self.region,\n                body={\n                    "projectId": self.project_id,\n                    "clusterName": self.cluster_name,\n                    "config": self.cluster_config,\n                },\n            ).execute()\n        )\n\n        def iter_fn():\n            # TODO: Add logging\n            # See: https://bit.ly/2UW5JaN\n            cluster = self.get_cluster()\n            return cluster["status"]["state"] in {"RUNNING", "UPDATING"}\n\n        done = DataprocResource._iter_and_sleep_until_ready(iter_fn)\n        if not done:\n            cluster = self.get_cluster()\n            raise DataprocError(\n                "Could not provision cluster -- status: %s" % str(cluster["status"])\n            )\n\n    def get_cluster(self):\n        return self.dataproc_clusters.get(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def delete_cluster(self):\n        return self.dataproc_clusters.delete(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def submit_job(self, job_details):\n        return self.dataproc_jobs.submit(\n            projectId=self.project_id, region=self.region, body=job_details\n        ).execute()\n\n    def get_job(self, job_id):\n        return self.dataproc_jobs.get(\n            projectId=self.project_id, region=self.region, jobId=job_id\n        ).execute()\n\n    def wait_for_job(self, job_id, wait_timeout=TWENTY_MINUTES):\n        """This method polls job status every 5 seconds"""\n        # TODO: Add logging here print('Waiting for job ID {} to finish...'.format(job_id))\n        def iter_fn():\n            # See: https://bit.ly/2Lg2tHr\n            result = self.get_job(job_id)\n\n            # Handle exceptions\n            if result["status"]["state"] in {"CANCELLED", "ERROR"}:\n                raise DataprocError("Job error: %s" % str(result["status"]))\n\n            if result["status"]["state"] == "DONE":\n                return True\n\n            return False\n\n        done = DataprocResource._iter_and_sleep_until_ready(iter_fn, max_wait_time_sec=wait_timeout)\n        if not done:\n            job = self.get_job(job_id)\n            raise DataprocError("Job run timed out: %s" % str(job["status"]))\n\n    @staticmethod\n    def _iter_and_sleep_until_ready(\n        callable_fn, max_wait_time_sec=TWENTY_MINUTES, iter_time=DEFAULT_ITER_TIME_SEC\n    ):\n        """Iterates and sleeps until callable_fn returns true"""\n        # Wait for cluster ready state\n        ready, curr_iter = False, 0\n        max_iter = max_wait_time_sec / iter_time\n        while not ready and curr_iter < max_iter:\n            ready = callable_fn()\n            time.sleep(iter_time)\n            curr_iter += 1\n\n        # Will return false if ran up to max_iter without success\n        return ready\n\n    @contextmanager\n    def cluster_context_manager(self):\n        """This context manager gives syntactic sugar so you can run:\n\n        with context.resources.dataproc.cluster as cluster:\n            # do stuff...\n        """\n        self.create_cluster()\n        try:\n            yield self\n        finally:\n            self.delete_cluster()\n\n\n
[docs]@resource(\n config_schema=define_dataproc_create_cluster_config(),\n description="Manage a Dataproc cluster resource",\n)\ndef dataproc_resource(context):\n return DataprocResource(context.resource_config)
\n
", "current_page_name": "_modules/dagster_gcp/dataproc/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.dataproc.resources"}}, "gcs": {"file_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.gcs.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nfrom google.cloud import storage  # type: ignore\n\nimport dagster._check as check\nfrom dagster.core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class GCSFileHandle(FileHandle):\n """A reference to a file on GCS."""\n\n def __init__(self, gcs_bucket: str, gcs_key: str):\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_key = check.str_param(gcs_key, "gcs_key")\n\n @property\n def gcs_bucket(self) -> str:\n """str: The name of the GCS bucket."""\n return self._gcs_bucket\n\n @property\n def gcs_key(self) -> str:\n """str: The GCS key."""\n return self._gcs_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's GCS URL."""\n return self.gcs_path\n\n @property\n def gcs_path(self) -> str:\n """str: The file's GCS URL."""\n return "gs://{bucket}/{key}".format(bucket=self.gcs_bucket, key=self.gcs_key)
\n\n\nclass GCSFileManager(FileManager):\n def __init__(self, client, gcs_bucket, gcs_base_key):\n self._client = check.inst_param(client, "client", storage.client.Client)\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_base_key = check.str_param(gcs_base_key, "gcs_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n bucket_obj = self._client.bucket(file_handle.gcs_bucket)\n bucket_obj.blob(file_handle.gcs_key).download_to_file(temp_file_obj)\n self._local_handle_cache[file_handle.gcs_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", GCSFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.gcs_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.gcs_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n gcs_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n bucket_obj = self._client.bucket(self._gcs_bucket)\n bucket_obj.blob(gcs_key).upload_from_file(file_obj)\n return GCSFileHandle(self._gcs_bucket, gcs_key)\n\n def get_full_key(self, file_key):\n return "{base_key}/{file_key}".format(base_key=self._gcs_base_key, file_key=file_key)\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_gcp/gcs/file_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.gcs.file_manager"}, "io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.gcs.io_manager

\nimport pickle\nfrom typing import Union\n\nfrom google.api_core.exceptions import Forbidden, TooManyRequests\nfrom google.cloud import storage  # type: ignore\n\nfrom dagster import Field, IOManager, InputContext, OutputContext, StringSource\nfrom dagster import _check as check\nfrom dagster import io_manager\nfrom dagster.utils import PICKLE_PROTOCOL\nfrom dagster.utils.backoff import backoff\n\nDEFAULT_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectGCSIOManager(IOManager):\n    def __init__(self, bucket, client=None, prefix="dagster"):\n        self.bucket = check.str_param(bucket, "bucket")\n        self.client = client or storage.Client()\n        self.bucket_obj = self.client.bucket(bucket)\n        check.invariant(self.bucket_obj.exists())\n        self.prefix = check.str_param(prefix, "prefix")\n\n    def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return "/".join([self.prefix, "storage", run_id, "files", *output_parts])\n\n    def _rm_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n\n        if self.bucket_obj.blob(key).exists():\n            self.bucket_obj.blob(key).delete()\n\n    def _has_object(self, key):\n        check.str_param(key, "key")\n        check.param_invariant(len(key) > 0, "key")\n        blobs = self.client.list_blobs(self.bucket, prefix=key)\n        return len(list(blobs)) > 0\n\n    def _uri_for_key(self, key):\n        check.str_param(key, "key")\n        return "gs://" + self.bucket + "/" + "{key}".format(key=key)\n\n    def load_input(self, context):\n        key = self._get_path(context)\n        context.log.debug(f"Loading GCS object from: {self._uri_for_key(key)}")\n\n        bytes_obj = self.bucket_obj.blob(key).download_as_bytes()\n        obj = pickle.loads(bytes_obj)\n\n        return obj\n\n    def handle_output(self, context, obj):\n        key = self._get_path(context)\n        context.log.debug(f"Writing GCS object at: {self._uri_for_key(key)}")\n\n        if self._has_object(key):\n            context.log.warning(f"Removing existing GCS key: {key}")\n            self._rm_object(key)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n        backoff(\n            self.bucket_obj.blob(key).upload_from_string,\n            args=[pickled_obj],\n            retry_on=(TooManyRequests, Forbidden),\n        )\n\n\n
[docs]@io_manager(\n config_schema={\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_io_manager(init_context):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Attach this resource definition to your job to make it available to your ops.\n\n .. code-block:: python\n\n @job(resource_defs={'io_manager': gcs_pickle_io_manager, 'gcs': gcs_resource, ...})\n def my_job():\n my_op()\n\n You may configure this storage as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n gcs_bucket: my-cool-bucket\n gcs_prefix: good/prefix-for-files-\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSIOManager(\n init_context.resource_config["gcs_bucket"],\n client,\n init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager
\n\n\nclass PickledObjectGCSAssetIOManager(PickledObjectGCSIOManager):\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n return "/".join([self.prefix, *context.get_asset_identifier()])\n\n\n
[docs]@io_manager(\n config_schema={\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_asset_io_manager(init_context):\n """Persistent IO manager using GCS for storage, meant for use with software-defined assets.\n\n Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\n will overwrite previous materializations of that asset.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Attach this resource definition to your job to make it available to your ops.\n\n .. code-block:: python\n\n asset_group = AssetGroup(\n assets...,\n resource_defs={'io_manager': gcs_pickle_asset_io_manager, "gcs": gcs_resource, ...}),\n )\n\n You may configure this IO manager as follows:\n\n .. code-block:: YAML\n\n resources:\n io_manager:\n config:\n gcs_bucket: my-cool-bucket\n gcs_prefix: good/prefix-for-files-\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSAssetIOManager(\n init_context.resource_config["gcs_bucket"],\n client,\n init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_gcp/gcs/io_manager", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.gcs.io_manager"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_gcp.gcs.resources

\nfrom google.cloud import storage  # type: ignore\n\nfrom dagster import Field, Noneable, StringSource, resource\nfrom dagster.utils.merger import merge_dicts\n\nfrom .file_manager import GCSFileManager\n\nGCS_CLIENT_CONFIG = {\n    "project": Field(Noneable(StringSource), is_required=False, description="Project name")\n}\n\n\n
[docs]@resource(\n GCS_CLIENT_CONFIG,\n description="This resource provides a GCS client",\n)\ndef gcs_resource(init_context):\n return _gcs_client_from_config(init_context.resource_config)
\n\n\n
[docs]@resource(\n merge_dicts(\n GCS_CLIENT_CONFIG,\n {\n "gcs_bucket": Field(StringSource),\n "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef gcs_file_manager(context):\n """FileManager that provides abstract access to GCS.\n\n Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.\n """\n gcs_client = _gcs_client_from_config(context.resource_config)\n return GCSFileManager(\n client=gcs_client,\n gcs_bucket=context.resource_config["gcs_bucket"],\n gcs_base_key=context.resource_config["gcs_prefix"],\n )
\n\n\ndef _gcs_client_from_config(config):\n """\n Args:\n config: A configuration containing the fields in GCS_CLIENT_CONFIG.\n\n Returns: A GCS client.\n """\n project = config.get("project", None)\n return storage.client.Client(project=project)\n
", "current_page_name": "_modules/dagster_gcp/gcs/resources", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_gcp.gcs.resources"}}}, "dagster_ge": {"factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_ge.factory

\nimport datetime\n\nimport great_expectations as ge\nfrom dagster_pandas import DataFrame\nfrom great_expectations.render.renderer import ValidationResultsPageRenderer\nfrom great_expectations.render.view import DefaultMarkdownPageView\n\nfrom dagster import (\n    ExpectationResult,\n    InputDefinition,\n    MetadataEntry,\n    MetadataValue,\n    Noneable,\n    Output,\n    OutputDefinition,\n    StringSource,\n)\nfrom dagster import _check as check\nfrom dagster import op, resource, solid\n\ntry:\n    # ge < v0.13.0\n    from great_expectations.core import convert_to_json_serializable\nexcept ImportError:\n    # ge >= v0.13.0\n    from great_expectations.core.util import convert_to_json_serializable\n\n\n@resource(config_schema={"ge_root_dir": Noneable(StringSource)})\ndef ge_data_context(context):\n    if context.resource_config["ge_root_dir"] is None:\n        yield ge.data_context.DataContext()\n    else:\n        yield ge.data_context.DataContext(context_root_dir=context.resource_config["ge_root_dir"])\n\n\ndef core_ge_validation_factory(\n    dagster_decorator,\n    decorator_name,\n    name,\n    datasource_name,\n    suite_name,\n    validation_operator_name=None,\n    input_dagster_type=DataFrame,\n    batch_kwargs=None,\n):\n    check.str_param(datasource_name, "datasource_name")\n    check.str_param(suite_name, "suite_name")\n    check.opt_str_param(validation_operator_name, "validation_operator_name")\n    batch_kwargs = check.opt_dict_param(batch_kwargs, "batch_kwargs")\n\n    @dagster_decorator(\n        name=name,\n        input_defs=[InputDefinition("dataset", input_dagster_type)],\n        output_defs=[\n            OutputDefinition(\n                dagster_type=dict,\n                description=f"""\n        This {decorator_name} yields an expectationResult with a structured dict of metadata from\n        the GE suite, as well as the full result in case a user wants to process it differently.\n        The structured dict contains both summary stats from the suite as well as expectation by\n        expectation results/details.\n        """,\n            )\n        ],\n        required_resource_keys={"ge_data_context"},\n        tags={"kind": "ge"},\n    )\n    def _ge_validation_fn(context, dataset):\n        data_context = context.resources.ge_data_context\n        if validation_operator_name is not None:\n            validation_operator = validation_operator_name\n        else:\n            data_context.add_validation_operator(\n                "ephemeral_validation",\n                {"class_name": "ActionListValidationOperator", "action_list": []},\n            )\n            validation_operator = "ephemeral_validation"\n        suite = data_context.get_expectation_suite(suite_name)\n        final_batch_kwargs = batch_kwargs or {"dataset": dataset}\n        if "datasource" in batch_kwargs:\n            context.log.warning(\n                "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "\n                f"parameter of the {decorator_name} factory instead."\n            )\n        final_batch_kwargs["datasource"] = datasource_name\n        batch = data_context.get_batch(final_batch_kwargs, suite)\n        run_id = {\n            "run_name": datasource_name + " run",\n            "run_time": datetime.datetime.utcnow(),\n        }\n        results = data_context.run_validation_operator(\n            validation_operator, assets_to_validate=[batch], run_id=run_id\n        )\n        res = convert_to_json_serializable(results.list_validation_results())[0]\n        validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n        rendered_document_content_list = (\n            validation_results_page_renderer.render_validation_operator_result(results)\n        )\n        md_str = " ".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n        meta_stats = MetadataEntry("Expectation Results", value=MetadataValue.md(md_str))\n        yield ExpectationResult(\n            success=res["success"],\n            metadata_entries=[\n                meta_stats,\n            ],\n        )\n        yield Output(res)\n\n    return _ge_validation_fn\n\n\n
[docs]def ge_validation_solid_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """Generates solids for interacting with GE.\n\n Args:\n name (str): the name of the solid\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to None,\n which generates an ephemeral validator.\n If you want to save data docs, use 'action_list_operator'.\n See https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the\n solid. Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`,\n where `dataset` is the input to the generated solid.\n Returns:\n A solid that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n """\n\n return core_ge_validation_factory(\n solid,\n "solid",\n name,\n datasource_name,\n suite_name,\n validation_operator_name,\n input_dagster_type,\n batch_kwargs,\n )
\n\n\n
[docs]def ge_validation_op_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """Generates ops for interacting with GE.\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to\n None, which generates an ephemeral validator. If you want to save data docs, use\n 'action_list_operator'.\n See https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`, where\n `dataset` is the input to the generated op.\n Returns:\n A solid that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n """\n\n return core_ge_validation_factory(\n solid,\n "solid",\n name,\n datasource_name,\n suite_name,\n validation_operator_name,\n input_dagster_type,\n batch_kwargs,\n )
\n\n\ndef core_ge_validation_factory_v3(\n dagster_decorator,\n decorator_name,\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n check.str_param(datasource_name, "datasource_name")\n check.str_param(data_connector_name, "data_connector_name")\n check.str_param(suite_name, "suite_name")\n\n extra_kwargs = check.opt_dict_param(extra_kwargs, "extra_kwargs")\n\n @dagster_decorator(\n name=name,\n input_defs=[InputDefinition("dataset", input_dagster_type)],\n output_defs=[\n OutputDefinition(\n dagster_type=dict,\n description=f"""\n This {decorator_name} yields an ExpectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n )\n ],\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context, dataset):\n data_context = context.resources.ge_data_context\n validator_kwargs = {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": datasource_name or data_asset_name,\n "runtime_parameters": {runtime_method_type: dataset},\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n **extra_kwargs,\n }\n validator = data_context.get_validator(**validator_kwargs)\n\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = validator.validate(run_id=run_id)\n\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = validation_results_page_renderer.render(\n validation_results=results\n )\n md_str = "".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n meta_stats = MetadataEntry("Expectation Results", value=MetadataValue.md(md_str))\n yield ExpectationResult(\n success=bool(results["success"]),\n metadata_entries=[meta_stats],\n )\n yield Output(results.to_json_dict())\n\n return _ge_validation_fn\n\n\ndef ge_validation_solid_factory_v3(\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n """Generates solids for interacting with GE (v3 API)\n\n Args:\n name (str): the name of the solid\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n data_connector_name (str): the name of the data connector for this datasource. This should\n point to a RuntimeDataConnector. For information on how to set this up, see:\n https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/how_to_create_a_batch_of_data_from_an_in_memory_spark_or_pandas_dataframe\n data_asset_name (str): the name of the data asset that this solid will be validating.\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n batch_identifier_fn (dict): A dicitonary of batch identifiers to uniquely identify this\n batch of data. To learn more about batch identifiers, see:\n https://docs.greatexpectations.io/docs/reference/datasources#batches.\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the\n solid. Defaults to `dagster_pandas.DataFrame`.\n runtime_method_type (str): how GE should interperet the solid input. One of ("batch_data",\n "path", "query"). Defaults to "batch_data", which will interperet the input as an in-memory\n object.\n extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s\n `get_validator` method. If not set, input will be:\n {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": data_asset_name,\n "runtime_parameters": {\n "<runtime_method_type>": <solid input>\n },\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n }\n\n Returns:\n A solid that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n\n """\n return core_ge_validation_factory_v3(\n solid,\n "solid",\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers,\n input_dagster_type,\n runtime_method_type,\n extra_kwargs,\n )\n\n\ndef ge_validation_op_factory_v3(\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n """Generates ops for interacting with GE (v3 API)\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n data_connector_name (str): the name of the data connector for this datasource. This should\n point to a RuntimeDataConnector. For information on how to set this up, see:\n https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/how_to_create_a_batch_of_data_from_an_in_memory_spark_or_pandas_dataframe\n data_asset_name (str): the name of the data asset that this op will be validating.\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n batch_identifier_fn (dict): A dicitonary of batch identifiers to uniquely identify this\n batch of data. To learn more about batch identifiers, see:\n https://docs.greatexpectations.io/docs/reference/datasources#batches.\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n runtime_method_type (str): how GE should interperet the solid input. One of ("batch_data",\n "path", "query"). Defaults to "batch_data", which will interperet the input as an\n in-memory object.\n extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s\n `get_validator` method. If not set, input will be:\n {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": data_asset_name,\n "runtime_parameters": {\n "<runtime_method_type>": <op input>\n },\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n }\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata and\n an output with all the metadata (for user processing)\n\n """\n return core_ge_validation_factory_v3(\n op,\n "op",\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers,\n input_dagster_type,\n runtime_method_type,\n extra_kwargs,\n )\n
", "current_page_name": "_modules/dagster_ge/factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_ge.factory"}}, "dagster_github": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_github.resources

\nimport time\nfrom datetime import datetime\n\nimport jwt\nimport requests\n\nfrom dagster import Field, IntSource, StringSource, resource\n\n\ndef to_seconds(dt):\n    return (dt - datetime(1970, 1, 1)).total_seconds()\n\n\nclass GithubResource:\n    def __init__(self, client, app_id, app_private_rsa_key, default_installation_id, hostname=None):\n        self.client = client\n        self.app_private_rsa_key = app_private_rsa_key\n        self.app_id = app_id\n        self.default_installation_id = default_installation_id\n        self.installation_tokens = {}\n        self.app_token = {}\n        self.hostname = hostname\n\n    def __set_app_token(self):\n        # from https://developer.github.com/apps/building-github-apps/authenticating-with-github-apps/\n        # needing to self-sign a JWT\n        now = int(time.time())\n        # JWT expiration time (10 minute maximum)\n        expires = now + (10 * 60)\n        encoded_token = jwt.encode(\n            {\n                # issued at time\n                "iat": now,\n                # JWT expiration time\n                "exp": expires,\n                # GitHub App's identifier\n                "iss": self.app_id,\n            },\n            self.app_private_rsa_key,\n            algorithm="RS256",\n        )\n        self.app_token = {\n            "value": encoded_token,\n            "expires": expires,\n        }\n\n    def __check_app_token(self):\n        if ("expires" not in self.app_token) or (\n            self.app_token["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_app_token()\n\n    def get_installations(self, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = self.client.get(\n            "https://api.github.com/app/installations"\n            if self.hostname is None\n            else "https://{}/api/v3/app/installations".format(self.hostname),\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def __set_installation_token(self, installation_id, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = requests.post(\n            "https://api.github.com/app/installations/{}/access_tokens".format(installation_id)\n            if self.hostname is None\n            else "https://{}/api/v3/app/installations/{}/access_tokens".format(\n                self.hostname, installation_id\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        auth = request.json()\n        self.installation_tokens[installation_id] = {\n            "value": auth["token"],\n            "expires": to_seconds(datetime.strptime(auth["expires_at"], "%Y-%m-%dT%H:%M:%SZ")),\n        }\n\n    def __check_installation_tokens(self, installation_id):\n        if (installation_id not in self.installation_tokens) or (\n            self.installation_tokens[installation_id]["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_installation_token(installation_id)\n\n    def execute(self, query, variables, headers=None, installation_id=None):\n        if headers is None:\n            headers = {}\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        self.__check_installation_tokens(installation_id)\n        headers["Authorization"] = "token {}".format(\n            self.installation_tokens[installation_id]["value"]\n        )\n        request = requests.post(\n            "https://api.github.com/graphql"\n            if self.hostname is None\n            else "https://{}/api/graphql".format(self.hostname),\n            json={"query": query, "variables": variables},\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def create_issue(self, repo_name, repo_owner, title, body, installation_id=None):\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        res = self.execute(\n            query="""\n            query get_repo_id($repo_name: String!, $repo_owner: String!) {\n                repository(name: $repo_name, owner: $repo_owner) {\n                    id\n                }\n            }\n            """,\n            variables={"repo_name": repo_name, "repo_owner": repo_owner},\n            installation_id=installation_id,\n        )\n\n        return self.execute(\n            query="""\n                mutation CreateIssue($id: ID!, $title: String!, $body: String!) {\n                createIssue(input: {\n                    repositoryId: $id,\n                    title: $title,\n                    body: $body\n                }) {\n                    clientMutationId,\n                    issue {\n                        body\n                        title\n                        url\n                    }\n                }\n                }\n            """,\n            variables={\n                "id": res["data"]["repository"]["id"],\n                "title": title,\n                "body": body,\n            },\n            installation_id=installation_id,\n        )\n\n\n
[docs]@resource(\n config_schema={\n "github_app_id": Field(\n IntSource,\n description="Github Application ID, for more info see https://developer.github.com/apps/",\n ),\n "github_app_private_rsa_key": Field(\n StringSource,\n description="Github Application Private RSA key text, for more info see https://developer.github.com/apps/",\n ),\n "github_installation_id": Field(\n IntSource,\n is_required=False,\n description="Github Application Installation ID, for more info see https://developer.github.com/apps/",\n ),\n "github_hostname": Field(\n StringSource,\n is_required=False,\n description="Github hostname. Defaults to `api.github.com`, for more info see https://developer.github.com/apps/",\n ),\n },\n description="This resource is for connecting to Github",\n)\ndef github_resource(context):\n return GithubResource(\n client=requests.Session(),\n app_id=context.resource_config["github_app_id"],\n app_private_rsa_key=context.resource_config["github_app_private_rsa_key"],\n default_installation_id=context.resource_config["github_installation_id"],\n hostname=context.resource_config.get("github_hostname", None),\n )
\n
", "current_page_name": "_modules/dagster_github/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_github.resources"}}, "dagster_graphql": {"client": {"client": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_graphql.client.client

\nfrom itertools import chain\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport requests.exceptions\nfrom gql import Client, gql\nfrom gql.transport import Transport\nfrom gql.transport.requests import RequestsHTTPTransport\n\nimport dagster._check as check\nfrom dagster.core.definitions.utils import validate_tags\nfrom dagster.core.storage.pipeline_run import PipelineRunStatus\nfrom dagster.utils.backcompat import experimental_class_warning\n\nfrom .client_queries import (\n    CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY,\n    CLIENT_SUBMIT_PIPELINE_RUN_MUTATION,\n    GET_PIPELINE_RUN_STATUS_QUERY,\n    RELOAD_REPOSITORY_LOCATION_MUTATION,\n    SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n    TERMINATE_RUN_JOB_MUTATION,\n)\nfrom .utils import (\n    DagsterGraphQLClientError,\n    InvalidOutputErrorInfo,\n    PipelineInfo,\n    ReloadRepositoryLocationInfo,\n    ReloadRepositoryLocationStatus,\n    ShutdownRepositoryLocationInfo,\n    ShutdownRepositoryLocationStatus,\n)\n\n\n
[docs]class DagsterGraphQLClient:\n """Official Dagster Python Client for GraphQL\n\n Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server\n\n As of now, all operations on this client are synchronous.\n\n Intended usage:\n\n .. code-block:: python\n\n client = DagsterGraphQLClient("localhost", port_number=3000)\n status = client.get_run_status(**SOME_RUN_ID**)\n\n Args:\n hostname (str): Hostname for the Dagster GraphQL API, like `localhost` or\n `dagit.dagster.YOUR_ORG_HERE`.\n port_number (Optional[int], optional): Optional port number to connect to on the host.\n Defaults to None.\n transport (Optional[Transport], optional): A custom transport to use to connect to the\n GraphQL API with (e.g. for custom auth). Defaults to None.\n use_https (bool, optional): Whether to use https in the URL connection string for the\n GraphQL API. Defaults to False.\n\n Raises:\n :py:class:`~requests.exceptions.ConnectionError`: if the client cannot connect to the host.\n """\n\n def __init__(\n self,\n hostname: str,\n port_number: Optional[int] = None,\n transport: Optional[Transport] = None,\n use_https: bool = False,\n ):\n experimental_class_warning(self.__class__.__name__)\n\n self._hostname = check.str_param(hostname, "hostname")\n self._port_number = check.opt_int_param(port_number, "port_number")\n self._use_https = check.bool_param(use_https, "use_https")\n\n self._url = (\n ("https://" if self._use_https else "http://")\n + (f"{self._hostname}:{self._port_number}" if self._port_number else self._hostname)\n + "/graphql"\n )\n\n self._transport = check.opt_inst_param(\n transport,\n "transport",\n Transport,\n default=RequestsHTTPTransport(url=self._url, use_json=True),\n )\n try:\n self._client = Client(transport=self._transport, fetch_schema_from_transport=True)\n except requests.exceptions.ConnectionError as exc:\n raise DagsterGraphQLClientError(\n f"Error when connecting to url {self._url}. "\n + f"Did you specify hostname: {self._hostname} "\n + (f"and port_number: {self._port_number} " if self._port_number else "")\n + "correctly?"\n ) from exc\n\n def _execute(self, query: str, variables: Optional[Dict[str, Any]] = None):\n try:\n return self._client.execute(gql(query), variable_values=variables)\n except Exception as exc: # catch generic Exception from the gql client\n raise DagsterGraphQLClientError(\n f"Exception occured during execution of query \\n{query}\\n with variables \\n{variables}\\n"\n ) from exc\n\n def _get_repo_locations_and_names_with_pipeline(self, pipeline_name: str) -> List[PipelineInfo]:\n res_data = self._execute(CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY)\n query_res = res_data["repositoriesOrError"]\n repo_connection_status = query_res["__typename"]\n if repo_connection_status == "RepositoryConnection":\n valid_nodes: Iterable[PipelineInfo] = chain(\n *map(PipelineInfo.from_node, query_res["nodes"])\n )\n return [info for info in valid_nodes if info.pipeline_name == pipeline_name]\n else:\n raise DagsterGraphQLClientError(repo_connection_status, query_res["message"])\n\n def _core_submit_execution(\n self,\n pipeline_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n is_using_job_op_graph_apis: Optional[bool] = False,\n ):\n check.opt_str_param(repository_location_name, "repository_location_name")\n check.opt_str_param(repository_name, "repository_name")\n check.str_param(pipeline_name, "pipeline_name")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n run_config = check.opt_dict_param(run_config, "run_config")\n\n # The following invariant will never fail when a job is executed\n check.invariant(\n (mode is not None and run_config is not None) or preset is not None,\n "Either a mode and run_config or a preset must be specified in order to "\n f"submit the pipeline {pipeline_name} for execution",\n )\n tags = validate_tags(tags)\n\n pipeline_or_job = "Job" if is_using_job_op_graph_apis else "Pipeline"\n\n if not repository_location_name or not repository_name:\n pipeline_info_lst = self._get_repo_locations_and_names_with_pipeline(pipeline_name)\n if len(pipeline_info_lst) == 0:\n raise DagsterGraphQLClientError(\n f"{pipeline_or_job}NotFoundError",\n f"No {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name `{pipeline_name}` exist",\n )\n elif len(pipeline_info_lst) == 1:\n pipeline_info = pipeline_info_lst[0]\n repository_location_name = pipeline_info.repository_location_name\n repository_name = pipeline_info.repository_name\n else:\n raise DagsterGraphQLClientError(\n "Must specify repository_location_name and repository_name"\n f" since there are multiple {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name {pipeline_name}."\n f"\\n\\tchoose one of: {pipeline_info_lst}"\n )\n\n variables: Dict[str, Any] = {\n "executionParams": {\n "selector": {\n "repositoryLocationName": repository_location_name,\n "repositoryName": repository_name,\n "pipelineName": pipeline_name,\n "solidSelection": solid_selection,\n }\n }\n }\n if preset is not None:\n variables["executionParams"]["preset"] = preset\n if mode is not None and run_config is not None:\n variables["executionParams"] = {\n **variables["executionParams"],\n "runConfigData": run_config,\n "mode": mode,\n "executionMetadata": {"tags": [{"key": k, "value": v} for k, v in tags.items()]}\n if tags\n else {},\n }\n\n res_data: Dict[str, Any] = self._execute(CLIENT_SUBMIT_PIPELINE_RUN_MUTATION, variables)\n query_result = res_data["launchPipelineExecution"]\n query_result_type = query_result["__typename"]\n if (\n query_result_type == "LaunchRunSuccess"\n or query_result_type == "LaunchPipelineRunSuccess"\n ):\n return query_result["run"]["runId"]\n elif query_result_type == "InvalidStepError":\n raise DagsterGraphQLClientError(query_result_type, query_result["invalidStepKey"])\n elif query_result_type == "InvalidOutputError":\n error_info = InvalidOutputErrorInfo(\n step_key=query_result["stepKey"],\n invalid_output_name=query_result["invalidOutputName"],\n )\n raise DagsterGraphQLClientError(query_result_type, body=error_info)\n elif (\n query_result_type == "RunConfigValidationInvalid"\n or query_result_type == "PipelineConfigValidationInvalid"\n ):\n raise DagsterGraphQLClientError(query_result_type, query_result["errors"])\n else:\n # query_result_type is a ConflictingExecutionParamsError, a PresetNotFoundError\n # a PipelineNotFoundError, a RunConflict, or a PythonError\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])\n\n
[docs] def submit_pipeline_execution(\n self,\n pipeline_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n mode: Optional[str] = None,\n preset: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n solid_selection: Optional[List[str]] = None,\n ) -> str:\n """Submits a Pipeline with attached configuration for execution.\n\n Args:\n pipeline_name (str): The pipeline's name\n repository_location_name (Optional[str], optional): The name of the repository location where\n the pipeline is located. If omitted, the client will try to infer the repository location\n from the available options on the Dagster deployment. Defaults to None.\n repository_name (Optional[str], optional): The name of the repository where the pipeline is located.\n If omitted, the client will try to infer the repository from the available options\n on the Dagster deployment. Defaults to None.\n run_config (Optional[Any], optional): This is the run config to execute the pipeline with.\n Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\n an arbitrary object for run config. However, it must conform to the constraints of the config\n schema for this pipeline. If it does not, the client will throw a DagsterGraphQLClientError with a message of\n RunConfigValidationInvalid. Defaults to None.\n mode (Optional[str], optional): The mode to run the pipeline with. If you have not\n defined any custom modes for your pipeline, the default mode is "default". Defaults to None.\n preset (Optional[str], optional): The name of a pre-defined preset to use instead of a\n run config. Defaults to None.\n tags (Optional[Dict[str, Any]], optional): A set of tags to add to the pipeline execution.\n\n Raises:\n DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the pipeline has an invalid step\n DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the pipeline.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.\n DagsterGraphQLClientError("ConflictingExecutionParamsError", invalid_step_key): a preset and a run_config & mode are present\n that conflict with one another\n DagsterGraphQLClientError("PresetNotFoundError", message): if the provided preset name is not found\n DagsterGraphQLClientError("RunConflict", message): a `DagsterRunConflict` occured during execution.\n This indicates that a conflicting pipeline run already exists in run storage.\n DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format\n for the pipeline\n DagsterGraphQLClientError("PipelineNotFoundError", message): the requested pipeline does not exist\n DagsterGraphQLClientError("PythonError", message): an internal framework error occurred\n\n Returns:\n str: run id of the submitted pipeline run\n """\n return self._core_submit_execution(\n pipeline_name,\n repository_location_name,\n repository_name,\n run_config,\n mode,\n preset,\n tags,\n solid_selection,\n is_using_job_op_graph_apis=False,\n )
\n\n
[docs] def submit_job_execution(\n self,\n job_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Dict[str, Any]] = None,\n tags: Optional[Dict[str, Any]] = None,\n op_selection: Optional[List[str]] = None,\n ) -> str:\n """Submits a job with attached configuration for execution.\n\n Args:\n job_name (str): The job's name\n repository_location_name (Optional[str]): The name of the repository location where\n the job is located. If omitted, the client will try to infer the repository location\n from the available options on the Dagster deployment. Defaults to None.\n repository_name (Optional[str]): The name of the repository where the job is located.\n If omitted, the client will try to infer the repository from the available options\n on the Dagster deployment. Defaults to None.\n run_config (Optional[Dict[str, Any]]): This is the run config to execute the job with.\n Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\n an arbitrary object for run config. However, it must conform to the constraints of the config\n schema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\n JobConfigValidationInvalid. Defaults to None.\n tags (Optional[Dict[str, Any]]): A set of tags to add to the job execution.\n\n Raises:\n DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the job has an invalid step\n DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.\n DagsterGraphQLClientError("RunConflict", message): a `DagsterRunConflict` occured during execution.\n This indicates that a conflicting job run already exists in run storage.\n DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format\n for the job\n DagsterGraphQLClientError("JobNotFoundError", message): the requested job does not exist\n DagsterGraphQLClientError("PythonError", message): an internal framework error occurred\n\n Returns:\n str: run id of the submitted pipeline run\n """\n return self._core_submit_execution(\n pipeline_name=job_name,\n repository_location_name=repository_location_name,\n repository_name=repository_name,\n run_config=run_config,\n mode="default",\n preset=None,\n tags=tags,\n solid_selection=op_selection,\n is_using_job_op_graph_apis=True,\n )
\n\n
[docs] def get_run_status(self, run_id: str) -> PipelineRunStatus:\n """Get the status of a given Pipeline Run\n\n Args:\n run_id (str): run id of the requested pipeline run.\n\n Raises:\n DagsterGraphQLClientError("PipelineNotFoundError", message): if the requested run id is not found\n DagsterGraphQLClientError("PythonError", message): on internal framework errors\n\n Returns:\n PipelineRunStatus: returns a status Enum describing the state of the requested pipeline run\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n GET_PIPELINE_RUN_STATUS_QUERY, {"runId": run_id}\n )\n query_result: Dict[str, Any] = res_data["pipelineRunOrError"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "PipelineRun" or query_result_type == "Run":\n return PipelineRunStatus(query_result["status"])\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n\n
[docs] def reload_repository_location(\n self, repository_location_name: str\n ) -> ReloadRepositoryLocationInfo:\n """Reloads a Dagster Repository Location, which reloads all repositories in that repository location.\n\n This is useful in a variety of contexts, including refreshing Dagit without restarting\n the server.\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ReloadRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n RELOAD_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["reloadRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "WorkspaceLocationEntry":\n location_or_error_type = query_result["locationOrLoadError"]["__typename"]\n if location_or_error_type == "RepositoryLocation":\n return ReloadRepositoryLocationInfo(status=ReloadRepositoryLocationStatus.SUCCESS)\n else:\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type="PythonError",\n message=query_result["locationOrLoadError"]["message"],\n )\n else:\n # query_result_type is either ReloadNotSupported or RepositoryLocationNotFound\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type=query_result_type,\n message=query_result["message"],\n )
\n\n
[docs] def shutdown_repository_location(\n self, repository_location_name: str\n ) -> ShutdownRepositoryLocationInfo:\n """Shuts down the server that is serving metadata for the provided repository location.\n\n This is primarily useful when you want the server to be restarted by the compute environment\n in which it is running (for example, in Kubernetes, the pod in which the server is running\n will automatically restart when the server is shut down, and the repository metadata will\n be reloaded)\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ShutdownRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["shutdownRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "ShutdownRepositoryLocationSuccess":\n return ShutdownRepositoryLocationInfo(status=ShutdownRepositoryLocationStatus.SUCCESS)\n elif (\n query_result_type == "RepositoryLocationNotFound" or query_result_type == "PythonError"\n ):\n return ShutdownRepositoryLocationInfo(\n status=ShutdownRepositoryLocationStatus.FAILURE,\n message=query_result["message"],\n )\n else:\n raise Exception(f"Unexpected query result type {query_result_type}")
\n\n
[docs] def terminate_run(self, run_id: str):\n """\n Terminates a pipeline run. This method it is useful when you would like to stop a pipeline run\n based on a external event.\n\n Args:\n run_id (str): The run id of the pipeline run to terminate\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n TERMINATE_RUN_JOB_MUTATION, {"runId": run_id}\n )\n\n query_result: Dict[str, Any] = res_data["terminateRun"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "TerminateRunSuccess":\n return\n\n elif query_result_type == "RunNotFoundError":\n raise DagsterGraphQLClientError("RunNotFoundError", f"Run Id {run_id} not found")\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n
", "current_page_name": "_modules/dagster_graphql/client/client", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_graphql.client.client"}, "utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_graphql.client.utils

\nfrom enum import Enum\nfrom typing import Any, Dict, List, NamedTuple, Optional\n\n\n
[docs]class DagsterGraphQLClientError(Exception):\n def __init__(self, *args, body=None):\n super().__init__(*args)\n self.body = body
\n\n\n
[docs]class ReloadRepositoryLocationStatus(Enum):\n """This enum describes the status of a GraphQL mutation to reload a Dagster repository location\n\n Args:\n Enum (str): can be either `ReloadRepositoryLocationStatus.SUCCESS`\n or `ReloadRepositoryLocationStatus.FAILURE`.\n """\n\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"
\n\n\nclass ShutdownRepositoryLocationStatus(Enum):\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n\n\n
[docs]class ReloadRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of reloading\n a Dagster repository location with a GraphQL mutation.\n\n Args:\n status (ReloadRepositoryLocationStatus): The status of the reload repository location mutation\n failure_type: (Optional[str], optional): the failure type if `status == ReloadRepositoryLocationStatus.FAILURE`.\n Can be one of `ReloadNotSupported`, `RepositoryLocationNotFound`, or `RepositoryLocationLoadFailure`. Defaults to None.\n message (Optional[str], optional): the failure message/reason if\n `status == ReloadRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ReloadRepositoryLocationStatus\n failure_type: Optional[str] = None\n message: Optional[str] = None
\n\n\nclass ShutdownRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of shutting down the server for\n a Dagster repository location using a GraphQL mutation.\n\n Args:\n status (ShutdownRepositoryLocationStatus) Whether the shutdown succeeded or failed.\n message (Optional[str], optional): the failure message/reason if\n `status == ShutdownRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ShutdownRepositoryLocationStatus\n message: Optional[str] = None\n\n\nclass PipelineInfo(NamedTuple):\n repository_location_name: str\n repository_name: str\n pipeline_name: str\n\n @staticmethod\n def from_node(node: Dict[str, Any]) -> List["PipelineInfo"]:\n repo_name = node["name"]\n repo_location_name = node["location"]["name"]\n return [\n PipelineInfo(\n repository_location_name=repo_location_name,\n repository_name=repo_name,\n pipeline_name=pipeline["name"],\n )\n for pipeline in node["pipelines"]\n ]\n\n\n
[docs]class InvalidOutputErrorInfo(NamedTuple):\n """This class gives information about an InvalidOutputError from submitting a pipeline for execution\n from GraphQL.\n\n Args:\n step_key (str): key of the step that failed\n invalid_output_name (str): the name of the invalid output from the given step\n """\n\n step_key: str\n invalid_output_name: str
\n
", "current_page_name": "_modules/dagster_graphql/client/utils", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_graphql.client.utils"}}}, "dagster_k8s": {"executor": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_k8s.executor

\nfrom typing import List, Optional, cast\n\nimport kubernetes\nfrom dagster_k8s.launcher import K8sRunLauncher\n\nfrom dagster import Field, IntSource, StringSource\nfrom dagster import _check as check\nfrom dagster import executor\nfrom dagster.core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster.core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster.core.events import DagsterEvent, DagsterEventType, EngineEventData, MetadataEntry\nfrom dagster.core.execution.plan.objects import StepFailureData\nfrom dagster.core.execution.retries import RetryMode, get_retries_config\nfrom dagster.core.executor.base import Executor\nfrom dagster.core.executor.init import InitExecutorContext\nfrom dagster.core.executor.step_delegating import StepDelegatingExecutor\nfrom dagster.core.executor.step_delegating.step_handler import StepHandler\nfrom dagster.core.executor.step_delegating.step_handler.base import StepHandlerContext\nfrom dagster.utils import frozentags, merge_dicts\n\nfrom .container_context import K8sContainerContext\nfrom .job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\nfrom .utils import delete_job\n\n\n
[docs]@executor(\n name="k8s",\n config_schema=merge_dicts(\n DagsterK8sJobConfig.config_type_job(),\n {\n "job_namespace": Field(StringSource, is_required=False),\n "retries": get_retries_config(),\n "max_concurrency": Field(\n IntSource,\n is_required=False,\n description="Limit on the number of pods that will run concurrently within the scope "\n "of a Dagster run. Note that this limit is per run, not global.",\n ),\n },\n ),\n requirements=multiple_process_executor_requirements(),\n)\ndef k8s_job_executor(init_context: InitExecutorContext) -> Executor:\n """\n Executor which launches steps as Kubernetes Jobs.\n\n To use the `k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_namespace: 'some-namespace'\n image_pull_policy: ...\n image_pull_secrets: ...\n service_account_name: ...\n env_config_maps: ...\n env_secrets: ...\n env_vars: ...\n job_image: ... # leave out if using userDeployments\n max_concurrent: ...\n\n `max_concurrent` limits the number of pods that will execute concurrently for one run. By default\n there is no limit- it will maximally parallel as allowed by the DAG. Note that this is not a\n global limit.\n\n Configuration set on the Kubernetes Jobs and Pods created by the `K8sRunLauncher` will also be\n set on Kubernetes Jobs and Pods created by the `k8s_job_executor`.\n """\n\n run_launcher = init_context.instance.run_launcher\n if not isinstance(run_launcher, K8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a K8sRunLauncher; configure the "\n "K8sRunLauncher on your instance to use it.",\n )\n\n exc_cfg = init_context.executor_config\n\n k8s_container_context = K8sContainerContext(\n image_pull_policy=exc_cfg.get("image_pull_policy"), # type: ignore\n image_pull_secrets=exc_cfg.get("image_pull_secrets"), # type: ignore\n service_account_name=exc_cfg.get("service_account_name"), # type: ignore\n env_config_maps=exc_cfg.get("env_config_maps"), # type: ignore\n env_secrets=exc_cfg.get("env_secrets"), # type: ignore\n env_vars=exc_cfg.get("env_vars"), # type: ignore\n volume_mounts=exc_cfg.get("volume_mounts"), # type: ignore\n volumes=exc_cfg.get("volumes"), # type: ignore\n labels=exc_cfg.get("labels"), # type: ignore\n namespace=exc_cfg.get("job_namespace"), # type: ignore\n resources=exc_cfg.get("resources"), # type: ignore\n )\n\n return StepDelegatingExecutor(\n K8sStepHandler(\n image=exc_cfg.get("job_image"), # type: ignore\n container_context=k8s_container_context,\n load_incluster_config=run_launcher.load_incluster_config,\n kubeconfig_file=run_launcher.kubeconfig_file,\n ),\n retries=RetryMode.from_config(init_context.executor_config["retries"]), # type: ignore\n max_concurrent=check.opt_int_elem(exc_cfg, "max_concurrent"),\n should_verify_step=True,\n )
\n\n\nclass K8sStepHandler(StepHandler):\n @property\n def name(self):\n return "K8sStepHandler"\n\n def __init__(\n self,\n image: Optional[str],\n container_context: K8sContainerContext,\n load_incluster_config: bool,\n kubeconfig_file: Optional[str],\n k8s_client_batch_api=None,\n ):\n super().__init__()\n\n self._executor_image = check.opt_str_param(image, "image")\n self._executor_container_context = check.inst_param(\n container_context, "container_context", K8sContainerContext\n )\n\n self._fixed_k8s_client_batch_api = k8s_client_batch_api\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n def _get_container_context(self, step_handler_context: StepHandlerContext):\n run_target = K8sContainerContext.create_for_run(\n step_handler_context.pipeline_run,\n cast(K8sRunLauncher, step_handler_context.instance.run_launcher),\n )\n return run_target.merge(self._executor_container_context)\n\n @property\n def _batch_api(self):\n return self._fixed_k8s_client_batch_api or kubernetes.client.BatchV1Api()\n\n def _get_k8s_step_job_name(self, step_handler_context):\n step_key = step_handler_context.execute_step_args.step_keys_to_execute[0]\n\n name_key = get_k8s_job_name(\n step_handler_context.execute_step_args.pipeline_run_id,\n step_key,\n )\n\n if step_handler_context.execute_step_args.known_state:\n retry_state = step_handler_context.execute_step_args.known_state.get_retry_state()\n if retry_state.get_attempt_count(step_key):\n return "dagster-step-%s-%d" % (name_key, retry_state.get_attempt_count(step_key))\n\n return "dagster-step-%s" % (name_key)\n\n def launch_step(self, step_handler_context: StepHandlerContext):\n events = []\n\n step_keys_to_execute = cast(\n List[str], step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n pod_name = job_name\n\n args = step_handler_context.execute_step_args.get_command_args()\n\n container_context = self._get_container_context(step_handler_context)\n\n job_config = container_context.get_k8s_job_config(\n self._executor_image, step_handler_context.instance.run_launcher\n )\n\n if not job_config.job_image:\n job_config = job_config.with_image(\n step_handler_context.execute_step_args.pipeline_origin.repository_origin.container_image\n )\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the job")\n\n user_defined_k8s_config = get_user_defined_k8s_config(\n frozentags(step_handler_context.step_tags[step_key])\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="step_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels={\n "dagster/job": step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n "dagster/op": step_key,\n "dagster/run-id": step_handler_context.execute_step_args.pipeline_run_id,\n },\n )\n\n events.append(\n DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Executing step {step_key} in Kubernetes job {job_name}",\n event_specific_data=EngineEventData(\n [\n MetadataEntry("Step key", value=step_key),\n MetadataEntry("Kubernetes Job name", value=job_name),\n ],\n ),\n )\n )\n\n self._batch_api.create_namespaced_job(body=job, namespace=container_context.namespace)\n\n return events\n\n def check_step_health(self, step_handler_context: StepHandlerContext):\n step_keys_to_execute = cast(\n List[str], step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n\n container_context = self._get_container_context(step_handler_context)\n\n job = self._batch_api.read_namespaced_job(\n namespace=container_context.namespace, name=job_name\n )\n if job.status.failed:\n return [\n DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n pipeline_name=step_handler_context.execute_step_args.pipeline_origin.pipeline_name,\n step_key=step_key,\n message=f"Discovered failed Kubernetes job {job_name} for step {step_key}",\n event_specific_data=StepFailureData(\n error=None,\n user_failure_data=None,\n ),\n )\n ]\n return []\n\n def terminate_step(self, step_handler_context: StepHandlerContext):\n step_keys_to_execute = cast(\n List[str], step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n container_context = self._get_container_context(step_handler_context)\n\n delete_job(job_name=job_name, namespace=container_context.namespace)\n return []\n
", "current_page_name": "_modules/dagster_k8s/executor", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_k8s.executor"}, "launcher": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_k8s.launcher

\nimport sys\nfrom typing import Dict, List, Optional\n\nimport kubernetes\n\nfrom dagster import Field, MetadataEntry, StringSource\nfrom dagster import _check as check\nfrom dagster.cli.api import ExecuteRunArgs\nfrom dagster.core.events import EngineEventData\nfrom dagster.core.launcher import LaunchRunContext, ResumeRunContext, RunLauncher\nfrom dagster.core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster.core.storage.pipeline_run import PipelineRun, PipelineRunStatus\nfrom dagster.core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster.grpc.types import ResumeRunArgs\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster.utils import frozentags, merge_dicts\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .container_context import K8sContainerContext\nfrom .job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_job_name_from_run_id,\n    get_user_defined_k8s_config,\n)\nfrom .utils import delete_job\n\n\n
[docs]class K8sRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a Kubernetes Job for each Dagster job run.\n\n Encapsulates each run in a separate, isolated invocation of ``dagster-graphql``.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n service_account_name: your_service_account\n job_image: my_project/dagster_image:latest\n instance_config_map: dagster-instance\n postgres_password_secret: dagster-postgresql-secret\n\n """\n\n def __init__(\n self,\n service_account_name,\n instance_config_map,\n postgres_password_secret=None,\n dagster_home=None,\n job_image=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n load_incluster_config=True,\n kubeconfig_file=None,\n inst_data=None,\n job_namespace="default",\n env_config_maps=None,\n env_secrets=None,\n env_vars=None,\n k8s_client_batch_api=None,\n volume_mounts=None,\n volumes=None,\n labels=None,\n fail_pod_on_run_failure=None,\n resources=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.job_namespace = check.str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = load_incluster_config\n self.kubeconfig_file = kubeconfig_file\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._fixed_batch_api = k8s_client_batch_api\n\n self._job_config = None\n self._job_image = check.opt_str_param(job_image, "job_image")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._service_account_name = check.str_param(service_account_name, "service_account_name")\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.postgres_password_secret = check.opt_str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n self._env_vars = check.opt_list_param(env_vars, "env_vars", of_type=str)\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self._resources = check.opt_dict_param(resources, "resources")\n\n super().__init__()\n\n @property\n def job_image(self):\n return self._job_image\n\n @property\n def image_pull_policy(self) -> str:\n return self._image_pull_policy\n\n @property\n def image_pull_secrets(self) -> List[Dict]:\n return self._image_pull_secrets\n\n @property\n def service_account_name(self) -> str:\n return self._service_account_name\n\n @property\n def env_config_maps(self) -> List[str]:\n return self._env_config_maps\n\n @property\n def env_secrets(self) -> List[str]:\n return self._env_secrets\n\n @property\n def volume_mounts(self) -> List:\n return self._volume_mounts\n\n @property\n def volumes(self) -> List:\n return self._volumes\n\n @property\n def resources(self) -> Dict:\n return self._resources\n\n @property\n def env_vars(self) -> List[str]:\n return self._env_vars\n\n @property\n def labels(self) -> Dict[str, str]:\n return self._labels\n\n @property\n def fail_pod_on_run_failure(self) -> Optional[bool]:\n return self._fail_pod_on_run_failure\n\n @property\n def _batch_api(self):\n return self._fixed_batch_api if self._fixed_batch_api else kubernetes.client.BatchV1Api()\n\n @classmethod\n def config_type(cls):\n """Include all arguments required for DagsterK8sJobConfig along with additional arguments\n needed for the RunLauncher itself.\n """\n job_cfg = DagsterK8sJobConfig.config_type_run_launcher()\n\n run_launcher_extra_cfg = {\n "job_namespace": Field(StringSource, is_required=False, default_value="default"),\n }\n return merge_dicts(job_cfg, run_launcher_extra_cfg)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def get_container_context_for_run(self, pipeline_run: PipelineRun) -> K8sContainerContext:\n return K8sContainerContext.create_for_run(pipeline_run, self)\n\n def _launch_k8s_job_with_args(self, job_name, args, run):\n container_context = self.get_container_context_for_run(run)\n\n pod_name = job_name\n\n pipeline_origin = run.pipeline_code_origin\n user_defined_k8s_config = get_user_defined_k8s_config(frozentags(run.tags))\n repository_origin = pipeline_origin.repository_origin\n\n job_config = container_context.get_k8s_job_config(\n job_image=repository_origin.container_image, run_launcher=self\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_config.job_image},\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels={\n "dagster/job": pipeline_origin.pipeline_name,\n "dagster/run-id": run.run_id,\n },\n )\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n [\n MetadataEntry("Kubernetes Job name", value=job_name),\n MetadataEntry("Kubernetes Namespace", value=container_context.namespace),\n MetadataEntry("Run ID", value=run.run_id),\n ]\n ),\n cls=self.__class__,\n )\n\n self._batch_api.create_namespaced_job(body=job, namespace=container_context.namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n cls=self.__class__,\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.pipeline_run\n job_name = get_job_name_from_run_id(run.run_id)\n pipeline_origin = check.not_none(run.pipeline_code_origin)\n\n args = ExecuteRunArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.pipeline_run\n job_name = get_job_name_from_run_id(\n run.run_id, resume_attempt_number=context.resume_attempt_number\n )\n pipeline_origin = check.not_none(run.pipeline_code_origin)\n\n args = ResumeRunArgs(\n pipeline_origin=pipeline_origin,\n pipeline_run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n # https://github.com/dagster-io/dagster/issues/2741\n def can_terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n pipeline_run = self._instance.get_run_by_id(run_id)\n if not pipeline_run:\n return False\n if pipeline_run.status != PipelineRunStatus.STARTED:\n return False\n return True\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n container_context = self.get_container_context_for_run(run)\n\n can_terminate = self.can_terminate(run_id)\n if not can_terminate:\n self._instance.report_engine_event(\n message="Unable to terminate run; can_terminate returned {}".format(can_terminate),\n pipeline_run=run,\n cls=self.__class__,\n )\n return False\n\n self._instance.report_run_canceling(run)\n\n job_name = get_job_name_from_run_id(\n run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)\n )\n\n try:\n termination_result = delete_job(\n job_name=job_name, namespace=container_context.namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Run was terminated successfully.",\n pipeline_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n pipeline_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; encountered error in delete_job",\n pipeline_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: PipelineRun):\n container_context = self.get_container_context_for_run(run)\n\n job_name = get_job_name_from_run_id(\n run.run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)\n )\n try:\n job = self._batch_api.read_namespaced_job(\n namespace=container_context.namespace, name=job_name\n )\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n if job.status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n if job.status.succeeded:\n return CheckRunHealthResult(WorkerStatus.SUCCESS)\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n
", "current_page_name": "_modules/dagster_k8s/launcher", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_k8s.launcher"}}, "dagster_mlflow": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_mlflow.resources

\n"""\nThis module contains the mlflow resource provided by the MlFlow\nclass. This resource provides an easy way to configure mlflow for logging various\nthings from dagster runs.\n"""\nimport atexit\nimport sys\nfrom itertools import islice\nfrom os import environ\nfrom typing import Any, Optional\n\nimport mlflow\nfrom mlflow.entities.run_status import RunStatus\n\nfrom dagster import Field, Noneable, Permissive, resource\n\nCONFIG_SCHEMA = {\n    "experiment_name": Field(str, is_required=True, description="MlFlow experiment name."),\n    "mlflow_tracking_uri": Field(\n        Noneable(str),\n        default_value=None,\n        is_required=False,\n        description="MlFlow tracking server uri.",\n    ),\n    "parent_run_id": Field(\n        Noneable(str),\n        default_value=None,\n        is_required=False,\n        description="Mlflow run ID of parent run if this is a nested run.",\n    ),\n    "env": Field(Permissive(), description="Environment variables for mlflow setup."),\n    "env_to_tag": Field(\n        Noneable(list),\n        default_value=None,\n        is_required=False,\n        description="List of environment variables to log as tags in mlflow.",\n    ),\n    "extra_tags": Field(Permissive(), description="Any extra key-value tags to log to mlflow."),\n}\n\n\nclass MlflowMeta(type):\n    """Mlflow Metaclass to create methods that "inherit" all of Mlflow's\n    methods. If the class has a method defined it is excluded from the\n    attribute setting from mlflow.\n    """\n\n    def __new__(cls, name, bases, attrs):\n        class_cls = super(MlflowMeta, cls).__new__(cls, name, bases, attrs)\n        for attr in (attr for attr in dir(mlflow) if attr not in dir(class_cls)):\n            mlflow_attribute = getattr(mlflow, attr)\n            if callable(mlflow_attribute):\n                setattr(class_cls, attr, staticmethod(mlflow_attribute))\n            else:\n                setattr(class_cls, attr, mlflow_attribute)\n        return class_cls\n\n\nclass MlFlow(metaclass=MlflowMeta):\n    """Class for setting up an mlflow resource for dagster runs.\n    This takes care of all the configuration required to use mlflow tracking and the complexities of\n    mlflow tracking dagster parallel runs.\n    """\n\n    def __init__(self, context):\n\n        # Context associated attributes\n        self.log = context.log\n        self.run_name = context.pipeline_run.pipeline_name\n        self.dagster_run_id = context.run_id\n\n        # resource config attributes\n        resource_config = context.resource_config\n        self.tracking_uri = resource_config.get("mlflow_tracking_uri")\n        if self.tracking_uri:\n            mlflow.set_tracking_uri(self.tracking_uri)\n        self.parent_run_id = resource_config.get("parent_run_id")\n        self.experiment_name = resource_config["experiment_name"]\n        self.env_tags_to_log = resource_config.get("env_to_tag") or []\n        self.extra_tags = resource_config.get("extra_tags")\n\n        # Update env variables if any are given\n        self.env_vars = resource_config.get("env", {})\n        if self.env_vars:\n            environ.update(self.env_vars)\n\n        # If the experiment exists then the set won't do anything\n        mlflow.set_experiment(self.experiment_name)\n        self.experiment = mlflow.get_experiment_by_name(self.experiment_name)\n\n        # Get the client object\n        self.tracking_client = mlflow.tracking.MlflowClient()\n\n        # Set up the active run and tags\n        self._setup()\n\n    def _setup(self):\n        """\n        Sets the active run and tags. If an Mlflow run_id exists then the\n        active run is set to it. This way a single Dagster run outputs data\n        to the same Mlflow run, even when multiprocess executors are used.\n        """\n        # Get the run id\n        run_id = self._get_current_run_id()  # pylint: disable=no-member\n        self._set_active_run(run_id=run_id)\n        self._set_all_tags()\n\n        # hack needed to stop mlflow from marking run as finished when\n        # a process exits in parallel runs\n        atexit.unregister(mlflow.end_run)\n\n    def _get_current_run_id(\n        self, experiment: Optional[Any] = None, dagster_run_id: Optional[str] = None\n    ):\n        """Gets the run id of a specific dagster run and experiment id.\n        If it doesn't exist then it returns a None.\n\n        Args:\n            experiment (optional): Mlflow experiment.\n            When none is passed it fetches the experiment object set in\n            the constructor.  Defaults to None.\n            dagster_run_id (optional): The Dagster run id.\n            When none is passed it fetches the dagster_run_id object set in\n            the constructor.  Defaults to None.\n        Returns:\n            run_id (str or None): run_id if it is found else None\n        """\n        experiment = experiment or self.experiment\n        dagster_run_id = dagster_run_id or self.dagster_run_id\n        if experiment:\n            # Check if a run with this dagster run id has already been started\n            # in mlflow, will get an empty dataframe if not\n            current_run_df = mlflow.search_runs(\n                experiment_ids=[experiment.experiment_id],\n                filter_string=f"tags.dagster_run_id='{dagster_run_id}'",\n            )\n            if not current_run_df.empty:\n                return current_run_df.run_id.values[0]  # pylint: disable=no-member\n\n    def _set_active_run(self, run_id=None):\n        """\n        This method sets the active run to be that of the specified\n        run_id. If None is passed then a new run is started. The new run also\n        takes care of nested runs.\n\n        Args:\n            run_id (str, optional): Mlflow run_id. Defaults to None.\n        """\n        nested_run = False\n        if self.parent_run_id is not None:\n            self._start_run(run_id=self.parent_run_id, run_name=self.run_name)\n            nested_run = True\n        self._start_run(run_id=run_id, run_name=self.run_name, nested=nested_run)\n\n    def _start_run(self, **kwargs):\n        """\n        Catches the Mlflow exception if a run is already active.\n        """\n\n        try:\n            run = mlflow.start_run(**kwargs)\n            self.log.info(\n                f"Starting a new mlflow run with id {run.info.run_id} "\n                f"in experiment {self.experiment_name}"\n            )\n        except Exception as ex:\n            run = mlflow.active_run()\n            if "is already active" not in str(ex):\n                raise (ex)\n            self.log.info(f"Run with id {run.info.run_id} is already active.")\n\n    def _set_all_tags(self):\n        """Method collects dagster_run_id plus all env variables/tags that have been\n            specified by the user in the config_schema and logs them as tags in mlflow.\n\n        Returns:\n            tags [dict]: Dictionary of all the tags\n        """\n        tags = {tag: environ.get(tag) for tag in self.env_tags_to_log}\n        tags["dagster_run_id"] = self.dagster_run_id\n        if self.extra_tags:\n            tags.update(self.extra_tags)\n\n        mlflow.set_tags(tags)\n\n    def cleanup_on_error(self):\n        """Method ends mlflow run with correct exit status for failed runs. Note that\n        this method does not work when a job running in dagit fails, it seems\n        that in this case a different process runs the job and when it fails\n        the stack trace is therefore not available. For this case we can use the\n        cleanup_on_failure hook defined below.\n        """\n        any_error = sys.exc_info()\n\n        if any_error[1]:\n            if isinstance(any_error[1], KeyboardInterrupt):\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))\n            else:\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n    @staticmethod\n    def log_params(params: dict):\n        """Overload of the mlflow.log_params. If len(params) >100 then\n        params is sent to mlflow in chunks.\n\n        Args:\n            params (dict): Parameters to be logged\n        """\n        for param_chunk in MlFlow.chunks(params, 100):\n            mlflow.log_params(param_chunk)\n\n    @staticmethod\n    def chunks(params: dict, size: int = 100):\n        """Method that chunks a dictionary into batches of size.\n\n        Args:\n            params (dict): Dictionary set to be batched\n            size (int, optional): Number of batches. Defaults to 100.\n\n        Yields:\n            (dict): Batch of dictionary\n        """\n        it = iter(params)\n        for _ in range(0, len(params), size):\n            yield {k: params[k] for k in islice(it, size)}\n\n\n
[docs]@resource(config_schema=CONFIG_SCHEMA)\ndef mlflow_tracking(context):\n """\n This resource initializes an MLflow run that's used for all steps within a Dagster run.\n\n This resource provides access to all of mlflow's methods as well as the mlflow tracking client's\n methods.\n\n Usage:\n\n 1. Add the mlflow resource to any solids in which you want to invoke mlflow tracking APIs.\n 2. Add the `end_mlflow_on_run_finished` hook to your pipeline to end the MLflow run\n when the Dagster run is finished.\n\n Examples:\n\n .. code-block:: python\n\n from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n @op(required_resource_keys={"mlflow"})\n def mlflow_solid(context):\n mlflow.log_params(some_params)\n mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n @end_mlflow_on_run_finished\n @job(resource_defs={"mlflow": mlflow_tracking})\n def mlf_example():\n mlflow_op()\n\n # example using an mlflow instance with s3 storage\n mlf_example.execute_in_process(run_config={\n "resources": {\n "mlflow": {\n "config": {\n "experiment_name": my_experiment,\n "mlflow_tracking_uri": "http://localhost:5000",\n\n # if want to run a nested run, provide parent_run_id\n "parent_run_id": an_existing_mlflow_run_id,\n\n # env variables to pass to mlflow\n "env": {\n "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n "AWS_ACCESS_KEY_ID": my_aws_key_id,\n "AWS_SECRET_ACCESS_KEY": my_secret,\n },\n\n # env variables you want to log as mlflow tags\n "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n # key-value tags to add to your experiment\n "extra_tags": {"super": "experiment"},\n }\n }\n }\n })\n """\n mlf = MlFlow(context)\n yield mlf\n mlf.cleanup_on_error()
\n
", "current_page_name": "_modules/dagster_mlflow/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_mlflow.resources"}}, "dagster_msteams": {"hooks": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_msteams.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster_msteams.card import Card\n\nfrom dagster.core.definitions import failure_hook, success_hook\nfrom dagster.core.execution.context.hook import HookContext\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return "Solid {solid_name} on pipeline {pipeline_name} {status}!\\nRun ID: {run_id}".format(\n        solid_name=context.solid.name,\n        pipeline_name=context.pipeline_name,\n        run_id=context.run_id,\n        status=status,\n    )\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]def teams_on_failure(\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this\n to allow messages to include deeplinks to the specific pipeline run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_failure(dagit_base_url="http://localhost:3000")\n @pipeline(...)\n def my_pipeline():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return "Solid {solid_name} failed!".format(\n solid_name=context.solid\n )\n\n @solid\n def a_solid(context):\n pass\n\n @pipeline(...)\n def my_pipeline():\n a_solid.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n\n """\n\n @failure_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if dagit_base_url:\n text += "<a href='{base_url}/instance/runs/{run_id}'>View in Dagit</a>".format(\n base_url=dagit_base_url,\n run_id=context.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload) # type: ignore\n\n return _hook
\n\n\n
[docs]def teams_on_success(\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this\n to allow messages to include deeplinks to the specific pipeline run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_success(dagit_base_url="http://localhost:3000")\n @pipeline(...)\n def my_pipeline():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return "Solid {solid_name} failed!".format(\n solid_name=context.solid\n )\n\n @solid\n def a_solid(context):\n pass\n\n @pipeline(...)\n def my_pipeline():\n a_solid.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n\n """\n\n @success_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if dagit_base_url:\n text += "<a href='{base_url}/instance/runs/{run_id}'>View in Dagit</a>".format(\n base_url=dagit_base_url,\n run_id=context.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload) # type: ignore\n\n return _hook
\n
", "current_page_name": "_modules/dagster_msteams/hooks", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_msteams.hooks"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_msteams.resources

\nfrom dagster_msteams.client import TeamsClient\n\nfrom dagster import Bool, Field, Float, StringSource, resource\n\n\n
[docs]@resource(\n {\n "hook_url": Field(\n StringSource,\n description="""To send messages to MS Teams channel, an incoming webhook has to\n be created. The incoming webhook url must be given as a part of the\n resource config to the msteams_resource in dagster.\n """,\n ),\n "http_proxy": Field(StringSource, is_required=False),\n "https_proxy": Field(StringSource, is_required=False),\n "timeout": Field(Float, default_value=60, is_required=False),\n "Verify": Field(Bool, is_required=False),\n },\n description="This resource is for connecting to MS Teams",\n)\ndef msteams_resource(context):\n """This resource is for connecting to Microsoft Teams.\n\n The resource object is a `dagster_msteams.TeamsClient`.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster solid:\n\n Examples:\n\n .. code-block:: python\n\n import os\n\n from dagster import ModeDefinition, execute_pipeline, pipeline, solid\n from dagster_msteams import Card, msteams_resource\n\n\n @solid(required_resource_keys={"msteams"})\n def teams_solid(context):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n context.resources.msteams.post_message(payload=card.payload)\n\n\n @pipeline(\n mode_defs=[ModeDefinition(resource_defs={"msteams": msteams_resource})],\n )\n def teams_pipeline():\n teams_solid()\n\n\n execute_pipeline(\n teams_pipeline,\n {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}},\n )\n\n """\n return TeamsClient(\n hook_url=context.resource_config.get("hook_url"),\n http_proxy=context.resource_config.get("http_proxy"),\n https_proxy=context.resource_config.get("https_proxy"),\n timeout=context.resource_config.get("timeout"),\n verify=context.resource_config.get("verify"),\n )
\n
", "current_page_name": "_modules/dagster_msteams/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_msteams.resources"}, "sensors": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_msteams.sensors

\nfrom typing import Callable, Optional\n\nfrom dagster_msteams.card import Card\nfrom dagster_msteams.client import TeamsClient\n\nfrom dagster import DefaultSensorStatus\nfrom dagster.core.definitions.run_status_sensor_definition import (\n    PipelineFailureSensorContext,\n    pipeline_failure_sensor,\n)\n\n\ndef _default_failure_message(context: PipelineFailureSensorContext) -> str:\n    return "\\n".join(\n        [\n            f"Pipeline {context.pipeline_run.pipeline_name} failed!",\n            f"Run ID: {context.pipeline_run.run_id}",\n            f"Mode: {context.pipeline_run.mode}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\n
[docs]def make_teams_on_pipeline_failure_sensor(\n hook_url: str,\n message_fn: Callable[[PipelineFailureSensorContext], str] = _default_failure_message,\n http_proxy: Optional[str] = None,\n https_proxy: Optional[str] = None,\n timeout: Optional[float] = 60,\n verify: Optional[bool] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n):\n """Create a sensor on pipeline failures that will message the given MS Teams webhook URL.\n\n Args:\n hook_url (str): MS Teams incoming webhook URL.\n message_fn (Optional(Callable[[PipelineFailureSensorContext], str])): Function which\n takes in the ``PipelineFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, pipeline name, and run ID.\n http_proxy : (Optional[str]): Proxy for requests using http protocol.\n https_proxy : (Optional[str]): Proxy for requests using https protocol.\n timeout: (Optional[float]): Connection timeout in seconds. Defaults to 60.\n verify: (Optional[bool]): Whether to verify the servers TLS certificate.\n name: (Optional[str]): The name of the sensor. Defaults to "teams_on_pipeline_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed pipeline run.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n\n Examples:\n\n .. code-block:: python\n\n teams_on_pipeline_failure = make_teams_on_pipeline_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n )\n\n @repository\n def my_repo():\n return [my_pipeline + teams_on_pipeline_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: PipelineFailureSensorContext) -> str:\n return "Pipeline {pipeline_name} failed! Error: {error}".format(\n pipeline_name=context.pipeline_run.pipeline_name,\n error=context.failure_event.message,\n )\n\n teams_on_pipeline_failure = make_teams_on_pipeline_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n message_fn=my_message_fn,\n dagit_base_url="http://localhost:3000",\n )\n\n\n """\n\n teams_client = TeamsClient(\n hook_url=hook_url,\n http_proxy=http_proxy,\n https_proxy=https_proxy,\n timeout=timeout,\n verify=verify,\n )\n\n @pipeline_failure_sensor(name=name, default_status=default_status)\n def teams_on_pipeline_failure(context: PipelineFailureSensorContext):\n\n text = message_fn(context)\n if dagit_base_url:\n text += "<a href='{base_url}/instance/runs/{run_id}'>View in Dagit</a>".format(\n base_url=dagit_base_url,\n run_id=context.pipeline_run.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n teams_client.post_message(payload=card.payload)\n\n return teams_on_pipeline_failure
\n
", "current_page_name": "_modules/dagster_msteams/sensors", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_msteams.sensors"}}, "dagster_mysql": {"event_log": {"event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_mysql.event_log.event_log

\nimport sqlalchemy as db\n\nimport dagster._check as check\nfrom dagster.core.storage.event_log import (\n    AssetKeyTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlPollingEventWatcher,\n)\nfrom dagster.core.storage.event_log.base import EventLogCursor\nfrom dagster.core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData\n\nfrom ..utils import (\n    MYSQL_POOL_RECYCLE,\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_config,\n    mysql_url_from_config,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\n\n
[docs]class MySQLEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """MySQL-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql.yaml\n :caption: dagster.yaml\n :start-after: start_marker_event_log\n :end-before: end_marker_event_log\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, mysql_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = check.str_param(mysql_url, "mysql_url")\n self._disposed = False\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db.pool.NullPool,\n )\n self._secondary_index_cache = {}\n\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "event_logs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n # mark all secondary indexes to be used\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def _init_db(self):\n with self._connect() as conn:\n with conn.begin():\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n pool_recycle=MYSQL_POOL_RECYCLE,\n )\n\n def upgrade(self):\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return mysql_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MySQLEventLogStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url):\n engine = create_engine(mysql_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool)\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(conn_string):\n MySQLEventLogStorage.wipe_storage(conn_string)\n return MySQLEventLogStorage(conn_string)\n\n def store_asset_event(self, event):\n # last_materialization_timestamp is updated upon observation, materialization, materialization_planned\n # See SqlEventLogStorage.store_asset_event method for more details\n\n values = self._get_asset_entry_values(event, self.has_secondary_index(ASSET_KEY_INDEX_COLS))\n with self.index_connection() as conn:\n if values:\n conn.execute(\n db.dialects.mysql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(),\n **values,\n )\n .on_duplicate_key_update(\n **values,\n )\n )\n else:\n try:\n conn.execute(\n db.dialects.mysql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(),\n )\n )\n except db.exc.IntegrityError:\n pass\n\n def _connect(self):\n return create_mysql_connection(self._engine, __file__, "event log")\n\n def run_connection(self, run_id=None):\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n MySQLEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(MySQLEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, cursor, callback):\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def end_watch(self, run_id, handler):\n self._event_watcher.unwatch_run(run_id, handler)\n\n @property\n def event_watcher(self):\n return self._event_watcher\n\n def __del__(self):\n self.dispose()\n\n def dispose(self):\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self):\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/event_log/event_log", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_mysql.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_mysql.run_storage.run_storage

\nimport sqlalchemy as db\n\nimport dagster._check as check\nfrom dagster.core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom ..utils import (\n    MYSQL_POOL_RECYCLE,\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_config,\n    mysql_url_from_config,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\n\n
[docs]class MySQLRunStorage(SqlRunStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql.yaml\n :caption: dagster.yaml\n :start-after: start_marker_runs\n :end-before: end_marker_runs\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db.pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "runs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n super().__init__()\n\n def _init_db(self):\n with self.connect() as conn:\n with conn.begin():\n RunStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold 1 open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n pool_recycle=MYSQL_POOL_RECYCLE,\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return mysql_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MySQLRunStorage(inst_data=inst_data, mysql_url=mysql_url_from_config(config_value))\n\n @staticmethod\n def wipe_storage(mysql_url):\n engine = create_engine(mysql_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool)\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url):\n MySQLRunStorage.wipe_storage(mysql_url)\n return MySQLRunStorage(mysql_url)\n\n def connect(self, run_id=None): # pylint: disable=arguments-differ, unused-argument\n return create_mysql_connection(self._engine, __file__, "run")\n\n def upgrade(self):\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_built_index(self, migration_name):\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n MySQLRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name):\n super(MySQLRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat):\n with self.connect() as conn:\n conn.execute(\n db.dialects.mysql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n .on_duplicate_key_update(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n )\n\n def alembic_version(self):\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/run_storage/run_storage", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_mysql.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_mysql.schedule_storage.schedule_storage

\nimport pendulum\nimport sqlalchemy as db\n\nimport dagster._check as check\nfrom dagster.core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster.core.storage.schedules.schema import InstigatorsTable\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\n\nfrom ..utils import (\n    MYSQL_POOL_RECYCLE,\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_config,\n    mysql_url_from_config,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\n\n
[docs]class MySQLScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql.yaml\n :caption: dagster.yaml\n :start-after: start_marker_schedules\n :end-before: end_marker_schedules\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db.pool.NullPool,\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n if "jobs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n\n super().__init__()\n\n def _init_db(self):\n with self.connect() as conn:\n with conn.begin():\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n pool_recycle=MYSQL_POOL_RECYCLE,\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return mysql_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return MySQLScheduleStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url):\n engine = create_engine(mysql_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool)\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url):\n MySQLScheduleStorage.wipe_storage(mysql_url)\n return MySQLScheduleStorage(mysql_url)\n\n def connect(self, run_id=None): # pylint: disable=arguments-differ, unused-argument\n return create_mysql_connection(self._engine, __file__, "schedule")\n\n def upgrade(self):\n alembic_config = mysql_alembic_config(__file__)\n run_alembic_upgrade(alembic_config, self._engine)\n\n def _add_or_update_instigators_table(self, conn, state):\n selector_id = state.selector_id\n conn.execute(\n db.dialects.mysql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_dagster_namedtuple(state),\n )\n .on_duplicate_key_update(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_dagster_namedtuple(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def alembic_version(self):\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/schedule_storage/schedule_storage", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_mysql.schedule_storage.schedule_storage"}}}, "dagster_pagerduty": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pagerduty.resources

\nfrom typing import Dict, Optional, cast\n\nimport pypd\n\nfrom dagster import Field, resource\n\n\nclass PagerDutyService:\n    """Integrates with PagerDuty via the pypd library.\n\n    See:\n        https://v2.developer.pagerduty.com/docs/events-api-v2\n        https://v2.developer.pagerduty.com/docs/send-an-event-events-api-v2\n        https://support.pagerduty.com/docs/services-and-integrations#section-events-api-v2\n        https://github.com/PagerDuty/pagerduty-api-python-client\n\n    for documentation and more information.\n    """\n\n    def __init__(self, routing_key: str):\n        self.routing_key = routing_key\n\n    def EventV2_create(\n        self,\n        summary: str,\n        source: str,\n        severity: str,\n        event_action: str = "trigger",\n        dedup_key: Optional[str] = None,\n        timestamp: Optional[str] = None,\n        component: Optional[str] = None,\n        group: Optional[str] = None,\n        event_class: Optional[str] = None,\n        custom_details: Optional[object] = None,\n    ) -> object:\n        """Events API v2 enables you to add PagerDuty's advanced event and incident management\n        functionality to any system that can make an outbound HTTP connection.\n\n        Arguments:\n            summary {string} -- A high-level, text summary message of the event. Will be used to\n                                construct an alert's description.\n\n                                Example: "PING OK - Packet loss = 0%, RTA = 1.41 ms" "Host\n                                         'acme-andromeda-sv1-c40 :: 179.21.24.50' is DOWN"\n\n            source {string} -- Specific human-readable unique identifier, such as a hostname, for\n                               the system having the problem.\n\n                               Examples:\n                               "prod05.theseus.acme-widgets.com"\n                               "171.26.23.22"\n                               "aws:elasticache:us-east-1:852511987:cluster/api-stats-prod-003"\n                               "9c09acd49a25"\n\n            severity {string} -- How impacted the affected system is. Displayed to users in lists\n                                 and influences the priority of any created incidents. Must be one\n                                 of {info, warning, error, critical}\n\n        Keyword Arguments:\n            event_action {str} -- There are three types of events that PagerDuty recognizes, and\n                                  are used to represent different types of activity in your\n                                  monitored systems. (default: 'trigger')\n                * trigger: When PagerDuty receives a trigger event, it will either open a new alert,\n                           or add a new trigger log entry to an existing alert, depending on the\n                           provided dedup_key. Your monitoring tools should send PagerDuty a trigger\n                           when a new problem has been detected. You may send additional triggers\n                           when a previously detected problem has occurred again.\n\n                * acknowledge: acknowledge events cause the referenced incident to enter the\n                               acknowledged state. While an incident is acknowledged, it won't\n                               generate any additional notifications, even if it receives new\n                               trigger events. Your monitoring tools should send PagerDuty an\n                               acknowledge event when they know someone is presently working on the\n                               problem.\n\n                * resolve: resolve events cause the referenced incident to enter the resolved state.\n                           Once an incident is resolved, it won't generate any additional\n                           notifications. New trigger events with the same dedup_key as a resolved\n                           incident won't re-open the incident. Instead, a new incident will be\n                           created. Your monitoring tools should send PagerDuty a resolve event when\n                           the problem that caused the initial trigger event has been fixed.\n\n            dedup_key {string} -- Deduplication key for correlating triggers and resolves. The\n                                  maximum permitted length of this property is 255 characters.\n\n            timestamp {string} -- Timestamp (ISO 8601). When the upstream system detected / created\n                                  the event. This is useful if a system batches or holds events\n                                  before sending them to PagerDuty.\n\n                                  Optional - Will be auto-generated by PagerDuty if not provided.\n\n                                  Example:\n                                  2015-07-17T08:42:58.315+0000\n\n            component {string} -- The part or component of the affected system that is broken.\n\n                                  Examples:\n                                  "keepalive"\n                                  "webping"\n                                  "mysql"\n                                  "wqueue"\n\n            group {string} -- A cluster or grouping of sources. For example, sources\n                              "prod-datapipe-02" and "prod-datapipe-03" might both be part of\n                              "prod-datapipe"\n\n                              Examples:\n                              "prod-datapipe"\n                              "www"\n                              "web_stack"\n\n            event_class {string} -- The class/type of the event.\n\n                                    Examples:\n                                    "High CPU"\n                                    "Latency"\n                                    "500 Error"\n\n            custom_details {Dict[str, str]} -- Additional details about the event and affected\n                                               system.\n\n                                               Example:\n                                               {"ping time": "1500ms", "load avg": 0.75 }\n        """\n\n        data = {\n            "routing_key": self.routing_key,\n            "event_action": event_action,\n            "payload": {"summary": summary, "source": source, "severity": severity},\n        }\n\n        if dedup_key is not None:\n            data["dedup_key"] = dedup_key\n\n        payload: Dict[str, object] = cast(Dict[str, object], data["payload"])\n\n        if timestamp is not None:\n            payload["timestamp"] = timestamp\n\n        if component is not None:\n            payload["component"] = component\n\n        if group is not None:\n            payload["group"] = group\n\n        if event_class is not None:\n            payload["class"] = event_class\n\n        if custom_details is not None:\n            payload["custom_details"] = custom_details\n\n        return pypd.EventV2.create(data=data)\n\n\n
[docs]@resource(\n {\n "routing_key": Field(\n str,\n description="""The routing key provisions access to your PagerDuty service. You\n will need to include the integration key for your new integration, as a\n routing_key in the event payload.""",\n )\n },\n description="""This resource is for posting events to PagerDuty.""",\n)\ndef pagerduty_resource(context):\n """A resource for posting events (alerts) to PagerDuty.\n\n Example:\n\n .. code-block:: python\n\n @op(required_resource_keys={'pagerduty'})\n def pagerduty_op(context):\n context.resources.pagerduty.EventV2_create(\n summary='alert from dagster'\n source='localhost',\n severity='error',\n event_action='trigger',\n )\n\n @job(resource_defs={ 'pagerduty': pagerduty_resource })\n def pagerduty_test():\n pagerduty_op()\n\n pagerduty_test.execute_in_process(\n run_config={\n "resources": {\n 'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n }\n }\n )\n """\n return PagerDutyService(context.resource_config.get("routing_key"))
\n
", "current_page_name": "_modules/dagster_pagerduty/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pagerduty.resources"}}, "dagster_pandas": {"constraints": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pandas.constraints

\nimport sys\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom functools import wraps\n\nimport pandas as pd\nfrom pandas import DataFrame\n\nfrom dagster import DagsterType, MetadataEntry, TypeCheck\nfrom dagster import _check as check\nfrom dagster.utils.backcompat import experimental_class_warning\n\n\nclass ConstraintViolationException(Exception):\n    """Indicates that a constraint has been violated."""\n\n\nclass ConstraintWithMetadataException(Exception):\n    """\n    This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a\n    failed typecheck or an exception.\n\n    Args:\n        constraint_name (str):  the name of the violated constraint\n        constraint_description (Optional[str]): the description of the violated constraint\n        expectation (Optional[Union[dict,list, str, set]]): what result was expected -- typically a jsonlike, though it can be a string\n        offending (Optional[Union[dict,list, str, set]]):  which pieces of the dataframe violated the expectation, typically list or string\n        actual (Optional[Union[dict,list, str, set]]): what those pieces of the dataframe actually were -- typically a jsonlike\n    """\n\n    def __init__(\n        self,\n        constraint_name,\n        constraint_description="",\n        expectation=None,\n        offending=None,\n        actual=None,\n    ):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))\n        self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))\n        self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))\n        super(ConstraintWithMetadataException, self).__init__(\n            "Violated {} - {}, {} was/were expected, but we received {} which was/were {}".format(\n                constraint_name,\n                constraint_description,\n                expectation,\n                offending,\n                actual,\n            )\n        )\n\n    def normalize_metadata_json_value(self, val):\n        if isinstance(val, set):\n            return list(val)\n        else:\n            return val\n\n    def convert_to_metadata(self):\n        return MetadataEntry(\n            "constraint-metadata",\n            value={\n                "constraint_name": self.constraint_name,\n                "constraint_description": self.constraint_description,\n                "expected": self.normalize_metadata_json_value(self.expectation),\n                "offending": self.normalize_metadata_json_value(self.offending),\n                "actual": self.normalize_metadata_json_value(self.actual),\n            },\n        )\n\n    def return_as_typecheck(self):\n        return TypeCheck(\n            success=False, description=self.args[0], metadata_entries=[self.convert_to_metadata()]\n        )\n\n\nclass DataFrameConstraintViolationException(ConstraintViolationException):\n    """Indicates a dataframe level constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description):\n        super(DataFrameConstraintViolationException, self).__init__(\n            "Violated {constraint_name} - {constraint_description}".format(\n                constraint_name=constraint_name, constraint_description=constraint_description\n            )\n        )\n\n\nclass DataFrameWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, actual):\n        super(DataFrameWithMetadataException, self).__init__(\n            constraint_name, constraint_description, expectation, "a malformed dataframe", actual\n        )\n\n\nclass ColumnConstraintViolationException(ConstraintViolationException):\n    """Indicates that a column constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description, column_name, offending_rows=None):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.column_name = column_name\n        self.offending_rows = offending_rows\n        super(ColumnConstraintViolationException, self).__init__(self.construct_message())\n\n    def construct_message(self):\n        base_message = 'Violated "{constraint_name}" for column "{column_name}" - {constraint_description}'.format(\n            constraint_name=self.constraint_name,\n            constraint_description=self.constraint_description,\n            column_name=self.column_name,\n        )\n        if self.offending_rows is not None:\n            base_message += "The offending (index, row values) are the following: {}".format(\n                self.offending_rows\n            )\n        return base_message\n\n\nclass ColumnWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, offending, actual):\n        super(ColumnWithMetadataException, self).__init__(\n            "the column constraint " + constraint_name,\n            constraint_description,\n            expectation,\n            offending,\n            actual,\n        )\n\n\nclass Constraint:\n    """\n    Base constraint object that all constraints inherit from.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        self.name = self.__class__.__name__\n        self.markdown_description = check.str_param(markdown_description, "markdown_description")\n        self.error_description = check.str_param(error_description, "error_description")\n\n\nclass ConstraintWithMetadata:\n    """\n    This class defines a base constraint over pandas DFs with organized metadata\n\n    args:\n        description (str): description of the constraint\n        validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n                    the validation function to run over inputted data\n                    This function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    # TODO:  validation_fn returning metadata is sorta broken.  maybe have it yield typecheck events and grab metadata?\n\n    def __init__(\n        self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None\n    ):\n        experimental_class_warning(self.__class__.__name__)\n        if name is None:\n            self.name = self.__class__.__name__\n        else:\n            self.name = name\n        self.description = description\n        # should return a tuple of (bool, and either an empty dict or a dict of extra params)\n        self.validation_fn = validation_fn\n        self.resulting_exception = resulting_exception\n        self.raise_or_typecheck = raise_or_typecheck\n\n    def validate(self, data, *args, **kwargs):\n        res = self.validation_fn(data, *args, **kwargs)\n        if not res[0]:\n            exc = self.resulting_exception(\n                constraint_name=self.name, constraint_description=self.description, **res[1]\n            )\n\n            if self.raise_or_typecheck:\n                raise exc\n            else:\n                return exc.return_as_typecheck()\n\n        else:\n            if res[0]:\n                return TypeCheck(success=True)\n\n    # TODO:  composition of validations\n    def as_dagster_type(self, *args, **kwargs):\n        if self.raise_or_typecheck:\n            raise Exception(\n                "Dagster types can only be constructed from constraints that return typechecks"\n            )\n        return DagsterType(\n            name=self.name,\n            description="A Pandas DataFrame with the following validation: {}".format(\n                self.description\n            ),\n            type_check_fn=lambda x: self.validate(x, *args),\n            **kwargs,\n        )\n\n\nclass MultiConstraintWithMetadata(ConstraintWithMetadata):\n    """\n    Use this class if you have multiple constraints to check over the entire dataframe\n\n    args:\n        description (str): description of the constraint\n        validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n                    a list of the validation functions to run over inputted data\n                    Each function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    def __init__(\n        self,\n        description,\n        validation_fn_arr,\n        resulting_exception,\n        raise_or_typecheck=True,\n        name=None,\n    ):\n        validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")\n\n        def validation_fn(data, *args, **kwargs):\n\n            results = [f(data, *args, **kwargs) for f in validation_fn_arr]\n            truthparam = all(item[0] for item in results)\n            metadict = defaultdict(dict)\n            for i, dicta in enumerate(item[1] for item in results):\n                if len(dicta.keys()) > 0:\n                    for key in dicta:\n                        metadict[key][validation_fn_arr[i].__name__] = dicta[key]\n            return (truthparam, metadict)\n\n        super(MultiConstraintWithMetadata, self).__init__(\n            description,\n            validation_fn,\n            resulting_exception,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass StrictColumnsWithMetadata(ConstraintWithMetadata):\n    def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):\n        self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n        self.column_list = check.list_param(column_list, "strict_column_list", of_type=str)\n\n        def validation_fcn(inframe):\n            if list(inframe.columns) == column_list:\n                return (True, {})\n            else:\n                if self.enforce_ordering:\n                    resdict = {"expectation": self.column_list, "actual": list(inframe.columns)}\n                    return (False, resdict)\n                else:\n                    if set(inframe.columns) == set(column_list):\n                        return (True, {})\n                    else:\n                        extra = [x for x in inframe.columns if x not in set(column_list)]\n                        missing = [x for x in set(column_list) if x not in inframe.columns]\n                        resdict = {\n                            "expectation": self.column_list,\n                            "actual": {"extra_columns": extra, "missing_columns": missing},\n                        }\n                        return (False, resdict)\n\n        basestr = "ensuring that the right columns, {} were present".format(self.column_list)\n        if enforce_ordering:\n            basestr += " in the right order"\n        super(StrictColumnsWithMetadata, self).__init__(\n            basestr,\n            validation_fcn,\n            DataFrameWithMetadataException,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass DataFrameConstraint(Constraint):\n    """\n    Base constraint object that represent Dataframe shape constraints.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        super(DataFrameConstraint, self).__init__(\n            error_description=error_description, markdown_description=markdown_description\n        )\n\n    def validate(self, dataframe):\n        raise NotImplementedError()\n\n\n
[docs]class StrictColumnsConstraint(DataFrameConstraint):\n """\n A dataframe constraint that validates column existence and ordering.\n\n Args:\n strict_column_list (List[str]): The exact list of columns that your dataframe must have.\n enforce_ordering (Optional[bool]): If true, will enforce that the ordering of column names must match.\n Default is False.\n """\n\n def __init__(self, strict_column_list, enforce_ordering=False):\n self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n self.strict_column_list = check.list_param(\n strict_column_list, "strict_column_list", of_type=str\n )\n description = "No columns outside of {cols} allowed. ".format(cols=self.strict_column_list)\n if enforce_ordering:\n description += "Columns must be in that order."\n super(StrictColumnsConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n columns_received = list(dataframe.columns)\n if self.enforce_ordering:\n if self.strict_column_list != columns_received:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected the following ordering of columns {expected}. Received: {received}".format(\n expected=self.strict_column_list, received=columns_received\n ),\n )\n for column in columns_received:\n if column not in self.strict_column_list:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {}. Recevied {}.".format(\n self.strict_column_list, columns_received\n ),\n )
\n\n\n
[docs]class RowCountConstraint(DataFrameConstraint):\n """\n A dataframe constraint that validates the expected count of rows.\n\n Args:\n num_allowed_rows (int): The number of allowed rows in your dataframe.\n error_tolerance (Optional[int]): The acceptable threshold if you are not completely certain. Defaults to 0.\n """\n\n def __init__(self, num_allowed_rows, error_tolerance=0):\n self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows")\n self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))\n if self.error_tolerance > self.num_allowed_rows:\n raise ValueError("Tolerance can't be greater than the number of rows you expect.")\n description = "Dataframe must have {} +- {} rows.".format(\n self.num_allowed_rows, self.error_tolerance\n )\n super(RowCountConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n\n if not (\n self.num_allowed_rows - self.error_tolerance\n <= len(dataframe)\n <= self.num_allowed_rows + self.error_tolerance\n ):\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {expected} +- {tolerance} rows. Got {received}".format(\n expected=self.num_allowed_rows,\n tolerance=self.error_tolerance,\n received=len(dataframe),\n ),\n )
\n\n\ndef apply_ignore_missing_data_to_mask(mask, column):\n return mask & ~column.isnull()\n\n\nclass ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):\n """\n Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.\n args:\n description (str): description of the constraint\n validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n\n offending_columns = set()\n offending_values = {}\n for column in columns:\n # TODO: grab extra metadata\n res = self.validation_fn(relevant_data[column])\n if not res[0]:\n offending_columns.add(column)\n if not res[1].get("actual") is None:\n offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]\n else:\n offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]\n if len(offending_columns) == 0 and not self.raise_or_typecheck:\n return TypeCheck(success=True)\n elif len(offending_columns) > 0:\n metadict = {\n "expectation": self.description.replace("Confirms", ""),\n "actual": offending_values,\n "offending": offending_columns,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass ColumnConstraintWithMetadata(ConstraintWithMetadata):\n """\n This class is useful for constructing single constraints that\n you want to apply to multiple columns of your dataframe\n The main difference from the base class in terms of construction is that now, your validation_fns should operate on\n individual values.\n args:\n description (str): description of the constraint\n validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n offending = {}\n offending_values = {}\n # TODO: grab metadata from here\n inverse_validation = lambda x: not self.validation_fn(x)[0]\n for column in columns:\n results = relevant_data[relevant_data[column].apply(inverse_validation)]\n if len(results.index.tolist()) > 0:\n offending[column] = ["row " + str(i) for i in (results.index.tolist())]\n offending_values[column] = results[column].tolist()\n if len(offending) == 0:\n if not self.raise_or_typecheck:\n return TypeCheck(success=True)\n else:\n metadict = {\n "expectation": self.validation_fn.__doc__,\n "actual": offending_values,\n "offending": offending,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):\n """\n This class is useful for constructing more complicated relationships between columns\n and expectations -- i.e. you want some validations on column A, others on column B, etc.\n This lets you package up the metadata neatly,\n and also allows for cases like 'fail if any one of these constraints fails but still run all of them'\n\n Args:\n description (str): description of the overall set of validations\n fn_and_columns_dict (Dict[str, List[Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is 'a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n type_for_internal=ColumnConstraintWithMetadata,\n name=None,\n ):\n # TODO: support multiple descriptions\n self.column_to_fn_dict = check.dict_param(\n fn_and_columns_dict, "fn_and_columns_dict", key_type=str\n )\n\n def validation_fn(data, *args, **kwargs):\n metadict = defaultdict(dict)\n truthparam = True\n for column, fn_arr in self.column_to_fn_dict.items():\n if column not in data.columns:\n continue\n for fn in fn_arr:\n # TODO: do this more effectively\n new_validator = type_for_internal(\n fn.__doc__, fn, ColumnWithMetadataException, raise_or_typecheck=False\n )\n result = new_validator.validate(\n DataFrame(data[column]), column, *args, **kwargs\n )\n result_val = result.success\n if result_val:\n continue\n result_dict = result.metadata_entries[0].entry_data.data\n truthparam = truthparam and result_val\n for key in result_dict.keys():\n if "constraint" not in key:\n if key == "expected":\n new_key = "expectation"\n result_dict[key] = result_dict[key].replace("returns", "").strip()\n if column not in metadict[new_key] or new_key not in metadict:\n metadict[new_key][column] = dict()\n metadict[new_key][column][fn.__name__] = result_dict[key]\n else:\n if column not in metadict[key] or key not in metadict:\n metadict[key][column] = dict()\n if isinstance(result_dict[key], dict):\n metadict[key][column][fn.__name__] = result_dict[key][column]\n else:\n metadict[key][column][fn.__name__] = "a violation"\n return truthparam, metadict\n\n super(MultiColumnConstraintWithMetadata, self).__init__(\n description,\n validation_fn,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n def validate(self, data, *args, **kwargs):\n return ConstraintWithMetadata.validate(self, data, *args, **kwargs)\n\n\nclass MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):\n """\n This class is similar to multicolumn, but takes in functions that operate on the whole column at once\n rather than ones that operate on each value --\n consider this similar to the difference between apply-map and apply aggregate.\n\n Args:\n description (str): description of the overall set of validations (TODO: support multiple descriptions)\n fn_and_columns_dict (Dict[str, List[Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n name=None,\n ):\n super(MultiAggregateConstraintWithMetadata, self).__init__(\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n type_for_internal=ColumnAggregateConstraintWithMetadata,\n name=name,\n )\n\n\ndef non_null_validation(x):\n """\n validates that a particular value in a column is not null\n Usage:\n pass this as a column validator to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Generally, you should prefer to use nonnull as a decorator/wrapper rather than using this\n directly.\n """\n return not pd.isnull(x), {}\n\n\ndef all_unique_validator(column, ignore_missing_vals=False):\n """\n validates that all values in an iterable are unique\n Returns duplicated values as metadata\n\n Usage:\n As a validation function for a\n :py:class:'~dagster_pandas.constraints.ColumnAggregateConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiAggregateConstraintWithMetadata'\n Example:\n .. code-block:: python\n aggregate_validator = MultiAggregateConstraintWithMetadata(\n "confirms all values are unique",\n {'bar': [all_unique_validator]},\n ConstraintWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_aggregate_validator=aggregate_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 3], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'bar': {'all_unique_validator': 'a violation'}}\n metadata['actual'] == {'bar': {'all_unique_validator': [10.0]}}\n """\n column = pd.Series(column)\n duplicated = column.duplicated()\n if ignore_missing_vals:\n duplicated = apply_ignore_missing_data_to_mask(duplicated, column)\n return not duplicated.any(), {"actual": column[duplicated]}\n\n\ndef nonnull(func):\n """\n decorator for column validation functions to make them error on nulls\n Usage:\n pass decorated functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Args:\n func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n the column validator you want to error on nulls\n """\n\n @wraps(func)\n def nvalidator(val):\n origval = func(val)\n nval = non_null_validation(val)\n return origval[0] and nval[0], {}\n\n nvalidator.__doc__ += " and ensures no values are null"\n\n return nvalidator\n\n\ndef column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):\n """\n factory for validators testing if column values are within a range\n Args:\n minim(Optional[Comparable]): the low end of the range\n maxim(Optional[Comparable]): the high end of the range\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Examples:\n .. code-block:: python\n in_range_validator = column_range_validation_factory(1, 3, ignore_missing_vals=True)\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [in_range_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'in_range_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'in_range_validation_fn': [7]}}\n\n """\n if minim is None:\n if isinstance(maxim, datetime):\n minim = datetime.min\n else:\n minim = -1 * (sys.maxsize - 1)\n if maxim is None:\n if isinstance(minim, datetime):\n maxim = datetime.max\n else:\n maxim = sys.maxsize\n\n def in_range_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}\n\n in_range_validation_fn.__doc__ = "checks whether values are between {} and {}".format(\n minim, maxim\n )\n if ignore_missing_vals:\n in_range_validation_fn.__doc__ += ", ignoring nulls"\n\n return in_range_validation_fn\n\n\ndef categorical_column_validator_factory(categories, ignore_missing_vals=False):\n """\n factory for validators testing if all values are in some set\n Args:\n categories(Union[Sequence, set]): the set of allowed values\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Example:\n .. code-block:: python\n categorical_validation_fn = categorical_column_validator_factory([1, 2])\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [categorical_validation_fn]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}\n\n """\n\n categories = set(categories)\n\n def categorical_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (x in categories), {}\n\n categorical_validation_fn.__doc__ = (\n "checks whether values are within this set of values: {}".format(categories)\n )\n if ignore_missing_vals:\n categorical_validation_fn.__doc__ += ", ignoring nulls"\n\n return categorical_validation_fn\n\n\ndef dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):\n """\n factory for testing if the dtype of a val falls within some allowed set\n Args:\n datatypes(Union[set[type], type]): which datatype/datatypes are allowed\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Examples:\n .. code-block:: python\n dtype_is_num_validator = dtype_in_set_validation_factory((int, float, int64, float64))\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [dtype_is_num_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 'a', 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 1']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': ['a']}}\n\n """\n\n def dtype_in_set_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return isinstance(x, datatypes), {}\n\n dtype_in_set_validation_fn.__doc__ = "checks whether values are this type/types: {}".format(\n datatypes\n )\n if ignore_missing_vals:\n dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"\n\n return dtype_in_set_validation_fn\n\n\nclass ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):\n def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):\n self.name = self.__class__.__name__\n\n description = "Confirms values are between {} and {}".format(minim, maxim)\n super(ColumnRangeConstraintWithMetadata, self).__init__(\n description=description,\n validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),\n resulting_exception=ColumnWithMetadataException,\n raise_or_typecheck=raise_or_typecheck,\n )\n self.columns = columns\n\n def validate(self, data, *args, **kwargs):\n if self.columns is None:\n self.columns = list(data.columns)\n self.columns.extend(args)\n return super(ColumnRangeConstraintWithMetadata, self).validate(\n data, *self.columns, **kwargs\n )\n\n\nclass ColumnConstraint(Constraint):\n """\n Base constraint object that represent dataframe column shape constraints.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is emitted by dagit if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n super(ColumnConstraint, self).__init__(\n error_description=error_description, markdown_description=markdown_description\n )\n\n def validate(self, dataframe, column_name):\n pass\n\n @staticmethod\n def get_offending_row_pairs(dataframe, column_name):\n return zip(dataframe.index.tolist(), dataframe[column_name].tolist())\n\n\nclass ColumnDTypeFnConstraint(ColumnConstraint):\n """\n A column constraint that applies a pandas dtype validation function to a columns dtype.\n\n Args:\n type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and\n returns if those dtypes match the types it expects. See pandas.core.dtypes.common for examples.\n """\n\n def __init__(self, type_fn):\n self.type_fn = check.callable_param(type_fn, "type_fn")\n description = f'Dtype must satisfy "{self.type_fn.__name__}"'\n super(ColumnDTypeFnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n column_dtype = dataframe[column_name].dtype\n if not self.type_fn(column_dtype):\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=f'{self.error_description}, but was "{column_dtype}"',\n column_name=column_name,\n )\n\n\nclass ColumnDTypeInSetConstraint(ColumnConstraint):\n """\n A column constraint that validates the pandas column dtypes based on the expected set of dtypes.\n\n Args:\n expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.\n """\n\n def __init__(self, expected_dtype_set):\n self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")\n description = "Column dtype must be in the following set {}.".format(\n self.expected_dtype_set\n )\n super(ColumnDTypeInSetConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n received_dtypes = dataframe[column_name].dtype\n if str(received_dtypes) not in self.expected_dtype_set:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description="{base_error_message}. DTypes received: {received_dtypes}".format(\n base_error_message=self.error_description, received_dtypes=received_dtypes\n ),\n column_name=column_name,\n )\n\n\nclass NonNullableColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are not null.\n """\n\n def __init__(self):\n description = "No Null values allowed."\n super(NonNullableColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n rows_with_null_columns = dataframe[dataframe[column_name].isna()]\n if not rows_with_null_columns.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=self.get_offending_row_pairs(rows_with_null_columns, column_name),\n )\n\n\nclass UniqueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are unique.\n\n Args:\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, ignore_missing_vals):\n description = "Column must be unique."\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(UniqueColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name].duplicated()\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_duplicated_values = dataframe[invalid]\n if not rows_with_duplicated_values.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_duplicated_values,\n )\n\n\nclass CategoricalColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are a valid category.\n\n Args:\n categories (Set[str]): Set of categories that values in your pandas column must match.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, categories, ignore_missing_vals):\n self.categories = list(check.set_param(categories, "categories", of_type=str))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(CategoricalColumnConstraint, self).__init__(\n error_description="Expected Categories are {}".format(self.categories),\n markdown_description="Category examples are {}...".format(self.categories[:5]),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].isin(self.categories)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_unexpected_buckets = dataframe[invalid]\n if not rows_with_unexpected_buckets.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_unexpected_buckets,\n )\n\n\nclass MinValueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are greater than the provided\n lower bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, min_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MinValueColumnConstraint, self).__init__(\n markdown_description="values > {}".format(self.min_value),\n error_description="Column must have values > {}".format(self.min_value),\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] < self.min_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass MaxValueColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are less than the provided\n upper bound [inclusive].\n\n Args:\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, max_value, ignore_missing_vals):\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MaxValueColumnConstraint, self).__init__(\n markdown_description="values < {}".format(self.max_value),\n error_description="Column must have values < {}".format(self.max_value),\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] > self.max_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass InRangeColumnConstraint(ColumnConstraint):\n """\n A column constraint that ensures all values in a pandas column are between the lower and upper\n bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non\n missing values.\n """\n\n def __init__(self, min_value, max_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(InRangeColumnConstraint, self).__init__(\n markdown_description="{} < values < {}".format(self.min_value, self.max_value),\n error_description="Column must have values between {} and {} inclusive.".format(\n self.min_value, self.max_value\n ),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].between(self.min_value, self.max_value)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n
", "current_page_name": "_modules/dagster_pandas/constraints", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pandas.constraints"}, "data_frame": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pandas.data_frame

\nimport pandas as pd\nfrom dagster_pandas.constraints import (\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    ConstraintViolationException,\n)\nfrom dagster_pandas.validation import PandasColumn, validate_constraints\n\nfrom dagster import (\n    AssetMaterialization,\n    DagsterInvariantViolationError,\n    DagsterType,\n    Field,\n    MetadataEntry,\n    StringSource,\n    TypeCheck,\n)\nfrom dagster import _check as check\nfrom dagster import dagster_type_loader, dagster_type_materializer\nfrom dagster._check import CheckError\nfrom dagster.config.field_utils import Selector\nfrom dagster.core.definitions.metadata import normalize_metadata\nfrom dagster.core.errors import DagsterInvalidMetadata\nfrom dagster.utils import dict_without_keys\nfrom dagster.utils.backcompat import experimental\n\nCONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}\n\n\n@dagster_type_materializer(\n    Selector(\n        {\n            "csv": {\n                "path": StringSource,\n                "sep": Field(StringSource, is_required=False, default_value=","),\n            },\n            "parquet": {"path": StringSource},\n            "table": {"path": StringSource},\n            "pickle": {"path": StringSource},\n        },\n    )\n)\ndef dataframe_materializer(_context, config, pandas_df):\n    check.inst_param(pandas_df, "pandas_df", pd.DataFrame)\n    file_type, file_options = list(config.items())[0]\n\n    if file_type == "csv":\n        path = file_options["path"]\n        pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, "path"))\n    elif file_type == "parquet":\n        pandas_df.to_parquet(file_options["path"])\n    elif file_type == "table":\n        pandas_df.to_csv(file_options["path"], sep="\\t", index=False)\n    elif file_type == "pickle":\n        pandas_df.to_pickle(file_options["path"])\n    else:\n        check.failed("Unsupported file_type {file_type}".format(file_type=file_type))\n\n    return AssetMaterialization.file(file_options["path"])\n\n\n@dagster_type_loader(\n    Selector(\n        {\n            "csv": {\n                "path": StringSource,\n                "sep": Field(StringSource, is_required=False, default_value=","),\n            },\n            "parquet": {"path": StringSource},\n            "table": {"path": StringSource},\n            "pickle": {"path": StringSource},\n        },\n    )\n)\ndef dataframe_loader(_context, config):\n    file_type, file_options = list(config.items())[0]\n\n    if file_type == "csv":\n        path = file_options["path"]\n        return pd.read_csv(path, **dict_without_keys(file_options, "path"))\n    elif file_type == "parquet":\n        return pd.read_parquet(file_options["path"])\n    elif file_type == "table":\n        return pd.read_csv(file_options["path"], sep="\\t")\n    elif file_type == "pickle":\n        return pd.read_pickle(file_options["path"])\n    else:\n        raise DagsterInvariantViolationError(\n            "Unsupported file_type {file_type}".format(file_type=file_type)\n        )\n\n\ndef df_type_check(_, value):\n    if not isinstance(value, pd.DataFrame):\n        return TypeCheck(success=False)\n    return TypeCheck(\n        success=True,\n        metadata_entries=[\n            MetadataEntry("row_count", value=str(len(value))),\n            # string cast columns since they may be things like datetime\n            MetadataEntry("metadata", value={"columns": list(map(str, value.columns))}),\n        ],\n    )\n\n\nDataFrame = DagsterType(\n    name="PandasDataFrame",\n    description="""Two-dimensional size-mutable, potentially heterogeneous\n    tabular data structure with labeled axes (rows and columns).\n    See http://pandas.pydata.org/""",\n    loader=dataframe_loader,\n    materializer=dataframe_materializer,\n    type_check_fn=df_type_check,\n)\n\n\ndef _construct_constraint_list(constraints):\n    def add_bullet(constraint_list, constraint_description):\n        return constraint_list + "+ {constraint_description}\\n".format(\n            constraint_description=constraint_description\n        )\n\n    constraint_list = ""\n    for constraint in constraints:\n        if constraint.__class__ not in CONSTRAINT_BLACKLIST:\n            constraint_list = add_bullet(constraint_list, constraint.markdown_description)\n    return constraint_list\n\n\ndef _build_column_header(column_name, constraints):\n    header = "**{column_name}**".format(column_name=column_name)\n    for constraint in constraints:\n        if isinstance(constraint, ColumnDTypeInSetConstraint):\n            dtypes_tuple = tuple(constraint.expected_dtype_set)\n            return header + ": `{expected_dtypes}`".format(\n                expected_dtypes=dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]\n            )\n        elif isinstance(constraint, ColumnDTypeFnConstraint):\n            return header + ": Validator `{expected_dtype_fn}`".format(\n                expected_dtype_fn=constraint.type_fn.__name__\n            )\n    return header\n\n\ndef create_dagster_pandas_dataframe_description(description, columns):\n    title = "\\n".join([description, "### Columns", ""])\n    buildme = title\n    for column in columns:\n        buildme += "{}\\n{}\\n".format(\n            _build_column_header(column.name, column.constraints),\n            _construct_constraint_list(column.constraints),\n        )\n    return buildme\n\n\n
[docs]def create_dagster_pandas_dataframe_type(\n name,\n description=None,\n columns=None,\n event_metadata_fn=None,\n dataframe_constraints=None,\n loader=None,\n materializer=None,\n):\n """\n Constructs a custom pandas dataframe dagster type.\n\n Args:\n name (str): Name of the dagster pandas type.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects\n which express dataframe column schemas and constraints.\n event_metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]], List[MetadataEntry]]]]):\n A callable which takes your dataframe and returns a dict with string label keys and\n MetadataValue values. Can optionally return a List[MetadataEntry].\n dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from\n :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will\n default to using `dataframe_materializer`.\n """\n # We allow for the plugging in of dagster_type_loaders/materializers so that\n # Users can load and materialize their custom dataframes via configuration their own way if the default\n # configs don't suffice. This is purely optional.\n check.str_param(name, "name")\n event_metadata_fn = check.opt_callable_param(event_metadata_fn, "event_metadata_fn")\n description = create_dagster_pandas_dataframe_description(\n check.opt_str_param(description, "description", default=""),\n check.opt_list_param(columns, "columns", of_type=PandasColumn),\n )\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description="Must be a pandas.DataFrame. Got value of type. {type_name}".format(\n type_name=type(value).__name__\n ),\n )\n\n try:\n validate_constraints(\n value, pandas_columns=columns, dataframe_constraints=dataframe_constraints\n )\n except ConstraintViolationException as e:\n return TypeCheck(success=False, description=str(e))\n\n return TypeCheck(\n success=True,\n metadata_entries=_execute_summary_stats(name, value, event_metadata_fn)\n if event_metadata_fn\n else None,\n )\n\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n materializer=materializer if materializer else dataframe_materializer,\n description=description,\n )
\n\n\n@experimental\ndef create_structured_dataframe_type(\n name,\n description=None,\n columns_validator=None,\n columns_aggregate_validator=None,\n dataframe_validator=None,\n loader=None,\n materializer=None,\n):\n """\n\n Args:\n name (str): the name of the new type\n description (Optional[str]): the description of the new type\n columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):\n what column-level row by row validation you want to have applied.\n Leave empty for no column-level row by row validation.\n columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,\n MultiAggregateConstraintWithMetadata]]):\n what column-level aggregate validation you want to have applied,\n Leave empty for no column-level aggregate validation.\n dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):\n what dataframe-wide validation you want to have applied.\n Leave empty for no dataframe-wide validation.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n materializer (Optional[DagsterTypeMaterializer]): An instance of a class\n that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will\n default to using `dataframe_materializer`.\n\n Returns:\n a DagsterType with the corresponding name and packaged validation.\n\n """\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description="Must be a pandas.DataFrame. Got value of type. {type_name}".format(\n type_name=type(value).__name__\n ),\n )\n individual_result_dict = {}\n\n if dataframe_validator is not None:\n individual_result_dict["dataframe"] = dataframe_validator.validate(value)\n if columns_validator is not None:\n individual_result_dict["columns"] = columns_validator.validate(value)\n\n if columns_aggregate_validator is not None:\n individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate(\n value\n )\n\n typechecks_succeeded = True\n metadata = []\n overall_description = "Failed Constraints: {}"\n constraint_clauses = []\n for key, result in individual_result_dict.items():\n result_val = result.success\n if result_val:\n continue\n typechecks_succeeded = typechecks_succeeded and result_val\n result_dict = result.metadata_entries[0].entry_data.data\n metadata.append(\n MetadataEntry(\n "{}-constraint-metadata".format(key),\n value=result_dict,\n )\n )\n constraint_clauses.append("{} failing constraints, {}".format(key, result.description))\n # returns aggregates, then column, then dataframe\n return TypeCheck(\n success=typechecks_succeeded,\n description=overall_description.format(constraint_clauses),\n metadata_entries=sorted(metadata, key=lambda x: x.label),\n )\n\n description = check.opt_str_param(description, "description", default="")\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n materializer=materializer if loader else dataframe_materializer,\n description=description,\n )\n\n\ndef _execute_summary_stats(type_name, value, event_metadata_fn):\n if not event_metadata_fn:\n return []\n\n metadata_or_metadata_entries = event_metadata_fn(value)\n\n invalid_message = (\n "The return value of the user-defined summary_statistics function for pandas "\n f"data frame type {type_name} returned {value}. This function must return "\n "Union[Dict[str, Union[str, float, int, Dict, MetadataValue]], List[MetadataEntry]]"\n )\n\n metadata = None\n metadata_entries = None\n\n if isinstance(metadata_or_metadata_entries, list):\n metadata_entries = metadata_or_metadata_entries\n elif isinstance(metadata_or_metadata_entries, dict):\n metadata = metadata_or_metadata_entries\n else:\n raise DagsterInvariantViolationError(invalid_message)\n\n try:\n return normalize_metadata(metadata, metadata_entries)\n except (DagsterInvalidMetadata, CheckError):\n raise DagsterInvariantViolationError(invalid_message)\n
", "current_page_name": "_modules/dagster_pandas/data_frame", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pandas.data_frame"}, "validation": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pandas.validation

\nfrom dagster_pandas.constraints import (\n    CategoricalColumnConstraint,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    Constraint,\n    ConstraintViolationException,\n    DataFrameConstraint,\n    InRangeColumnConstraint,\n    NonNullableColumnConstraint,\n    UniqueColumnConstraint,\n)\nfrom pandas import DataFrame, Timestamp\nfrom pandas.core.dtypes.common import (\n    is_bool_dtype,\n    is_float_dtype,\n    is_integer_dtype,\n    is_numeric_dtype,\n    is_string_dtype,\n)\n\nfrom dagster import DagsterInvariantViolationError\nfrom dagster import _check as check\n\nPANDAS_NUMERIC_TYPES = {"int64", "float"}\n\n\ndef _construct_keyword_constraints(non_nullable, unique, ignore_missing_vals):\n    non_nullable = check.bool_param(non_nullable, "exists")\n    unique = check.bool_param(unique, "unique")\n    ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n    if non_nullable and ignore_missing_vals:\n        raise DagsterInvariantViolationError(\n            "PandasColumn cannot have a non-null constraint while also ignore missing values"\n        )\n    constraints = []\n    if non_nullable:\n        constraints.append(NonNullableColumnConstraint())\n    if unique:\n        constraints.append(UniqueColumnConstraint(ignore_missing_vals=ignore_missing_vals))\n    return constraints\n\n\n
[docs]class PandasColumn:\n """\n The main API for expressing column level schemas and constraints for your custom dataframe\n types.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If th column exists, the validate function will validate the column. Defaults to True.\n constraints (Optional[List[Constraint]]): List of constraint objects that indicate the\n validation rules for the pandas column.\n """\n\n def __init__(self, name, constraints=None, is_required=None):\n self.name = check.str_param(name, "name")\n self.is_required = check.opt_bool_param(is_required, "is_required", default=True)\n self.constraints = check.opt_list_param(constraints, "constraints", of_type=Constraint)\n\n def validate(self, dataframe):\n if self.name not in dataframe.columns:\n # Ignore validation if column is missing from dataframe and is not required\n if self.is_required:\n raise ConstraintViolationException(\n "Required column {column_name} not in dataframe with columns {dataframe_columns}".format(\n column_name=self.name, dataframe_columns=dataframe.columns\n )\n )\n else:\n for constraint in self.constraints:\n constraint.validate(dataframe, self.name)\n\n
[docs] @staticmethod\n def exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None):\n """\n Simple constructor for PandasColumns that expresses existence constraints.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=_construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def boolean_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """\n Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_bool_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def numeric_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_numeric_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def integer_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_integer_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def float_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_float_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def datetime_column(\n name,\n min_datetime=Timestamp.min,\n max_datetime=Timestamp.max,\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n tz=None,\n ):\n """\n Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.\n Defaults to pandas.Timestamp.min.\n max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.\n Defaults to pandas.Timestamp.max.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'.\n Defaults to None, meaning naive datetime values.\n """\n if tz is None:\n datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"})\n else:\n datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"})\n # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware\n if min_datetime.tz_localize(None) == Timestamp.min:\n min_datetime = Timestamp("1677-09-22 00:12:43.145225Z")\n if max_datetime.tz_localize(None) == Timestamp.max:\n max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z")\n # Convert bounds to same tz\n if Timestamp(min_datetime).tz is None:\n min_datetime = Timestamp(min_datetime).tz_localize(tz)\n if Timestamp(max_datetime).tz is None:\n max_datetime = Timestamp(max_datetime).tz_localize(tz)\n\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n datetime_constraint,\n InRangeColumnConstraint(\n min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def string_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """\n Simple constructor for PandasColumns that expresses constraints on string dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_string_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n
[docs] @staticmethod\n def categorical_column(\n name,\n categories,\n of_types=frozenset({"category", "object"}),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """\n Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n categories (List[Any]): The valid set of buckets that all values in the column must match.\n of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must\n abide by.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in\n the column ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the\n constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n of_types = {of_types} if isinstance(of_types, str) else of_types\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeInSetConstraint(of_types),\n CategoricalColumnConstraint(categories, ignore_missing_vals=ignore_missing_vals),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n\ndef validate_constraints(dataframe, pandas_columns=None, dataframe_constraints=None):\n dataframe = check.inst_param(dataframe, "dataframe", DataFrame)\n pandas_columns = check.opt_list_param(\n pandas_columns, "column_constraints", of_type=PandasColumn\n )\n dataframe_constraints = check.opt_list_param(\n dataframe_constraints, "dataframe_constraints", of_type=DataFrameConstraint\n )\n\n if pandas_columns:\n for column in pandas_columns:\n column.validate(dataframe)\n\n if dataframe_constraints:\n for dataframe_constraint in dataframe_constraints:\n dataframe_constraint.validate(dataframe)\n
", "current_page_name": "_modules/dagster_pandas/validation", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pandas.validation"}}, "dagster_postgres": {"event_log": {"event_log": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_postgres.event_log.event_log

\nfrom typing import Optional\n\nimport sqlalchemy as db\n\nimport dagster._check as check\nfrom dagster.core.events.log import EventLogEntry\nfrom dagster.core.storage.event_log import (\n    AssetKeyTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlEventLogStorageTable,\n)\nfrom dagster.core.storage.event_log.base import EventLogCursor\nfrom dagster.core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, deserialize_as\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\nfrom .event_watcher import PostgresEventWatcher\n\nCHANNEL_NAME = "run_events"\n\n\n
[docs]class PostgresEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """Postgres-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for event log storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 12-21\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, postgres_url, should_autocreate_tables=True, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = check.str_param(postgres_url, "postgres_url")\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n self._disposed = False\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n\n # lazy init\n self._event_watcher: Optional[PostgresEventWatcher] = None\n\n self._secondary_index_cache = {}\n\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables and "event_logs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def _init_db(self):\n with self._connect() as conn:\n with conn.begin():\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n def upgrade(self):\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresEventLogStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(conn_string, should_autocreate_tables=True):\n engine = create_engine(\n conn_string, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n return PostgresEventLogStorage(conn_string, should_autocreate_tables)\n\n def store_event(self, event):\n """Store an event corresponding to a pipeline run.\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event) # from SqlEventLogStorage.py\n with self._connect() as conn:\n result = conn.execute(\n insert_event_statement.returning(\n SqlEventLogStorageTable.c.run_id, SqlEventLogStorageTable.c.id\n )\n )\n res = result.fetchone()\n result.close()\n conn.execute(\n """NOTIFY {channel}, %s; """.format(channel=CHANNEL_NAME),\n (res[0] + "_" + str(res[1]),),\n )\n\n if (\n event.is_dagster_event\n and (\n event.dagster_event.is_step_materialization\n or event.dagster_event.is_asset_observation\n or event.dagster_event.is_asset_materialization_planned\n )\n and event.dagster_event.asset_key\n ):\n self.store_asset_event(event)\n\n def store_asset_event(self, event):\n check.inst_param(event, "event", EventLogEntry)\n if not event.is_dagster_event or not event.dagster_event.asset_key:\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in Dagit.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n\n values = self._get_asset_entry_values(event, self.has_secondary_index(ASSET_KEY_INDEX_COLS))\n with self.index_connection() as conn:\n query = db.dialects.postgresql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(),\n **values,\n )\n if values:\n query = query.on_conflict_do_update(\n index_elements=[AssetKeyTable.c.asset_key],\n set_=dict(**values),\n )\n else:\n query = query.on_conflict_do_nothing()\n conn.execute(query)\n\n def _connect(self):\n return create_pg_connection(self._engine, pg_alembic_config(__file__), "event log")\n\n def run_connection(self, run_id=None):\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n PostgresEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(PostgresEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, cursor, callback):\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n\n if self._event_watcher is None:\n self._event_watcher = PostgresEventWatcher(\n self.postgres_url,\n [CHANNEL_NAME],\n self._gen_event_log_entry_from_cursor,\n )\n\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def _gen_event_log_entry_from_cursor(self, cursor) -> EventLogEntry:\n with self._engine.connect() as conn:\n cursor_res = conn.execute(\n db.select([SqlEventLogStorageTable.c.event]).where(\n SqlEventLogStorageTable.c.id == cursor\n ),\n )\n return deserialize_as(cursor_res.scalar(), EventLogEntry)\n\n def end_watch(self, run_id, handler):\n if self._event_watcher is None:\n return\n\n self._event_watcher.unwatch_run(run_id, handler)\n\n def __del__(self):\n # Keep the inherent limitations of __del__ in Python in mind!\n self.dispose()\n\n def dispose(self):\n if not self._disposed:\n self._disposed = True\n if self._event_watcher:\n self._event_watcher.close()\n\n def alembic_version(self):\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/event_log/event_log", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_postgres.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_postgres.run_storage.run_storage

\nimport sqlalchemy as db\n\nimport dagster._check as check\nfrom dagster.core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\nfrom dagster.utils import utc_datetime_from_timestamp\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresRunStorage(SqlRunStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-10\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, postgres_url, should_autocreate_tables=True, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db.pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables and "runs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n super().__init__()\n\n def _init_db(self):\n with self.connect() as conn:\n with conn.begin():\n RunStorageSqlMetadata.create_all(conn)\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold 1 open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresRunStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(postgres_url, should_autocreate_tables=True):\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresRunStorage(postgres_url, should_autocreate_tables)\n\n def connect(self):\n return create_pg_connection(\n self._engine,\n pg_alembic_config(__file__),\n "run",\n )\n\n def upgrade(self):\n with self.connect() as conn:\n run_alembic_upgrade(pg_alembic_config(__file__), conn)\n\n def has_built_index(self, migration_name):\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n PostgresRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name):\n super(PostgresRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat):\n with self.connect() as conn:\n\n # insert or update if already present, using postgres specific on_conflict\n conn.execute(\n db.dialects.postgresql.insert(DaemonHeartbeatsTable)\n .values( # pylint: disable=no-value-for-parameter\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_dagster_namedtuple(daemon_heartbeat),\n )\n .on_conflict_do_update(\n index_elements=[DaemonHeartbeatsTable.c.daemon_type],\n set_={\n "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n "daemon_id": daemon_heartbeat.daemon_id,\n "body": serialize_dagster_namedtuple(daemon_heartbeat),\n },\n )\n )\n\n def alembic_version(self):\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/run_storage/run_storage", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_postgres.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_postgres.schedule_storage.schedule_storage

\nimport pendulum\nimport sqlalchemy as db\n\nimport dagster._check as check\nfrom dagster.core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster.core.storage.schedules.schema import InstigatorsTable\nfrom dagster.core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster.serdes import ConfigurableClass, ConfigurableClassData, serialize_dagster_namedtuple\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagit`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for schedule storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 23-32\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, postgres_url, should_autocreate_tables=True, inst_data=None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n missing_main_table = "schedules" not in table_names and "jobs" not in table_names\n if self.should_autocreate_tables and missing_main_table:\n retry_pg_creation_fn(self._init_db)\n\n super().__init__()\n\n def _init_db(self):\n with self.connect() as conn:\n with conn.begin():\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_dagit(self, statement_timeout):\n # When running in dagit, hold an open connection and set statement_timeout\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": pg_statement_timeout(statement_timeout)},\n )\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return pg_config()\n\n @staticmethod\n def from_config_value(inst_data, config_value):\n return PostgresScheduleStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(postgres_url, should_autocreate_tables=True):\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db.pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresScheduleStorage(postgres_url, should_autocreate_tables)\n\n def connect(self, run_id=None): # pylint: disable=arguments-differ, unused-argument\n return create_pg_connection(self._engine, pg_alembic_config(__file__), "schedule")\n\n def upgrade(self):\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn, state):\n selector_id = state.selector_id\n conn.execute(\n db.dialects.postgresql.insert(InstigatorsTable)\n .values( # pylint: disable=no-value-for-parameter\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_dagster_namedtuple(state),\n )\n .on_conflict_do_update(\n index_elements=[InstigatorsTable.c.selector_id],\n set_={\n "status": state.status.value,\n "instigator_type": state.instigator_type.value,\n "instigator_body": serialize_dagster_namedtuple(state),\n "update_timestamp": pendulum.now("UTC"),\n },\n )\n )\n\n def alembic_version(self):\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/schedule_storage/schedule_storage", "customsidebar": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_postgres.schedule_storage.schedule_storage"}}}, "dagster_prometheus": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_prometheus.resources

\nimport prometheus_client\nfrom prometheus_client.exposition import default_handler\n\nfrom dagster import Field\nfrom dagster import _check as check\nfrom dagster import resource\n\n\n
[docs]class PrometheusResource:\n """Integrates with Prometheus via the prometheus_client library."""\n\n def __init__(self, gateway, timeout):\n self.gateway = check.str_param(gateway, "gateway")\n self.timeout = check.opt_int_param(timeout, "timeout")\n self.registry = prometheus_client.CollectorRegistry()\n\n def push_to_gateway(self, job, grouping_key=None, handler=default_handler):\n """Push metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n If not None, the argument must be a function which accepts\n the following arguments:\n url, method, timeout, headers, and content\n May be used to implement additional functionality not\n supported by the built-in default handler (such as SSL\n client certicates, and HTTP authentication mechanisms).\n 'url' is the URL for the request, the 'gateway' argument\n described earlier will form the basis of this URL.\n 'method' is the HTTP method which should be used when\n carrying out the request.\n 'timeout' requests not successfully completed after this\n many seconds should be aborted. If timeout is None, then\n the handler should not set a timeout.\n 'headers' is a list of ("header-name","header-value") tuples\n which must be passed to the pushgateway in the form of HTTP\n request headers.\n The function should raise an exception (e.g. IOError) on\n failure.\n 'content' is the data which should be used to form the HTTP\n Message Body.\n This overwrites all metrics with the same job and grouping_key.\n This uses the PUT HTTP method."""\n prometheus_client.push_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self.registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def pushadd_to_gateway(self, job, grouping_key=None, handler=default_handler):\n """PushAdd metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `registry` is an instance of CollectorRegistry\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This replaces metrics with the same name, job and grouping_key.\n This uses the POST HTTP method."""\n prometheus_client.pushadd_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self.registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def delete_from_gateway(self, job, grouping_key=None, handler=default_handler):\n """Delete metrics from the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This deletes metrics with the given job and grouping_key.\n This uses the DELETE HTTP method."""\n prometheus_client.delete_from_gateway(\n gateway=self.gateway,\n job=job,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )
\n\n\n
[docs]@resource(\n {\n "gateway": Field(\n str,\n description="the url for your push gateway. Either of the form "\n "'http://pushgateway.local', or 'pushgateway.local'. "\n "Scheme defaults to 'http' if none is provided",\n ),\n "timeout": Field(\n int,\n default_value=30,\n is_required=False,\n description="is how long delete will attempt to connect before giving up. "\n "Defaults to 30s.",\n ),\n },\n description="""This resource is for sending metrics to a Prometheus Pushgateway.""",\n)\ndef prometheus_resource(context):\n return PrometheusResource(\n gateway=context.resource_config["gateway"], timeout=context.resource_config["timeout"]\n )
\n
", "current_page_name": "_modules/dagster_prometheus/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_prometheus.resources"}}, "dagster_pyspark": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_pyspark.resources

\nfrom dagster_spark.configs_spark import spark_config\nfrom dagster_spark.utils import flatten_dict\nfrom pyspark.sql import SparkSession\n\nimport dagster._check as check\nfrom dagster import resource\n\n\ndef spark_session_from_config(spark_conf=None):\n    spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n    builder = SparkSession.builder\n    flat = flatten_dict(spark_conf)\n    for key, value in flat:\n        builder = builder.config(key, value)\n\n    return builder.getOrCreate()\n\n\nclass PySparkResource:\n    def __init__(self, spark_conf):\n        self._spark_session = spark_session_from_config(spark_conf)\n\n    @property\n    def spark_session(self):\n        return self._spark_session\n\n    @property\n    def spark_context(self):\n        return self.spark_session.sparkContext\n\n\n
[docs]@resource({"spark_conf": spark_config()})\ndef pyspark_resource(init_context):\n """This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.\n\n Example:\n\n .. code-block:: python\n\n @op(required_resource_keys={"pyspark"})\n def my_op(context):\n spark_session = context.resources.pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n\n """\n return PySparkResource(init_context.resource_config["spark_conf"])
\n
", "current_page_name": "_modules/dagster_pyspark/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_pyspark.resources"}}, "dagster_shell": {"solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_shell.solids

\nimport os\n\nfrom dagster import (\n    Enum,\n    EnumValue,\n    Failure,\n    Field,\n    InputDefinition,\n    Noneable,\n    Nothing,\n    OutputDefinition,\n    Permissive,\n)\nfrom dagster import _check as check\nfrom dagster import op, solid\n\nfrom .utils import execute, execute_script_file\n\n\ndef shell_op_config():\n    return {\n        "env": Field(\n            Noneable(Permissive()),\n            is_required=False,\n            description="An optional dict of environment variables to pass to the subprocess.",\n        ),\n        "output_logging": Field(\n            Enum(\n                name="OutputType",\n                enum_values=[\n                    EnumValue("STREAM", description="Stream script stdout/stderr."),\n                    EnumValue(\n                        "BUFFER",\n                        description="Buffer shell script stdout/stderr, then log upon completion.",\n                    ),\n                    EnumValue("NONE", description="No logging"),\n                ],\n            ),\n            is_required=False,\n            default_value="BUFFER",\n        ),\n        "cwd": Field(\n            Noneable(str),\n            default_value=None,\n            is_required=False,\n            description="Working directory in which to execute shell script",\n        ),\n    }\n\n\ndef core_shell(dagster_decorator, decorator_name):\n    @dagster_decorator(\n        name=f"shell_{decorator_name}",\n        description=(\n            f"This {decorator_name} executes a shell command it receives as input.\\n\\n"\n            f"This {decorator_name} is suitable for uses where the command to execute is generated dynamically by "\n            f"upstream {decorator_name}. If you know the command to execute at pipeline construction time, "\n            f"consider `shell_command_{decorator_name}` instead."\n        ),\n        input_defs=[InputDefinition("shell_command", str)],\n        output_defs=[OutputDefinition(str, "result")],\n        config_schema=shell_op_config(),\n    )\n    def shell_fn(context, shell_command):\n        op_config = context.op_config.copy()\n        op_config["env"] = {**os.environ, **op_config.get("env", {})}\n        output, return_code = execute(shell_command=shell_command, log=context.log, **op_config)\n\n        if return_code:\n            raise Failure(\n                description="Shell command execution failed with output: {output}".format(\n                    output=output\n                )\n            )\n\n        return output\n\n    return shell_fn\n\n\nshell_solid = core_shell(solid, "solid")\nshell_op = core_shell(op, "op")\n\n\n
[docs]def create_shell_command_op(\n shell_command,\n name,\n description=None,\n required_resource_keys=None,\n tags=None,\n):\n """This function is a factory that constructs ops to execute a shell command.\n\n Note that you can only use ``shell_command_op`` if you know the command you'd like to execute\n at pipeline construction time. If you'd like to construct shell commands dynamically during\n pipeline execution and pass them between ops, you should use ``shell_op`` instead.\n\n Examples:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_op.py\n :language: python\n\n\n Args:\n shell_command (str): The shell command that the constructed op will execute.\n name (str): The name of the constructed op.\n description (Optional[str]): Human-readable description of this op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return core_create_shell_command(\n op,\n shell_command=shell_command,\n name=name,\n description=description,\n required_resource_keys=required_resource_keys,\n tags=tags,\n )
\n\n\n
[docs]def create_shell_command_solid(\n shell_command,\n name,\n description=None,\n required_resource_keys=None,\n tags=None,\n):\n """This function is a factory that constructs solids to execute a shell command.\n\n Note that you can only use ``shell_command_solid`` if you know the command you'd like to execute\n at pipeline construction time. If you'd like to construct shell commands dynamically during\n pipeline execution and pass them between solids, you should use ``shell_solid`` instead.\n\n Examples:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_solid.py\n :language: python\n\n\n Args:\n shell_command (str): The shell command that the constructed solid will execute.\n name (str): The name of the constructed solid.\n description (Optional[str]): Human-readable description of this solid.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this solid.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may\n expect and require certain metadata to be attached to a solid. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n return core_create_shell_command(\n solid,\n shell_command=shell_command,\n name=name,\n description=description,\n required_resource_keys=required_resource_keys,\n tags=tags,\n )
\n\n\ndef core_create_shell_command(\n dagster_decorator,\n shell_command,\n name,\n description=None,\n required_resource_keys=None,\n tags=None,\n):\n check.str_param(shell_command, "shell_command")\n name = check.str_param(name, "name")\n\n @dagster_decorator(\n name=name,\n description=description,\n input_defs=[InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(str, "result")],\n config_schema=shell_op_config(),\n required_resource_keys=required_resource_keys,\n tags=tags,\n )\n def _shell_fn(context):\n op_config = context.op_config.copy()\n op_config["env"] = {**os.environ, **op_config.get("env", {})}\n output, return_code = execute(shell_command=shell_command, log=context.log, **op_config)\n\n if return_code:\n raise Failure(\n description="Shell command execution failed with output: {output}".format(\n output=output\n )\n )\n\n return output\n\n return _shell_fn\n\n\n
[docs]def create_shell_script_op(\n shell_script_path, name="create_shell_script_op", input_defs=None, **kwargs\n):\n """This function is a factory which constructs an op that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@op\n <dagster.op>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@graph <dagster.graph>` to wrap this op\n in the cases where you'd like to configure the shell op with different config fields.\n\n\n Examples:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_op.py\n :language: python\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (str, optional): The name of this op. Defaults to "create_shell_script_op".\n input_defs (List[InputDefinition], optional): input definitions for the op. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return core_create_shell_script(\n dagster_decorator=solid,\n decorator_name="solid",\n shell_script_path=shell_script_path,\n name=name,\n input_defs=input_defs,\n **kwargs,\n )
\n\n\n
[docs]def create_shell_script_solid(\n shell_script_path, name="create_shell_script_solid", input_defs=None, **kwargs\n):\n """This function is a factory which constructs a solid that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@solid\n <dagster.solid>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@composite_solid <dagster.composite_solid>` to wrap this solid\n in the cases where you'd like to configure the shell solid with different config fields.\n\n\n Examples:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_solid.py\n :language: python\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (str, optional): The name of this solid. Defaults to "create_shell_script_solid".\n input_defs (List[InputDefinition], optional): input definitions for the solid. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n SolidDefinition: Returns the constructed solid definition.\n """\n return core_create_shell_script(\n dagster_decorator=solid,\n decorator_name="solid",\n shell_script_path=shell_script_path,\n name=name,\n input_defs=input_defs,\n **kwargs,\n )
\n\n\ndef core_create_shell_script(\n dagster_decorator,\n decorator_name,\n shell_script_path,\n name="create_shell_script_solid",\n input_defs=None,\n **kwargs,\n):\n check.str_param(shell_script_path, "shell_script_path")\n name = check.str_param(name, "name")\n check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n\n if "output_defs" in kwargs:\n raise TypeError(f"Overriding output_defs for shell {decorator_name} is not supported.")\n\n if "config" in kwargs:\n raise TypeError(f"Overriding config for shell {decorator_name} is not supported.")\n\n @dagster_decorator(\n name=name,\n description=kwargs.pop("description", f"A {decorator_name} to invoke a shell command."),\n input_defs=input_defs or [InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(str, "result")],\n config_schema=shell_op_config(),\n **kwargs,\n )\n def _shell_script_fn(context):\n op_config = context.op_config.copy()\n op_config["env"] = {**os.environ, **op_config.get("env", {})}\n output, return_code = execute_script_file(\n shell_script_path=shell_script_path, log=context.log, **op_config\n )\n\n if return_code:\n raise Failure(\n description="Shell command execution failed with output: {output}".format(\n output=output\n )\n )\n\n return output\n\n return _shell_script_fn\n
", "current_page_name": "_modules/dagster_shell/solids", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_shell.solids"}}, "dagster_slack": {"hooks": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_slack.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster.core.definitions import failure_hook, success_hook\nfrom dagster.core.execution.context.hook import HookContext\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return "Op {op_name} on job {pipeline_name} {status}!\\nRun ID: {run_id}".format(\n        op_name=context.op.name,\n        pipeline_name=context.pipeline_name,\n        run_id=context.run_id,\n        status=status,\n    )\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]def slack_on_failure(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the specific pipeline run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_failure("#foo", dagit_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} failed!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n\n """\n\n @failure_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if dagit_base_url:\n text += "\\n<{base_url}/instance/runs/{run_id}|View in Dagit>".format(\n base_url=dagit_base_url, run_id=context.run_id\n )\n\n context.resources.slack.chat_postMessage(channel=channel, text=text) # type: ignore\n\n return _hook
\n\n\n
[docs]def slack_on_success(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the specific pipeline run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_success("#foo", dagit_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} worked!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n\n """\n\n @success_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if dagit_base_url:\n text += "\\n<{base_url}/instance/runs/{run_id}|View in Dagit>".format(\n base_url=dagit_base_url, run_id=context.run_id\n )\n\n context.resources.slack.chat_postMessage(channel=channel, text=text) # type: ignore\n\n return _hook
\n
", "current_page_name": "_modules/dagster_slack/hooks", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_slack.hooks"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_slack.resources

\nfrom slack_sdk.web.client import WebClient\n\nfrom dagster import Field, StringSource, resource\n\n\n
[docs]@resource(\n {\n "token": Field(\n StringSource,\n description="""To configure access to the Slack API, you'll need an access\n token provisioned with access to your Slack workspace.\n\n Tokens are typically either user tokens or bot tokens. For programmatic posting\n to Slack from this resource, you probably want to provision and use a bot token.\n\n More in the Slack API documentation here: https://api.slack.com/docs/token-types\n """,\n )\n },\n description="This resource is for connecting to Slack",\n)\ndef slack_resource(context):\n """This resource is for connecting to Slack.\n\n The resource object is a `slack_sdk.WebClient`.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op:\n\n Examples:\n\n .. code-block:: python\n\n import os\n\n from dagster import job, op\n from dagster_slack import slack_resource\n\n\n @op(required_resource_keys={'slack'})\n def slack_op(context):\n context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job(resource_defs={'slack': slack_resource})\n def slack_job():\n slack_op()\n\n slack_job.execute_in_process(\n run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n )\n\n """\n return WebClient(context.resource_config.get("token"))
\n
", "current_page_name": "_modules/dagster_slack/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_slack.resources"}, "sensors": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_slack.sensors

\nfrom typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union\n\nfrom slack_sdk.web.client import WebClient\n\nfrom dagster import DefaultSensorStatus\nfrom dagster.core.definitions import GraphDefinition, PipelineDefinition\nfrom dagster.core.definitions.run_status_sensor_definition import (\n    PipelineFailureSensorContext,\n    RunFailureSensorContext,\n    RunStatusSensorContext,\n    pipeline_failure_sensor,\n    run_failure_sensor,\n)\n\nT = TypeVar("T", bound=RunStatusSensorContext)\n\n\ndef _build_slack_blocks_and_text(\n    context: T,\n    text_fn: Callable[[T], str],\n    blocks_fn: Optional[Callable[[T], List[Dict]]],\n    dagit_base_url: Optional[str],\n) -> Tuple[List[Dict[str, Any]], str]:\n    blocks: List[Dict[str, Any]] = [\n        {\n            "type": "section",\n            "text": {\n                "type": "mrkdwn",\n                "text": f'*Job "{context.pipeline_run.pipeline_name}" failed. `{context.pipeline_run.run_id.split("-")[0]}`*',\n            },\n        },\n    ]\n    main_body_text = text_fn(context)\n\n    if blocks_fn:\n        blocks.extend(blocks_fn(context))\n    else:\n        blocks.append(\n            {\n                "type": "section",\n                "text": {"type": "mrkdwn", "text": main_body_text},\n            },\n        )\n\n    if dagit_base_url:\n        blocks.append(\n            {\n                "type": "actions",\n                "elements": [\n                    {\n                        "type": "button",\n                        "text": {"type": "plain_text", "text": "View in Dagit"},\n                        "url": f"{dagit_base_url}/instance/runs/{context.pipeline_run.run_id}",\n                    }\n                ],\n            }\n        )\n    return blocks, main_body_text\n\n\ndef _default_failure_message_text_fn(\n    context: Union[PipelineFailureSensorContext, RunFailureSensorContext]\n) -> str:\n    return f"Error: ```{context.failure_event.message}```"\n\n\n
[docs]def make_slack_on_pipeline_failure_sensor(\n channel: str,\n slack_token: str,\n text_fn: Callable[[PipelineFailureSensorContext], str] = _default_failure_message_text_fn,\n blocks_fn: Optional[Callable[[PipelineFailureSensorContext], List[Dict]]] = None,\n pipeline_selection: Optional[List[str]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n):\n """Create a sensor on pipeline failures that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n text_fn (Optional(Callable[[PipelineFailureSensorContext], str])): Function which\n takes in the ``PipelineFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, pipeline name, and run ID.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with mrkdwn.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[PipelineFailureSensorContext], List[Dict]]): Function which takes in\n the ``PipelineFailureSensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n pipeline_selection (Optional[List[str]]): Names of the pipelines that will be monitored by\n this failure sensor. Defaults to None, which means the alert will be sent when any\n pipeline in the repository fails.\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_pipeline_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed pipeline run.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n\n Examples:\n\n .. code-block:: python\n\n slack_on_pipeline_failure = make_slack_on_pipeline_failure_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN")\n )\n\n @repository\n def my_repo():\n return [my_pipeline + slack_on_pipeline_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: PipelineFailureSensorContext) -> str:\n return "Pipeline {pipeline_name} failed! Error: {error}".format(\n pipeline_name=context.pipeline_run.pipeline_name,\n error=context.failure_event.message,\n )\n\n slack_on_pipeline_failure = make_slack_on_pipeline_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n message_fn=my_message_fn,\n dagit_base_url="http://mycoolsite.com",\n )\n\n\n """\n\n slack_client = WebClient(token=slack_token)\n\n @pipeline_failure_sensor(\n name=name, pipeline_selection=pipeline_selection, default_status=default_status\n )\n def slack_on_pipeline_failure(context: PipelineFailureSensorContext):\n\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context, text_fn=text_fn, blocks_fn=blocks_fn, dagit_base_url=dagit_base_url\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_pipeline_failure
\n\n\n
[docs]def make_slack_on_run_failure_sensor(\n channel: str,\n slack_token: str,\n text_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message_text_fn,\n blocks_fn: Optional[Callable[[RunFailureSensorContext], List[Dict]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n):\n """Create a sensor on job failures that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with mrkdwn.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]): Function which takes in\n the ``RunFailureSensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n job_selection (Optional[List[Union[PipelineDefinition, GraphDefinition]]]): The jobs that\n will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n\n Examples:\n\n .. code-block:: python\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN")\n )\n\n @repository\n def my_repo():\n return [my_job + slack_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.pipeline_run.pipeline_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n message_fn=my_message_fn,\n dagit_base_url="http://mycoolsite.com",\n )\n\n\n """\n\n slack_client = WebClient(token=slack_token)\n\n @run_failure_sensor(name=name, job_selection=job_selection, default_status=default_status)\n def slack_on_run_failure(context: RunFailureSensorContext):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context, text_fn=text_fn, blocks_fn=blocks_fn, dagit_base_url=dagit_base_url\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_run_failure
\n
", "current_page_name": "_modules/dagster_slack/sensors", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_slack.sensors"}}, "dagster_snowflake": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_snowflake.resources

\nimport sys\nimport warnings\nfrom contextlib import closing, contextmanager\nfrom typing import Mapping\n\nimport dagster._check as check\nfrom dagster import resource\n\nfrom .configs import define_snowflake_config\n\ntry:\n    import snowflake.connector\nexcept ImportError:\n    msg = (\n        "Could not import snowflake.connector. This could mean you have an incompatible version "\n        "of azure-storage-blob installed. dagster-snowflake requires azure-storage-blob<12.0.0; "\n        "this conflicts with dagster-azure which requires azure-storage-blob~=12.0.0 and is "\n        "incompatible with dagster-snowflake. Please uninstall dagster-azure and reinstall "\n        "dagster-snowflake to fix this error."\n    )\n    warnings.warn(msg)\n    raise\n\n\n
[docs]class SnowflakeConnection:\n def __init__(self, config: Mapping[str, str], log): # pylint: disable=too-many-locals\n # Extract parameters from resource config. Note that we can't pass None values to\n # snowflake.connector.connect() because they will override the default values set within the\n # connector; remove them from the conn_args dict.\n self.connector = config.get("connector", None)\n\n if self.connector == "sqlalchemy":\n self.conn_args = {\n k: config.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "cache_column_metadata",\n "numpy",\n )\n if config.get(k) is not None\n }\n\n else:\n self.conn_args = {\n k: config.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "autocommit",\n "client_prefetch_threads",\n "client_session_keep_alive",\n "login_timeout",\n "network_timeout",\n "ocsp_response_cache_filename",\n "validate_default_parameters",\n "paramstyle",\n "timezone",\n "authenticator",\n )\n if config.get(k) is not None\n }\n\n self.autocommit = self.conn_args.get("autocommit", False)\n self.log = log\n\n
[docs] @contextmanager\n def get_connection(self, raw_conn=True):\n if self.connector == "sqlalchemy":\n from snowflake.sqlalchemy import URL # pylint: disable=no-name-in-module,import-error\n from sqlalchemy import create_engine\n\n engine = create_engine(URL(**self.conn_args))\n conn = engine.raw_connection() if raw_conn else engine.connect()\n\n yield conn\n conn.close()\n engine.dispose()\n else:\n conn = snowflake.connector.connect(**self.conn_args)\n\n yield conn\n if not self.autocommit:\n conn.commit()\n conn.close()
\n\n
[docs] def execute_query(self, sql, parameters=None, fetch_results=False):\n check.str_param(sql, "sql")\n check.opt_dict_param(parameters, "parameters")\n check.bool_param(fetch_results, "fetch_results")\n\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n\n self.log.info("Executing query: " + sql)\n cursor.execute(sql, parameters) # pylint: disable=E1101\n if fetch_results:\n return cursor.fetchall() # pylint: disable=E1101
\n\n
[docs] def execute_queries(self, sql_queries, parameters=None, fetch_results=False):\n check.list_param(sql_queries, "sql_queries", of_type=str)\n check.opt_dict_param(parameters, "parameters")\n check.bool_param(fetch_results, "fetch_results")\n\n results = []\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n for sql in sql_queries:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n self.log.info("Executing query: " + sql)\n cursor.execute(sql, parameters) # pylint: disable=E1101\n if fetch_results:\n results.append(cursor.fetchall()) # pylint: disable=E1101\n\n return results if fetch_results else None
\n\n
[docs] def load_table_from_local_parquet(self, src, table):\n check.str_param(src, "src")\n check.str_param(table, "table")\n\n sql_queries = [\n "CREATE OR REPLACE TABLE {table} ( data VARIANT DEFAULT NULL);".format(table=table),\n "CREATE OR REPLACE FILE FORMAT parquet_format TYPE = 'parquet';",\n "PUT {src} @%{table};".format(src=src, table=table),\n "COPY INTO {table} FROM @%{table} FILE_FORMAT = (FORMAT_NAME = 'parquet_format');".format(\n table=table\n ),\n ]\n\n self.execute_queries(sql_queries)
\n\n\n
[docs]@resource(\n config_schema=define_snowflake_config(),\n description="This resource is for connecting to the Snowflake data warehouse",\n)\ndef snowflake_resource(context):\n """A resource for connecting to the Snowflake data warehouse.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import snowflake_resource\n\n @op(required_resource_keys={'snowflake'})\n def get_one(context):\n context.resources.snowflake.execute_query('SELECT 1')\n\n @job(resource_defs={'snowflake': snowflake_resource})\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n run_config={\n 'resources': {\n 'snowflake': {\n 'config': {\n 'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n 'user': {'env': 'SNOWFLAKE_USER'},\n 'password': {'env': 'SNOWFLAKE_PASSWORD'},\n 'database': {'env': 'SNOWFLAKE_DATABASE'},\n 'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n 'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n }\n }\n }\n }\n )\n\n """\n return SnowflakeConnection(context.resource_config, context.log)
\n\n\ndef _filter_password(args):\n """Remove password from connection args for logging"""\n return {k: v for k, v in args.items() if k != "password"}\n
", "current_page_name": "_modules/dagster_snowflake/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_snowflake.resources"}, "snowflake_io_manager": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_snowflake.snowflake_io_manager

\nfrom typing import Sequence\n\nfrom dagster import Field, IOManagerDefinition, OutputContext, StringSource, io_manager\n\nfrom .db_io_manager import DbClient, DbIOManager, DbTypeHandler, TablePartition, TableSlice\nfrom .resources import SnowflakeConnection\n\nSNOWFLAKE_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_snowflake_io_manager(type_handlers: Sequence[DbTypeHandler]) -> IOManagerDefinition:\n """\n Builds an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of Snowflake tables and an in-memory type - e.g. a Pandas DataFrame.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n\n .. code-block:: python\n\n from dagster_snowflake import build_snowflake_io_manager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n\n snowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler()])\n\n @job(resource_defs={'io_manager': snowflake_io_manager})\n def my_job():\n ...\n """\n\n @io_manager(\n config_schema={\n "database": StringSource,\n "account": StringSource,\n "user": StringSource,\n "password": StringSource,\n "warehouse": Field(StringSource, is_required=False),\n }\n )\n def snowflake_io_manager():\n return DbIOManager(type_handlers=type_handlers, db_client=SnowflakeDbClient())\n\n return snowflake_io_manager
\n\n\nclass SnowflakeDbClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice) -> None:\n with SnowflakeConnection(\n dict(**(context.resource_config or {}), schema=table_slice.schema), context.log\n ).get_connection() as con:\n con.execute_string(_get_cleanup_statement(table_slice))\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n if table_slice.partition:\n return (\n f"SELECT {col_str} FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}\\n"\n + _time_window_where_clause(table_slice.partition)\n )\n else:\n return f"""SELECT {col_str} FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"""\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """\n Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition:\n return (\n f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}\\n"\n + _time_window_where_clause(table_slice.partition)\n )\n else:\n return f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"\n\n\ndef _time_window_where_clause(table_partition: TablePartition) -> str:\n start_dt, end_dt = table_partition.time_window\n start_dt_str = start_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n return f"""WHERE {table_partition.partition_expr} BETWEEN '{start_dt_str}' AND '{end_dt_str}'"""\n
", "current_page_name": "_modules/dagster_snowflake/snowflake_io_manager", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_snowflake.snowflake_io_manager"}, "solids": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_snowflake.solids

\nfrom dagster import InputDefinition, Nothing\nfrom dagster import _check as check\nfrom dagster import op, solid\n\n\ndef _core_create_snowflake_command(dagster_decorator, decorator_name, sql, parameters=None):\n    check.str_param(sql, "sql")\n    check.opt_dict_param(parameters, "parameters")\n\n    @dagster_decorator(\n        name=f"snowflake_{decorator_name}",\n        input_defs=[InputDefinition("start", Nothing)],\n        required_resource_keys={"snowflake"},\n        tags={"kind": "sql", "sql": sql},\n    )\n    def snowflake_fn(context):\n        context.resources.snowflake.execute_query(sql=sql, parameters=parameters)\n\n    return snowflake_fn\n\n\ndef snowflake_solid_for_query(sql, parameters=None):\n    """This function is a solid factory that constructs solids to execute a snowflake query.\n\n    Note that you can only use `snowflake_solid_for_query` if you know the query you'd like to\n    execute at pipeline construction time. If you'd like to execute queries dynamically during\n    pipeline execution, you should manually execute those queries in your custom solid using the\n    snowflake resource.\n\n    Args:\n        sql (str): The sql query that will execute against the provided snowflake resource.\n        parameters (dict): The parameters for the sql query.\n\n    Returns:\n        SolidDefinition: Returns the constructed solid definition.\n    """\n    return _core_create_snowflake_command(solid, "solid", sql, parameters)\n\n\n
[docs]def snowflake_op_for_query(sql, parameters=None):\n """This function is an op factory that constructs an op to execute a snowflake query.\n\n Note that you can only use `snowflake_op_for_query` if you know the query you'd like to\n execute at graph construction time. If you'd like to execute queries dynamically during\n job execution, you should manually execute those queries in your custom op using the\n snowflake resource.\n\n Args:\n sql (str): The sql query that will execute against the provided snowflake resource.\n parameters (dict): The parameters for the sql query.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return _core_create_snowflake_command(op, "op", sql, parameters)
\n
", "current_page_name": "_modules/dagster_snowflake/solids", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_snowflake.solids"}}, "dagster_snowflake_pandas": {"snowflake_pandas_type_handler": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_snowflake_pandas.snowflake_pandas_type_handler

\nfrom typing import Mapping, Union, cast\n\nfrom dagster_snowflake import DbTypeHandler\nfrom dagster_snowflake.resources import SnowflakeConnection\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient, TableSlice\nfrom pandas import DataFrame, read_sql\nfrom snowflake.connector.pandas_tools import pd_writer\n\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster.core.definitions.metadata import RawMetadataValue\n\n\ndef _connect_snowflake(context: Union[InputContext, OutputContext], table_slice: TableSlice):\n    return SnowflakeConnection(\n        dict(\n            schema=table_slice.schema,\n            connector="sqlalchemy",\n            **cast(Mapping[str, str], context.resource_config),\n        ),\n        context.log,\n    ).get_connection(raw_conn=False)\n\n\n
[docs]class SnowflakePandasTypeHandler(DbTypeHandler[DataFrame]):\n """\n Defines how to translate between slices of Snowflake tables and Pandas DataFrames.\n\n Examples:\n\n .. code-block:: python\n\n from dagster_snowflake import build_snowflake_io_manager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n\n snowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler()])\n\n @job(resource_defs={'io_manager': snowflake_io_manager})\n def my_job():\n ...\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame\n ) -> Mapping[str, RawMetadataValue]:\n from snowflake import connector # pylint: disable=no-name-in-module\n\n connector.paramstyle = "pyformat"\n with _connect_snowflake(context, table_slice) as con:\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n with_uppercase_cols.to_sql(\n table_slice.table,\n con=con.engine,\n if_exists="append",\n index=False,\n method=pd_writer,\n )\n\n return {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype))\n for name, dtype in obj.dtypes.iteritems()\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice) -> DataFrame:\n with _connect_snowflake(context, table_slice) as con:\n result = read_sql(sql=SnowflakeDbClient.get_select_statement(table_slice), con=con)\n result.columns = map(str.lower, result.columns)\n return result\n\n @property\n def supported_types(self):\n return [DataFrame]
\n
", "current_page_name": "_modules/dagster_snowflake_pandas/snowflake_pandas_type_handler", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_snowflake_pandas.snowflake_pandas_type_handler"}}, "dagster_spark": {"configs": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.configs

\n"""Spark Configuration\n\nIn this file we define the key configuration parameters for submitting Spark jobs. Spark can be run\nin a variety of deployment contexts. See the Spark documentation at\nhttps://spark.apache.org/docs/latest/submitting-applications.html for a more in-depth summary of\nSpark deployment contexts and configuration.\n"""\nfrom dagster import Field, StringSource\n\nfrom .configs_spark import spark_config\nfrom .types import SparkDeployMode\n\n\n
[docs]def define_spark_config():\n """Spark configuration.\n\n See the Spark documentation for reference:\n https://spark.apache.org/docs/latest/submitting-applications.html\n """\n\n master_url = Field(\n StringSource,\n description="The master URL for the cluster (e.g. spark://23.195.26.187:7077)",\n is_required=True,\n )\n\n deploy_mode = Field(\n SparkDeployMode,\n description="""Whether to deploy your driver on the worker nodes (cluster) or locally as an\n external client (client) (default: client). A common deployment strategy is to submit your\n application from a gateway machine that is physically co-located with your worker machines\n (e.g. Master node in a standalone EC2 cluster). In this setup, client mode is appropriate.\n In client mode, the driver is launched directly within the spark-submit process which acts\n as a client to the cluster. The input and output of the application is attached to the\n console. Thus, this mode is especially suitable for applications that involve the REPL (e.g.\n Spark shell).""",\n is_required=False,\n )\n\n application_jar = Field(\n StringSource,\n description="""Path to a bundled jar including your application and all\n dependencies. The URL must be globally visible inside of your cluster, for\n instance, an hdfs:// path or a file:// path that is present on all nodes.\n """,\n is_required=True,\n )\n\n application_arguments = Field(\n StringSource,\n description="Arguments passed to the main method of your main class, if any",\n is_required=False,\n )\n\n spark_home = Field(\n StringSource,\n description="The path to your spark installation. Defaults to $SPARK_HOME at runtime if not provided.",\n is_required=False,\n )\n\n return {\n "master_url": master_url,\n "deploy_mode": deploy_mode,\n "application_jar": application_jar,\n "spark_conf": spark_config(),\n "spark_home": spark_home,\n "application_arguments": application_arguments,\n }
\n
", "current_page_name": "_modules/dagster_spark/configs", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.configs"}, "ops": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.ops

\nfrom dagster import InputDefinition, Nothing, OutputDefinition\nfrom dagster import _check as check\nfrom dagster import op, solid\n\nfrom .configs import define_spark_config\n\n\ndef create_spark_solid(\n    name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n    return core_create_spark(\n        dagster_decorator=solid,\n        name=name,\n        main_class=main_class,\n        description=description,\n        required_resource_keys=required_resource_keys,\n    )\n\n\n
[docs]def create_spark_op(\n name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n return core_create_spark(\n dagster_decorator=op,\n name=name,\n main_class=main_class,\n description=description,\n required_resource_keys=required_resource_keys,\n )
\n\n\ndef core_create_spark(\n dagster_decorator,\n name,\n main_class,\n description=None,\n required_resource_keys=frozenset(["spark"]),\n):\n check.str_param(name, "name")\n check.str_param(main_class, "main_class")\n check.opt_str_param(description, "description", "A parameterized Spark job.")\n check.set_param(required_resource_keys, "required_resource_keys")\n\n @dagster_decorator(\n name=name,\n description=description,\n config_schema=define_spark_config(),\n input_defs=[InputDefinition("start", Nothing)],\n output_defs=[OutputDefinition(Nothing)],\n tags={"kind": "spark", "main_class": main_class},\n required_resource_keys=required_resource_keys,\n )\n def spark_solid(context): # pylint: disable=unused-argument\n context.resources.spark.run_spark_job(context.solid_config, main_class)\n\n return spark_solid\n
", "current_page_name": "_modules/dagster_spark/ops", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.ops"}, "resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.resources

\nimport os\nimport subprocess\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster.core.log_manager import DagsterLogManager\n\nfrom .types import SparkOpError\nfrom .utils import construct_spark_shell_command\n\n\nclass SparkResource:\n    def __init__(self, logger):\n        self.logger = check.inst_param(logger, "logger", DagsterLogManager)\n\n    def run_spark_job(self, config, main_class):\n        check.dict_param(config, "config")\n        check.str_param(main_class, "main_class")\n\n        # Extract parameters from config\n        (\n            master_url,\n            deploy_mode,\n            application_jar,\n            spark_conf,\n            application_arguments,\n            spark_home,\n        ) = [\n            config.get(k)\n            for k in (\n                "master_url",\n                "deploy_mode",\n                "application_jar",\n                "spark_conf",\n                "application_arguments",\n                "spark_home",\n            )\n        ]\n\n        if not os.path.exists(application_jar):\n            raise SparkOpError(\n                (\n                    "Application jar {} does not exist. A valid jar must be "\n                    "built before running this op.".format(application_jar)\n                )\n            )\n\n        spark_shell_cmd = construct_spark_shell_command(\n            application_jar=application_jar,\n            main_class=main_class,\n            master_url=master_url,\n            spark_conf=spark_conf,\n            deploy_mode=deploy_mode,\n            application_arguments=application_arguments,\n            spark_home=spark_home,\n        )\n        self.logger.info("Running spark-submit: " + " ".join(spark_shell_cmd))\n\n        retcode = subprocess.call(" ".join(spark_shell_cmd), shell=True)\n\n        if retcode != 0:\n            raise SparkOpError("Spark job failed. Please consult your logs.")\n\n\n
[docs]@resource\ndef spark_resource(context):\n return SparkResource(context.log)
\n
", "current_page_name": "_modules/dagster_spark/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.resources"}, "types": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.types

\nfrom dagster import Enum, EnumValue\n\nSparkDeployModeCluster = EnumValue("cluster")\nSparkDeployModeClient = EnumValue("client")\nSparkDeployMode = Enum(\n    name="SparkDeployMode", enum_values=[SparkDeployModeCluster, SparkDeployModeClient]\n)\n\n\nclass SparkSolidError(Exception):\n    pass\n\n\n
[docs]class SparkOpError(SparkSolidError):\n pass
\n
", "current_page_name": "_modules/dagster_spark/types", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.types"}, "utils": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_spark.utils

\nimport itertools\nimport os\n\nimport dagster._check as check\n\nfrom .types import SparkOpError\n\n\ndef flatten_dict(d):\n    def _flatten_dict(d, result, key_path=None):\n        """Iterates an arbitrarily nested dictionary and yield dot-notation key:value tuples.\n\n        {'foo': {'bar': 3, 'baz': 1}, {'other': {'key': 1}} =>\n            [('foo.bar', 3), ('foo.baz', 1), ('other.key', 1)]\n\n        """\n        for k, v in d.items():\n            new_key_path = (key_path or []) + [k]\n            if isinstance(v, dict):\n                _flatten_dict(v, result, new_key_path)\n            else:\n                result.append((".".join(new_key_path), v))\n\n    result = []\n    if d is not None:\n        _flatten_dict(d, result)\n    return result\n\n\ndef parse_spark_config(spark_conf):\n    """For each key-value pair in spark conf, we need to pass to CLI in format:\n\n    --conf "key=value"\n    """\n\n    spark_conf_list = flatten_dict(spark_conf)\n    return format_for_cli(spark_conf_list)\n\n\ndef format_for_cli(spark_conf_list):\n    return list(\n        itertools.chain.from_iterable([("--conf", "{}={}".format(*c)) for c in spark_conf_list])\n    )\n\n\n
[docs]def construct_spark_shell_command(\n application_jar,\n main_class,\n master_url=None,\n spark_conf=None,\n deploy_mode=None,\n application_arguments=None,\n spark_home=None,\n):\n """Constructs the spark-submit command for a Spark job."""\n check.opt_str_param(master_url, "master_url")\n check.str_param(application_jar, "application_jar")\n spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n check.opt_str_param(deploy_mode, "deploy_mode")\n check.opt_str_param(application_arguments, "application_arguments")\n check.opt_str_param(spark_home, "spark_home")\n\n spark_home = spark_home if spark_home else os.environ.get("SPARK_HOME")\n if spark_home is None:\n raise SparkOpError(\n (\n "No spark home set. You must either pass spark_home in config or "\n "set $SPARK_HOME in your environment (got None)."\n )\n )\n\n master_url = ["--master", master_url] if master_url else []\n deploy_mode = ["--deploy-mode", deploy_mode] if deploy_mode else []\n\n spark_shell_cmd = (\n ["{}/bin/spark-submit".format(spark_home), "--class", main_class]\n + master_url\n + deploy_mode\n + parse_spark_config(spark_conf)\n + [application_jar]\n + [application_arguments]\n )\n return spark_shell_cmd
\n
", "current_page_name": "_modules/dagster_spark/utils", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_spark.utils"}}, "dagster_ssh": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_ssh.resources

\nimport getpass\nimport os\nfrom io import StringIO\n\nimport paramiko\nfrom paramiko.config import SSH_PORT\nfrom sshtunnel import SSHTunnelForwarder\n\nfrom dagster import Field, StringSource\nfrom dagster import _check as check\nfrom dagster import resource\nfrom dagster.utils import merge_dicts, mkdir_p\n\n\ndef key_from_str(key_str):\n    """Creates a paramiko SSH key from a string."""\n    check.str_param(key_str, "key_str")\n\n    # py2 StringIO doesn't support with\n    key_file = StringIO(key_str)\n    result = paramiko.RSAKey.from_private_key(key_file)\n    key_file.close()\n    return result\n\n\n
[docs]class SSHResource:\n """\n Resource for ssh remote execution using Paramiko.\n ref: https://github.com/paramiko/paramiko\n """\n\n def __init__(\n self,\n remote_host,\n remote_port,\n username=None,\n password=None,\n key_file=None,\n key_string=None,\n timeout=10,\n keepalive_interval=30,\n compress=True,\n no_host_key_check=True,\n allow_host_key_change=False,\n logger=None,\n ):\n self.remote_host = check.str_param(remote_host, "remote_host")\n self.remote_port = check.opt_int_param(remote_port, "remote_port")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.key_file = check.opt_str_param(key_file, "key_file")\n self.timeout = check.opt_int_param(timeout, "timeout")\n self.keepalive_interval = check.opt_int_param(keepalive_interval, "keepalive_interval")\n self.compress = check.opt_bool_param(compress, "compress")\n self.no_host_key_check = check.opt_bool_param(no_host_key_check, "no_host_key_check")\n self.allow_host_key_change = check.opt_bool_param(\n allow_host_key_change, "allow_host_key_change"\n )\n self.log = logger\n\n self.host_proxy = None\n\n # Create RSAKey object from private key string\n self.key_obj = key_from_str(key_string) if key_string is not None else None\n\n # Auto detecting username values from system\n if not self.username:\n logger.debug(\n "username to ssh to host: %s is not specified. Using system's default provided by"\n " getpass.getuser()" % self.remote_host\n )\n self.username = getpass.getuser()\n\n user_ssh_config_filename = os.path.expanduser("~/.ssh/config")\n if os.path.isfile(user_ssh_config_filename):\n ssh_conf = paramiko.SSHConfig()\n ssh_conf.parse(open(user_ssh_config_filename, encoding="utf8"))\n host_info = ssh_conf.lookup(self.remote_host)\n if host_info and host_info.get("proxycommand"):\n self.host_proxy = paramiko.ProxyCommand(host_info.get("proxycommand"))\n\n if not (self.password or self.key_file):\n if host_info and host_info.get("identityfile"):\n self.key_file = host_info.get("identityfile")[0]\n\n def get_connection(self):\n """\n Opens a SSH connection to the remote host.\n\n :rtype: paramiko.client.SSHClient\n """\n client = paramiko.SSHClient()\n if not self.allow_host_key_change:\n self.log.warning(\n "Remote Identification Change is not verified. This won't protect against "\n "Man-In-The-Middle attacks"\n )\n client.load_system_host_keys()\n if self.no_host_key_check:\n self.log.warning(\n "No Host Key Verification. This won't protect against Man-In-The-Middle attacks"\n )\n # Default is RejectPolicy\n client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n\n if self.password and self.password.strip():\n client.connect(\n hostname=self.remote_host,\n username=self.username,\n password=self.password,\n key_filename=self.key_file,\n pkey=self.key_obj,\n timeout=self.timeout,\n compress=self.compress,\n port=self.remote_port,\n sock=self.host_proxy,\n look_for_keys=False,\n )\n else:\n client.connect(\n hostname=self.remote_host,\n username=self.username,\n key_filename=self.key_file,\n pkey=self.key_obj,\n timeout=self.timeout,\n compress=self.compress,\n port=self.remote_port,\n sock=self.host_proxy,\n )\n\n if self.keepalive_interval:\n client.get_transport().set_keepalive(self.keepalive_interval)\n\n return client\n\n def get_tunnel(self, remote_port, remote_host="localhost", local_port=None):\n check.int_param(remote_port, "remote_port")\n check.str_param(remote_host, "remote_host")\n check.opt_int_param(local_port, "local_port")\n\n if local_port is not None:\n local_bind_address = ("localhost", local_port)\n else:\n local_bind_address = ("localhost",)\n\n # Will prefer key string if specified, otherwise use the key file\n pkey = self.key_obj if self.key_obj else self.key_file\n\n if self.password and self.password.strip():\n client = SSHTunnelForwarder(\n self.remote_host,\n ssh_port=self.remote_port,\n ssh_username=self.username,\n ssh_password=self.password,\n ssh_pkey=pkey,\n ssh_proxy=self.host_proxy,\n local_bind_address=local_bind_address,\n remote_bind_address=(remote_host, remote_port),\n logger=self.log,\n )\n else:\n client = SSHTunnelForwarder(\n self.remote_host,\n ssh_port=self.remote_port,\n ssh_username=self.username,\n ssh_pkey=pkey,\n ssh_proxy=self.host_proxy,\n local_bind_address=local_bind_address,\n remote_bind_address=(remote_host, remote_port),\n host_pkey_directories=[],\n logger=self.log,\n )\n\n return client\n\n def sftp_get(self, remote_filepath, local_filepath):\n check.str_param(remote_filepath, "remote_filepath")\n check.str_param(local_filepath, "local_filepath")\n conn = self.get_connection()\n with conn.open_sftp() as sftp_client:\n local_folder = os.path.dirname(local_filepath)\n\n # Create intermediate directories if they don't exist\n mkdir_p(local_folder)\n\n self.log.info(\n "Starting to transfer from {0} to {1}".format(remote_filepath, local_filepath)\n )\n\n sftp_client.get(remote_filepath, local_filepath)\n\n conn.close()\n return local_filepath\n\n def sftp_put(self, remote_filepath, local_filepath, confirm=True):\n check.str_param(remote_filepath, "remote_filepath")\n check.str_param(local_filepath, "local_filepath")\n conn = self.get_connection()\n with conn.open_sftp() as sftp_client:\n self.log.info(\n "Starting to transfer file from {0} to {1}".format(local_filepath, remote_filepath)\n )\n\n sftp_client.put(local_filepath, remote_filepath, confirm=confirm)\n\n conn.close()\n return local_filepath
\n\n\n
[docs]@resource(\n {\n "remote_host": Field(\n StringSource, description="remote host to connect to", is_required=True\n ),\n "remote_port": Field(\n int,\n description="port of remote host to connect (Default is paramiko SSH_PORT)",\n is_required=False,\n default_value=SSH_PORT,\n ),\n "username": Field(\n StringSource, description="username to connect to the remote_host", is_required=False\n ),\n "password": Field(\n StringSource,\n description="password of the username to connect to the remote_host",\n is_required=False,\n ),\n "key_file": Field(\n StringSource,\n description="key file to use to connect to the remote_host.",\n is_required=False,\n ),\n "key_string": Field(\n StringSource,\n description="key string to use to connect to remote_host",\n is_required=False,\n ),\n "timeout": Field(\n int,\n description="timeout for the attempt to connect to the remote_host.",\n is_required=False,\n default_value=10,\n ),\n "keepalive_interval": Field(\n int,\n description="send a keepalive packet to remote host every keepalive_interval seconds",\n is_required=False,\n default_value=30,\n ),\n "compress": Field(bool, is_required=False, default_value=True),\n "no_host_key_check": Field(bool, is_required=False, default_value=True),\n "allow_host_key_change": Field(bool, is_required=False, default_value=False),\n }\n)\ndef ssh_resource(init_context):\n args = init_context.resource_config\n args = merge_dicts(init_context.resource_config, {"logger": init_context.log})\n return SSHResource(**args)
\n
", "current_page_name": "_modules/dagster_ssh/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_ssh.resources"}}, "dagster_twilio": {"resources": {"alabaster_version": "0.7.12", "body": "

Source code for dagster_twilio.resources

\nfrom twilio.rest import Client\n\nfrom dagster import Field, StringSource, resource\n\n\n
[docs]@resource(\n {\n "account_sid": Field(StringSource, description="Twilio Account SID"),\n "auth_token": Field(StringSource, description="Twilio Auth Token"),\n },\n description="This resource is for connecting to Twilio",\n)\ndef twilio_resource(context):\n return Client(context.resource_config["account_sid"], context.resource_config["auth_token"])
\n
", "current_page_name": "_modules/dagster_twilio/resources", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagster_twilio.resources"}}, "dagstermill": {"context": {"alabaster_version": "0.7.12", "body": "

Source code for dagstermill.context

\nfrom typing import Any, Dict, Mapping, Optional, Set, cast\n\nfrom dagster import PipelineDefinition, PipelineRun, SolidDefinition\nfrom dagster import _check as check\nfrom dagster.core.definitions.dependency import Node, NodeHandle\nfrom dagster.core.execution.context.compute import AbstractComputeExecutionContext\nfrom dagster.core.execution.context.system import PlanExecutionContext, StepExecutionContext\nfrom dagster.core.log_manager import DagsterLogManager\nfrom dagster.core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]class DagstermillExecutionContext(AbstractComputeExecutionContext):\n """Dagstermill-specific execution context.\n\n Do not initialize directly: use :func:`dagstermill.get_context`.\n """\n\n def __init__(\n self,\n pipeline_context: PlanExecutionContext,\n pipeline_def: PipelineDefinition,\n resource_keys_to_init: Set[str],\n solid_name: str,\n solid_handle: NodeHandle,\n solid_config: Any = None,\n ):\n self._pipeline_context = check.inst_param(\n pipeline_context, "pipeline_context", PlanExecutionContext\n )\n self._pipeline_def = check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)\n self._resource_keys_to_init = check.set_param(\n resource_keys_to_init, "resource_keys_to_init", of_type=str\n )\n self.solid_name = check.str_param(solid_name, "solid_name")\n self.solid_handle = check.inst_param(solid_handle, "solid_handle", NodeHandle)\n self._solid_config = solid_config\n\n
[docs] def has_tag(self, key: str) -> bool:\n """Check if a logging tag is defined on the context.\n\n Args:\n key (str): The key to check.\n\n Returns:\n bool\n """\n check.str_param(key, "key")\n return self._pipeline_context.has_tag(key)
\n\n
[docs] def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag defined on the context.\n\n Args:\n key (str): The key to get.\n\n Returns:\n str\n """\n check.str_param(key, "key")\n return self._pipeline_context.get_tag(key)
\n\n @property\n def run_id(self) -> str:\n """str: The run_id for the context."""\n return self._pipeline_context.run_id\n\n @property\n def run_config(self) -> Mapping[str, Any]:\n """dict: The run_config for the context."""\n return self._pipeline_context.run_config\n\n @property\n def resolved_run_config(self) -> ResolvedRunConfig:\n """:class:`dagster.ResolvedRunConfig`: The resolved_run_config for the context"""\n return self._pipeline_context.resolved_run_config\n\n @property\n def logging_tags(self) -> Dict[str, str]:\n """dict: The logging tags for the context."""\n return self._pipeline_context.logging_tags\n\n @property\n def pipeline_name(self) -> str:\n return self._pipeline_context.pipeline_name\n\n @property\n def pipeline_def(self) -> PipelineDefinition:\n """:class:`dagster.PipelineDefinition`: The pipeline definition for the context.\n\n This will be a dagstermill-specific shim.\n """\n return self._pipeline_def\n\n @property\n def resources(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n resources."""\n return self._pipeline_context.scoped_resources_builder.build(\n required_resource_keys=self._resource_keys_to_init,\n )\n\n @property\n def pipeline_run(self) -> PipelineRun:\n """:class:`dagster.PipelineRun`: The pipeline run for the context."""\n return self._pipeline_context.pipeline_run\n\n @property\n def log(self) -> DagsterLogManager:\n """:class:`dagster.DagsterLogManager`: The log manager for the context.\n\n Call, e.g., ``log.info()`` to log messages through the Dagster machinery.\n """\n return self._pipeline_context.log\n\n @property\n def solid_def(self) -> SolidDefinition:\n """:class:`dagster.SolidDefinition`: The solid definition for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether a\n solid definition was passed to ``dagstermill.get_context``.\n """\n return cast(SolidDefinition, self.pipeline_def.solid_def_named(self.solid_name))\n\n @property\n def solid(self) -> Node:\n """:class:`dagster.Node`: The solid for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether a\n solid definition was passed to ``dagstermill.get_context``.\n """\n return self.pipeline_def.get_solid(self.solid_handle)\n\n @property\n def solid_config(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n solid-specific config."""\n if self._solid_config:\n return self._solid_config\n\n solid_config = self.resolved_run_config.solids.get(self.solid_name)\n return solid_config.config if solid_config else None
\n\n\nclass DagstermillRuntimeExecutionContext(DagstermillExecutionContext):\n def __init__(\n self,\n pipeline_context: PlanExecutionContext,\n pipeline_def: PipelineDefinition,\n resource_keys_to_init: Set[str],\n solid_name: str,\n step_context: StepExecutionContext,\n solid_handle: NodeHandle,\n solid_config: Any = None,\n ):\n self._step_context = check.inst_param(step_context, "step_context", StepExecutionContext)\n super().__init__(\n pipeline_context,\n pipeline_def,\n resource_keys_to_init,\n solid_name,\n solid_handle,\n solid_config,\n )\n
", "current_page_name": "_modules/dagstermill/context", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagstermill.context"}, "errors": {"alabaster_version": "0.7.12", "body": "

Source code for dagstermill.errors

\nfrom dagster.core.errors import DagsterError\n\n\n
[docs]class DagstermillError(DagsterError):\n """Base class for errors raised by dagstermill."""
\n
", "current_page_name": "_modules/dagstermill/errors", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagstermill.errors"}, "factory": {"alabaster_version": "0.7.12", "body": "

Source code for dagstermill.factory

\nimport copy\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport uuid\nfrom typing import Any, Dict, List, Optional, Sequence, Set, Union\n\nimport nbformat\nimport papermill\nfrom papermill.engines import papermill_engines\nfrom papermill.iorw import load_notebook_node, write_ipynb\n\nfrom dagster import InputDefinition, OpDefinition, Output, OutputDefinition, SolidDefinition\nfrom dagster import _check as check\nfrom dagster import seven\nfrom dagster.core.definitions.events import AssetMaterialization, Failure, RetryRequested\nfrom dagster.core.definitions.metadata import MetadataValue\nfrom dagster.core.definitions.reconstruct import ReconstructablePipeline\nfrom dagster.core.definitions.utils import validate_tags\nfrom dagster.core.execution.context.compute import SolidExecutionContext\nfrom dagster.core.execution.context.input import build_input_context\nfrom dagster.core.execution.context.system import StepExecutionContext\nfrom dagster.core.execution.plan.outputs import StepOutputHandle\nfrom dagster.core.storage.file_manager import FileHandle\nfrom dagster.serdes import pack_value\nfrom dagster.seven import get_system_temp_directory\nfrom dagster.utils import mkdir_p, safe_tempfile_path\nfrom dagster.utils.backcompat import rename_warning\nfrom dagster.utils.error import serializable_error_info_from_exc_info\n\nfrom .compat import ExecutionError\nfrom .engine import DagstermillEngine\nfrom .errors import DagstermillError\nfrom .translator import DagsterTranslator\n\n\n# https://github.com/nteract/papermill/blob/17d4bbb3960c30c263bca835e48baf34322a3530/papermill/parameterize.py\ndef _find_first_tagged_cell_index(nb, tag):\n    parameters_indices = []\n    for idx, cell in enumerate(nb.cells):\n        if tag in cell.metadata.tags:\n            parameters_indices.append(idx)\n    if not parameters_indices:\n        return -1\n    return parameters_indices[0]\n\n\n# This is based on papermill.parameterize.parameterize_notebook\n# Typically, papermill injects the injected-parameters cell *below* the parameters cell\n# but we want to *replace* the parameters cell, which is what this function does.\ndef replace_parameters(context, nb, parameters):\n    """Assigned parameters into the appropiate place in the input notebook\n\n    Args:\n        nb (NotebookNode): Executable notebook object\n        parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.\n    """\n    check.dict_param(parameters, "parameters")\n\n    # Copy the nb object to avoid polluting the input\n    nb = copy.deepcopy(nb)\n\n    # papermill method chooses translator based on kernel_name and language, but we just call the\n    # DagsterTranslator to generate parameter content based on the kernel_name\n    param_content = DagsterTranslator.codify(parameters)\n\n    newcell = nbformat.v4.new_code_cell(source=param_content)\n    newcell.metadata["tags"] = ["injected-parameters"]\n\n    param_cell_index = _find_first_tagged_cell_index(nb, "parameters")\n    injected_cell_index = _find_first_tagged_cell_index(nb, "injected-parameters")\n    if injected_cell_index >= 0:\n        # Replace the injected cell with a new version\n        before = nb.cells[:injected_cell_index]\n        after = nb.cells[injected_cell_index + 1 :]\n        check.int_value_param(param_cell_index, -1, "param_cell_index")\n        # We should have blown away the parameters cell if there is an injected-parameters cell\n    elif param_cell_index >= 0:\n        # Replace the parameter cell with the injected-parameters cell\n        before = nb.cells[:param_cell_index]\n        after = nb.cells[param_cell_index + 1 :]\n    else:\n        # Inject to the top of the notebook, presumably first cell includes dagstermill import\n        context.log.debug(\n            (\n                "Executing notebook with no tagged parameters cell: injecting boilerplate in first "\n                "cell."\n            )\n        )\n        before = []\n        after = nb.cells\n\n    nb.cells = before + [newcell] + after\n    nb.metadata.papermill["parameters"] = seven.json.dumps(parameters)\n\n    return nb\n\n\ndef get_papermill_parameters(step_context, inputs, output_log_path, compute_descriptor):\n    check.inst_param(step_context, "step_context", StepExecutionContext)\n    check.param_invariant(\n        isinstance(step_context.run_config, dict),\n        "step_context",\n        "StepExecutionContext must have valid run_config",\n    )\n    check.dict_param(inputs, "inputs", key_type=str)\n\n    run_id = step_context.run_id\n    temp_dir = get_system_temp_directory()\n    marshal_dir = os.path.normpath(os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))\n    mkdir_p(marshal_dir)\n\n    if not isinstance(step_context.pipeline, ReconstructablePipeline):\n        if compute_descriptor == "solid":\n            raise DagstermillError(\n                "Can't execute a dagstermill solid from a pipeline that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n        else:\n            raise DagstermillError(\n                "Can't execute a dagstermill op from a job that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n\n    dm_executable_dict = step_context.pipeline.to_dict()\n\n    dm_context_dict = {\n        "output_log_path": output_log_path,\n        "marshal_dir": marshal_dir,\n        "run_config": step_context.run_config,\n    }\n\n    dm_solid_handle_kwargs = step_context.solid_handle._asdict()\n    dm_step_key = step_context.step.key\n\n    parameters = {}\n\n    parameters["__dm_context"] = dm_context_dict\n    parameters["__dm_executable_dict"] = dm_executable_dict\n    parameters["__dm_pipeline_run_dict"] = pack_value(step_context.pipeline_run)\n    parameters["__dm_solid_handle_kwargs"] = dm_solid_handle_kwargs\n    parameters["__dm_instance_ref_dict"] = pack_value(step_context.instance.get_ref())\n    parameters["__dm_step_key"] = dm_step_key\n    parameters["__dm_input_names"] = list(inputs.keys())\n\n    return parameters\n\n\ndef _dm_compute(\n    dagster_factory_name,\n    name,\n    notebook_path,\n    output_notebook_name=None,\n    asset_key_prefix=None,\n    output_notebook=None,\n):\n    check.str_param(name, "name")\n    check.str_param(notebook_path, "notebook_path")\n    check.opt_str_param(output_notebook_name, "output_notebook_name")\n    check.opt_list_param(asset_key_prefix, "asset_key_prefix")\n    check.opt_str_param(output_notebook, "output_notebook")\n\n    def _t_fn(step_context, inputs):\n        check.inst_param(step_context, "step_context", SolidExecutionContext)\n        check.param_invariant(\n            isinstance(step_context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        step_execution_context = step_context.get_step_execution_context()\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            with safe_tempfile_path() as output_log_path:\n\n                prefix = str(uuid.uuid4())\n                parameterized_notebook_path = os.path.join(\n                    output_notebook_dir, f"{prefix}-inter.ipynb"\n                )\n\n                executed_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-out.ipynb")\n\n                # Scaffold the registration here\n                nb = load_notebook_node(notebook_path)\n                compute_descriptor = (\n                    "solid" if dagster_factory_name == "define_dagstermill_solid" else "op"\n                )\n                nb_no_parameters = replace_parameters(\n                    step_execution_context,\n                    nb,\n                    get_papermill_parameters(\n                        step_execution_context, inputs, output_log_path, compute_descriptor\n                    ),\n                )\n                write_ipynb(nb_no_parameters, parameterized_notebook_path)\n\n                try:\n                    papermill_engines.register("dagstermill", DagstermillEngine)\n                    papermill.execute_notebook(\n                        input_path=parameterized_notebook_path,\n                        output_path=executed_notebook_path,\n                        engine_name="dagstermill",\n                        log_output=True,\n                    )\n\n                except Exception as ex:\n                    step_execution_context.log.warn(\n                        "Error when attempting to materialize executed notebook: {exc}".format(\n                            exc=str(serializable_error_info_from_exc_info(sys.exc_info()))\n                        )\n                    )\n                    # pylint: disable=no-member\n                    # compat:\n                    if isinstance(ex, ExecutionError) and (\n                        ex.ename == "RetryRequested" or ex.ename == "Failure"\n                    ):\n                        step_execution_context.log.warn(\n                            f"Encountered raised {ex.ename} in notebook. Use dagstermill.yield_event "\n                            "with RetryRequested or Failure to trigger their behavior."\n                        )\n\n                    raise\n\n            step_execution_context.log.debug(\n                "Notebook execution complete for {name} at {executed_notebook_path}.".format(\n                    name=name,\n                    executed_notebook_path=executed_notebook_path,\n                )\n            )\n            if output_notebook_name is not None:\n                # yield output notebook binary stream as a solid output\n                with open(executed_notebook_path, "rb") as fd:\n                    yield Output(fd.read(), output_notebook_name)\n\n            else:\n                # backcompat\n                executed_notebook_file_handle = None\n                try:\n                    # use binary mode when when moving the file since certain file_managers such as S3\n                    # may try to hash the contents\n                    with open(executed_notebook_path, "rb") as fd:\n                        executed_notebook_file_handle = step_context.resources.file_manager.write(\n                            fd, mode="wb", ext="ipynb"\n                        )\n                        executed_notebook_materialization_path = (\n                            executed_notebook_file_handle.path_desc\n                        )\n\n                    yield AssetMaterialization(\n                        asset_key=(asset_key_prefix + [f"{name}_output_notebook"]),\n                        description="Location of output notebook in file manager",\n                        metadata={\n                            "path": MetadataValue.path(executed_notebook_materialization_path),\n                        },\n                    )\n\n                except Exception:\n                    # if file manager writing errors, e.g. file manager is not provided, we throw a warning\n                    # and fall back to the previously stored temp executed notebook.\n                    step_context.log.warning(\n                        "Error when attempting to materialize executed notebook using file manager: "\n                        f"{str(serializable_error_info_from_exc_info(sys.exc_info()))}"\n                        f"\\nNow falling back to local: notebook execution was temporarily materialized at {executed_notebook_path}"\n                        "\\nIf you have supplied a file manager and expect to use it for materializing the "\n                        'notebook, please include "file_manager" in the `required_resource_keys` argument '\n                        f"to `{dagster_factory_name}`"\n                    )\n\n                if output_notebook is not None:\n                    yield Output(executed_notebook_file_handle, output_notebook)\n\n            # deferred import for perf\n            import scrapbook\n\n            output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n            for (output_name, _) in step_execution_context.solid_def.output_dict.items():\n                data_dict = output_nb.scraps.data_dict\n                if output_name in data_dict:\n                    # read outputs that were passed out of process via io manager from `yield_result`\n                    step_output_handle = StepOutputHandle(\n                        step_key=step_execution_context.step.key, output_name=output_name\n                    )\n                    output_context = step_execution_context.get_output_context(step_output_handle)\n                    io_manager = step_execution_context.get_io_manager(step_output_handle)\n                    value = io_manager.load_input(\n                        build_input_context(upstream_output=output_context)\n                    )\n\n                    yield Output(value, output_name)\n\n            for key, value in output_nb.scraps.items():\n                if key.startswith("event-"):\n                    with open(value.data, "rb") as fd:\n                        event = pickle.loads(fd.read())\n                        if isinstance(event, (Failure, RetryRequested)):\n                            raise event\n                        else:\n                            yield event\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_solid(\n name: str,\n notebook_path: str,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n output_notebook: Optional[str] = None,\n output_notebook_name: Optional[str] = None,\n asset_key_prefix: Optional[Union[List[str], str]] = None,\n description: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n):\n """Wrap a Jupyter notebook in a solid.\n\n Arguments:\n name (str): The name of the solid.\n notebook_path (str): Path to the backing notebook.\n input_defs (Optional[List[InputDefinition]]): The solid's inputs.\n output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook (Optional[str]): If set, will be used as the name of an injected output of\n type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in\n addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This\n respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on\n the pipeline resources via the "file_manager" resource key, so, e.g.,\n if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :\n py:class:`~dagster_aws.s3.S3FileHandle`.\n output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output\n of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed\n notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always\n created). It allows the downstream solids to access the executed notebook via a file\n object.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n description (Optional[str]): If set, description used for solid.\n tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.\n Dagster uses the tag keys `notebook_path` and `kind`, which cannot be\n overwritten by the user.\n\n Returns:\n :py:class:`~dagster.SolidDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition)\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n\n extra_output_defs = []\n if output_notebook_name is not None:\n required_resource_keys.add("output_notebook_io_manager")\n extra_output_defs.append(\n OutputDefinition(name=output_notebook_name, io_manager_key="output_notebook_io_manager")\n )\n # backcompact\n if output_notebook is not None:\n rename_warning(\n new_name="output_notebook_name", old_name="output_notebook", breaking_version="0.14.0"\n )\n required_resource_keys.add("file_manager")\n extra_output_defs.append(OutputDefinition(dagster_type=FileHandle, name=output_notebook))\n\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n default_description = f"This solid is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n user_tags = validate_tags(tags)\n if tags is not None:\n check.invariant(\n "notebook_path" not in tags,\n "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in tags,\n "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",\n )\n default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}\n\n return SolidDefinition(\n name=name,\n input_defs=input_defs,\n compute_fn=_dm_compute(\n "define_dagstermill_solid",\n name,\n notebook_path,\n output_notebook_name,\n asset_key_prefix=asset_key_prefix,\n output_notebook=output_notebook, # backcompact\n ),\n output_defs=output_defs + extra_output_defs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description=description,\n tags={**user_tags, **default_tags},\n )
\n\n\n
[docs]def define_dagstermill_op(\n name: str,\n notebook_path: str,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n config_schema: Optional[Union[Any, Dict[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n output_notebook_name: Optional[str] = None,\n asset_key_prefix: Optional[Union[List[str], str]] = None,\n description: Optional[str] = None,\n tags: Optional[Dict[str, Any]] = None,\n):\n """Wrap a Jupyter notebook in a solid.\n\n Arguments:\n name (str): The name of the solid.\n notebook_path (str): Path to the backing notebook.\n input_defs (Optional[List[InputDefinition]]): The solid's inputs.\n output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output\n of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed\n notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always\n created). It allows the downstream solids to access the executed notebook via a file\n object.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n description (Optional[str]): If set, description used for solid.\n tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.\n Dagster uses the tag keys `notebook_path` and `kind`, which cannot be\n overwritten by the user.\n\n Returns:\n :py:class:`~dagster.SolidDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition)\n output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition)\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n\n extra_output_defs = []\n if output_notebook_name is not None:\n required_resource_keys.add("output_notebook_io_manager")\n extra_output_defs.append(\n OutputDefinition(name=output_notebook_name, io_manager_key="output_notebook_io_manager")\n )\n\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n default_description = f"This op is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n user_tags = validate_tags(tags)\n if tags is not None:\n check.invariant(\n "notebook_path" not in tags,\n "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in tags,\n "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",\n )\n default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}\n\n return OpDefinition(\n name=name,\n input_defs=input_defs,\n compute_fn=_dm_compute(\n "define_dagstermill_op",\n name,\n notebook_path,\n output_notebook_name,\n asset_key_prefix=asset_key_prefix,\n ),\n output_defs=output_defs + extra_output_defs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description=description,\n tags={**user_tags, **default_tags},\n )
\n
", "current_page_name": "_modules/dagstermill/factory", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagstermill.factory"}, "io_managers": {"alabaster_version": "0.7.12", "body": "

Source code for dagstermill.io_managers

\nimport os\nfrom pathlib import Path\nfrom typing import Any, List, Optional\n\nimport dagster._check as check\nfrom dagster.config.field import Field\nfrom dagster.core.definitions.events import AssetKey\nfrom dagster.core.definitions.metadata import MetadataEntry, MetadataValue\nfrom dagster.core.execution.context.input import InputContext\nfrom dagster.core.execution.context.output import OutputContext\nfrom dagster.core.storage.io_manager import IOManager, io_manager\nfrom dagster.utils import mkdir_p\n\n\nclass OutputNotebookIOManager(IOManager):\n    def __init__(self, asset_key_prefix: Optional[List[str]] = None):\n        self.asset_key_prefix = asset_key_prefix if asset_key_prefix else []\n\n    def get_output_asset_key(self, context: OutputContext):\n        return AssetKey([*self.asset_key_prefix, f"{context.step_key}_output_notebook"])\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        raise NotImplementedError\n\n    def load_input(self, context: InputContext) -> Any:\n        raise NotImplementedError\n\n\nclass LocalOutputNotebookIOManager(OutputNotebookIOManager):\n    """Built-in IO Manager for handling output notebook."""\n\n    def __init__(self, base_dir: str, asset_key_prefix: Optional[List[str]] = None):\n        super(LocalOutputNotebookIOManager, self).__init__(asset_key_prefix=asset_key_prefix)\n        self.base_dir = base_dir\n        self.write_mode = "wb"\n        self.read_mode = "rb"\n\n    def _get_path(self, context: OutputContext) -> str:\n        """Automatically construct filepath."""\n        keys = context.get_run_scoped_output_identifier()\n        return str(Path(self.base_dir, *keys).with_suffix(".ipynb"))\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        """obj: bytes"""\n        check.inst_param(context, "context", OutputContext)\n\n        # the output notebook itself is stored at output_file_path\n        output_notebook_path = self._get_path(context)\n        mkdir_p(os.path.dirname(output_notebook_path))\n        with open(output_notebook_path, self.write_mode) as dest_file_obj:\n            dest_file_obj.write(obj)\n        yield MetadataEntry("path", value=MetadataValue.path(output_notebook_path))\n\n    def load_input(self, context) -> bytes:\n        check.inst_param(context, "context", InputContext)\n        # pass output notebook to downstream solids as File Object\n        with open(self._get_path(context.upstream_output), self.read_mode) as file_obj:\n            return file_obj.read()\n\n\n
[docs]@io_manager(\n config_schema={\n "asset_key_prefix": Field(str, is_required=False),\n "base_dir": Field(str, is_required=False),\n },\n)\ndef local_output_notebook_io_manager(init_context):\n """Built-in IO Manager that handles output notebooks."""\n return LocalOutputNotebookIOManager(\n base_dir=init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory()\n ),\n asset_key_prefix=init_context.resource_config.get("asset_key_prefix", []),\n )
\n
", "current_page_name": "_modules/dagstermill/io_managers", "customsidebar": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["globaltoc.html", "searchbox.html"], "title": "dagstermill.io_managers"}}} \ No newline at end of file diff --git a/docs/content/api/searchindex.json b/docs/content/api/searchindex.json index 7b6111702e9d9..214c5fbf2e660 100644 --- a/docs/content/api/searchindex.json +++ b/docs/content/api/searchindex.json @@ -1 +1 @@ -{"docnames": ["index", "sections/api/apidocs/assets", "sections/api/apidocs/cli", "sections/api/apidocs/config", "sections/api/apidocs/dynamic", "sections/api/apidocs/errors", "sections/api/apidocs/execution", "sections/api/apidocs/graphs", "sections/api/apidocs/hooks", "sections/api/apidocs/internals", "sections/api/apidocs/io-managers", "sections/api/apidocs/jobs", "sections/api/apidocs/libraries/dagster-airbyte", "sections/api/apidocs/libraries/dagster-airflow", "sections/api/apidocs/libraries/dagster-aws", "sections/api/apidocs/libraries/dagster-azure", "sections/api/apidocs/libraries/dagster-celery", "sections/api/apidocs/libraries/dagster-celery-docker", "sections/api/apidocs/libraries/dagster-celery-k8s", "sections/api/apidocs/libraries/dagster-dask", "sections/api/apidocs/libraries/dagster-databricks", "sections/api/apidocs/libraries/dagster-datadog", "sections/api/apidocs/libraries/dagster-dbt", "sections/api/apidocs/libraries/dagster-docker", "sections/api/apidocs/libraries/dagster-fivetran", "sections/api/apidocs/libraries/dagster-gcp", "sections/api/apidocs/libraries/dagster-ge", "sections/api/apidocs/libraries/dagster-github", "sections/api/apidocs/libraries/dagster-graphql", "sections/api/apidocs/libraries/dagster-k8s", "sections/api/apidocs/libraries/dagster-mlflow", "sections/api/apidocs/libraries/dagster-msteams", "sections/api/apidocs/libraries/dagster-mysql", "sections/api/apidocs/libraries/dagster-pagerduty", "sections/api/apidocs/libraries/dagster-pandas", "sections/api/apidocs/libraries/dagster-papertrail", "sections/api/apidocs/libraries/dagster-postgres", "sections/api/apidocs/libraries/dagster-prometheus", "sections/api/apidocs/libraries/dagster-pyspark", "sections/api/apidocs/libraries/dagster-shell", "sections/api/apidocs/libraries/dagster-slack", "sections/api/apidocs/libraries/dagster-snowflake", "sections/api/apidocs/libraries/dagster-spark", "sections/api/apidocs/libraries/dagster-ssh", "sections/api/apidocs/libraries/dagster-twilio", "sections/api/apidocs/libraries/dagstermill", "sections/api/apidocs/loggers", "sections/api/apidocs/memoization", "sections/api/apidocs/modes", "sections/api/apidocs/ops", "sections/api/apidocs/partitions", "sections/api/apidocs/pipeline", "sections/api/apidocs/presets", "sections/api/apidocs/repositories", "sections/api/apidocs/resources", "sections/api/apidocs/schedules-sensors", "sections/api/apidocs/solids", "sections/api/apidocs/types", "sections/api/apidocs/utilities"], "envversion": {"nbsphinx": 3, "sphinx": 56, "sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 3, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 2, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["index.rst", "sections/api/apidocs/assets.rst", "sections/api/apidocs/cli.rst", "sections/api/apidocs/config.rst", "sections/api/apidocs/dynamic.rst", "sections/api/apidocs/errors.rst", "sections/api/apidocs/execution.rst", "sections/api/apidocs/graphs.rst", "sections/api/apidocs/hooks.rst", "sections/api/apidocs/internals.rst", "sections/api/apidocs/io-managers.rst", "sections/api/apidocs/jobs.rst", "sections/api/apidocs/libraries/dagster-airbyte.rst", "sections/api/apidocs/libraries/dagster-airflow.rst", "sections/api/apidocs/libraries/dagster-aws.rst", "sections/api/apidocs/libraries/dagster-azure.rst", "sections/api/apidocs/libraries/dagster-celery.rst", "sections/api/apidocs/libraries/dagster-celery-docker.rst", "sections/api/apidocs/libraries/dagster-celery-k8s.rst", "sections/api/apidocs/libraries/dagster-dask.rst", "sections/api/apidocs/libraries/dagster-databricks.rst", "sections/api/apidocs/libraries/dagster-datadog.rst", "sections/api/apidocs/libraries/dagster-dbt.rst", "sections/api/apidocs/libraries/dagster-docker.rst", "sections/api/apidocs/libraries/dagster-fivetran.rst", "sections/api/apidocs/libraries/dagster-gcp.rst", "sections/api/apidocs/libraries/dagster-ge.rst", "sections/api/apidocs/libraries/dagster-github.rst", "sections/api/apidocs/libraries/dagster-graphql.rst", "sections/api/apidocs/libraries/dagster-k8s.rst", "sections/api/apidocs/libraries/dagster-mlflow.rst", "sections/api/apidocs/libraries/dagster-msteams.rst", "sections/api/apidocs/libraries/dagster-mysql.rst", "sections/api/apidocs/libraries/dagster-pagerduty.rst", "sections/api/apidocs/libraries/dagster-pandas.rst", "sections/api/apidocs/libraries/dagster-papertrail.rst", "sections/api/apidocs/libraries/dagster-postgres.rst", "sections/api/apidocs/libraries/dagster-prometheus.rst", "sections/api/apidocs/libraries/dagster-pyspark.rst", "sections/api/apidocs/libraries/dagster-shell.rst", "sections/api/apidocs/libraries/dagster-slack.rst", "sections/api/apidocs/libraries/dagster-snowflake.rst", "sections/api/apidocs/libraries/dagster-spark.rst", "sections/api/apidocs/libraries/dagster-ssh.rst", "sections/api/apidocs/libraries/dagster-twilio.rst", "sections/api/apidocs/libraries/dagstermill.rst", "sections/api/apidocs/loggers.rst", "sections/api/apidocs/memoization.rst", "sections/api/apidocs/modes.rst", "sections/api/apidocs/ops.rst", "sections/api/apidocs/partitions.rst", "sections/api/apidocs/pipeline.rst", "sections/api/apidocs/presets.rst", "sections/api/apidocs/repositories.rst", "sections/api/apidocs/resources.rst", "sections/api/apidocs/schedules-sensors.rst", "sections/api/apidocs/solids.rst", "sections/api/apidocs/types.rst", "sections/api/apidocs/utilities.rst"], "objects": {"dagit": {"--attribute": [2, 7, 1, "cmdoption-dagit-a"], "--db-statement-timeout": [2, 7, 1, "cmdoption-dagit-db-statement-timeout"], "--empty-workspace": [2, 7, 1, "cmdoption-dagit-empty-workspace"], "--grpc-host": [2, 7, 1, "cmdoption-dagit-grpc-host"], "--grpc-port": [2, 7, 1, "cmdoption-dagit-grpc-port"], "--grpc-socket": [2, 7, 1, "cmdoption-dagit-grpc-socket"], "--host": [2, 7, 1, "cmdoption-dagit-h"], "--module-name": [2, 7, 1, "cmdoption-dagit-m"], "--package-name": [2, 7, 1, "cmdoption-dagit-package-name"], "--path-prefix": [2, 7, 1, "cmdoption-dagit-l"], "--port": [2, 7, 1, "cmdoption-dagit-p"], "--python-file": [2, 7, 1, "cmdoption-dagit-f"], "--read-only": [2, 7, 1, "cmdoption-dagit-read-only"], "--suppress-warnings": [2, 7, 1, "cmdoption-dagit-suppress-warnings"], "--use-ssl": [2, 7, 1, "cmdoption-dagit-use-ssl"], "--version": [2, 7, 1, "cmdoption-dagit-version"], "--working-directory": [2, 7, 1, "cmdoption-dagit-d"], "--workspace": [2, 7, 1, "cmdoption-dagit-w"], "-a": [2, 7, 1, "cmdoption-dagit-a"], "-d": [2, 7, 1, "cmdoption-dagit-d"], "-f": [2, 7, 1, "cmdoption-dagit-f"], "-h": [2, 7, 1, "cmdoption-dagit-h"], "-l": [2, 7, 1, "cmdoption-dagit-l"], "-m": [2, 7, 1, "cmdoption-dagit-m"], "-p": [2, 7, 1, "cmdoption-dagit-p"], "-w": [2, 7, 1, "cmdoption-dagit-w"]}, "dagster": {"Any": [57, 0, 1, ""], "Array": [3, 1, 1, ""], "AssetGroup": [1, 1, 1, ""], "AssetIn": [1, 1, 1, ""], "AssetKey": [49, 1, 1, ""], "AssetMaterialization": [49, 1, 1, ""], "AssetSensorDefinition": [55, 1, 1, ""], "Backoff": [49, 1, 1, ""], "Bool": [57, 0, 1, ""], "BoolSource": [3, 0, 1, ""], "CompositeSolidDefinition": [56, 1, 1, ""], "CompositeSolidExecutionResult": [56, 1, 1, ""], "ConfigMapping": [3, 1, 1, ""], "ConfigSchema": [3, 1, 1, ""], "DagsterAssetMetadataValue": [49, 1, 1, ""], "DagsterConfigMappingFunctionError": [5, 3, 1, ""], "DagsterError": [5, 3, 1, ""], "DagsterEvent": [6, 1, 1, ""], "DagsterEventLogInvalidForRun": [5, 3, 1, ""], "DagsterEventType": [6, 1, 1, ""], "DagsterExecutionStepExecutionError": [5, 3, 1, ""], "DagsterExecutionStepNotFoundError": [5, 3, 1, ""], "DagsterInstance": [9, 1, 1, ""], "DagsterInvalidConfigDefinitionError": [5, 3, 1, ""], "DagsterInvalidConfigError": [5, 3, 1, ""], "DagsterInvalidDefinitionError": [5, 3, 1, ""], "DagsterInvariantViolationError": [5, 3, 1, ""], "DagsterLogManager": [46, 1, 1, ""], "DagsterPipelineRunMetadataValue": [49, 1, 1, ""], "DagsterResourceFunctionError": [5, 3, 1, ""], "DagsterRunNotFoundError": [5, 3, 1, ""], "DagsterRunStatus": [9, 1, 1, ""], "DagsterStepOutputNotFoundError": [5, 3, 1, ""], "DagsterSubprocessError": [5, 3, 1, ""], "DagsterType": [57, 1, 1, ""], "DagsterTypeCheckDidNotPass": [5, 3, 1, ""], "DagsterTypeCheckError": [5, 3, 1, ""], "DagsterTypeLoader": [57, 1, 1, ""], "DagsterTypeMaterializer": [57, 1, 1, ""], "DagsterUnknownResourceError": [5, 3, 1, ""], "DagsterUnmetExecutorRequirementsError": [5, 3, 1, ""], "DagsterUserCodeExecutionError": [5, 3, 1, ""], "DependencyDefinition": [51, 1, 1, ""], "Dict": [57, 0, 1, ""], "DynamicOut": [4, 1, 1, ""], "DynamicOutput": [4, 1, 1, ""], "Enum": [3, 1, 1, ""], "EnumValue": [3, 1, 1, ""], "ExecuteInProcessResult": [6, 1, 1, ""], "Executor": [9, 1, 1, ""], "ExecutorDefinition": [9, 1, 1, ""], "ExpectationResult": [49, 1, 1, ""], "ExperimentalWarning": [58, 1, 1, ""], "Failure": [49, 1, 1, ""], "Field": [3, 1, 1, ""], "FileHandle": [57, 1, 1, ""], "Float": [57, 0, 1, ""], "FloatMetadataValue": [49, 1, 1, ""], "GraphDefinition": [7, 1, 1, ""], "GraphIn": [7, 1, 1, ""], "GraphOut": [7, 1, 1, ""], "HookContext": [8, 1, 1, ""], "HookDefinition": [8, 1, 1, ""], "IOManager": [10, 1, 1, ""], "IOManagerDefinition": [10, 1, 1, ""], "In": [49, 1, 1, ""], "InitExecutorContext": [9, 1, 1, ""], "InitLoggerContext": [46, 1, 1, ""], "InitResourceContext": [54, 1, 1, ""], "InputContext": [10, 1, 1, ""], "InputDefinition": [56, 1, 1, ""], "InputMapping": [56, 1, 1, ""], "Int": [57, 0, 1, ""], "IntMetadataValue": [49, 1, 1, ""], "IntSource": [3, 0, 1, ""], "Jitter": [49, 1, 1, ""], "JobDefinition": [11, 1, 1, ""], "JsonMetadataValue": [49, 1, 1, ""], "List": [57, 0, 1, ""], "LocalFileHandle": [57, 1, 1, ""], "LoggerDefinition": [46, 1, 1, ""], "MEMOIZED_RUN_TAG": [47, 0, 1, ""], "Map": [3, 1, 1, ""], "MarkdownMetadataValue": [49, 1, 1, ""], "MemoizableIOManager": [47, 1, 1, ""], "MetadataEntry": [49, 1, 1, ""], "MetadataValue": [49, 1, 1, ""], "ModeDefinition": [48, 1, 1, ""], "MultiDependencyDefinition": [51, 1, 1, ""], "Noneable": [3, 1, 1, ""], "Nothing": [57, 0, 1, ""], "OpDefinition": [49, 1, 1, ""], "OpExecutionContext": [6, 1, 1, ""], "Optional": [57, 0, 1, ""], "Out": [49, 1, 1, ""], "Output": [49, 1, 1, ""], "OutputContext": [10, 1, 1, ""], "OutputDefinition": [56, 1, 1, ""], "OutputMapping": [56, 1, 1, ""], "Partition": [50, 1, 1, ""], "PartitionScheduleDefinition": [55, 1, 1, ""], "PartitionSetDefinition": [50, 1, 1, ""], "PartitionedConfig": [50, 1, 1, ""], "PathMetadataValue": [49, 1, 1, ""], "Permissive": [3, 1, 1, ""], "PipelineDefinition": [51, 1, 1, ""], "PipelineExecutionResult": [51, 1, 1, ""], "PipelineFailureSensorContext": [55, 1, 1, ""], "PipelineRun": [9, 1, 1, ""], "PipelineRunStatus": [9, 0, 1, ""], "PresetDefinition": [52, 1, 1, ""], "PythonArtifactMetadataValue": [49, 1, 1, ""], "PythonObjectDagsterType": [57, 4, 1, ""], "RepositoryData": [53, 1, 1, ""], "RepositoryDefinition": [53, 1, 1, ""], "ResourceDefinition": [54, 1, 1, ""], "RetryPolicy": [49, 1, 1, ""], "RetryRequested": [49, 1, 1, ""], "RootInputManager": [10, 1, 1, ""], "RootInputManagerDefinition": [10, 1, 1, ""], "RunFailureSensorContext": [55, 1, 1, ""], "RunRequest": [55, 1, 1, ""], "RunStatusSensorContext": [55, 1, 1, ""], "RunStatusSensorDefinition": [55, 1, 1, ""], "ScalarUnion": [3, 1, 1, ""], "ScheduleDefinition": [55, 1, 1, ""], "ScheduleEvaluationContext": [55, 1, 1, ""], "Selector": [3, 1, 1, ""], "SensorDefinition": [55, 1, 1, ""], "SensorEvaluationContext": [55, 1, 1, ""], "Set": [57, 0, 1, ""], "Shape": [3, 1, 1, ""], "SkipReason": [55, 1, 1, ""], "SolidDefinition": [56, 1, 1, ""], "SolidExecutionContext": [56, 1, 1, ""], "SolidExecutionResult": [56, 1, 1, ""], "SolidInvocation": [51, 0, 1, ""], "SourceAsset": [1, 1, 1, ""], "SourceHashVersionStrategy": [47, 1, 1, ""], "String": [57, 0, 1, ""], "StringSource": [3, 0, 1, ""], "TableColumn": [49, 1, 1, ""], "TableColumnConstraints": [49, 1, 1, ""], "TableConstraints": [49, 1, 1, ""], "TableMetadataValue": [49, 1, 1, ""], "TableRecord": [49, 1, 1, ""], "TableSchema": [49, 1, 1, ""], "TableSchemaMetadataValue": [49, 1, 1, ""], "TextMetadataValue": [49, 1, 1, ""], "Tuple": [57, 0, 1, ""], "TypeCheck": [49, 1, 1, ""], "TypeCheckContext": [6, 1, 1, ""], "UrlMetadataValue": [49, 1, 1, ""], "VersionStrategy": [47, 1, 1, ""], "asset": [1, 4, 1, ""], "asset_sensor": [55, 4, 1, ""], "build_assets_job": [1, 4, 1, ""], "build_hook_context": [8, 4, 1, ""], "build_init_logger_context": [46, 4, 1, ""], "build_init_resource_context": [54, 4, 1, ""], "build_input_context": [10, 4, 1, ""], "build_op_context": [6, 4, 1, ""], "build_output_context": [10, 4, 1, ""], "build_reconstructable_job": [11, 4, 1, ""], "build_resources": [54, 4, 1, ""], "build_run_status_sensor_context": [55, 4, 1, ""], "build_schedule_context": [55, 4, 1, ""], "build_schedule_from_partitioned_job": [55, 4, 1, ""], "build_sensor_context": [55, 4, 1, ""], "build_solid_context": [56, 4, 1, ""], "check_dagster_type": [57, 4, 1, ""], "composite_solid": [56, 4, 1, ""], "config_from_files": [58, 4, 1, ""], "config_from_pkg_resources": [58, 4, 1, ""], "config_from_yaml_strings": [58, 4, 1, ""], "configured": [3, 4, 1, ""], "create_offset_partition_selector": [50, 4, 1, ""], "custom_path_fs_io_manager": [10, 6, 1, ""], "dagster_type_loader": [57, 4, 1, ""], "dagster_type_materializer": [57, 4, 1, ""], "daily_partitioned_config": [50, 4, 1, ""], "date_partition_range": [50, 4, 1, ""], "default_executors": [51, 6, 1, ""], "dynamic_partitioned_config": [50, 4, 1, ""], "execute_pipeline": [51, 4, 1, ""], "execute_pipeline_iterator": [51, 4, 1, ""], "execute_solid": [56, 4, 1, ""], "execute_solid_within_pipeline": [56, 4, 1, ""], "execute_solids_within_pipeline": [56, 4, 1, ""], "executor": [9, 4, 1, ""], "failure_hook": [8, 4, 1, ""], "file_relative_path": [58, 4, 1, ""], "fs_asset_io_manager": [1, 6, 1, ""], "fs_io_manager": [10, 6, 1, ""], "get_dagster_logger": [58, 4, 1, ""], "graph": [7, 4, 1, ""], "hourly_partitioned_config": [50, 4, 1, ""], "identity_partition_selector": [50, 4, 1, ""], "in_process_executor": [6, 6, 1, ""], "io_manager": [10, 4, 1, ""], "job": [11, 4, 1, ""], "local_file_manager": [9, 6, 1, ""], "logger": [46, 4, 1, ""], "make_python_type_usable_as_dagster_type": [57, 4, 1, ""], "make_values_resource": [54, 4, 1, ""], "mem_io_manager": [10, 6, 1, ""], "monthly_partitioned_config": [50, 4, 1, ""], "multi_asset": [1, 4, 1, ""], "multiprocess_executor": [6, 6, 1, ""], "op": [49, 4, 1, ""], "pipeline": [51, 4, 1, ""], "pipeline_failure_sensor": [55, 4, 1, ""], "reconstructable": [6, 1, 1, ""], "reexecute_pipeline": [51, 4, 1, ""], "reexecute_pipeline_iterator": [51, 4, 1, ""], "repository": [53, 6, 1, ""], "resource": [54, 4, 1, ""], "root_input_manager": [10, 4, 1, ""], "run_failure_sensor": [55, 4, 1, ""], "run_status_sensor": [55, 4, 1, ""], "schedule": [55, 4, 1, ""], "sensor": [55, 4, 1, ""], "solid": [56, 4, 1, ""], "static_partitioned_config": [50, 4, 1, ""], "success_hook": [8, 4, 1, ""], "usable_as_dagster_type": [57, 4, 1, ""], "validate_run_config": [6, 4, 1, ""], "weekly_partitioned_config": [50, 4, 1, ""]}, "dagster-api-grpc": {"--attribute": [2, 7, 1, "cmdoption-dagster-api-grpc-a"], "--empty-working-directory": [2, 7, 1, "cmdoption-dagster-api-grpc-empty-working-directory"], "--fixed-server-id": [2, 7, 1, "cmdoption-dagster-api-grpc-fixed-server-id"], "--heartbeat": [2, 7, 1, "cmdoption-dagster-api-grpc-heartbeat"], "--heartbeat-timeout": [2, 7, 1, "cmdoption-dagster-api-grpc-heartbeat-timeout"], "--host": [2, 7, 1, "cmdoption-dagster-api-grpc-h"], "--ipc-output-file": [2, 7, 1, "cmdoption-dagster-api-grpc-ipc-output-file"], "--lazy-load-user-code": [2, 7, 1, "cmdoption-dagster-api-grpc-lazy-load-user-code"], "--log-level": [2, 7, 1, "cmdoption-dagster-api-grpc-log-level"], "--max_workers": [2, 7, 1, "cmdoption-dagster-api-grpc-n"], "--module-name": [2, 7, 1, "cmdoption-dagster-api-grpc-m"], "--override-system-timezone": [2, 7, 1, "cmdoption-dagster-api-grpc-override-system-timezone"], "--package-name": [2, 7, 1, "cmdoption-dagster-api-grpc-package-name"], "--port": [2, 7, 1, "cmdoption-dagster-api-grpc-p"], "--python-file": [2, 7, 1, "cmdoption-dagster-api-grpc-f"], "--socket": [2, 7, 1, "cmdoption-dagster-api-grpc-s"], "--use-python-environment-entry-point": [2, 7, 1, "cmdoption-dagster-api-grpc-use-python-environment-entry-point"], "--working-directory": [2, 7, 1, "cmdoption-dagster-api-grpc-d"], "-a": [2, 7, 1, "cmdoption-dagster-api-grpc-a"], "-d": [2, 7, 1, "cmdoption-dagster-api-grpc-d"], "-f": [2, 7, 1, "cmdoption-dagster-api-grpc-f"], "-h": [2, 7, 1, "cmdoption-dagster-api-grpc-h"], "-m": [2, 7, 1, "cmdoption-dagster-api-grpc-m"], "-n": [2, 7, 1, "cmdoption-dagster-api-grpc-n"], "-p": [2, 7, 1, "cmdoption-dagster-api-grpc-p"], "-s": [2, 7, 1, "cmdoption-dagster-api-grpc-s"]}, "dagster-celery-worker-list": {"--config-yaml": [16, 7, 1, "cmdoption-dagster-celery-worker-list-y"], "-y": [16, 7, 1, "cmdoption-dagster-celery-worker-list-y"]}, "dagster-celery-worker-start": {"--app": [16, 7, 1, "cmdoption-dagster-celery-worker-start-A"], "--background": [16, 7, 1, "cmdoption-dagster-celery-worker-start-d"], "--config-yaml": [16, 7, 1, "cmdoption-dagster-celery-worker-start-y"], "--includes": [16, 7, 1, "cmdoption-dagster-celery-worker-start-i"], "--loglevel": [16, 7, 1, "cmdoption-dagster-celery-worker-start-l"], "--name": [16, 7, 1, "cmdoption-dagster-celery-worker-start-n"], "--queue": [16, 7, 1, "cmdoption-dagster-celery-worker-start-q"], "-A": [16, 7, 1, "cmdoption-dagster-celery-worker-start-A"], "-d": [16, 7, 1, "cmdoption-dagster-celery-worker-start-d"], "-i": [16, 7, 1, "cmdoption-dagster-celery-worker-start-i"], "-l": [16, 7, 1, "cmdoption-dagster-celery-worker-start-l"], "-n": [16, 7, 1, "cmdoption-dagster-celery-worker-start-n"], "-q": [16, 7, 1, "cmdoption-dagster-celery-worker-start-q"], "-y": [16, 7, 1, "cmdoption-dagster-celery-worker-start-y"], "ADDITIONAL_ARGS": [16, 7, 1, "cmdoption-dagster-celery-worker-start-arg-ADDITIONAL_ARGS"]}, "dagster-celery-worker-terminate": {"--all": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-a"], "--config-yaml": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-y"], "-a": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-a"], "-y": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-y"], "NAME": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-arg-NAME"]}, "dagster-daemon-run": {"--attribute": [2, 7, 1, "cmdoption-dagster-daemon-run-a"], "--empty-workspace": [2, 7, 1, "cmdoption-dagster-daemon-run-empty-workspace"], "--grpc-host": [2, 7, 1, "cmdoption-dagster-daemon-run-grpc-host"], "--grpc-port": [2, 7, 1, "cmdoption-dagster-daemon-run-grpc-port"], "--grpc-socket": [2, 7, 1, "cmdoption-dagster-daemon-run-grpc-socket"], "--module-name": [2, 7, 1, "cmdoption-dagster-daemon-run-m"], "--package-name": [2, 7, 1, "cmdoption-dagster-daemon-run-package-name"], "--python-file": [2, 7, 1, "cmdoption-dagster-daemon-run-f"], "--use-ssl": [2, 7, 1, "cmdoption-dagster-daemon-run-use-ssl"], "--working-directory": [2, 7, 1, "cmdoption-dagster-daemon-run-d"], "--workspace": [2, 7, 1, "cmdoption-dagster-daemon-run-w"], "-a": [2, 7, 1, "cmdoption-dagster-daemon-run-a"], "-d": [2, 7, 1, "cmdoption-dagster-daemon-run-d"], "-f": [2, 7, 1, "cmdoption-dagster-daemon-run-f"], "-m": [2, 7, 1, "cmdoption-dagster-daemon-run-m"], "-w": [2, 7, 1, "cmdoption-dagster-daemon-run-w"]}, "dagster-graphql": {"--attribute": [2, 7, 1, "cmdoption-dagster-graphql-a"], "--empty-workspace": [2, 7, 1, "cmdoption-dagster-graphql-empty-workspace"], "--ephemeral-instance": [2, 7, 1, "cmdoption-dagster-graphql-ephemeral-instance"], "--file": [2, 7, 1, "cmdoption-dagster-graphql-f"], "--grpc-host": [2, 7, 1, "cmdoption-dagster-graphql-grpc-host"], "--grpc-port": [2, 7, 1, "cmdoption-dagster-graphql-grpc-port"], "--grpc-socket": [2, 7, 1, "cmdoption-dagster-graphql-grpc-socket"], "--module-name": [2, 7, 1, "cmdoption-dagster-graphql-m"], "--output": [2, 7, 1, "cmdoption-dagster-graphql-o"], "--package-name": [2, 7, 1, "cmdoption-dagster-graphql-package-name"], "--predefined": [2, 7, 1, "cmdoption-dagster-graphql-p"], "--python-file": [2, 7, 1, "cmdoption-dagster-graphql-0"], "--remote": [2, 7, 1, "cmdoption-dagster-graphql-r"], "--text": [2, 7, 1, "cmdoption-dagster-graphql-t"], "--use-ssl": [2, 7, 1, "cmdoption-dagster-graphql-use-ssl"], "--variables": [2, 7, 1, "cmdoption-dagster-graphql-v"], "--version": [2, 7, 1, "cmdoption-dagster-graphql-version"], "--working-directory": [2, 7, 1, "cmdoption-dagster-graphql-d"], "--workspace": [2, 7, 1, "cmdoption-dagster-graphql-w"], "-a": [2, 7, 1, "cmdoption-dagster-graphql-a"], "-d": [2, 7, 1, "cmdoption-dagster-graphql-d"], "-f": [2, 7, 1, "cmdoption-dagster-graphql-0"], "-m": [2, 7, 1, "cmdoption-dagster-graphql-m"], "-o": [2, 7, 1, "cmdoption-dagster-graphql-o"], "-p": [2, 7, 1, "cmdoption-dagster-graphql-p"], "-r": [2, 7, 1, "cmdoption-dagster-graphql-r"], "-t": [2, 7, 1, "cmdoption-dagster-graphql-t"], "-v": [2, 7, 1, "cmdoption-dagster-graphql-v"], "-w": [2, 7, 1, "cmdoption-dagster-graphql-w"]}, "dagster.AssetGroup": {"all_assets_job_name": [1, 2, 1, ""], "build_job": [1, 2, 1, ""], "from_current_module": [1, 2, 1, ""], "from_modules": [1, 2, 1, ""], "from_package_module": [1, 2, 1, ""], "from_package_name": [1, 2, 1, ""]}, "dagster.AssetMaterialization": {"file": [49, 2, 1, ""]}, "dagster.CompositeSolidDefinition": {"configured": [56, 2, 1, ""]}, "dagster.CompositeSolidExecutionResult": {"output_for_solid": [56, 2, 1, ""], "result_for_handle": [56, 2, 1, ""], "result_for_solid": [56, 2, 1, ""], "solid_result_list": [56, 2, 1, ""], "step_event_list": [56, 2, 1, ""], "success": [56, 2, 1, ""]}, "dagster.DagsterEvent": {"event_specific_data": [6, 0, 1, ""], "event_type": [6, 2, 1, ""], "event_type_value": [6, 0, 1, ""], "logging_tags": [6, 0, 1, ""], "message": [6, 0, 1, ""], "pid": [6, 0, 1, ""], "pipeline_name": [6, 0, 1, ""], "solid_handle": [6, 0, 1, ""], "step_key": [6, 0, 1, ""], "step_kind_value": [6, 0, 1, ""]}, "dagster.DagsterEventType": {"ALERT_FAILURE": [6, 0, 1, ""], "ALERT_START": [6, 0, 1, ""], "ALERT_SUCCESS": [6, 0, 1, ""], "ASSET_MATERIALIZATION": [6, 0, 1, ""], "ASSET_MATERIALIZATION_PLANNED": [6, 0, 1, ""], "ASSET_OBSERVATION": [6, 0, 1, ""], "ASSET_STORE_OPERATION": [6, 0, 1, ""], "ENGINE_EVENT": [6, 0, 1, ""], "HANDLED_OUTPUT": [6, 0, 1, ""], "HOOK_COMPLETED": [6, 0, 1, ""], "HOOK_ERRORED": [6, 0, 1, ""], "HOOK_SKIPPED": [6, 0, 1, ""], "LOADED_INPUT": [6, 0, 1, ""], "LOGS_CAPTURED": [6, 0, 1, ""], "OBJECT_STORE_OPERATION": [6, 0, 1, ""], "PIPELINE_CANCELED": [6, 0, 1, ""], "PIPELINE_CANCELING": [6, 0, 1, ""], "PIPELINE_DEQUEUED": [6, 0, 1, ""], "PIPELINE_ENQUEUED": [6, 0, 1, ""], "PIPELINE_FAILURE": [6, 0, 1, ""], "PIPELINE_START": [6, 0, 1, ""], "PIPELINE_STARTING": [6, 0, 1, ""], "PIPELINE_SUCCESS": [6, 0, 1, ""], "RUN_CANCELED": [6, 0, 1, ""], "RUN_CANCELING": [6, 0, 1, ""], "RUN_DEQUEUED": [6, 0, 1, ""], "RUN_ENQUEUED": [6, 0, 1, ""], "RUN_FAILURE": [6, 0, 1, ""], "RUN_START": [6, 0, 1, ""], "RUN_STARTING": [6, 0, 1, ""], "RUN_SUCCESS": [6, 0, 1, ""], "STEP_EXPECTATION_RESULT": [6, 0, 1, ""], "STEP_FAILURE": [6, 0, 1, ""], "STEP_INPUT": [6, 0, 1, ""], "STEP_OUTPUT": [6, 0, 1, ""], "STEP_RESTARTED": [6, 0, 1, ""], "STEP_SKIPPED": [6, 0, 1, ""], "STEP_START": [6, 0, 1, ""], "STEP_SUCCESS": [6, 0, 1, ""], "STEP_UP_FOR_RETRY": [6, 0, 1, ""]}, "dagster.DagsterInstance": {"add_daemon_heartbeat": [9, 2, 1, ""], "get_addresses_for_step_output_versions": [9, 2, 1, ""], "get_daemon_heartbeats": [9, 2, 1, ""], "launch_run": [9, 2, 1, ""], "report_engine_event": [9, 2, 1, ""], "resume_run": [9, 2, 1, ""], "should_start_background_run_thread": [9, 2, 1, ""], "submit_run": [9, 2, 1, ""]}, "dagster.DagsterRunStatus": {"CANCELED": [9, 0, 1, ""], "CANCELING": [9, 0, 1, ""], "FAILURE": [9, 0, 1, ""], "MANAGED": [9, 0, 1, ""], "NOT_STARTED": [9, 0, 1, ""], "QUEUED": [9, 0, 1, ""], "STARTED": [9, 0, 1, ""], "STARTING": [9, 0, 1, ""], "SUCCESS": [9, 0, 1, ""]}, "dagster.Enum": {"from_python_enum": [3, 2, 1, ""]}, "dagster.ExecuteInProcessResult": {"all_events": [6, 2, 1, ""], "all_node_events": [6, 2, 1, ""], "dagster_run": [6, 2, 1, ""], "events_for_node": [6, 2, 1, ""], "get_job_failure_event": [6, 2, 1, ""], "get_job_success_event": [6, 2, 1, ""], "output_for_node": [6, 2, 1, ""], "output_value": [6, 2, 1, ""], "run_id": [6, 2, 1, ""], "success": [6, 2, 1, ""]}, "dagster.Executor": {"execute": [9, 2, 1, ""], "retries": [9, 2, 1, ""]}, "dagster.ExecutorDefinition": {"configured": [9, 2, 1, ""]}, "dagster.FileHandle": {"path_desc": [57, 2, 1, ""]}, "dagster.GraphDefinition": {"execute_in_process": [7, 2, 1, ""], "to_job": [7, 2, 1, ""]}, "dagster.HookContext": {"hook_def": [8, 0, 1, ""], "job_name": [8, 0, 1, ""], "log": [8, 0, 1, ""], "mode_def": [8, 0, 1, ""], "op": [8, 0, 1, ""], "op_config": [8, 0, 1, ""], "op_exception": [8, 0, 1, ""], "op_output_values": [8, 0, 1, ""], "pipeline_name": [8, 0, 1, ""], "required_resource_keys": [8, 0, 1, ""], "resources": [8, 0, 1, ""], "run_id": [8, 0, 1, ""], "solid": [8, 0, 1, ""], "solid_config": [8, 0, 1, ""], "solid_exception": [8, 2, 1, ""], "solid_output_values": [8, 2, 1, ""], "step_key": [8, 0, 1, ""]}, "dagster.IOManager": {"get_input_asset_key": [10, 2, 1, ""], "get_input_asset_partitions": [10, 2, 1, ""], "get_output_asset_key": [10, 2, 1, ""], "get_output_asset_partitions": [10, 2, 1, ""], "handle_output": [10, 2, 1, ""], "load_input": [10, 2, 1, ""]}, "dagster.IOManagerDefinition": {"hardcoded_io_manager": [10, 2, 1, ""], "input_config_schema": [10, 2, 1, ""], "output_config_schema": [10, 2, 1, ""]}, "dagster.InitExecutorContext": {"executor_config": [9, 0, 1, ""], "executor_def": [9, 0, 1, ""], "instance": [9, 0, 1, ""], "job": [9, 0, 1, ""]}, "dagster.InitLoggerContext": {"logger_config": [46, 0, 1, ""], "logger_def": [46, 0, 1, ""], "pipeline_def": [46, 0, 1, ""], "run_id": [46, 0, 1, ""]}, "dagster.InitResourceContext": {"dagster_run": [54, 0, 1, ""], "log_manager": [54, 0, 1, ""], "pipeline_run": [54, 0, 1, ""], "resource_config": [54, 0, 1, ""], "resource_def": [54, 0, 1, ""], "resources": [54, 0, 1, ""], "run_id": [54, 0, 1, ""]}, "dagster.InputContext": {"add_input_metadata": [10, 2, 1, ""], "asset_partition_key": [10, 2, 1, ""], "asset_partition_key_range": [10, 2, 1, ""], "asset_partitions_time_window": [10, 2, 1, ""], "config": [10, 0, 1, ""], "consume_events": [10, 2, 1, ""], "dagster_type": [10, 0, 1, ""], "get_observations": [10, 2, 1, ""], "has_input_name": [10, 2, 1, ""], "has_partition_key": [10, 2, 1, ""], "log": [10, 0, 1, ""], "metadata": [10, 0, 1, ""], "name": [10, 0, 1, ""], "op_def": [10, 0, 1, ""], "partition_key": [10, 2, 1, ""], "pipeline_name": [10, 0, 1, ""], "resource_config": [10, 0, 1, ""], "resources": [10, 0, 1, ""], "solid_def": [10, 0, 1, ""], "upstream_output": [10, 0, 1, ""]}, "dagster.JobDefinition": {"execute_in_process": [11, 2, 1, ""], "with_hooks": [11, 2, 1, ""]}, "dagster.LoggerDefinition": {"configured": [46, 2, 1, ""]}, "dagster.MemoizableIOManager": {"has_output": [47, 2, 1, ""]}, "dagster.MetadataEntry": {"asset": [49, 2, 1, ""], "float": [49, 2, 1, ""], "fspath": [49, 2, 1, ""], "int": [49, 2, 1, ""], "json": [49, 2, 1, ""], "md": [49, 2, 1, ""], "path": [49, 2, 1, ""], "table": [49, 2, 1, ""], "table_schema": [49, 2, 1, ""], "text": [49, 2, 1, ""], "url": [49, 2, 1, ""], "value": [49, 2, 1, ""]}, "dagster.MetadataValue": {"asset": [49, 2, 1, ""], "dagster_run": [49, 2, 1, ""], "float": [49, 2, 1, ""], "int": [49, 2, 1, ""], "json": [49, 2, 1, ""], "md": [49, 2, 1, ""], "path": [49, 2, 1, ""], "python_artifact": [49, 2, 1, ""], "table": [49, 2, 1, ""], "table_schema": [49, 2, 1, ""], "text": [49, 2, 1, ""], "url": [49, 2, 1, ""]}, "dagster.OpDefinition": {"configured": [49, 2, 1, ""]}, "dagster.OpExecutionContext": {"add_output_metadata": [6, 2, 1, ""], "consume_events": [6, 2, 1, ""], "get_mapping_key": [6, 2, 1, ""], "get_tag": [6, 2, 1, ""], "has_partition_key": [6, 2, 1, ""], "has_tag": [6, 2, 1, ""], "instance": [6, 2, 1, ""], "log": [6, 2, 1, ""], "log_event": [6, 2, 1, ""], "mode_def": [6, 2, 1, ""], "output_asset_partition_key": [6, 2, 1, ""], "output_asset_partitions_time_window": [6, 2, 1, ""], "partition_key": [6, 2, 1, ""], "pdb": [6, 2, 1, ""], "pipeline_def": [6, 2, 1, ""], "pipeline_name": [6, 2, 1, ""], "pipeline_run": [6, 2, 1, ""], "resources": [6, 2, 1, ""], "retry_number": [6, 2, 1, ""], "run_config": [6, 2, 1, ""], "run_id": [6, 2, 1, ""], "solid_config": [6, 2, 1, ""], "solid_def": [6, 2, 1, ""], "step_launcher": [6, 2, 1, ""]}, "dagster.OutputContext": {"add_output_metadata": [10, 2, 1, ""], "asset_partition_key": [10, 2, 1, ""], "asset_partition_key_range": [10, 2, 1, ""], "asset_partitions_time_window": [10, 2, 1, ""], "config": [10, 0, 1, ""], "consume_events": [10, 2, 1, ""], "consume_logged_metadata_entries": [10, 2, 1, ""], "dagster_type": [10, 0, 1, ""], "get_logged_events": [10, 2, 1, ""], "get_logged_metadata_entries": [10, 2, 1, ""], "get_output_identifier": [10, 2, 1, ""], "get_run_scoped_output_identifier": [10, 2, 1, ""], "has_partition_key": [10, 2, 1, ""], "log": [10, 0, 1, ""], "log_event": [10, 2, 1, ""], "mapping_key": [10, 0, 1, ""], "metadata": [10, 0, 1, ""], "name": [10, 0, 1, ""], "op_def": [10, 0, 1, ""], "partition_key": [10, 2, 1, ""], "pipeline_name": [10, 0, 1, ""], "resource_config": [10, 0, 1, ""], "resources": [10, 0, 1, ""], "run_id": [10, 0, 1, ""], "solid_def": [10, 0, 1, ""], "step_key": [10, 0, 1, ""], "version": [10, 0, 1, ""]}, "dagster.PartitionSetDefinition": {"create_schedule_definition": [50, 2, 1, ""], "get_partitions": [50, 2, 1, ""]}, "dagster.PartitionedConfig": {"get_run_config_for_partition_key": [50, 2, 1, ""]}, "dagster.PipelineExecutionResult": {"output_for_solid": [51, 2, 1, ""], "result_for_handle": [51, 2, 1, ""], "result_for_solid": [51, 2, 1, ""], "solid_result_list": [51, 2, 1, ""], "step_event_list": [51, 2, 1, ""], "success": [51, 2, 1, ""]}, "dagster.PipelineFailureSensorContext": {"failure_event": [55, 0, 1, ""], "pipeline_run": [55, 0, 1, ""], "sensor_name": [55, 0, 1, ""]}, "dagster.PresetDefinition": {"from_files": [52, 2, 1, ""], "from_pkg_resources": [52, 2, 1, ""], "from_yaml_strings": [52, 2, 1, ""], "get_environment_yaml": [52, 2, 1, ""], "with_additional_config": [52, 2, 1, ""]}, "dagster.RepositoryDefinition": {"get_all_jobs": [53, 2, 1, ""], "get_job": [53, 2, 1, ""], "has_job": [53, 2, 1, ""], "job_names": [53, 2, 1, ""]}, "dagster.ResourceDefinition": {"configured": [54, 2, 1, ""], "hardcoded_resource": [54, 2, 1, ""], "mock_resource": [54, 2, 1, ""], "none_resource": [54, 2, 1, ""]}, "dagster.RootInputManager": {"load_input": [10, 2, 1, ""]}, "dagster.RootInputManagerDefinition": {"input_config_schema": [10, 2, 1, ""]}, "dagster.RunFailureSensorContext": {"failure_event": [55, 0, 1, ""], "pipeline_run": [55, 0, 1, ""], "sensor_name": [55, 0, 1, ""]}, "dagster.RunRequest": {"job_name": [55, 0, 1, ""], "run_config": [55, 0, 1, ""], "run_key": [55, 0, 1, ""], "tags": [55, 0, 1, ""]}, "dagster.RunStatusSensorContext": {"dagster_event": [55, 0, 1, ""], "dagster_run": [55, 0, 1, ""], "for_run_failure": [55, 2, 1, ""], "instance": [55, 0, 1, ""], "sensor_name": [55, 0, 1, ""]}, "dagster.ScheduleEvaluationContext": {"instance_ref": [55, 0, 1, ""], "scheduled_execution_time": [55, 0, 1, ""]}, "dagster.SensorEvaluationContext": {"cursor": [55, 0, 1, ""], "instance": [55, 0, 1, ""], "instance_ref": [55, 0, 1, ""], "last_completion_time": [55, 0, 1, ""], "last_run_key": [55, 0, 1, ""], "repository_name": [55, 0, 1, ""]}, "dagster.SkipReason": {"skip_message": [55, 0, 1, ""]}, "dagster.SolidDefinition": {"configured": [56, 2, 1, ""]}, "dagster.SolidExecutionContext": {"add_output_metadata": [56, 2, 1, ""], "consume_events": [56, 2, 1, ""], "get_mapping_key": [56, 2, 1, ""], "get_tag": [56, 2, 1, ""], "has_partition_key": [56, 2, 1, ""], "has_tag": [56, 2, 1, ""], "instance": [56, 2, 1, ""], "log": [56, 2, 1, ""], "log_event": [56, 2, 1, ""], "mode_def": [56, 2, 1, ""], "output_asset_partition_key": [56, 2, 1, ""], "output_asset_partitions_time_window": [56, 2, 1, ""], "partition_key": [56, 2, 1, ""], "pdb": [56, 2, 1, ""], "pipeline_def": [56, 2, 1, ""], "pipeline_name": [56, 2, 1, ""], "pipeline_run": [56, 2, 1, ""], "resources": [56, 2, 1, ""], "retry_number": [56, 2, 1, ""], "run_config": [56, 2, 1, ""], "run_id": [56, 2, 1, ""], "solid_config": [56, 2, 1, ""], "solid_def": [56, 2, 1, ""], "step_launcher": [56, 2, 1, ""]}, "dagster.SolidExecutionResult": {"compute_input_event_dict": [56, 2, 1, ""], "compute_output_events_dict": [56, 2, 1, ""], "compute_step_events": [56, 2, 1, ""], "compute_step_failure_event": [56, 2, 1, ""], "expectation_events_during_compute": [56, 2, 1, ""], "expectation_results_during_compute": [56, 2, 1, ""], "failure_data": [56, 2, 1, ""], "get_output_event_for_compute": [56, 2, 1, ""], "get_output_events_for_compute": [56, 2, 1, ""], "get_step_success_event": [56, 2, 1, ""], "input_events_during_compute": [56, 2, 1, ""], "materialization_events_during_compute": [56, 2, 1, ""], "materializations_during_compute": [56, 2, 1, ""], "output_events_during_compute": [56, 2, 1, ""], "output_value": [56, 2, 1, ""], "output_values": [56, 2, 1, ""], "retry_attempts": [56, 2, 1, ""], "skipped": [56, 2, 1, ""], "success": [56, 2, 1, ""]}, "dagster.SourceAsset": {"description": [1, 0, 1, ""], "io_manager_key": [1, 0, 1, ""], "key": [1, 0, 1, ""], "metadata_entries": [1, 0, 1, ""], "partitions_def": [1, 0, 1, ""]}, "dagster.TypeCheckContext": {"log": [6, 0, 1, ""], "resources": [6, 0, 1, ""], "run_id": [6, 0, 1, ""]}, "dagster.core": {"errors": [5, 5, 0, "-"]}, "dagster.core.definitions.reconstruct": {"ReconstructablePipeline": [51, 1, 1, ""]}, "dagster.core.definitions.reconstruct.ReconstructablePipeline": {"get_module": [51, 2, 1, ""]}, "dagster.core.errors": {"user_code_error_boundary": [9, 4, 1, ""]}, "dagster.core.instance": {"InstanceRef": [9, 1, 1, ""]}, "dagster.core.launcher": {"DefaultRunLauncher": [9, 1, 1, ""], "RunLauncher": [9, 1, 1, ""]}, "dagster.core.run_coordinator": {"DefaultRunCoordinator": [9, 1, 1, ""], "QueuedRunCoordinator": [9, 6, 1, ""]}, "dagster.core.scheduler": {"DagsterDaemonScheduler": [55, 6, 1, ""], "Scheduler": [9, 1, 1, ""]}, "dagster.core.storage.compute_log_manager": {"ComputeLogManager": [9, 1, 1, ""]}, "dagster.core.storage.event_log": {"ConsolidatedSqliteEventLogStorage": [9, 1, 1, ""], "EventLogEntry": [9, 1, 1, ""], "EventLogRecord": [9, 1, 1, ""], "EventLogStorage": [9, 1, 1, ""], "EventRecordsFilter": [9, 1, 1, ""], "RunShardedEventsCursor": [9, 1, 1, ""], "SqlEventLogStorage": [9, 1, 1, ""], "SqliteEventLogStorage": [9, 1, 1, ""]}, "dagster.core.storage.file_manager": {"FileManager": [9, 1, 1, ""]}, "dagster.core.storage.file_manager.FileManager": {"copy_handle_to_local_temp": [9, 2, 1, ""], "delete_local_temp": [9, 2, 1, ""], "read": [9, 2, 1, ""], "read_data": [9, 2, 1, ""], "write": [9, 2, 1, ""], "write_data": [9, 2, 1, ""]}, "dagster.core.storage.local_compute_log_manager": {"LocalComputeLogManager": [9, 1, 1, ""]}, "dagster.core.storage.root": {"LocalArtifactStorage": [9, 1, 1, ""]}, "dagster.core.storage.root.LocalArtifactStorage": {"config_type": [9, 2, 1, ""], "from_config_value": [9, 2, 1, ""], "inst_data": [9, 2, 1, ""]}, "dagster.core.storage.runs": {"RunStorage": [9, 1, 1, ""], "SqlRunStorage": [9, 1, 1, ""], "SqliteRunStorage": [9, 1, 1, ""]}, "dagster.core.storage.schedules": {"ScheduleStorage": [9, 1, 1, ""], "SqlScheduleStorage": [9, 1, 1, ""], "SqliteScheduleStorage": [9, 1, 1, ""]}, "dagster.loggers": {"colored_console_logger": [46, 4, 1, ""], "json_console_logger": [46, 4, 1, ""]}, "dagster.serdes": {"ConfigurableClass": [9, 1, 1, ""], "ConfigurableClassData": [9, 1, 1, ""]}, "dagster.serdes.ConfigurableClass": {"config_type": [9, 2, 1, ""], "from_config_value": [9, 2, 1, ""], "inst_data": [9, 2, 1, ""]}, "dagster.utils": {"make_email_on_run_failure_sensor": [58, 4, 1, ""]}, "dagster.utils.forked_pdb": {"ForkedPdb": [58, 1, 1, ""]}, "dagster_airbyte": {"AirbyteResource": [12, 1, 1, ""], "airbyte_resource": [12, 6, 1, ""], "airbyte_sync_op": [12, 6, 1, ""], "build_airbyte_assets": [12, 4, 1, ""]}, "dagster_airbyte.AirbyteResource": {"make_request": [12, 2, 1, ""], "sync_and_poll": [12, 2, 1, ""]}, "dagster_airflow": {"make_airflow_dag": [13, 4, 1, ""], "make_airflow_dag_containerized": [13, 4, 1, ""], "make_airflow_dag_for_operator": [13, 4, 1, ""], "make_dagster_job_from_airflow_dag": [13, 4, 1, ""], "make_dagster_pipeline_from_airflow_dag": [13, 4, 1, ""], "make_dagster_repo_from_airflow_dag_bag": [13, 4, 1, ""], "make_dagster_repo_from_airflow_dags_path": [13, 4, 1, ""], "make_dagster_repo_from_airflow_example_dags": [13, 4, 1, ""]}, "dagster_aws.cloudwatch": {"cloudwatch_logger": [14, 6, 1, ""]}, "dagster_aws.ecs": {"EcsRunLauncher": [14, 6, 1, ""]}, "dagster_aws.emr": {"EmrClusterState": [14, 6, 1, ""], "EmrError": [14, 1, 1, ""], "EmrJobRunner": [14, 1, 1, ""], "EmrStepState": [14, 6, 1, ""], "emr_pyspark_step_launcher": [14, 6, 1, ""]}, "dagster_aws.redshift": {"fake_redshift_resource": [14, 6, 1, ""], "redshift_resource": [14, 6, 1, ""]}, "dagster_aws.s3": {"S3ComputeLogManager": [14, 1, 1, ""], "S3Coordinate": [14, 6, 1, ""], "S3FileCache": [14, 1, 1, ""], "S3FileHandle": [14, 1, 1, ""], "s3_file_manager": [14, 6, 1, ""], "s3_pickle_asset_io_manager": [14, 6, 1, ""], "s3_pickle_io_manager": [14, 6, 1, ""], "s3_resource": [14, 6, 1, ""]}, "dagster_aws.s3.S3FileHandle": {"path_desc": [14, 2, 1, ""], "s3_bucket": [14, 2, 1, ""], "s3_key": [14, 2, 1, ""], "s3_path": [14, 2, 1, ""]}, "dagster_aws.secretsmanager": {"secretsmanager_resource": [14, 6, 1, ""], "secretsmanager_secrets_resource": [14, 6, 1, ""]}, "dagster_azure.adls2": {"ADLS2FileHandle": [15, 1, 1, ""], "FakeADLS2Resource": [15, 1, 1, ""], "adls2_file_cache": [15, 6, 1, ""], "adls2_file_manager": [15, 6, 1, ""], "adls2_pickle_asset_io_manager": [15, 6, 1, ""], "adls2_pickle_io_manager": [15, 6, 1, ""], "adls2_resource": [15, 6, 1, ""]}, "dagster_azure.adls2.ADLS2FileHandle": {"account": [15, 2, 1, ""], "adls2_path": [15, 2, 1, ""], "file_system": [15, 2, 1, ""], "key": [15, 2, 1, ""], "path_desc": [15, 2, 1, ""]}, "dagster_azure.blob": {"AzureBlobComputeLogManager": [15, 1, 1, ""]}, "dagster_celery": {"celery_executor": [16, 6, 1, ""]}, "dagster_celery_docker": {"celery_docker_executor": [17, 6, 1, ""]}, "dagster_celery_k8s": {"CeleryK8sRunLauncher": [18, 6, 1, ""], "celery_k8s_job_executor": [18, 6, 1, ""]}, "dagster_dask": {"dask_executor": [19, 6, 1, ""]}, "dagster_databricks": {"DatabricksError": [20, 1, 1, ""], "create_databricks_job_op": [20, 4, 1, ""], "create_databricks_job_solid": [20, 4, 1, ""], "databricks_pyspark_step_launcher": [20, 6, 1, ""]}, "dagster_datadog": {"datadog_resource": [21, 6, 1, ""]}, "dagster_dbt": {"DagsterDbtCliFatalRuntimeError": [22, 3, 1, ""], "DagsterDbtCliHandledRuntimeError": [22, 3, 1, ""], "DagsterDbtCliOutputsNotFoundError": [22, 3, 1, ""], "DagsterDbtCliRuntimeError": [22, 3, 1, ""], "DagsterDbtCliUnexpectedOutputError": [22, 3, 1, ""], "DagsterDbtError": [22, 3, 1, ""], "DagsterDbtRpcUnexpectedPollOutputError": [22, 3, 1, ""], "DbtCliOutput": [22, 1, 1, ""], "DbtCliResource": [22, 1, 1, ""], "DbtCloudResourceV2": [22, 1, 1, ""], "DbtOutput": [22, 1, 1, ""], "DbtResource": [22, 1, 1, ""], "DbtRpcOutput": [22, 1, 1, ""], "DbtRpcResource": [22, 1, 1, ""], "DbtRpcSyncResource": [22, 1, 1, ""], "create_dbt_rpc_run_sql_solid": [22, 4, 1, ""], "dbt_cli_compile": [22, 6, 1, ""], "dbt_cli_resource": [22, 6, 1, ""], "dbt_cli_run": [22, 6, 1, ""], "dbt_cli_run_operation": [22, 6, 1, ""], "dbt_cli_snapshot": [22, 6, 1, ""], "dbt_cli_snapshot_freshness": [22, 6, 1, ""], "dbt_cli_test": [22, 6, 1, ""], "dbt_cloud_resource": [22, 6, 1, ""], "dbt_cloud_run_op": [22, 6, 1, ""], "dbt_compile_op": [22, 4, 1, ""], "dbt_docs_generate_op": [22, 4, 1, ""], "dbt_ls_op": [22, 4, 1, ""], "dbt_rpc_compile_sql": [22, 6, 1, ""], "dbt_rpc_resource": [22, 6, 1, ""], "dbt_rpc_run": [22, 6, 1, ""], "dbt_rpc_run_and_wait": [22, 6, 1, ""], "dbt_rpc_run_operation": [22, 6, 1, ""], "dbt_rpc_run_operation_and_wait": [22, 6, 1, ""], "dbt_rpc_snapshot": [22, 6, 1, ""], "dbt_rpc_snapshot_and_wait": [22, 6, 1, ""], "dbt_rpc_snapshot_freshness": [22, 6, 1, ""], "dbt_rpc_snapshot_freshness_and_wait": [22, 6, 1, ""], "dbt_rpc_sync_resource": [22, 6, 1, ""], "dbt_rpc_test": [22, 6, 1, ""], "dbt_rpc_test_and_wait": [22, 6, 1, ""], "dbt_run_op": [22, 6, 1, ""], "dbt_seed_op": [22, 4, 1, ""], "dbt_snapshot_op": [22, 4, 1, ""], "dbt_test_op": [22, 4, 1, ""], "load_assets_from_dbt_manifest": [22, 4, 1, ""], "load_assets_from_dbt_project": [22, 4, 1, ""], "local_dbt_rpc_resource": [22, 6, 1, ""]}, "dagster_dbt.DbtCliOutput": {"command": [22, 0, 1, ""], "logs": [22, 0, 1, ""], "raw_output": [22, 0, 1, ""], "result": [22, 0, 1, ""], "return_code": [22, 0, 1, ""]}, "dagster_dbt.DbtCliResource": {"cli": [22, 2, 1, ""], "compile": [22, 2, 1, ""], "default_flags": [22, 2, 1, ""], "freshness": [22, 2, 1, ""], "generate_docs": [22, 2, 1, ""], "ls": [22, 2, 1, ""], "run": [22, 2, 1, ""], "run_operation": [22, 2, 1, ""], "seed": [22, 2, 1, ""], "snapshot": [22, 2, 1, ""], "strict_flags": [22, 2, 1, ""], "test": [22, 2, 1, ""]}, "dagster_dbt.DbtCloudResourceV2": {"cancel_run": [22, 2, 1, ""], "get_job": [22, 2, 1, ""], "get_manifest": [22, 2, 1, ""], "get_run": [22, 2, 1, ""], "get_run_artifact": [22, 2, 1, ""], "get_run_results": [22, 2, 1, ""], "get_run_steps": [22, 2, 1, ""], "list_run_artifacts": [22, 2, 1, ""], "make_request": [22, 2, 1, ""], "poll_run": [22, 2, 1, ""], "run_job": [22, 2, 1, ""], "run_job_and_poll": [22, 2, 1, ""], "update_job": [22, 2, 1, ""]}, "dagster_dbt.DbtResource": {"compile": [22, 2, 1, ""], "generate_docs": [22, 2, 1, ""], "logger": [22, 2, 1, ""], "ls": [22, 2, 1, ""], "run": [22, 2, 1, ""], "run_operation": [22, 2, 1, ""], "seed": [22, 2, 1, ""], "snapshot": [22, 2, 1, ""], "test": [22, 2, 1, ""]}, "dagster_dbt.DbtRpcOutput": {"response": [22, 0, 1, ""], "response_dict": [22, 0, 1, ""], "result": [22, 0, 1, ""]}, "dagster_dbt.DbtRpcResource": {"cli": [22, 2, 1, ""], "compile": [22, 2, 1, ""], "compile_sql": [22, 2, 1, ""], "generate_docs": [22, 2, 1, ""], "host": [22, 2, 1, ""], "jsonrpc_version": [22, 2, 1, ""], "kill": [22, 2, 1, ""], "logger": [22, 2, 1, ""], "ls": [22, 2, 1, ""], "poll": [22, 2, 1, ""], "port": [22, 2, 1, ""], "ps": [22, 2, 1, ""], "run": [22, 2, 1, ""], "run_operation": [22, 2, 1, ""], "run_sql": [22, 2, 1, ""], "seed": [22, 2, 1, ""], "snapshot": [22, 2, 1, ""], "snapshot_freshness": [22, 2, 1, ""], "status": [22, 2, 1, ""], "test": [22, 2, 1, ""], "url": [22, 2, 1, ""]}, "dagster_dbt.utils": {"generate_materializations": [22, 4, 1, ""]}, "dagster_docker": {"DockerRunLauncher": [23, 6, 1, ""], "docker_executor": [23, 6, 1, ""]}, "dagster_fivetran": {"FivetranResource": [24, 1, 1, ""], "build_fivetran_assets": [24, 4, 1, ""], "fivetran_resource": [24, 6, 1, ""], "fivetran_sync_op": [24, 6, 1, ""]}, "dagster_fivetran.FivetranResource": {"get_connector_details": [24, 2, 1, ""], "get_connector_sync_status": [24, 2, 1, ""], "make_request": [24, 2, 1, ""], "poll_sync": [24, 2, 1, ""], "resync_and_poll": [24, 2, 1, ""], "start_resync": [24, 2, 1, ""], "start_sync": [24, 2, 1, ""], "sync_and_poll": [24, 2, 1, ""], "update_connector": [24, 2, 1, ""], "update_schedule_type": [24, 2, 1, ""]}, "dagster_gcp": {"BigQueryError": [25, 1, 1, ""], "GCSFileHandle": [25, 1, 1, ""], "bigquery_resource": [25, 6, 1, ""], "bq_create_dataset": [25, 4, 1, ""], "bq_delete_dataset": [25, 4, 1, ""], "bq_op_for_queries": [25, 4, 1, ""], "bq_solid_for_queries": [25, 4, 1, ""], "dataproc_op": [25, 6, 1, ""], "dataproc_resource": [25, 6, 1, ""], "dataproc_solid": [25, 4, 1, ""], "gcs_file_manager": [25, 6, 1, ""], "gcs_resource": [25, 6, 1, ""], "import_df_to_bq": [25, 4, 1, ""], "import_file_to_bq": [25, 4, 1, ""], "import_gcs_paths_to_bq": [25, 4, 1, ""]}, "dagster_gcp.GCSFileHandle": {"gcs_bucket": [25, 2, 1, ""], "gcs_key": [25, 2, 1, ""], "gcs_path": [25, 2, 1, ""], "path_desc": [25, 2, 1, ""]}, "dagster_gcp.gcs": {"gcs_pickle_asset_io_manager": [25, 6, 1, ""], "gcs_pickle_io_manager": [25, 6, 1, ""]}, "dagster_ge": {"ge_validation_op_factory": [26, 4, 1, ""], "ge_validation_solid_factory": [26, 4, 1, ""]}, "dagster_github": {"github_resource": [27, 6, 1, ""]}, "dagster_graphql": {"DagsterGraphQLClient": [28, 1, 1, ""], "DagsterGraphQLClientError": [28, 3, 1, ""], "InvalidOutputErrorInfo": [28, 1, 1, ""], "ReloadRepositoryLocationInfo": [28, 1, 1, ""], "ReloadRepositoryLocationStatus": [28, 1, 1, ""]}, "dagster_graphql.DagsterGraphQLClient": {"get_run_status": [28, 2, 1, ""], "reload_repository_location": [28, 2, 1, ""], "shutdown_repository_location": [28, 2, 1, ""], "submit_job_execution": [28, 2, 1, ""], "submit_pipeline_execution": [28, 2, 1, ""]}, "dagster_k8s": {"K8sRunLauncher": [29, 6, 1, ""], "k8s_job_executor": [29, 6, 1, ""]}, "dagster_mlflow": {"end_mlflow_on_run_finished": [30, 6, 1, ""], "end_mlflow_run_on_pipeline_finished": [30, 6, 1, ""], "mlflow_tracking": [30, 6, 1, ""]}, "dagster_msteams": {"make_teams_on_pipeline_failure_sensor": [31, 4, 1, ""], "msteams_resource": [31, 6, 1, ""], "teams_on_failure": [31, 6, 1, ""], "teams_on_success": [31, 6, 1, ""]}, "dagster_mysql": {"MySQLEventLogStorage": [32, 1, 1, ""], "MySQLRunStorage": [32, 1, 1, ""], "MySQLScheduleStorage": [32, 1, 1, ""]}, "dagster_pagerduty": {"pagerduty_resource": [33, 6, 1, ""]}, "dagster_pandas": {"DataFrame": [34, 6, 1, ""], "PandasColumn": [34, 1, 1, ""], "RowCountConstraint": [34, 1, 1, ""], "StrictColumnsConstraint": [34, 1, 1, ""], "create_dagster_pandas_dataframe_type": [34, 4, 1, ""]}, "dagster_pandas.PandasColumn": {"boolean_column": [34, 2, 1, ""], "categorical_column": [34, 2, 1, ""], "datetime_column": [34, 2, 1, ""], "exists": [34, 2, 1, ""], "float_column": [34, 2, 1, ""], "integer_column": [34, 2, 1, ""], "numeric_column": [34, 2, 1, ""], "string_column": [34, 2, 1, ""]}, "dagster_papertrail": {"papertrail_logger": [35, 6, 1, ""]}, "dagster_postgres": {"PostgresEventLogStorage": [36, 6, 1, ""], "PostgresRunStorage": [36, 6, 1, ""], "PostgresScheduleStorage": [36, 6, 1, ""]}, "dagster_prometheus": {"prometheus_resource": [37, 6, 1, ""]}, "dagster_prometheus.resources": {"PrometheusResource": [37, 1, 1, ""]}, "dagster_pyspark": {"pyspark_resource": [38, 6, 1, ""]}, "dagster_shell": {"create_shell_command_op": [39, 4, 1, ""], "create_shell_command_solid": [39, 4, 1, ""], "create_shell_script_op": [39, 4, 1, ""], "create_shell_script_solid": [39, 4, 1, ""], "shell_op": [39, 4, 1, ""], "shell_solid": [39, 4, 1, ""]}, "dagster_slack": {"make_slack_on_pipeline_failure_sensor": [40, 4, 1, ""], "make_slack_on_run_failure_sensor": [40, 4, 1, ""], "slack_on_failure": [40, 6, 1, ""], "slack_on_success": [40, 6, 1, ""], "slack_resource": [40, 6, 1, ""]}, "dagster_snowflake": {"SnowflakeConnection": [41, 1, 1, ""], "snowflake_op_for_query": [41, 4, 1, ""], "snowflake_resource": [41, 6, 1, ""]}, "dagster_snowflake.SnowflakeConnection": {"execute_queries": [41, 2, 1, ""], "execute_query": [41, 2, 1, ""], "get_connection": [41, 2, 1, ""], "load_table_from_local_parquet": [41, 2, 1, ""]}, "dagster_spark": {"SparkOpError": [42, 1, 1, ""], "construct_spark_shell_command": [42, 4, 1, ""], "create_spark_op": [42, 4, 1, ""], "define_spark_config": [42, 4, 1, ""], "spark_resource": [42, 6, 1, ""]}, "dagster_ssh": {"SSHResource": [43, 1, 1, ""], "ssh_resource": [43, 6, 1, ""]}, "dagster_twilio": {"twilio_resource": [44, 6, 1, ""]}, "dagstermill": {"DagstermillError": [45, 1, 1, ""], "DagstermillExecutionContext": [45, 1, 1, ""], "define_dagstermill_op": [45, 4, 1, ""], "define_dagstermill_solid": [45, 4, 1, ""], "get_context": [45, 4, 1, ""], "local_output_notebook_io_manager": [45, 4, 1, ""], "yield_event": [45, 4, 1, ""], "yield_result": [45, 4, 1, ""]}, "dagstermill.DagstermillExecutionContext": {"get_tag": [45, 2, 1, ""], "has_tag": [45, 2, 1, ""], "log": [45, 2, 1, ""], "logging_tags": [45, 2, 1, ""], "pipeline_def": [45, 2, 1, ""], "pipeline_run": [45, 2, 1, ""], "resolved_run_config": [45, 2, 1, ""], "resources": [45, 2, 1, ""], "run_config": [45, 2, 1, ""], "run_id": [45, 2, 1, ""], "solid": [45, 2, 1, ""], "solid_config": [45, 2, 1, ""], "solid_def": [45, 2, 1, ""]}}, "objnames": {"0": ["py", "attribute", "Python attribute"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "exception", "Python exception"], "4": ["py", "function", "Python function"], "5": ["py", "module", "Python module"], "6": ["py", "data", "Python data"], "7": ["std", "cmdoption", "program option"]}, "objtypes": {"0": "py:attribute", "1": "py:class", "2": "py:method", "3": "py:exception", "4": "py:function", "5": "py:module", "6": "py:data", "7": "std:cmdoption"}, "terms": {"00am": 50, "0123456789abcdef0123456789abcdef": 33, "100": [14, 25, 38], "10000": 53, "1001": 21, "1035": 25, "11000": 53, "120": 22, "1234": 21, "12345": 22, "127": [2, 29], "145224193": 34, "15000": 2, "1677": 34, "20000": [14, 38], "200m": [14, 38], "2017": 16, "2020": 29, "2021": 50, "2022": [50, 55], "2048m": [14, 38], "21t21": 29, "2200": 20, "2262": 34, "2546": [14, 38], "28000m": 20, "2auto": 25, "2fbl320": 41, "2gb": [14, 38], "3000": [2, 28, 31, 40], "30000": 22, "300mb": [14, 38], "3333": 2, "4815": [14, 38], "5000": 30, "500gb": 25, "500m": [14, 38], "512m": [14, 38], "5432": 36, "54321": 22, "5439": 14, "5672": 16, "6313": [14, 38], "6379": 29, "77777": 22, "8080": 22, "854775807": 34, "8580": 22, "86400": [18, 20], "95590a": 29, "999": 21, "AWS": [20, 29], "But": [14, 38], "EBS": 20, "EKS": 29, "For": [1, 2, 3, 6, 7, 9, 10, 11, 12, 14, 18, 20, 22, 24, 25, 29, 34, 38, 41, 49, 50, 51, 54, 55, 56, 57], "IDs": 27, "Ins": 57, "Its": [14, 38], "K8s": 18, "KMS": [20, 25], "NFS": [14, 38], "Not": [2, 14, 38, 55], "ONE": 2, "Ops": 56, "PBS": 19, "SAS": 15, "Such": 58, "TLS": 31, "That": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 29, 31, 33, 34, 35, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57], "Then": [16, 17, 18, 23, 29, 40], "There": [1, 5, 31, 33], "These": [1, 4, 6, 7, 8, 9, 11, 14, 20, 29, 34, 38, 49, 51, 54, 56, 57], "UDS": 2, "UIs": [14, 38], "USE": [14, 41], "Use": [2, 3, 9, 12, 13, 14, 20, 21, 29, 34, 38, 53, 55, 56, 57], "Used": [9, 13, 22, 49, 51, 56], "Useful": [2, 14, 25, 38], "Uses": [4, 7, 56], "Using": [6, 7, 9, 11, 25, 51, 56], "Will": [18, 22, 25, 29], "With": 18, "__executor_name__": [6, 51], "__fieldvaluesentinel": 3, "__file__": [4, 39, 58], "__init__": [53, 57], "__input_name__": [6, 51], "__logger_name__": [6, 51], "__main__": 16, "__name__": [3, 16, 57], "__op_name__": 6, "__resource_name__": [6, 51], "__solid_name__": 51, "_add_on": [49, 56], "_clean": 4, "_config_map": 48, "_construct_job_def_from_yaml_fil": 53, "_context": [10, 34, 49, 56, 57], "_default_failure_messag": 31, "_default_failure_message_text_fn": 40, "_get_node_asset_kei": 22, "_job": 11, "_kei": 3, "_op_selection_data": [6, 11], "_parent_pipeline_def": 51, "_partit": 48, "_partitioned_config": 48, "_s3_bucket": 57, "_s3_kei": 57, "_schedul": 55, "_yaml_directori": 53, "_yaml_file_for_job_nam": 53, "a_solid": 31, "abcdef": 14, "abid": 34, "abil": [9, 49, 56], "abl": [3, 6, 9, 11, 14, 16, 17, 18, 20, 38, 51], "abort": [14, 38], "about": [1, 2, 4, 7, 9, 10, 14, 22, 24, 28, 38, 40, 49, 51, 56], "abov": [3, 14, 20, 27, 29, 38, 49, 56], "absolut": [14, 20, 38], "abstract": [9, 10, 14, 15, 22, 25, 47, 56, 57], "abstractset": [8, 51], "acceler": 25, "accept": [3, 5, 9, 10, 14, 20, 34, 35, 38, 46, 49, 50, 54, 55, 56], "access": [5, 6, 9, 13, 14, 15, 20, 25, 27, 30, 33, 34, 38, 41, 45, 53, 54, 56, 57], "access_key_kei": 20, "accord": [3, 5, 9, 14, 25, 38], "accordingli": [14, 38], "account": [14, 15, 18, 20, 21, 22, 24, 25, 27, 29, 35, 38, 41, 44], "account_id": [22, 25], "account_nam": 15, "account_sid": 44, "accur": [14, 38], "accurateblockthreshold": [14, 38], "achiev": [14, 38], "ack": [14, 38], "acl": 20, "acquir": 20, "across": [4, 9, 14, 25, 38, 49, 50, 55, 56], "act": [14, 25, 38], "action": [14, 25, 38], "action_list_oper": 26, "action_on_failur": 14, "activ": [20, 25, 35, 41], "actual": [2, 3, 6, 10, 14, 38, 49, 55, 56], "acycl": [6, 7, 51], "adapt": 22, "add": [3, 6, 9, 10, 13, 14, 20, 22, 25, 28, 29, 30, 36, 41, 51, 56], "add_3": 57, "add_attach": 31, "add_daemon_heartbeat": 9, "add_input_metadata": 10, "add_metadata": [6, 56], "add_metadata_two_output": [6, 56], "add_mod": 51, "add_on": [6, 7, 49, 51, 56], "add_output_metadata": [6, 10, 56], "add_three_preset": 51, "add_to_environ": 14, "add_two": 56, "added": [10, 14, 20, 25, 38, 50, 51, 56], "adder_1": 56, "adder_2": 56, "adder_resourc": 51, "addfil": [14, 38], "adding": [18, 29], "addit": [5, 9, 13, 14, 16, 17, 18, 20, 22, 29, 38, 45, 49, 52, 56], "addition": [49, 56], "additional_arg": 16, "address": [9, 12, 14, 19, 22, 25, 38], "adl": [15, 20], "adls2": [15, 20], "adls2_client": 15, "adls2_file_cach": 15, "adls2_file_manag": 15, "adls2_file_system": 15, "adls2_path": 15, "adls2_pickle_asset_io_manag": 15, "adls2_pickle_io_manag": 15, "adls2_prefix": 15, "adls2_resourc": 15, "adls2filehandl": 15, "adls2resourc": 15, "admin": 25, "administr": [22, 27], "advanc": [14, 18, 22, 29, 38], "advantag": 50, "advertis": [14, 38], "advis": 16, "affect": [14, 38], "after": [1, 2, 14, 20, 22, 24, 25, 29, 38, 40, 41, 49, 50, 54, 55, 56], "after_cursor": 9, "after_timestamp": 9, "against": [2, 5, 6, 7, 9, 11, 14, 29, 41, 51, 56, 57], "agent": 21, "aggreg": [14, 38], "ahead": [14, 38], "aim": 15, "airbyte_host": 12, "airbyte_port": 12, "airbyte_resourc": 12, "airbyte_sync_op": 12, "airbyteoutput": 12, "airbyteresourc": 12, "airflow_example_dags_repo": 13, "airflow_execution_d": 13, "airflow_hom": 13, "airline_demo": [52, 58], "aka": 15, "album": 21, "alert": [33, 40, 55], "alert_failur": 6, "alert_start": 6, "alert_success": 6, "algorithm": [14, 38], "alia": [3, 6, 9, 49, 51, 53, 56, 57], "alias": [6, 7, 56], "align": [6, 9, 16, 17, 18, 51], "aliv": [9, 14, 38], "all": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15, 16, 17, 18, 20, 22, 24, 25, 26, 28, 29, 30, 34, 38, 41, 49, 50, 51, 53, 54, 55, 56, 57], "all_assets_job_nam": 1, "all_ev": 6, "all_node_ev": 6, "all_user_ev": 10, "alloc": [14, 38], "allow": [1, 2, 3, 6, 7, 9, 10, 11, 12, 14, 20, 22, 24, 27, 29, 31, 34, 38, 40, 45, 47, 49, 50, 51, 53, 54, 56], "allow_host_key_chang": 43, "aloha": 3, "along": [1, 14, 20, 22, 38, 39], "alreadi": [9, 14, 20, 28, 29, 38], "also": [1, 2, 6, 7, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22, 23, 25, 27, 29, 33, 38, 46, 47, 49, 52, 54, 55, 56, 57], "alter": 4, "altern": [3, 9, 14, 41, 51, 56, 58], "alwai": [22, 25, 34, 45, 50, 55, 57, 58], "amazon": [14, 20], "amazonaw": [14, 20, 29], "amazonec": 14, "amazons3": 20, "america": [41, 50, 55], "amount": [14, 38], "amqp": 16, "an_existing_mlflow_run_id": 30, "an_op": 40, "analyt": 25, "ancestor": [1, 6, 7, 11, 51], "ani": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 22, 24, 27, 28, 30, 31, 34, 35, 38, 39, 40, 45, 46, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "annot": [45, 57], "anonym": [49, 56], "anoth": [5, 6, 7, 13, 14, 28, 38, 51, 57], "ant": [14, 38], "any_config": 57, "apach": [14, 25, 38, 42], "api": [1, 4, 5, 6, 9, 12, 14, 15, 21, 22, 24, 27, 28, 30, 31, 33, 34, 38, 40, 46, 47, 49, 50, 51, 52, 54, 56, 57], "api_kei": [21, 24], "api_secret": [14, 24], "api_stepconfig": 14, "apirefer": 14, "app": [14, 17, 18, 27, 38, 40], "app_id": [14, 38], "app_kei": 21, "appauthexampl": 14, "appear": [3, 6, 10, 14, 20, 27, 38, 56], "append": [20, 49, 56], "appli": [9, 11, 13, 14, 20, 38, 49, 50, 51, 52, 55, 56], "applic": [14, 18, 21, 22, 25, 27, 29, 38, 42], "application_argu": 42, "application_jar": 42, "apply_op": 51, "applylimitperuniquevalu": 9, "appropri": [3, 7, 17, 18, 29, 41, 51, 56, 57], "arbitrari": [1, 3, 4, 5, 6, 7, 11, 28, 39, 49, 51, 54, 56, 57], "arbitrarili": 56, "arbyt": 12, "archiv": 25, "archiveuri": 25, "aren": [10, 40], "arg": [2, 5, 6, 11, 22, 25, 28, 46, 56, 58], "argument": [1, 2, 3, 5, 6, 7, 8, 9, 11, 14, 16, 17, 18, 22, 25, 34, 38, 46, 49, 50, 51, 52, 53, 54, 55, 56, 57], "arn": 14, "around": [9, 14, 16, 17, 18, 38], "arrai": [3, 5, 14], "arrang": [6, 7, 51], "articl": 16, "artifact": [2, 9, 14, 22, 38, 49, 51, 56], "artifactid": [14, 38], "artifactori": [14, 38], "as_dagster_typ": [49, 56, 57], "asia": 25, "asid": [14, 38], "ask": [14, 38], "asktimeout": [14, 38], "assembl": 51, "assert": [9, 14, 54, 57], "assert_failur": 57, "assert_success": 57, "asset": [6, 9, 10, 14, 15, 25, 45, 55, 57], "asset1": 1, "asset2": 1, "asset_group": [1, 14, 15, 25], "asset_kei": [1, 4, 9, 10, 24, 49, 55, 56], "asset_key_prefix": [12, 22, 24, 45], "asset_materi": [6, 10, 56], "asset_materialization_fn": 55, "asset_materialization_plan": 6, "asset_observ": 6, "asset_partit": [4, 9, 49, 56], "asset_partition_kei": 10, "asset_partition_key_rang": 10, "asset_partitions_def": [4, 49, 56], "asset_partitions_time_window": 10, "asset_sensor": 55, "asset_store_oper": 6, "assetgroup": [1, 14, 15, 25], "assetin": 1, "assetkei": [1, 9, 10, 12, 22, 24, 49, 55, 56], "assetmateri": [6, 10, 22, 45, 49, 55, 56, 57], "assetobserv": [6, 10, 56], "assetsdefinit": 1, "assetsdefint": 24, "assetsensordefinit": 55, "assign": [1, 9, 10, 14, 15, 25], "assist": 21, "associ": [1, 2, 6, 8, 9, 10, 22, 25, 49, 55, 56], "assum": [1, 18, 29, 49, 50], "assumpt": 14, "async": [49, 56], "asynchron": [22, 51], "attach": [4, 6, 10, 14, 15, 25, 28, 39, 48, 49, 51, 55, 56, 57], "attempt": [5, 6, 9, 14, 38, 43, 49, 56], "attempt_num": [49, 56], "attempt_numb": 9, "attit": 20, "attribut": [2, 6, 9, 25, 49, 54, 55], "audit": [14, 38], "auth": [25, 28, 29, 44], "auth_token": [22, 44], "authent": [14, 15, 38], "author": [6, 7, 9, 13, 33, 49, 56], "auto": [3, 22, 24, 25], "autocommit": [14, 41], "autom": [27, 29], "automat": [2, 3, 5, 6, 10, 14, 20, 22, 24, 28, 35, 38, 46, 49, 54, 56], "autosc": 20, "autoscal": 20, "avail": [3, 5, 6, 8, 9, 10, 13, 14, 15, 16, 20, 22, 23, 25, 28, 29, 35, 38, 45, 46, 48, 51, 54, 55, 56, 57], "avoid": [9, 14, 38, 49, 56], "aws": [20, 29], "aws_access_key_id": [14, 30], "aws_account_id": [14, 29], "aws_region": 14, "aws_secret_access_kei": [14, 30], "axi": 50, "azur": 20, "azure_data_lake_storage_kei": 15, "azureblobcomputelogmanag": 15, "azuredatabrick": 20, "back": [9, 14, 15, 16, 18, 25, 29, 32, 36, 38, 45, 55], "backend": [14, 17, 18, 38], "backendconnectiontimeout": [14, 38], "backfil": [2, 50, 55], "background": 16, "backlog": [14, 38], "backoff": [49, 56], "backoff_delai": [49, 56], "backpressur": [14, 38], "backward": [14, 38], "bad": 3, "badg": 1, "balthazar": 16, "bar": [6, 9, 10, 14, 25, 38, 49, 53, 54, 56, 57], "bare": [3, 5], "base": [1, 5, 7, 9, 10, 11, 14, 15, 16, 17, 18, 19, 22, 25, 31, 32, 34, 36, 38, 40, 45, 47, 49, 50, 55, 56, 57, 58], "base_dir": [1, 9, 10], "base_path": [1, 10], "basedir": 10, "baseexcept": 8, "baseoper": 13, "basi": [29, 51], "basic": [6, 7, 25, 51], "basicprofil": [14, 38], "bat": [6, 56], "batch": [14, 38], "batch_kwarg": 26, "baz": [6, 49, 56], "becaus": [9, 14, 15, 22, 24, 38], "becom": [3, 5, 34, 46, 54, 57], "been": [5, 6, 9, 10, 12, 14, 38, 53, 56], "befor": [3, 9, 12, 14, 18, 20, 22, 24, 25, 29, 38, 39, 49, 50, 55, 56], "before_cursor": 9, "before_timestamp": 9, "begin": [6, 9, 14, 38, 56], "behalf": 25, "behavior": [6, 11, 13, 14, 20, 22, 38, 49, 51, 56], "behind": [14, 38, 55], "being": [3, 8, 9, 10, 14, 38, 46, 49, 54, 55, 56], "belong": [6, 7, 8, 14, 20, 25, 38, 51, 55], "below": [3, 14, 20, 25, 27, 29, 38, 41], "bertovi\u0107": 16, "bespok": 53, "best": 13, "beta": 25, "better": [13, 14, 38], "between": [1, 7, 9, 12, 14, 15, 22, 24, 25, 38, 39, 49, 50, 51, 54, 55, 56], "beyond": [14, 38, 49], "bigger": [14, 38], "bigquery_resourc": 25, "bigqueryerror": 25, "bigtabl": 25, "binari": [14, 25, 38], "binaryio": 9, "bind": [11, 14, 38, 41], "bindaddress": [14, 38], "bit": 41, "bitnami": 29, "blacklist": [14, 38], "blank": [12, 24], "blob": [6, 15, 20, 22], "block": [9, 14, 15, 16, 36, 38, 40, 56], "blockinterv": [14, 38], "blockmanag": [14, 38], "blockmanagerslavetimeoutm": [14, 38], "blocks_fn": 40, "blocksiz": [14, 38], "blog": 16, "blue": [3, 21], "bodi": [5, 6, 7, 10, 27, 28, 40, 46, 49, 54, 56], "bool": [3, 5, 6, 7, 11, 12, 13, 14, 15, 18, 20, 22, 24, 25, 28, 29, 31, 34, 36, 38, 41, 43, 45, 47, 49, 50, 51, 53, 55, 56, 57], "bool_config": 57, "boolean": [3, 34, 47, 49, 55, 57], "boolean_column": 34, "boolsourc": 3, "boot": 25, "bootdisksizegb": 25, "bootdisktyp": 25, "bootstrap": [14, 29], "bot": 40, "both": [3, 6, 9, 13, 14, 20, 21, 22, 26, 29, 34, 38, 51, 55, 56], "boto": 14, "boto3": 14, "botocor": 14, "bound": [14, 34, 38, 50, 55], "boundari": [5, 6, 9, 11, 49, 51, 56], "bq_create_dataset": 25, "bq_delete_dataset": 25, "bq_op_for_queri": 25, "bq_solid_for_queri": 25, "bracket": 57, "breakpoint": 58, "brew": 29, "bridg": [14, 38], "broadcast": [14, 38], "broker": [17, 18], "broker_url": 16, "browser": 21, "bucket": [3, 14, 20, 25, 34, 57], "bucket_prefix": 3, "buffer": [14, 38], "bufferediobas": 45, "buffers": [14, 38], "build": [1, 6, 7, 8, 10, 11, 12, 22, 24, 29, 46, 49, 51, 54, 55, 56], "build_airbyte_asset": 12, "build_assets_job": [1, 24], "build_fivetran_asset": 24, "build_hook_context": 8, "build_init_logger_context": 46, "build_init_resource_context": 54, "build_input_context": 10, "build_job": 1, "build_op_context": [6, 14], "build_output_context": 10, "build_reconstructable_job": [6, 11, 51], "build_resourc": 54, "build_run_status_sensor_context": 55, "build_schedule_context": 55, "build_schedule_from_partitioned_job": [50, 55], "build_sensor_context": 55, "build_solid_context": 56, "buildkit": 29, "buildup": [14, 38], "built": [3, 6, 9, 14, 22, 34, 38, 45, 51], "builtin": [3, 12, 22, 24, 58], "bulk": 25, "bundl": 14, "bus": [14, 38], "busi": 57, "bypass": [22, 29], "bypass_cach": 22, "bypassmergethreshold": [14, 38], "byte": [9, 14, 38, 49, 56], "cach": [14, 22, 38, 41, 53], "cache_column_metadata": 41, "cachedexecutoridletimeout": [14, 38], "cadenc": [50, 55], "calcul": [14, 22, 38, 49, 56], "calculate_byt": [49, 56], "call": [1, 2, 3, 5, 6, 9, 10, 11, 13, 14, 19, 20, 22, 26, 29, 35, 38, 41, 45, 46, 47, 51, 53, 56, 57], "call_user_provided_funct": 9, "callabl": [3, 8, 9, 13, 14, 22, 31, 34, 35, 40, 46, 49, 50, 53, 54, 55, 56, 57], "callback": 8, "caller": [14, 25, 38, 51], "callercontext": [14, 38], "can": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "cancel": [9, 22], "cancel_and_wait": 14, "cancel_run": 22, "canned_acl": 20, "cannot": [5, 13, 14, 20, 25, 28, 34, 38, 45, 49, 55, 56], "capac": [14, 25, 38], "captur": [50, 58], "card": 31, "care": [7, 11], "case": [6, 7, 8, 9, 14, 16, 17, 18, 22, 34, 38, 39, 46, 49, 51, 53, 54, 56, 57, 58], "catalog": [22, 49, 56], "catch": 5, "categor": 34, "categori": 34, "categorical_column": 34, "caus": [2, 8, 14, 25, 38], "caution": [14, 38], "celeri": 29, "celery_docker_executor": 17, "celery_enabled_job": [16, 17, 18], "celery_executor": [16, 17], "celery_k8s_job_executor": [17, 18], "celeryk8srunlaunch": 18, "celeryq": [16, 17, 18], "central": [6, 8, 14, 38, 46, 50], "central1": 25, "cert": 14, "certain": [6, 9, 11, 14, 34, 38, 39, 41, 49, 51, 56], "certif": [14, 25, 31], "chain": [14, 38], "chang": [2, 14, 16, 22, 24, 25, 29, 40, 41, 50], "channel": [2, 31, 40], "charact": [14, 20, 25, 38], "chat": 40, "chat_postmessag": 40, "check": [3, 5, 6, 9, 14, 22, 26, 34, 38, 45, 47, 49, 53, 54, 55, 56, 57], "check_cluster_everi": 14, "check_dagster_typ": 57, "check_nam": 21, "checker": 3, "checkerror": [34, 57], "checkpoint": [14, 38], "checkpointinterv": [14, 38], "checksum": [14, 38], "child": [1, 3, 5, 6, 7, 11, 49, 51, 56, 58], "children": [51, 56], "choic": 11, "chosen": 14, "chunk": [9, 14, 38], "circumst": [14, 38], "claim": 29, "class": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 20, 22, 24, 25, 28, 29, 32, 34, 35, 36, 37, 38, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "class_nam": 9, "classestoregist": [14, 38], "classmethod": [3, 9], "classpath": [14, 25, 38], "clean": [14, 38, 54], "cleancheckpoint": [14, 38], "cleaner": [14, 38], "cleanup": [14, 29, 38, 54], "clear": [14, 38], "cli": [13, 29], "click": 40, "client": [2, 14, 15, 19, 22, 25, 30, 38, 40, 41], "client_prefetch_thread": 41, "client_session_keep_al": 41, "clone": [4, 14, 38], "cloneconf": [14, 38], "close": [9, 14, 38, 41], "closefileafterwrit": [14, 38], "cloud": [9, 25], "cloudwatch_logg": 14, "cls": [9, 57], "cluster": [9, 14, 18, 19, 20, 25, 38, 57], "cluster_config": 25, "cluster_id": 14, "cluster_log_conf": 20, "clusternam": 25, "coars": [14, 38], "code": [2, 5, 6, 8, 9, 14, 20, 22, 25, 27, 33, 38, 39, 45, 46, 47, 49, 51, 56, 57], "codec": [14, 38], "coerc": [1, 49, 56], "cogroup": [14, 38], "col": [49, 56], "col_a": 49, "col_b": 49, "collect": [1, 3, 5, 6, 7, 10, 14, 38, 45, 49, 51, 56], "collis": 25, "color": [3, 21], "colored_console_logg": 46, "column": [34, 41, 49, 56], "com": [14, 16, 17, 18, 20, 22, 24, 25, 27, 29, 38, 40, 43, 49, 56], "combin": [1, 7, 49, 51, 56], "come": [14, 38, 55], "comma": [14, 38, 51], "command": [2, 14, 16, 17, 18, 22, 25, 29, 38, 39, 42, 52], "committ": [14, 38], "common": [9, 16, 17, 18, 25, 51, 57], "commun": [2, 9, 12, 14, 24, 25, 38, 49, 56, 57], "compani": [29, 40], "compar": 22, "compat": [14, 15, 25, 38], "compelt": 22, "compil": [13, 22], "compile_project": 22, "compile_sql": 22, "complet": [6, 12, 14, 18, 20, 22, 24, 25, 29, 34, 38, 50, 51], "completed": 24, "completekei": 58, "complex": [53, 56], "complex_repositori": 53, "complex_solid": 58, "complexrepositorydata": 53, "complic": 58, "compon": [1, 3, 7, 9, 12, 22, 24, 25], "compos": [1, 16, 49], "composit": [3, 4, 7, 51, 56, 57], "composite_solid": [7, 22, 39, 56], "compositesoliddefinit": [3, 7, 56], "compositesolidexecutionresult": [51, 56], "comprehens": 24, "compress": [14, 38, 43], "comput": [1, 4, 5, 6, 7, 8, 10, 14, 15, 20, 24, 25, 28, 35, 38, 46, 49, 51, 55, 56, 57], "compute_fn": [49, 56], "compute_input_event_dict": 56, "compute_kind": 1, "compute_log": [14, 15], "compute_log_manag": [9, 14, 15], "compute_logs_data": 9, "compute_output_events_dict": 56, "compute_step_ev": 56, "compute_step_failure_ev": 56, "computelogmanag": 9, "computemetadata": 25, "concat": 57, "concat_list": 57, "concept": [18, 29], "conceptu": 1, "concert": [5, 18], "concis": [14, 38], "concret": [9, 51], "concurr": [6, 9, 14, 38], "condit": 9, "conf": [14, 20, 25, 38], "config": [1, 2, 5, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 45, 46, 49, 50, 52, 53, 54, 55, 56, 57, 58], "config_field": [6, 51, 54, 57], "config_fil": [52, 58], "config_fn": [3, 51, 56], "config_from_fil": 58, "config_from_pkg_resourc": 58, "config_from_yaml_str": 58, "config_map": 56, "config_or_config_fn": [9, 46, 49, 54, 56], "config_schema": [3, 4, 9, 10, 14, 22, 35, 45, 46, 49, 51, 53, 54, 56, 57], "config_sourc": [16, 17, 18], "config_typ": 9, "config_valu": [3, 5, 9, 57], "config_yaml": [9, 16], "configbucket": 25, "configmap": [3, 6, 7, 11, 18, 29, 48, 56], "configmapenvsourc": [18, 29], "configschema": [3, 9, 10, 14, 35, 46, 49, 54, 56, 57], "configtyp": 9, "configu": 3, "configur": [1, 2, 3, 5, 7, 9, 10, 11, 12, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 27, 28, 29, 31, 32, 35, 36, 38, 39, 40, 41, 42, 45, 46, 49, 50, 52, 54, 55, 56, 57, 58], "configurableclass": 9, "configurableclassdata": [9, 14, 15], "configurabledefinit": [3, 9, 46, 49, 54, 56], "conflict": [14, 25, 28, 38], "conflictingexecutionparamserror": 28, "conform": [3, 7, 11, 13, 25, 28, 49, 56], "confus": [9, 20, 49, 56], "conjunct": [3, 55], "conn_str": 9, "connect": [2, 9, 10, 12, 14, 15, 17, 19, 22, 23, 24, 25, 28, 29, 31, 38, 40, 41, 43, 49, 54, 56], "connect_timeout": 14, "connection_id": 12, "connectionerror": 28, "connectiontimeout": [14, 38], "connector": [12, 15, 24, 41], "connector_id": 24, "consecut": [14, 38], "consequ": [6, 11, 13, 51], "conserv": [6, 11, 51], "consid": [14, 20, 22, 38, 39, 55, 57], "consider": [14, 38], "consist": [10, 49, 50, 51], "consol": [14, 25, 38, 45], "consolid": 9, "consolidatedsqliteeventlogstorag": 9, "constant": 3, "constitu": [1, 6, 7, 13, 56], "constraint": [22, 28, 34, 49, 56], "construct": [1, 5, 6, 9, 10, 11, 13, 16, 17, 18, 22, 34, 39, 41, 42, 45, 46, 50, 51, 52, 53, 54, 55, 56, 57, 58], "construct_spark_shell_command": 42, "constructor": [4, 5, 9, 13, 14, 16, 17, 18, 34, 38, 49, 51, 52, 53, 56, 58], "consult": 9, "consum": [1, 9, 14, 38, 49, 56], "consume_ev": [6, 10, 56], "consume_logged_metadata_entri": 10, "consumpt": [14, 38], "contact": 16, "contain": [2, 3, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 29, 31, 34, 38, 40, 49, 50, 51, 53, 54, 55, 56, 57], "container": 13, "container_kwarg": 23, "container_nam": 14, "content": [1, 14, 20, 22, 38], "context": [1, 3, 4, 5, 8, 9, 14, 15, 20, 21, 22, 25, 27, 28, 30, 31, 33, 34, 35, 38, 39, 40, 41, 45, 46, 47, 49, 50, 51, 53, 54, 55, 57, 58], "context_": [34, 57], "context_manager_resourc": [6, 8, 10, 54, 56], "contextlib": 54, "contextmanag": 54, "continu": 25, "continueonfailur": 25, "contrain": 3, "contrast": 18, "contribut": 9, "control": [9, 14, 20, 38, 49, 53, 55, 56], "conveni": [46, 58], "convent": 13, "convert": [3, 55, 56, 58], "cool": [14, 15, 25, 49], "coordin": [14, 38], "copi": [9, 14, 20, 22, 25, 38, 40], "copy_handle_to_local_temp": 9, "core": [4, 5, 9, 11, 12, 14, 18, 24, 25, 26, 29, 34, 35, 38, 46, 49, 51, 54, 55, 56], "core_concept": 26, "correct": [1, 5, 49], "correctli": [14, 20, 29, 38], "correpond": 1, "correspond": [2, 3, 4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 20, 22, 24, 25, 49, 50, 53, 55, 56], "corrupt": [14, 38], "cost": [14, 38], "costli": 53, "could": [3, 14, 20, 22, 38], "count": [14, 21, 34], "cover": 10, "cowboytyp": 3, "cpu": [14, 38], "cpu_count": 6, "crash": [14, 38], "creat": [1, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23, 24, 25, 27, 29, 31, 33, 38, 40, 41, 45, 49, 50, 51, 53, 54, 55, 56, 57, 58], "create_dagster_pandas_dataframe_typ": 34, "create_databricks_job_op": 20, "create_databricks_job_solid": 20, "create_dbt_rpc_run_sql_solid": 22, "create_issu": 27, "create_k8s_job_task": 18, "create_offset_partition_selector": 50, "create_registered_model": 30, "create_run": 9, "create_schedule_definit": 50, "create_shell_command_op": 39, "create_shell_command_solid": 39, "create_shell_script_op": 39, "create_shell_script_solid": 39, "create_spark_op": 42, "create_task": 16, "creation": [14, 17, 20, 23, 29, 38, 48, 53], "cred": 15, "credenti": [14, 15, 18, 20, 25, 29, 40], "criteria": [1, 6, 7, 11, 39, 49, 51, 56], "critic": 46, "cron": [9, 50, 55], "cron_schedul": [50, 53, 55], "cross": [6, 9, 11, 20, 25, 51], "crossrealmtrustadminserv": 25, "crossrealmtrustkdc": 25, "crossrealmtrustrealm": 25, "crossrealmtrustsharedpassworduri": 25, "csv": [3, 57], "csv_loader": 10, "csv_loader_kei": 10, "curiou": 9, "curl": 25, "current": [2, 6, 9, 10, 12, 13, 14, 22, 24, 25, 38, 46, 49, 50, 51, 54, 55, 56, 58], "current_tim": 50, "current_valu": 5, "curri": 3, "cursor": [2, 9, 55], "custom": [3, 6, 9, 10, 13, 14, 18, 20, 22, 25, 28, 29, 34, 38, 41, 49, 51, 56, 57], "custom_dbt_cli_resourc": 22, "custom_dbt_rpc_resourc": 22, "custom_dbt_rpc_sync_resourc": 22, "custom_instance_class_data": 9, "custom_path_fs_io_manag": 10, "custom_service_account": 25, "custom_sync_dbt_rpc_resourc": 22, "custom_tag": 20, "custom_typ": 49, "cyclic": 54, "d9971c84d44d47f382a2928c8c161faa": 29, "daemon": [9, 25, 55], "daemon_heartbeat": 9, "dag": [6, 7, 13, 14, 38, 51], "dag_bag": 13, "dag_descript": 13, "dag_id": 13, "dag_kwarg": 13, "dag_path": 13, "dagbag": 13, "daggraph": [14, 38], "dagit": [1, 7, 9, 11, 13, 28, 29, 31, 32, 36, 40, 49, 50, 51, 52, 55, 56, 58], "dagit_base_url": [31, 40], "dagit_port": 2, "dagredi": 29, "dagrun": 13, "dagster": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 23, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "dagster_airbyt": 12, "dagster_airflow": 13, "dagster_aw": [9, 14, 45, 57], "dagster_azur": 15, "dagster_celeri": [16, 17, 18], "dagster_celery_broker_host": [16, 17, 18], "dagster_celery_dock": 17, "dagster_celery_k8": [16, 18], "dagster_dask": 19, "dagster_databrick": 20, "dagster_datadog": 21, "dagster_dbt": 22, "dagster_dock": 23, "dagster_docker_imag": 29, "dagster_docker_image_tag": 29, "dagster_docker_repositori": 29, "dagster_ev": [9, 45, 55], "dagster_exampl": [52, 58], "dagster_fivetran": 24, "dagster_g": 26, "dagster_gcp": 25, "dagster_github": 27, "dagster_graphql": 28, "dagster_handl": 46, "dagster_hom": [2, 9, 17, 18, 29, 32, 36], "dagster_imag": 29, "dagster_inst": 55, "dagster_k8": [18, 29], "dagster_mlflow": 30, "dagster_msteam": 31, "dagster_mysql": [9, 32], "dagster_pagerduti": 33, "dagster_panda": [26, 34], "dagster_papertrail": 35, "dagster_pg_password": [18, 29], "dagster_pipeline_factori": 13, "dagster_postgr": [9, 36], "dagster_prometheu": 37, "dagster_pyspark": 38, "dagster_run": [6, 49, 54, 55, 56], "dagster_shel": 39, "dagster_slack": 40, "dagster_snowflak": 41, "dagster_spark": 42, "dagster_ssh": 43, "dagster_stag": 20, "dagster_test": 29, "dagster_twilio": 44, "dagster_typ": [1, 3, 4, 5, 6, 10, 26, 34, 49, 56, 57], "dagster_type_load": [6, 34, 51, 57], "dagster_type_m": 57, "dagster_type_materi": [34, 57], "dagsterassetmetadatavalu": [49, 56], "dagsterconfigmappingfunctionerror": 5, "dagsterdaemonschedul": 55, "dagsterdbtclifatalruntimeerror": 22, "dagsterdbtclihandledruntimeerror": 22, "dagsterdbtclioutputsnotfounderror": 22, "dagsterdbtcliruntimeerror": 22, "dagsterdbtcliunexpectedoutputerror": 22, "dagsterdbterror": 22, "dagsterdbtrpcunexpectedpolloutputerror": 22, "dagstererror": 5, "dagsterev": [6, 8, 9, 51, 55, 56], "dagstereventloginvalidforrun": 5, "dagstereventtyp": [6, 9], "dagsterexecutionstepexecutionerror": [5, 9], "dagsterexecutionstepnotfounderror": 5, "dagstergraphqlcli": 28, "dagstergraphqlclienterror": 28, "dagsterinst": [2, 5, 6, 7, 9, 11, 13, 51, 54, 55, 56], "dagsterinvalidconfigdefinitionerror": 5, "dagsterinvalidconfigerror": [3, 5], "dagsterinvaliddefinitionerror": [5, 22], "dagsterinvariantviolationerror": [5, 6, 52, 54, 58], "dagsterlogmanag": [6, 8, 9, 10, 45, 46, 54, 56], "dagstermil": [6, 11, 51], "dagstermillerror": 45, "dagstermillexecutioncontext": 45, "dagsterpipelinerunmetadatavalu": [49, 56], "dagsterresourcefunctionerror": 5, "dagsterrun": [6, 55], "dagsterrunconflict": 28, "dagsterrunnotfounderror": 5, "dagsterrunstatu": 9, "dagsterstepoutputnotfounderror": 5, "dagstersubprocesserror": 5, "dagstertyp": [1, 5, 6, 10, 14, 26, 34, 49, 56, 57], "dagstertypecheckdidnotpass": 5, "dagstertypecheckerror": 5, "dagstertypekind": [34, 57], "dagstertypeload": [34, 57], "dagstertypemateri": [34, 57], "dagsterunknownresourceerror": 5, "dagsterunmetexecutorrequirementserror": 5, "dagsterusercodeexecutionerror": [5, 9], "dai": [50, 55], "daili": [14, 38, 50, 55], "daily_10am_schedul": 50, "daily_partitioned_config": [50, 55], "daily_schedul": 55, "dashboard": [14, 38, 49, 56], "dashboard_url": [49, 56], "dask_enabled_job": 19, "dask_executor": 19, "data": [3, 4, 6, 9, 10, 12, 14, 15, 18, 20, 22, 24, 25, 26, 29, 34, 35, 38, 41, 46, 47, 49, 51, 54, 56, 57], "databas": [2, 9, 14, 22, 25, 29, 32, 36, 41, 49, 50, 54, 55, 56], "databricks_cli": 20, "databricks_host": 20, "databricks_job": 20, "databricks_pyspark_step_launch": 20, "databricks_token": 20, "databrickserror": 20, "datadog_op": 21, "datadog_resourc": 21, "datadogpi": 21, "datafram": [22, 26, 34, 38], "dataframe_constraint": 34, "dataframe_load": 34, "dataframe_materi": 34, "dataframeconstraint": 34, "datalakeservicecli": 15, "dataproc_op": 25, "dataproc_resourc": 25, "dataproc_solid": 25, "dataset": [25, 26, 49, 56], "datasourc": 26, "datasource_nam": 26, "date": [2, 14, 20, 22, 47, 50, 55], "date_partition_rang": 50, "datetim": [24, 34, 50, 55], "datetime64": 34, "datetime_column": 34, "day_of_month": [50, 55], "day_of_week": [50, 55], "day_offset": [50, 55], "db_name": [9, 32, 36], "db_statement_timeout": 2, "dbf": 20, "dbt": 1, "dbt_cli_compil": 22, "dbt_cli_pipelin": 22, "dbt_cli_resourc": 22, "dbt_cli_run": 22, "dbt_cli_run_oper": 22, "dbt_cli_snapshot": 22, "dbt_cli_snapshot_fresh": 22, "dbt_cli_test": 22, "dbt_cloud": 22, "dbt_cloud_auth_token": 22, "dbt_cloud_host": 22, "dbt_cloud_resourc": 22, "dbt_cloud_run_op": 22, "dbt_compile_op": 22, "dbt_docs_generate_op": 22, "dbt_execut": 22, "dbt_ls_op": 22, "dbt_output": 22, "dbt_profiles_dir": 22, "dbt_project": 22, "dbt_rpc": 22, "dbt_rpc_compile_sql": 22, "dbt_rpc_job": 22, "dbt_rpc_resourc": 22, "dbt_rpc_run": 22, "dbt_rpc_run_and_wait": 22, "dbt_rpc_run_oper": 22, "dbt_rpc_run_operation_and_wait": 22, "dbt_rpc_snapshot": 22, "dbt_rpc_snapshot_and_wait": 22, "dbt_rpc_snapshot_fresh": 22, "dbt_rpc_snapshot_freshness_and_wait": 22, "dbt_rpc_sync_job": 22, "dbt_rpc_sync_resourc": 22, "dbt_rpc_test": 22, "dbt_rpc_test_and_wait": 22, "dbt_run_op": 22, "dbt_seed_op": 22, "dbt_snapshot_op": 22, "dbt_test_op": 22, "dbtclioutput": 22, "dbtcliresourc": 22, "dbtcloudoutput": 22, "dbtcloudresourcev2": 22, "dbtoutput": 22, "dbtresourc": 22, "dbtrpcoutput": 22, "dbtrpcresourc": 22, "dbtrpcsyncresourc": 22, "dd_job": 21, "dead": [14, 38], "debian": 25, "debug": [6, 12, 22, 24, 25, 29, 46, 56, 58], "debug_log": 22, "debugg": 58, "decid": [14, 38], "declar": [6, 7, 14, 15, 49, 51, 56], "decor": [1, 3, 6, 7, 8, 9, 10, 11, 21, 22, 34, 39, 46, 49, 50, 51, 53, 54, 55, 56, 57], "decorated_fn": [8, 50, 55], "decreas": [14, 38], "decrement": 21, "dedupl": 56, "deeplink": [31, 40], "def": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 30, 31, 33, 38, 39, 40, 41, 49, 51, 53, 54, 56, 57, 58], "default": [1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 26, 27, 28, 29, 30, 31, 34, 36, 38, 39, 40, 41, 43, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "default_arg": 13, "default_executor": [48, 51], "default_flag": 22, "default_output": [51, 56], "default_statu": [31, 40, 50, 55], "default_tag": 20, "default_valu": [3, 4, 49, 56], "defaultcor": [14, 38], "defaultruncoordin": 9, "defaultrunlaunch": 9, "defaultschedulestatu": [50, 55], "defaultsensorstatu": [31, 40, 55], "defin": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22, 23, 25, 28, 29, 34, 35, 45, 47, 48, 50, 51, 52, 53, 54, 55, 57], "define_dagstermill_op": 45, "define_dagstermill_solid": 45, "define_my_job": [6, 11, 51], "define_pipelin": 2, "define_repo": 2, "define_spark_config": 42, "definit": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 22, 24, 25, 38, 39, 41, 45, 46, 49, 50, 53, 54, 55, 56], "delai": [14, 38, 49, 56], "deleg": [9, 16, 18, 46], "delet": [2, 9, 14, 25, 29, 38], "delete_local_temp": 9, "delin": [50, 55], "deliv": 20, "delta_rang": 50, "deni": 16, "denibertov": 16, "depend": [1, 4, 6, 7, 11, 13, 14, 15, 22, 25, 38, 40, 45, 54, 56, 57], "depende": 51, "dependency_structur": 51, "dependencydefinit": [6, 7, 51, 56], "deploi": [14, 25, 29, 38], "deploy": [14, 17, 18, 19, 28, 29, 38, 55], "deploy_local_job_packag": 14, "deploy_local_pipeline_packag": 14, "deploy_mod": 42, "deploymod": [14, 38], "deprec": [6, 14, 22, 38, 48, 55], "deqeueu": 9, "dequeue_interval_second": 9, "deriv": 9, "descend": [1, 6, 7, 11, 51], "describ": [3, 7, 9, 10, 11, 20, 22, 28, 29, 47, 49, 56], "descript": [1, 3, 4, 5, 6, 7, 9, 10, 11, 13, 14, 20, 22, 34, 35, 38, 39, 42, 45, 46, 48, 49, 50, 51, 53, 54, 55, 56, 57], "descriptor": 49, "deseri": [10, 55], "design": [1, 6, 10, 56], "desir": [12, 15, 22, 24, 29, 56], "destin": 20, "destination_t": [12, 24], "destruct": [14, 38], "detail": [12, 14, 16, 17, 18, 20, 22, 24, 38, 40, 41, 49, 56], "detect": [14, 38], "determin": [1, 2, 4, 6, 7, 11, 20, 22, 25, 34, 49, 50, 51, 55, 56, 57], "determinist": [49, 56, 57], "dev": [3, 14, 16, 17, 18, 20, 29], "dev_s3": 3, "develop": [25, 27, 45, 57], "devstorag": 25, "dfoo": 25, "dict": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 29, 30, 34, 36, 38, 39, 40, 41, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "dictionari": [1, 3, 5, 6, 7, 8, 10, 11, 22, 24, 48, 49, 50, 51, 53, 54, 55, 56, 58], "dictread": 57, "dictwrit": 57, "did": [56, 58], "died": 9, "differ": [1, 3, 6, 10, 11, 14, 16, 17, 18, 22, 38, 39, 49, 50, 51, 55, 56], "dir": [1, 9, 14, 22, 25, 38], "direct": [1, 6, 7, 11, 14, 38, 51], "directli": [2, 3, 6, 7, 9, 13, 14, 15, 21, 22, 32, 36, 38, 39, 45, 46, 49, 51, 53, 55, 56], "directori": [1, 2, 9, 10, 13, 14, 15, 16, 20, 22, 25, 38, 41, 53, 58], "dirnam": 4, "disabl": [9, 14, 16, 17, 18, 22, 23, 29, 38, 39, 41], "disable_schedule_on_trigg": [22, 24], "discret": [7, 11, 50], "disk": [9, 14, 20, 25, 38, 49, 56], "diskconfig": 25, "dispatch": [6, 8, 28, 46], "displai": [1, 2, 4, 14, 22, 34, 38, 40, 49, 55, 56, 57], "distcp": 25, "distinct": 7, "distinguish": [4, 9, 34, 57], "distribut": [14, 15, 18, 19, 21, 25, 29], "div_2": 57, "div_i": 57, "divid": [14, 38], "dkr": 29, "do_someth": [8, 10], "do_something_on_failur": 8, "do_something_on_success": 8, "do_stuff": 10, "doc": [9, 12, 14, 16, 17, 18, 20, 22, 24, 25, 26, 29, 40, 42], "docker": [13, 16, 18, 29], "docker_executor": 23, "docker_image_tag": 30, "docker_job": 23, "docker_password": 17, "dockeroper": 13, "dockerrunlaunch": 23, "docstr": [49, 56], "document": [2, 3, 9, 12, 14, 20, 21, 22, 24, 33, 38, 40, 42, 52, 58], "doe": [5, 6, 7, 9, 14, 22, 23, 28, 34, 38, 49, 50, 51, 54, 55, 56, 57], "doesn": 41, "dog": [6, 56], "dogstatsd": 21, "don": 55, "done": [9, 32, 36, 57], "doubl": [14, 38], "down": [1, 2, 6, 7, 11, 14, 16, 20, 21, 28, 38, 51, 54], "download": [9, 15, 41], "downstream": [1, 4, 6, 7, 10, 11, 22, 45, 49, 51, 56], "downtim": 55, "draw": [18, 29], "drive": 25, "driver": [14, 20, 25, 38], "driver_node_type_id": 20, "driverloglevel": 25, "drop": [14, 22, 38], "dry": 29, "dspark": 20, "dtype": 34, "dublin": 34, "due": [14, 15, 38], "dump": [1, 6, 7, 11, 14, 38, 39, 49, 51, 56], "dump_profil": [14, 38], "dunderfil": 58, "durat": [14, 19, 38, 50], "dure": [5, 6, 8, 9, 14, 28, 38, 39, 41, 46, 49, 54, 56], "dynam": [6, 11, 14, 20, 38, 39, 41, 45, 50, 51, 53], "dynamic_partitioned_config": 50, "dynamicalloc": [14, 38], "dynamicout": 4, "dynamicoutput": [4, 6, 56], "each": [1, 3, 4, 6, 7, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 29, 38, 45, 46, 47, 49, 50, 51, 54, 55, 56], "eager": 9, "earlier": 55, "eas": 49, "easi": [3, 6, 9, 11, 51], "easier": 14, "easiest": [9, 10, 46, 49, 54, 56], "easili": 35, "east": 14, "east1": 25, "eastern": 34, "echo": [39, 49], "echo_2": 49, "ecr": 29, "ecs": 14, "ecsrunlaunch": 14, "edg": 51, "edit": [7, 11], "effect": [14, 29, 38, 49, 56], "effici": [14, 38], "egg": [14, 25, 38], "either": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 20, 22, 24, 28, 29, 34, 38, 40, 41, 46, 49, 50, 51, 54, 55, 56, 57], "elaps": 55, "element": [3, 5, 14, 29, 38], "elimin": 2, "els": [25, 57], "email": 29, "embed": 58, "emit": [6, 9], "emit_1": 57, "emit_2": 57, "emit_3": 57, "emit_metadata": [49, 56], "emit_two_four": 51, "empti": [2, 14, 18, 25, 29, 51], "empty_str": 57, "emr_pyspark_step_launch": 14, "emr_stag": 14, "emrclusterst": 14, "emrerror": 14, "emrjobrunn": 14, "emrstepst": 14, "enabl": [1, 7, 9, 11, 14, 16, 17, 18, 20, 23, 25, 38, 41, 47, 49, 51, 56], "enable_elastic_disk": 20, "enable_encrypt": 20, "enablecompress": [14, 38], "enablekerbero": 25, "encapsul": [13, 25, 29, 49, 56], "encod": [1, 2, 6, 7, 11, 20, 22, 39, 49, 51, 56, 58], "encrypt": [20, 25], "encryption_typ": 20, "encryptionconfig": 25, "end": [1, 6, 7, 16, 27, 30, 49, 50, 51, 55, 56], "end_mlflow_on_run_finish": 30, "end_mlflow_run_on_pipeline_finish": 30, "end_offset": [50, 55], "endpoint": [12, 14, 20, 22, 24, 38], "endpoint_url": 14, "enforc": [5, 13, 14, 34, 38], "enforce_ord": 34, "engin": [6, 9, 25, 41], "engine_ev": 6, "engine_event_data": 9, "engineev": 9, "enough": [2, 14, 38], "enqueu": 9, "ensur": [9, 13, 20, 21, 29, 39, 41, 55], "entail": 16, "enterpris": 27, "entir": [14, 22, 38, 50, 51], "entireti": [14, 38], "entiti": [14, 38], "entri": [1, 2, 3, 4, 9, 10, 14, 22, 25, 38, 49, 56], "entry_data": [49, 56], "enum": [3, 5, 14, 28, 56], "enum_valu": 3, "enumer": 14, "enummeta": 3, "enumvalu": 3, "env": [3, 12, 15, 17, 22, 24, 29, 30, 41], "env_config_map": [18, 29], "env_secret": [18, 29], "env_to_tag": 30, "env_var": [17, 18, 23, 29], "envfrom": [18, 29], "environ": [1, 2, 3, 6, 11, 13, 14, 15, 17, 18, 20, 23, 25, 27, 28, 29, 30, 32, 36, 38, 50, 51, 52, 55, 58], "environment": 20, "environment_var": [50, 55], "ephemer": [2, 6, 7, 9, 11, 16, 25, 26, 45, 51, 54, 55, 56], "equal": [14, 38, 50, 55], "equival": [3, 25, 52, 57], "error": [1, 2, 3, 6, 9, 10, 12, 14, 24, 28, 31, 33, 38, 40, 45, 46, 49, 52, 55, 56, 58], "error_cl": 9, "error_info": 9, "error_object": 28, "error_toler": 34, "especi": [14, 16, 17, 18, 38], "essenti": [14, 38], "establish": 57, "estim": [14, 38], "etc": [6, 14, 18, 22, 25, 29, 38, 50, 56, 58], "europ": 34, "evalu": [34, 50, 53, 55], "evaluate_tick": 55, "evaluatevalueresult": 9, "evaluation_fn": 55, "even": [14, 38, 41, 55], "event": [2, 4, 5, 6, 8, 10, 14, 18, 21, 22, 29, 31, 32, 33, 36, 38, 40, 45, 51, 55, 58], "event_act": 33, "event_list": [51, 56], "event_log": [9, 32, 36], "event_log_entri": 9, "event_log_storag": [9, 32, 36], "event_metadata_fn": 34, "event_specific_data": 6, "event_storag": 9, "event_storage_data": 9, "event_typ": [6, 9], "event_type_valu": 6, "eventlog": [14, 38], "eventlogentri": [9, 55], "eventlogrecord": 9, "eventlogstorag": 9, "eventqueu": [14, 38], "eventrecordsfilt": 9, "events_for_nod": 6, "eventu": [9, 49, 56], "eventv2_cr": 33, "everi": [14, 16, 17, 18, 20, 38, 43, 55], "evict": [14, 38], "exact": [14, 34, 38], "exactli": [3, 20], "examin": [49, 56, 58], "exampl": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 24, 25, 27, 28, 29, 30, 31, 33, 38, 39, 40, 41, 46, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "example_adls2_op": 15, "example_external_task_marker_child": 13, "example_job": 14, "example_mapping_kei": 4, "example_pig_oper": 13, "example_pipelin": 52, "example_preset": 52, "example_redshift_op": 14, "example_s3_op": 14, "example_secretsmanager_op": 14, "example_secretsmanager_secrets_op": 14, "example_secretsmanager_secrets_op_2": 14, "example_skip_dag": 13, "example_trigger_target_dag": 13, "example_xcom": 13, "exampleenum": 3, "exc_info": 5, "exceed": [14, 22, 38], "except": [3, 5, 6, 7, 8, 11, 14, 22, 25, 28, 38, 41, 46, 49, 51, 56], "excess": [14, 38], "excit": 9, "exclud": [14, 22, 38, 51, 56], "execut": [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 28, 29, 38, 39, 41, 43, 45, 46, 47, 48, 50, 52, 54, 55, 57, 58], "execute_in_process": [3, 5, 6, 7, 11, 13, 14, 21, 27, 30, 33, 40, 41, 55], "execute_pipelin": [13, 31, 51, 52], "execute_pipeline_iter": 51, "execute_plan": 16, "execute_queri": [14, 41], "execute_solid": 56, "execute_solid_within_pipelin": 56, "execute_solids_within_pipelin": 56, "executeinprocessresult": [6, 7, 11], "execution_d": 13, "execution_fn": 55, "execution_plan": 9, "execution_plan_snapshot_id": 9, "execution_time_to_partition_fn": 50, "execution_timezon": [50, 55], "executionplan": 9, "executor": [1, 3, 5, 11, 14, 15, 16, 17, 18, 19, 20, 23, 25, 29, 38, 48, 49, 51, 54, 56], "executor_config": 9, "executor_creation_fn": 9, "executor_def": [1, 6, 7, 9, 11, 16, 17, 18, 19, 23, 29, 48], "executor_id": [14, 38], "executorallocationratio": [14, 38], "executordefinit": [1, 3, 6, 7, 9, 11, 16, 17, 18, 19, 23, 29, 48, 51], "executoridletimeout": [14, 38], "executorrequir": 9, "exist": [2, 3, 5, 6, 7, 9, 14, 18, 19, 20, 25, 28, 34, 38, 41, 47, 50, 51, 52, 54, 55, 56, 57], "exit": [2, 14, 22, 25, 38, 39, 54], "expect": [9, 12, 14, 16, 17, 18, 22, 24, 25, 28, 34, 38, 39, 49, 56, 57], "expectation_events_during_comput": 56, "expectation_results_during_comput": 56, "expectationresult": [6, 45, 49, 56], "expens": [9, 14, 38], "expensive_job": 53, "expensive_schedul": 53, "experi": [29, 30], "experienc": 55, "experiment": [4, 9, 14, 38, 49, 54, 55, 56, 57], "experiment_nam": 30, "experimentalwarn": 58, "expir": 13, "explicit": [7, 56, 57], "explicitli": [1, 6, 7, 9, 10, 49, 56, 57, 58], "explod": 56, "explor": 45, "exponenti": [49, 56], "export": [2, 9, 20, 29], "expos": [3, 12, 16, 17, 18, 22, 24, 29, 46, 56], "express": [14, 34, 38, 49, 50], "ext": 9, "extend": [10, 14, 38, 50, 55], "extens": 9, "extern": [9, 14, 20, 25, 29, 38, 54, 55, 57], "external_pipeline_origin": 9, "external_version_fn": 57, "externalpipelineorigin": 9, "extra": [14, 20, 22, 30, 38], "extra_tag": 30, "extraclasspath": [14, 38], "extract": 25, "extrajavaopt": [14, 20, 38], "extralibrarypath": [14, 38], "extralisten": [14, 38], "extras_requir": 9, "face": [9, 14, 38], "facil": 57, "factori": [11, 22, 39, 41], "fail": [2, 5, 8, 12, 14, 18, 20, 22, 24, 25, 28, 29, 31, 34, 38, 40, 49, 54, 55, 56, 57], "fail_fast": 22, "fail_pod_on_run_failur": [18, 29], "failur": [2, 4, 8, 9, 14, 20, 22, 28, 31, 38, 39, 40, 45, 49, 55, 56], "failure_data": 56, "failure_ev": [31, 40, 55], "failure_hook": 8, "failure_typ": 28, "fair": [14, 38], "fake": 15, "fake_redshift_resourc": 14, "fakeadls2resourc": 15, "fall": [18, 25, 29, 55], "fallback": 40, "fals": [3, 5, 9, 12, 13, 14, 15, 20, 22, 25, 28, 29, 34, 38, 41, 43, 47, 49, 50, 56, 57, 58], "fan": [51, 57], "fast": [14, 16, 17, 18, 22, 38], "faster": [14, 38], "fatal": [22, 25, 46], "favor": 48, "featur": [9, 14, 16, 20, 25, 38], "feedback": 29, "femal": 21, "fetch": [9, 14, 22, 38, 41], "fetch_result": [14, 41], "fetchfailur": [14, 38], "fetchtimeout": [14, 38], "few": [14, 38], "fewer": [14, 38], "fh_1": 9, "fh_2": 9, "field": [3, 4, 5, 9, 13, 14, 18, 20, 22, 29, 32, 36, 38, 39, 49, 53, 56, 57], "field_alias": 3, "field_util": 3, "fieldnam": 57, "file": [1, 2, 3, 4, 10, 13, 14, 15, 16, 18, 22, 25, 29, 32, 36, 38, 39, 41, 43, 45, 49, 52, 53, 56, 57, 58], "file_handl": 9, "file_manag": [9, 45, 57], "file_nam": 53, "file_obj": 9, "file_relative_path": [4, 39, 58], "file_result": 4, "file_system": 15, "filehandl": [9, 45, 57], "filemanag": [9, 14, 15, 25, 45, 57], "filenam": [4, 14], "filenotfounderror": 58, "fileoutputcommitt": [14, 38], "filepath": [10, 49, 56], "files_in_directori": 4, "files_pipelin": 9, "filesystem": [1, 6, 7, 9, 10, 11, 14, 15, 25, 38, 49, 56, 57], "fileuri": 25, "fileystem": [14, 20], "fill": 50, "filter": [2, 9, 14, 34, 38, 57], "filter1": [14, 38], "final": [1, 14, 22, 25, 38, 40], "final_foobar_st": [12, 24], "find": [9, 13, 14, 16, 22, 24, 27, 29, 33, 38], "fine": [27, 53], "finish": [14, 30, 38, 40], "fire": 55, "firewal": [14, 38], "first": [2, 5, 6, 14, 21, 22, 27, 29, 33, 34, 38, 40, 41, 46, 49, 50, 53, 55, 56, 57], "fit": [14, 38], "fivetran_api_kei": 24, "fivetran_api_secret": 24, "fivetran_asset": 24, "fivetran_resourc": 24, "fivetran_sync_op": 24, "fivetranoutput": 24, "fivetranresourc": 24, "fix": [2, 14, 38, 50], "fixed_server_id": 2, "fixtur": 29, "flag": [2, 9, 16, 22, 25, 29, 34, 41], "flake": [49, 56], "flakey_oper": [49, 56], "flat_asset_kei": [49, 56], "flavor": 25, "flexibl": [49, 56], "float": [3, 4, 5, 9, 12, 14, 18, 20, 22, 24, 31, 34, 38, 49, 55, 56, 57], "float_column": 34, "floatmetadatavalu": [49, 56], "flow": [6, 7, 14, 34, 49, 51, 56, 57], "flower": [16, 29], "flush": [14, 38], "flux": 9, "fmt": [50, 55], "follow": [3, 4, 6, 9, 10, 13, 14, 15, 16, 17, 18, 19, 22, 23, 25, 27, 29, 36, 38, 49, 50, 51, 52, 55, 56], "foo": [1, 6, 8, 9, 10, 11, 14, 25, 31, 38, 40, 49, 51, 53, 54, 56, 57], "foo_job": [6, 11, 51], "foo_job_arg": 11, "foo_job_kwarg": 11, "foo_resourc": 1, "foobar": [3, 12, 24], "footprint": [14, 38], "for_run_failur": 55, "forc": [14, 38], "forefront": 1, "fork": [14, 38, 58], "forked_pdb": [6, 56, 58], "forkedpdb": [6, 56, 58], "form": [1, 14, 20, 38, 51, 53], "format": [3, 7, 8, 11, 12, 14, 22, 24, 25, 28, 31, 34, 38, 40, 41, 50, 53, 55, 57], "forver": 22, "forward": [5, 14, 17, 23, 29, 38], "found": [5, 13, 14, 16, 20, 22, 24, 27, 28, 29, 38, 47, 49, 51, 56], "foundat": 49, "four": [14, 16, 38, 51], "fraction": [14, 38], "fragment": [6, 9], "framework": [5, 6, 7, 9, 13, 25, 28, 39, 49, 56], "free": [14, 38], "freeli": 5, "frequenc": [14, 38], "frequent": [14, 16, 17, 18, 20, 38], "fresh": [2, 22], "friend": [9, 57], "from": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 45, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "from_config_valu": 9, "from_current_modul": 1, "from_def": 54, "from_fil": 52, "from_modul": 1, "from_package_modul": 1, "from_package_nam": 1, "from_pkg_resourc": 52, "from_python_enum": 3, "from_val": 54, "from_yaml_str": 52, "front": [14, 38], "frozenset": [20, 34, 42, 51, 56], "fs_asset_io_manag": 1, "fs_io_manag": [1, 10], "fspath": [49, 56], "full": [14, 16, 20, 22, 23, 25, 38, 49, 51, 56], "full_control": 25, "full_refresh": 22, "fulli": [9, 14, 22, 38, 46, 49, 54, 56], "fulltrac": 29, "function": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 22, 24, 31, 34, 35, 38, 39, 40, 41, 46, 49, 51, 53, 54, 55, 56, 57, 58], "further": [14, 20, 38], "futur": 53, "gain": 9, "garbag": [14, 38], "gate": 9, "gatewai": 37, "gather": [4, 6, 11], "gaug": 21, "gceclusterconfig": 25, "gcepdkmskeynam": 25, "gcloud": 29, "gcp": 29, "gcs": 25, "gcs_bucket": 25, "gcs_file_manag": 25, "gcs_kei": 25, "gcs_path": 25, "gcs_pickle_asset_io_manag": 25, "gcs_pickle_io_manag": 25, "gcs_prefix": 25, "gcs_resourc": 25, "gcsfilehandl": 25, "ge_data_context": 26, "ge_validation_op_factori": 26, "ge_validation_solid_factori": 26, "gen": 15, "gen2": 15, "gender": 21, "gener": [1, 2, 5, 6, 7, 9, 10, 12, 13, 14, 15, 18, 20, 22, 24, 25, 26, 29, 34, 38, 39, 41, 49, 50, 51, 55, 56, 57], "generate_doc": 22, "generate_materi": 22, "get": [2, 6, 9, 10, 13, 14, 16, 18, 20, 21, 22, 24, 25, 28, 29, 38, 45, 51, 52, 53, 56, 58], "get_addresses_for_step_output_vers": 9, "get_all_job": 53, "get_all_pipelin": 53, "get_batch": 26, "get_connect": 41, "get_connector_detail": 24, "get_connector_sync_statu": 24, "get_context": 45, "get_daemon_heartbeat": 9, "get_dag": 13, "get_dagster_logg": 58, "get_environment_yaml": 52, "get_input_asset_kei": 10, "get_input_asset_partit": 10, "get_job": [22, 53], "get_job_failure_ev": [6, 55], "get_job_success_ev": [6, 55], "get_logged_ev": 10, "get_logged_metadata_entri": 10, "get_manifest": 22, "get_mapping_kei": [6, 56], "get_modul": 51, "get_observ": 10, "get_on": 41, "get_output_asset_kei": 10, "get_output_asset_partit": 10, "get_output_event_for_comput": 56, "get_output_events_for_comput": 56, "get_output_identifi": 10, "get_partit": 50, "get_repo_id": 27, "get_resource_vers": 47, "get_run": 22, "get_run_artifact": 22, "get_run_config_for_partition_kei": 50, "get_run_result": 22, "get_run_scoped_output_identifi": 10, "get_run_statu": 28, "get_run_step": 22, "get_secret_valu": 14, "get_solid_vers": 47, "get_step_success_ev": 56, "get_system_temp_directori": [14, 15], "get_tag": [6, 45, 56], "get_template_context": 13, "getdbt": 22, "getenv": [14, 16, 17, 18, 27, 31, 40], "getrunbyid": 22, "giant": [14, 38], "github": 43, "github_app_id": 27, "github_app_private_rsa_kei": 27, "github_hostnam": 27, "github_installation_id": 27, "github_job": 27, "github_op": 27, "github_private_kei": 27, "github_resourc": 27, "give": [1, 6, 14, 15, 20, 28, 38, 41, 56], "given": [1, 6, 9, 10, 12, 13, 14, 20, 22, 24, 28, 31, 34, 38, 40, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57], "gke": 29, "glob": [14, 38, 52, 58], "global": [25, 54, 57], "goe": [14, 38], "going": [14, 22, 38, 49, 56], "good": [3, 14, 15, 25, 27, 34, 38], "googl": 25, "googleapi": 25, "gql": 28, "grab": 9, "gracefulli": [14, 38], "grain": [14, 38, 53], "grandchild": [49, 56], "grant": [25, 27], "graph": [1, 3, 4, 5, 11, 12, 14, 20, 24, 38, 39, 41, 51, 56], "graph_a": 51, "graph_def": [6, 7, 11, 51], "graphdefinit": [5, 6, 7, 11, 40, 46, 49, 51, 55], "graphin": 7, "graphout": 7, "graphql": [9, 29, 31, 32, 36, 40, 50, 55], "graphx": [14, 38], "great_expect": 26, "greater": [9, 14, 20, 38], "greatexpect": 26, "green": 3, "group": [1, 14, 25, 53], "groupid": [14, 38], "grow": [14, 38], "grpc": 9, "grpc_host": 2, "grpc_port": 2, "grpc_socket": 2, "gserviceaccount": 25, "guarante": [9, 14, 20, 38], "guest": [16, 17, 18], "guid": [14, 19, 25, 27, 29, 34, 38], "had": [14, 38], "hadoop": [14, 25, 38], "hadoopjob": 25, "hand": [14, 38, 40], "handi": 35, "handl": [1, 10, 14, 24, 38, 39, 45, 46, 49, 51, 56, 57], "handle_input": 10, "handle_output": [10, 47], "handle_str": [51, 56], "handled_output": [6, 10], "handler": [14, 35, 46], "hang": 41, "happen": [5, 14, 38, 55], "happi": 9, "hard": [9, 14, 25, 38, 46], "hardcod": [10, 54], "hardcoded_io_manag": 10, "hardcoded_resourc": 54, "has": [1, 3, 5, 6, 7, 9, 10, 12, 14, 15, 20, 22, 25, 28, 31, 34, 38, 49, 50, 51, 52, 53, 56, 57, 58], "has_error": [49, 56], "has_input_nam": 10, "has_job": 53, "has_output": 47, "has_partition_kei": [6, 10, 56], "has_tag": [6, 45, 56], "have": [4, 5, 6, 9, 10, 12, 14, 20, 22, 24, 25, 28, 29, 34, 35, 38, 41, 47, 49, 53, 54, 55, 56, 57, 58], "haw": 3, "hcf": 25, "hdf": [14, 25, 38], "hdfs_user_guid": 25, "heap": [14, 38], "heartbeat": [9, 14, 38], "heartbeat_timeout": 2, "heartbeatinterv": [14, 38], "hei": 40, "hello": [3, 31, 39, 49, 56, 57], "hello_op": 58, "hello_world": [39, 49, 56], "hello_world_daily_schedul": 50, "hello_world_partition_set": 50, "hello_world_pipelin": 50, "hello_world_with_default": 3, "help": [9, 10, 14, 16, 38, 53, 54], "helper": [10, 54], "here": [6, 7, 14, 16, 17, 18, 20, 21, 22, 24, 25, 27, 29, 33, 38, 40, 47, 49, 51, 56], "heurist": 13, "hierarch": [49, 56], "high": [14, 38], "higher": [6, 14, 38], "highlight": [49, 56], "highlycompressedmapstatu": [14, 38], "hint": 57, "histogram": 21, "histor": [2, 5, 24], "histori": [2, 9, 14, 38, 50], "hit": [14, 38], "hive": 25, "hivejob": 25, "hoc": 13, "hold": [3, 49], "home": [22, 25], "honor": [14, 41], "honua": 3, "hook": [7, 9, 11, 30, 31, 40, 51], "hook_complet": 6, "hook_def": [6, 8, 11, 31, 40, 51], "hook_error": 6, "hook_fn": 8, "hook_skip": 6, "hook_to_invok": 8, "hook_url": 31, "hookcontext": [8, 31, 40], "hookdefinit": [8, 30, 31, 40, 51], "hope": [16, 17, 18], "host": [2, 12, 14, 20, 21, 22, 28, 38, 43], "hostnam": [2, 9, 14, 16, 25, 27, 28, 32, 36, 38], "hour": [25, 50, 55], "hour_of_dai": [50, 55], "hour_offset": [50, 55], "hourli": [14, 38, 50, 55], "hourly_partitioned_config": [50, 55], "hous": [14, 38], "how": [1, 3, 6, 7, 10, 11, 14, 16, 20, 21, 22, 38, 51, 55, 56], "howev": [3, 14, 22, 28, 38, 39, 53], "href": 22, "html": [14, 16, 17, 18, 20, 23, 25, 26, 38, 42], "http": [12, 14, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 38, 40, 41, 42, 43, 49, 50, 55, 56], "http_proxi": 31, "https_proxi": 31, "human": [3, 6, 7, 10, 14, 35, 39, 46, 48, 49, 50, 51, 54, 55, 56], "hydrat": [14, 57], "hyphen": 25, "iam": 25, "iana": [50, 55], "idea": [14, 27, 38], "idempot": [16, 17, 18, 20], "idempotency_token": 20, "ident": [54, 57], "identifi": [4, 7, 10, 11, 34, 48, 49, 50, 55, 56, 57], "identity_imp": 57, "identity_partition_selector": 50, "idl": [14, 38], "ids": 13, "ietf": 25, "ifnotpres": [18, 29], "ignor": [3, 6, 9, 11, 14, 38], "ignore_handled_error": 22, "ignore_missing_v": 34, "illeg": [14, 38], "imag": [13, 17, 18, 23, 25, 29], "image_nam": [17, 18], "image_pull_polici": [18, 29], "image_pull_secret": [18, 29], "image_vers": 25, "imagepullpolici": 29, "imageuri": 25, "imagevers": 25, "immedi": [9, 14, 38], "immun": [14, 38], "impact": [14, 38], "implement": [6, 7, 9, 10, 11, 12, 14, 15, 16, 18, 22, 24, 25, 34, 38, 47, 48, 49, 51, 55, 56], "implementor": 9, "implicitli": 57, "import": [1, 2, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 27, 29, 30, 31, 38, 39, 40, 41, 49, 50, 51, 54, 56, 58], "import_df_to_bq": 25, "import_file_to_bq": 25, "import_gcs_paths_to_bq": 25, "imprecis": [14, 38], "improv": [9, 14, 38, 41], "in_process": [6, 18, 48, 51], "in_process_executor": [6, 51], "inbound": [14, 38], "includ": [1, 5, 6, 7, 9, 11, 12, 13, 14, 16, 17, 18, 20, 22, 24, 25, 28, 29, 31, 33, 38, 40, 49, 50, 51, 52, 55, 56, 57], "include_exampl": 13, "include_rel": 22, "include_sidecar": 14, "inclus": 50, "incom": [14, 31, 38], "incompat": [5, 15, 49, 56], "incorrect": 25, "increas": [14, 38, 41], "increment": [14, 21, 22, 38], "incur": 9, "indefinit": 41, "independ": [25, 54], "index": [2, 9, 14, 22, 38, 49, 50, 51, 53, 55, 56], "indic": [2, 3, 5, 9, 20, 25, 28, 34, 41, 49, 55, 56], "individu": [4, 6, 48], "inf": 34, "infer": [1, 7, 25, 28, 49, 56], "infinit": [14, 38, 41], "info": [2, 9, 10, 14, 16, 22, 25, 27, 38, 45, 46, 49, 56, 58], "inform": [1, 2, 7, 14, 17, 20, 22, 23, 25, 28, 38, 40, 49, 51, 55, 56, 58], "ingest": 47, "inherit": [5, 9, 13, 34, 46, 49, 54, 56, 57, 58], "init": [20, 34, 54, 57], "init_context": [9, 10, 14, 35, 45, 46, 51, 54], "init_script": 20, "initexecutorcontext": 9, "initi": [3, 5, 6, 9, 10, 12, 13, 14, 15, 19, 20, 22, 24, 30, 38, 45, 46, 54, 55, 56, 58], "initial_last_sync_complet": 24, "initialexecutor": [14, 38], "initializationact": 25, "initialr": [14, 38], "initloggercontext": [14, 35, 46], "initresourcecontext": [10, 54], "inject": [5, 18, 22, 29, 45, 49, 56], "inlin": 39, "inner": [9, 46, 49, 54, 56, 57], "inner_nod": 6, "inner_typ": 3, "input": [1, 3, 5, 6, 7, 9, 14, 22, 26, 34, 38, 39, 45, 46, 49, 51, 54, 57], "input1": 10, "input_config_schema": 10, "input_dagster_typ": 26, "input_def": [7, 22, 39, 45, 49, 51, 56], "input_events_during_comput": 56, "input_map": [6, 7, 56], "input_nam": 56, "input_valu": 56, "inputcontext": [10, 49, 56], "inputdefinit": [7, 10, 34, 39, 45, 49, 51, 56, 57], "inputmap": [6, 7, 56], "ins": [1, 7, 9, 10, 49, 57], "insensit": 9, "insid": [1, 12, 14, 22, 24, 25, 38], "inst_data": [9, 14, 15, 32], "instal": [13, 14, 20, 25, 27, 29, 33, 40], "installation_id": 27, "instanc": [3, 5, 6, 7, 8, 10, 11, 13, 14, 18, 20, 22, 25, 29, 30, 31, 34, 38, 40, 46, 47, 51, 54, 55, 56, 57], "instance_config_map": [18, 29], "instance_pool_id": 20, "instance_ref": 55, "instance_typ": 9, "instanceof": 57, "instanceref": [9, 55], "instancetyp": 9, "instanti": [6, 9, 14, 15, 22, 32, 35, 36, 46, 51, 54, 56], "instead": [1, 2, 3, 5, 6, 9, 11, 13, 14, 15, 16, 17, 18, 20, 22, 28, 34, 38, 39, 51, 53, 55, 57], "instruct": [22, 27, 29, 33], "insuffici": [14, 38], "int": [1, 3, 4, 5, 6, 9, 12, 13, 14, 20, 22, 24, 25, 28, 34, 43, 49, 50, 51, 53, 54, 55, 56, 57], "integ": [3, 9, 22, 34, 46, 49, 57], "integer_column": 34, "integr": [12, 14, 20, 21, 22, 24, 27, 29, 33, 35, 37, 38, 40, 41, 43, 44], "intend": [6, 7, 9, 14, 18, 22, 28, 49, 51, 53, 56, 57], "intens": 20, "intent": 9, "inter": 2, "interact": [6, 11, 14, 17, 22, 26, 38, 45, 51], "interchang": 56, "interfac": [1, 2, 9, 12, 14, 22, 24], "intermedi": [9, 14, 38], "intern": [2, 6, 10, 14, 22, 24, 25, 28, 32, 34, 36, 38, 48, 51, 56, 57], "internal_asset_dep": 1, "internal_ip_onli": 25, "internaliponli": 25, "interpol": [14, 38], "interrupt": [14, 38], "interv": [9, 14, 22, 38, 50, 55], "intmetadatavalu": [49, 56], "introduc": [16, 17, 18], "introduct": 16, "intsourc": [3, 9, 14, 20, 22, 27, 32, 36, 38, 41, 55], "intuit": [7, 14], "invalid": [5, 22, 28, 49, 52, 56, 58], "invalid_line_no": 22, "invalid_output_nam": 28, "invalid_step_kei": 28, "invalidoutputerror": 28, "invalidoutputerrorinfo": 28, "invalidsteperror": 28, "invari": 5, "invert": 1, "invoc": [1, 6, 7, 8, 11, 13, 29, 46, 51, 54, 55], "invok": [3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 17, 18, 22, 30, 35, 46, 51, 55, 56], "io_manag": [1, 6, 7, 10, 11, 14, 15, 22, 24, 25, 49, 56], "io_manager_kei": [1, 4, 10, 22, 24, 49, 56], "iomanag": [1, 10, 22, 47, 56], "iomanagerdefinit": [1, 10, 14, 15, 25], "iomanagerdefnit": 10, "ipc": 2, "ipc_output_fil": 2, "ipipelin": [9, 51], "is_builtin": [34, 57], "is_pres": [49, 56], "is_requir": [3, 4, 34, 49, 56], "is_valid": [49, 56], "isinst": [10, 57], "isol": [13, 29], "ispreempt": 25, "issu": [14, 33, 38, 41], "item": [3, 4], "iter": [1, 14, 38, 51], "its": [1, 2, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 22, 24, 38, 45, 46, 49, 50, 51, 53, 54, 55, 56], "itself": [1, 2, 3, 5, 6, 7, 11, 14, 38, 51], "ivi": [14, 38], "ivy2": [14, 38], "ivyset": [14, 38], "jar": [14, 25, 38], "jar_file_uri": 25, "jarfileuri": 25, "java": [14, 38], "javaseri": [14, 38], "javax": [14, 38], "jitter": [49, 56], "jni": [14, 38], "job": [1, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 35, 38, 40, 41, 42, 46, 47, 49, 50, 51, 53, 54, 55, 56, 57], "job_config": 25, "job_def": [6, 46], "job_id": 22, "job_imag": [18, 29], "job_nam": [8, 9, 13, 28, 50, 53, 55], "job_namespac": [18, 29], "job_runn": 29, "job_scoped_clust": 25, "job_select": [40, 55], "job_wait_timeout": 18, "job_with_all_asset": 1, "job_with_multiple_select": 1, "job_with_one_select": 1, "jobconfigvalidationinvalid": 28, "jobdefinit": [1, 5, 6, 7, 11, 13, 15, 46, 48, 49, 51, 53, 55], "jobfactori": 11, "jobid": 25, "jobnotfounderror": 28, "join": [4, 14, 38, 57], "json": [1, 2, 3, 4, 6, 7, 11, 12, 22, 24, 38, 39, 49, 51, 56], "json_console_logg": 46, "jsonmetadatavalu": [49, 56], "jsonrpc_vers": 22, "jupyt": [6, 11, 45, 51], "just": 3, "jvm": [14, 20, 38], "k8s": 18, "k8s_job": 29, "k8s_job_executor": 29, "k8srunlaunch": [18, 29], "kafka": [14, 38], "kdc": 25, "kdcdbkeyuri": 25, "keep": [14, 29, 38, 41, 50], "keepal": 43, "keepalive_interv": 43, "kei": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 34, 38, 43, 45, 48, 50, 51, 54, 55, 57], "kerber": 25, "kerbero": 25, "kerberosconfig": 25, "key_fil": 43, "key_label_nam": 3, "key_str": 43, "key_typ": 3, "keypassworduri": 25, "keystor": 25, "keystorepassworduri": 25, "keystoreuri": 25, "keyword": [3, 22, 49, 54], "kib": [14, 38], "kill": [14, 22, 38], "killblacklistedexecutor": [14, 38], "killen": [14, 38], "killtimeout": [14, 38], "kind": [1, 4, 6, 11, 34, 45, 51, 57], "kit": 40, "kms": 20, "kms_kei": 20, "kmskeyuri": 25, "know": [1, 6, 11, 14, 16, 38, 39, 41, 51], "known": 50, "kryo": [14, 38], "kryoregistr": [14, 38], "kryoseri": [14, 38], "kube": 19, "kubeconfig": [18, 29], "kubeconfig_fil": [18, 29], "kubectl": 29, "kubernet": [9, 14, 16, 19, 28, 38], "kwarg": [3, 5, 6, 7, 9, 11, 13, 22, 39, 46, 54, 56, 57, 58], "lab": 22, "label": [4, 18, 25, 29, 34, 49, 50, 56], "lack": [14, 38], "lake": [15, 20], "lambda": [6, 11, 50, 51], "lambda_solid": [51, 56], "larg": [14, 38], "larger": [14, 38], "last": [6, 10, 14, 22, 24, 38, 50, 55, 56], "last_completion_tim": 55, "last_run_kei": 55, "latenc": [14, 38], "later": [14, 38], "latest": [9, 14, 16, 17, 18, 20, 25, 26, 29, 38, 42], "latter": [9, 46, 49, 54, 56], "launch": [2, 9, 12, 14, 17, 18, 20, 22, 23, 24, 29, 38, 50, 55], "launch_run": 9, "launcher": [2, 6, 14, 18, 20, 23, 29, 56], "launchpipelineexecut": 2, "lazi": [2, 53], "lazili": 53, "lazy_loaded_repositori": 53, "lead": [14, 38], "leader": [14, 38], "leaf": 56, "least": [14, 38], "leav": [3, 14, 29, 38, 51], "left": [12, 24, 29, 40], "legaci": [6, 8, 9, 13, 14, 38, 49, 54], "len": [49, 56, 57], "length": [3, 5, 14, 25, 38, 41], "lengthi": 9, "less": [9, 14, 16, 17, 18, 38, 50], "let": [14, 16, 38], "letter": [25, 49], "level": [1, 2, 3, 6, 7, 9, 10, 11, 14, 16, 22, 25, 34, 38, 46, 49, 51, 56, 58], "lib": 25, "libjar": 25, "librari": [9, 12, 14, 16, 20, 21, 22, 24, 27, 28, 29, 33, 34, 35, 37, 38, 39, 40, 41, 43, 44, 46], "lifecycl": [51, 56], "lifetim": 25, "like": [3, 6, 9, 11, 14, 18, 22, 24, 27, 28, 29, 38, 39, 41, 45, 46, 49, 50, 51, 54, 56], "limit": [9, 14, 38, 56], "line": [14, 22, 38, 52], "lineag": [14, 38, 49, 56], "linear": [49, 56], "lint": 29, "list": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 24, 25, 29, 30, 34, 38, 39, 40, 45, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58], "list_file_system": 15, "list_objects_v2": 14, "list_run_artifact": 22, "list_vers": 2, "listdir": 53, "listen": [14, 16, 17, 18, 38], "listenerbu": [14, 38], "liter": 3, "littl": [14, 38], "live": [1, 2, 14, 38, 55], "liveupd": [14, 38], "load": [1, 2, 3, 6, 7, 9, 10, 11, 14, 18, 22, 29, 32, 36, 38, 39, 41, 49, 50, 51, 52, 53, 56, 57, 58], "load_assets_from_dbt_manifest": 22, "load_assets_from_dbt_project": 22, "load_dict": 57, "load_incluster_config": [18, 29], "load_input": [10, 47], "load_kube_config": [18, 29], "load_table_from_local_parquet": 41, "loaded_input": [6, 10], "loader": [3, 34, 57], "loader_vers": 57, "loadrepositori": 2, "local": [1, 9, 14, 15, 16, 17, 19, 20, 23, 38, 51, 52, 57], "local_artifact_storag": [1, 9, 10], "local_artifact_storage_data": 9, "local_bas": [52, 58], "local_compute_log_manag": 9, "local_dagster_job_package_path": 20, "local_dbt_rpc_resourc": 22, "local_dir": [14, 15, 38], "local_disk0": 20, "local_file_manag": 9, "local_job_package_path": 14, "local_output_notebook_io_manag": 45, "local_pipeline_package_path": [14, 20], "local_warehous": [52, 58], "localartifactstorag": 9, "localclust": 19, "localcomputelogmanag": 9, "localfilehandl": 57, "localhost": [2, 14, 16, 17, 18, 28, 30, 31, 33, 40], "locat": [9, 14, 18, 20, 25, 28, 29, 38], "log": [2, 5, 6, 8, 10, 12, 14, 15, 16, 18, 20, 22, 24, 25, 29, 30, 32, 35, 36, 38, 45, 51, 54, 56, 58], "log_ev": [6, 10, 56], "log_group_nam": 14, "log_level": [2, 14], "log_manag": [6, 9, 10, 54], "log_materi": [6, 56], "log_param": 30, "log_request": 22, "log_stream_nam": 14, "logblockupd": [14, 38], "logconf": [14, 38], "logger": [3, 6, 7, 11, 12, 14, 22, 24, 35, 43, 45, 48, 51, 56, 58], "logger_config": [14, 35, 46], "logger_def": [7, 11, 45, 46, 48], "logger_fn": [14, 35, 46], "logger_to_init": 46, "loggerdefinit": [3, 7, 11, 14, 35, 46, 48], "logging_tag": [6, 45], "loggingconfig": 25, "logic": [10, 13, 14, 35, 38, 46, 49, 54, 56, 57], "login": [14, 20, 29, 41], "login_timeout": 41, "loglevel": 16, "logs_captur": 6, "logs_start": 22, "logwrit": 25, "long": [14, 15, 20, 25, 38, 55], "longer": [7, 14, 38, 49, 56], "longform": [14, 38], "look": [1, 3, 9, 13, 22, 51, 55], "lookup": [14, 38], "lookuptimeout": [14, 38], "loop": [14, 29, 38], "los_angel": [41, 50, 55], "lost": [14, 38], "lot": [14, 38], "low": 20, "lower": [14, 34, 38], "lowercas": [16, 17, 18], "lsf": 19, "lz4": [14, 38], "lz4compressioncodec": [14, 38], "lzf": [14, 38], "lzfcompressioncodec": [14, 38], "machin": [2, 14, 25, 38], "machineri": [9, 32, 34, 36, 45, 49, 56, 57], "machinetyp": 25, "machinetypeuri": 25, "maco": 29, "macro": 22, "made": [6, 7, 14, 38, 45, 46, 50, 51, 54, 55, 56], "magic": [9, 54], "magic_word": 9, "magicmock": [15, 54], "mai": [1, 3, 5, 6, 7, 8, 9, 11, 14, 15, 16, 17, 18, 20, 22, 25, 29, 38, 39, 41, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "main": [20, 25, 34, 38, 40], "main_class": 42, "mainclass": 25, "mainjarfileuri": 25, "mainpythonfileuri": 25, "maintain": 9, "majmin": 29, "make": [2, 3, 7, 9, 10, 14, 15, 16, 17, 18, 20, 25, 27, 38, 41, 50, 53, 54], "make_airflow_dag": 13, "make_airflow_dag_container": 13, "make_airflow_dag_for_oper": 13, "make_airflow_example_dag": 13, "make_bar_job": [6, 11, 51], "make_dagster_job_from_airflow_dag": 13, "make_dagster_pipeline_from_airflow_dag": 13, "make_dagster_repo": 13, "make_dagster_repo_from_airflow_dag_bag": 13, "make_dagster_repo_from_airflow_dags_path": 13, "make_dagster_repo_from_airflow_example_dag": 13, "make_email_on_run_failure_sensor": 58, "make_expensive_job": 53, "make_expensive_schedul": 53, "make_job": 11, "make_python_type_usable_as_dagster_typ": 57, "make_repo_from_dag_bag": 13, "make_repo_from_dir": 13, "make_request": [12, 22, 24], "make_slack_on_pipeline_failure_sensor": 40, "make_slack_on_run_failure_sensor": 40, "make_teams_on_pipeline_failure_sensor": 31, "make_values_resourc": 54, "malform": 5, "man": 21, "manag": [1, 2, 5, 6, 8, 14, 15, 16, 20, 22, 25, 38, 45, 46, 47, 49, 54, 56], "managed_logg": 46, "managedgroupconfig": 25, "mani": [6, 14, 18, 20, 29, 38, 49, 53, 55, 56], "manifest": 22, "manifest_json": 22, "manipul": 57, "manner": 54, "manual": [14, 20, 22, 24, 38, 41], "map": [1, 3, 5, 6, 7, 8, 10, 11, 12, 14, 18, 22, 25, 34, 38, 50, 51, 56, 57], "map_config_op": 3, "mapped_op": 4, "mappedinputplacehold": 51, "mapping_from": 56, "mapping_kei": [4, 6, 10, 56], "mapping_to": 56, "mapr": 25, "mapreduc": [14, 25, 38], "mapreducetutori": 25, "maps_from": 56, "maps_to": 56, "mark": [3, 14, 18, 38], "markdown": [34, 49, 56, 57], "markdownmetadatavalu": [49, 56], "master": [14, 25, 29, 38], "master_url": 42, "masterconfig": 25, "match": [5, 9, 14, 22, 34, 38, 47, 49, 50, 54, 56], "materi": [1, 3, 6, 9, 10, 12, 14, 15, 22, 24, 25, 34, 45, 49, 55, 56, 57], "materialization_events_during_comput": 56, "materializations_during_comput": 56, "materialize_df": 57, "math_pipelin": 51, "matter": [14, 38, 55], "maven": [14, 38], "max": [14, 34, 38, 49, 56], "max_attempt": 14, "max_catchup_run": 55, "max_completion_wait_time_second": 20, "max_concurr": 6, "max_concurrent_run": 9, "max_datetim": 34, "max_retri": [22, 49, 56], "max_tick_retri": 55, "max_valu": 34, "max_work": [2, 20], "maxattempt": [14, 38], "maxblocksinflightperaddress": [14, 38], "maxchunksbeingtransf": [14, 38], "maxconsecutiveattempt": [14, 38], "maxexecutor": [14, 38], "maxfailedexecutorspernod": [14, 38], "maxfailedtasksperexecutor": [14, 38], "maxfailur": [14, 38], "maxfailuresperhour": 25, "maxim": [14, 38], "maximum": [2, 9, 12, 14, 20, 22, 24, 25, 38, 49, 55, 56], "maxpartitionbyt": [14, 38], "maxrat": [14, 38], "maxrateperpartit": [14, 38], "maxregisteredresourceswaitingtim": [14, 38], "maxremoteblocksizefetchtomem": [14, 38], "maxreqsinflight": [14, 38], "maxresults": [14, 38], "maxretainedfil": [14, 38], "maxretri": [14, 38], "maxsiz": [14, 38], "maxsizeinflight": [14, 38], "maxtaskattemptsperexecutor": [14, 38], "maxtaskattemptspernod": [14, 38], "md_str": [49, 56], "mean": [3, 4, 14, 34, 38, 40, 55, 58], "meant": [5, 14, 15, 25, 34, 57], "measur": [14, 38], "mechan": [14, 16, 38], "median": [14, 38], "meet": [1, 6, 7, 11, 39, 49, 51, 56], "mega": 1, "mem_io_manag": 10, "member": [5, 9, 53], "memoiz": [2, 10, 51], "memoizableiomanag": 47, "memoizaton": [7, 11], "memoized_run_tag": 47, "memori": [6, 7, 9, 10, 11, 14, 15, 20, 38, 41], "memory_onli": [14, 38], "memory_only_s": [14, 38], "memoryfract": [14, 38], "memorymapthreshold": [14, 38], "memoryoverhead": [14, 38], "merg": [14, 22, 38, 52], "meso": [14, 38], "mesos_sandbox": [14, 38], "messag": [5, 6, 8, 9, 14, 28, 31, 35, 38, 40, 45, 46, 55, 58], "message_fn": [31, 40], "met": 9, "metadata": [1, 4, 6, 7, 9, 10, 11, 14, 22, 25, 26, 28, 38, 39, 41, 47, 51, 57], "metadata_entri": [1, 4, 5, 22, 49, 56, 57], "metadataentri": [1, 4, 34, 49, 56], "metadatavalu": [4, 34, 49, 56], "method": [4, 6, 9, 10, 11, 12, 13, 14, 21, 22, 24, 26, 30, 35, 40, 41, 46, 47, 49, 51, 54, 56], "metric": [14, 21, 25, 38], "mgmt": 14, "mgr": 10, "mib": [14, 38], "midnight": [50, 55], "might": [3, 14, 38, 39, 57], "migrat": [2, 49], "mileston": [14, 38], "millisecond": [2, 14, 38], "min": [20, 34], "min_datetim": 34, "min_valu": 34, "min_work": 20, "minexecutor": [14, 38], "minim": [14, 38], "minimum": [14, 20, 38, 55], "minimum_interval_second": 55, "minrateperpartit": [14, 38], "minregisteredresourcesratio": [14, 38], "minut": [14, 20, 25, 38, 50, 55], "minute_of_hour": [50, 55], "minute_offset": [50, 55], "mirror": 21, "mismatch": 15, "miss": [22, 55], "missing_column": [49, 56], "missing_th": [49, 56], "mitig": [14, 38], "mixin": 9, "mlf_exampl": 30, "mlflow_op": 30, "mlflow_s3_endpoint_url": 30, "mlflow_solid": 30, "mlflow_track": 30, "mlflow_tracking_uri": 30, "mlflowclient": 30, "moab": 19, "mock": [8, 10, 15, 54], "mock_resourc": 54, "mode": [2, 6, 7, 8, 9, 13, 14, 28, 38, 41, 45, 50, 51, 52, 55, 56], "mode_def": [6, 8, 9, 11, 20, 22, 31, 45, 51, 56], "modedefinit": [6, 8, 9, 20, 22, 31, 45, 48, 51, 56], "model": [13, 22], "modifi": [13, 16, 17, 18, 22, 49, 56], "modifyaconnector": 24, "modul": [1, 2, 6, 9, 11, 13, 14, 15, 16, 17, 18, 29, 32, 36, 46, 49, 51, 52, 56, 58], "module_nam": [2, 9, 13], "moduletyp": 1, "monitor": [9, 12, 14, 22, 24, 38, 40, 55], "month": [50, 55], "monthli": [50, 55], "monthly_partitioned_config": [50, 55], "more": [5, 6, 7, 14, 16, 20, 22, 25, 27, 38, 40, 41, 49, 54, 55, 56, 58], "most": [14, 16, 17, 18, 20, 22, 24, 34, 38, 51, 55, 56, 57, 58], "mostli": 15, "mount": [14, 18, 29], "mrkdwn": 40, "msg": [46, 49], "msg_fn": 9, "msteams_resourc": 31, "much": [9, 14, 29, 38], "mult_two": 51, "multi": [1, 7, 14, 16, 38], "multi_asset": 1, "multi_or_in_process_executor": [1, 7], "multi_out": 49, "multidependencydefinit": [51, 57], "multipl": [1, 2, 6, 7, 11, 14, 16, 24, 25, 38, 49, 51, 55, 56, 57], "multipli": [14, 38], "multiprocess": [6, 48, 51, 58], "multiprocess_executor": [6, 11, 51], "must": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 20, 25, 28, 29, 31, 34, 38, 39, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57], "mutabl": 51, "mutat": [2, 28], "my_airbyte_job": 12, "my_airbyte_resourc": 12, "my_asset": 1, "my_assets_job": 1, "my_aws_key_id": 30, "my_channel": 40, "my_composed_airbyte_job": 12, "my_composed_fivetran_job": 24, "my_custom_dbt_run": 22, "my_custom_path_fs_io_manag": 10, "my_dag_bag": 13, "my_dagster_job": 13, "my_dashboard": [49, 56], "my_dataset": [49, 56], "my_dbt_cli_job": 22, "my_dbt_cloud_job": 22, "my_dbt_cloud_resourc": 22, "my_dbt_output": 22, "my_dbt_rpc_job": 22, "my_downstream_op": 51, "my_experi": 30, "my_first_dbt_model": 22, "my_fivetran_job": 24, "my_fivetran_resourc": 24, "my_funct": [49, 56], "my_graph": [6, 11, 39, 51], "my_int_var": 54, "my_io_manag": [1, 10], "my_io_manager_kei": 10, "my_job": [10, 14, 15, 16, 25, 40, 53, 54, 55], "my_message_fn": [31, 40], "my_modul": [16, 17, 18], "my_new_project": 22, "my_op": [10, 25, 38, 54], "my_other_t": [49, 56], "my_pipelin": [20, 31, 40], "my_project": 29, "my_pyspark_resourc": 38, "my_repo": [17, 18, 31, 40], "my_repo_nam": 13, "my_return_n_": 53, "my_run_config_fn": 50, "my_s3_endpoint": 30, "my_sas_token": 15, "my_schedul": 53, "my_secret": 30, "my_sensor": 55, "my_simple_airbyte_job": 12, "my_simple_fivetran_job": 24, "my_slack_token": 40, "my_snowflake_job": 41, "my_spark": 20, "my_spark_job": 38, "my_storage_account": 15, "my_str_var": 54, "my_tabl": [49, 56], "my_table_schema": [49, 56], "my_text_label": [49, 56], "my_upstream_asset": 1, "my_upstream_graph": 51, "my_upstream_op": 51, "my_us": 17, "my_valu": 22, "my_vari": 22, "myclass": [49, 56], "mycompani": [18, 29], "myconfigurableclass": 9, "mycoolsit": [40, 49, 56], "mycorp": 14, "myiomanag": 10, "mymodul": 11, "mysql_db": 32, "mysql_url": 32, "mysqleventlogstorag": [9, 32], "mysqlrunstorag": [9, 32], "mysqlschedulestorag": [9, 32], "mytabl": [49, 56], "n_worker": 19, "naiv": [25, 34], "name": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 38, 39, 40, 41, 42, 45, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58], "name1": [14, 38], "name2": [14, 38], "namedtemporaryfil": 9, "namedtupl": 45, "namespac": [1, 18, 29], "nativ": [14, 38], "necessari": [9, 14, 20, 22, 24, 38, 57], "need": [6, 7, 10, 11, 14, 15, 16, 20, 21, 25, 27, 33, 34, 35, 38, 40, 49, 51, 53, 56, 57, 58], "neg": [6, 14, 38, 49, 56], "neither": [25, 34, 57], "nest": [6, 7, 11, 30, 51, 56], "net": [14, 38], "netti": [14, 38], "network": [9, 14, 15, 17, 23, 25, 38], "network_timeout": 41, "network_uri": 25, "networkuri": 25, "never": [6, 10, 12, 14, 16, 17, 18, 22, 24, 38, 55, 56], "new": [9, 14, 15, 16, 17, 18, 20, 29, 33, 38, 40, 46, 49, 51, 52, 53, 54, 56], "new_clust": 20, "newer": [14, 38], "newli": 9, "next": [4, 24, 49, 56], "next_asset": 1, "no_host_key_check": 43, "node": [1, 3, 6, 7, 11, 14, 15, 20, 22, 25, 38, 45, 51, 56], "node_a": 51, "node_b": 51, "node_def": [6, 7], "node_info_to_asset_kei": 22, "node_nam": 6, "node_str": 6, "node_typ": 20, "node_type_id": 20, "nodedefinit": [6, 7], "nodehandl": [6, 51, 56], "nodeinvoc": [6, 7, 51, 56], "nois": 40, "non": [3, 9, 14, 17, 23, 25, 34, 38, 39], "non_argument_dep": 1, "non_nul": 34, "non_scalar_schema": 3, "noncancel": [14, 38], "none": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 26, 28, 29, 30, 31, 32, 34, 39, 40, 41, 42, 43, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "none_resourc": 54, "noneabl": [3, 5], "nonetyp": 3, "nor": [25, 34, 57], "normal": [8, 22, 56], "normalized_nam": 13, "nosigint": 58, "not_start": 9, "note": [1, 3, 9, 10, 14, 15, 16, 17, 18, 20, 22, 27, 28, 32, 33, 36, 38, 39, 41, 46, 49, 53, 55, 56], "notebook": [6, 11, 45, 51], "notebook_path": 45, "noth": [22, 39, 54, 55, 57], "nothing_int_job": 57, "nothing_job": 57, "notic": 46, "notif": 40, "novaluesentinel": [4, 49, 56], "novel": 1, "now": [20, 27, 28, 48], "ntype": 57, "null": [9, 34, 49], "nullabl": 49, "nullable_concat": 57, "num": [6, 7, 14, 38, 49, 51, 56], "num_allowed_row": 34, "num_input": 20, "num_row": [49, 56], "num_work": 20, "number": [2, 6, 9, 12, 14, 19, 20, 22, 24, 25, 28, 34, 38, 41, 49, 50, 55, 56], "numconnectionsperp": [14, 38], "numer": [34, 41], "numeric_column": 34, "numinst": 25, "numlocalssd": 25, "numpi": 41, "numrbackendthread": [14, 38], "numretri": [14, 38], "oar": 19, "oauth": [14, 38], "oauth2accesstoken": 29, "obj": 10, "object": [3, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, 24, 25, 26, 28, 29, 31, 34, 38, 40, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57], "object_store_oper": 6, "objectadmin": 25, "objectstreamreset": [14, 38], "observ": [4, 10], "occasion": [14, 38], "occur": [5, 6, 7, 9, 11, 14, 25, 28, 38, 39, 49, 51, 56], "ocsp": 41, "ocsp_response_cache_filenam": 41, "ocur": 6, "of_typ": 34, "off": [2, 14, 24, 25, 29, 38, 47], "offer": [14, 38], "offheap": [14, 38], "offici": [20, 28], "offset": [14, 38, 50], "often": [5, 14, 38], "old": [14, 15, 38], "older": [14, 38], "omit": [14, 22, 25, 28, 34, 38, 56, 57, 58], "onc": [3, 6, 9, 27, 33, 40, 54, 56, 57], "one": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 16, 17, 18, 19, 20, 22, 23, 25, 28, 38, 41, 46, 49, 50, 51, 52, 53, 55, 56, 57, 58], "ones": [50, 55], "ongo": 9, "onli": [1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 14, 20, 22, 24, 25, 34, 38, 39, 41, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57], "onlin": 21, "onto": [13, 29, 56], "oom": [14, 38], "op_a": [10, 51], "op_b": [10, 51], "op_c": 51, "op_config": [3, 4, 6, 8, 53, 57], "op_def": 10, "op_definit": [12, 22, 24, 25], "op_except": 8, "op_kwarg": 13, "op_output_valu": 8, "op_retry_polici": [6, 7, 11], "op_select": [6, 7, 11, 28], "op_to_invok": 6, "opdefinit": [8, 10, 12, 20, 22, 24, 25, 39, 41, 49, 56], "open": [9, 14, 27, 38, 57], "opencostinbyt": [14, 38], "oper": [12, 13, 14, 22, 24, 26, 27, 28, 38, 41, 48, 51], "opexecutioncontext": 6, "ops": [1, 3, 4, 6, 7, 10, 11, 14, 15, 20, 21, 22, 25, 26, 33, 34, 39, 41, 46, 51, 53, 54, 56, 57], "opt": [18, 29], "optim": [14, 20, 38], "option": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 35, 36, 38, 39, 40, 41, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "optionalcompon": 25, "orchestr": [4, 9], "order": [3, 9, 14, 15, 18, 20, 29, 34, 38, 49, 56, 57], "ordinari": [14, 57], "org": [14, 25, 38, 42, 50, 55], "organ": [49, 56], "origin": [3, 5, 9, 18, 22, 29, 51], "original_exc_info": 5, "original_root": 5, "other": [1, 3, 5, 6, 7, 9, 11, 14, 18, 22, 38, 41, 49, 50, 51, 53, 56, 58], "other_asset": 1, "other_asset_key_a": 1, "other_asset_key_b": 1, "other_expensive_job": 53, "other_op": [12, 24], "other_op_a": [6, 7, 11], "other_op_b": [6, 7, 11], "other_result": 51, "other_solid": [50, 52, 55], "other_solid_a": 51, "other_solid_b": 51, "otherwis": [6, 9, 14, 18, 22, 24, 29, 38, 41, 47, 49, 56], "ought": 34, "our": 9, "out": [1, 2, 4, 6, 7, 10, 12, 14, 22, 24, 29, 38, 56, 57], "outer": 56, "outer_graph": 6, "outliv": 29, "output": [1, 2, 4, 5, 6, 7, 8, 9, 12, 14, 22, 24, 25, 26, 28, 31, 34, 38, 40, 45, 47, 49, 51, 57, 58], "output_asset_partition_kei": [6, 56], "output_asset_partitions_time_window": [6, 56], "output_captur": [6, 51, 56], "output_config_schema": 10, "output_def": [7, 22, 39, 45, 49, 51, 56], "output_events_during_comput": 56, "output_for_nod": 6, "output_for_solid": [51, 56], "output_map": [6, 7, 56], "output_nam": [4, 6, 10, 45, 49, 51, 56], "output_notebook": 45, "output_notebook_nam": 45, "output_valu": [6, 56], "outputcontext": [10, 47, 49, 56], "outputdefinit": [7, 10, 22, 34, 45, 49, 51, 56, 57], "outputmap": [6, 7, 56], "outsid": [9, 10, 14, 29, 38, 54], "over": [2, 14, 21, 22, 28, 38, 50, 53, 55], "overestim": [14, 38], "overhead": [14, 38], "overload": 20, "overrid": [2, 3, 13, 14, 18, 22, 26, 29, 38, 39, 56], "overridden": [14, 22, 31, 38, 40, 50, 55], "override_system_timezon": 2, "overview": [18, 20, 29, 49, 56], "overwrit": [1, 14, 15, 25, 38], "overwritten": [1, 6, 7, 11, 25, 45, 51], "own": [5, 6, 11, 14, 22, 38, 49, 51, 54, 56], "owner": [20, 27], "pack": [14, 38], "packag": [1, 2, 9, 14, 15, 20, 25, 38, 51, 52, 58], "package_modul": 1, "package_nam": [1, 2], "packet": 43, "page": [14, 21, 22, 24, 27, 38], "pagerduty_op": 33, "pagerduty_resourc": 33, "pagerduty_test": 33, "pair": [9, 18, 20, 23, 29, 49, 50, 51, 55, 56], "panda": 22, "pandascolumn": 34, "papertrail_logg": 35, "parallel": [14, 25, 38], "param": [2, 3, 9, 14, 22, 36, 38], "paramet": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 26, 28, 31, 34, 35, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "parameter": [7, 11, 50, 55, 56], "parametr": 51, "paramiko": 43, "paramstyl": 41, "parent": [1, 10, 22, 30, 49, 56], "parent_run_id": [9, 30, 51], "parquet": 3, "pars": [3, 5, 6, 8, 9, 12, 22, 24, 52, 56, 57, 58], "part": [4, 27, 31, 49], "parti": 9, "partial": [3, 25], "partially_specified_config": 3, "particular": [1, 6, 9, 10, 14, 38, 50], "particularli": 9, "partit": [1, 2, 6, 7, 9, 10, 11, 14, 38, 49, 52, 56], "partition_fn": 50, "partition_kei": [6, 10, 11, 50, 56], "partition_map": 1, "partition_selector": 50, "partition_set": [50, 53, 55], "partition_set_def": 50, "partitionedconfig": [7, 11, 48, 50], "partitionmap": 1, "partitionmetadataentri": [4, 49, 56], "partitions_def": [1, 7, 11, 50], "partitionscheduledefinit": [50, 55], "partitionsdefinit": [1, 7, 11, 50], "partitionset": 50, "partitionsetdefinit": [50, 53], "pase": 57, "pass": [1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 28, 29, 30, 31, 34, 38, 39, 40, 45, 46, 49, 50, 51, 53, 54, 55, 56, 57], "password": [9, 14, 17, 18, 23, 25, 29, 32, 36, 41, 43], "past": [9, 50, 55], "patch": [22, 24], "path": [1, 2, 4, 9, 10, 13, 14, 15, 16, 18, 20, 22, 25, 38, 45, 49, 52, 56, 57, 58], "path_desc": [14, 15, 25, 57], "path_prefix": 2, "pathlik": [49, 56], "pathmetadatavalu": [49, 56], "pattern": [9, 52, 58], "paus": [14, 38], "pawel": 16, "pawelzni": 16, "payload": [24, 31, 33], "pbs": 19, "pdb": [6, 56, 58], "peer": 4, "pem": 14, "pend": [14, 16, 38], "pendingnodeinvoc": 8, "pendulum": 50, "peopl": 38, "per": [1, 6, 9, 10, 14, 19, 25, 29, 38, 55, 56], "perform": [2, 5, 9, 14, 22, 27, 38, 41, 47, 49, 50, 53, 56, 57], "period": [9, 14, 22, 38, 50, 55], "periodicgc": [14, 38], "permiss": [3, 5, 9, 14, 16, 17, 18, 19, 20, 22, 23, 25, 27, 29, 30, 36, 38, 41, 57], "permit": [6, 51], "persist": [9, 14, 15, 20, 25, 29, 34, 38, 49, 51, 55, 56, 57], "person": 27, "photo": 21, "pick": [16, 17, 18, 25], "pickl": [1, 3, 10, 14, 15, 25], "pid": 6, "piec": [9, 14, 20, 38], "pig": 25, "pigjob": 25, "pip": 33, "pipe": 39, "pipelin": [4, 6, 7, 8, 9, 10, 11, 13, 14, 20, 22, 28, 30, 31, 35, 39, 40, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 58], "pipeline_cancel": 6, "pipeline_code_origin": 9, "pipeline_context": 45, "pipeline_def": [6, 13, 45, 46, 51, 52, 56], "pipeline_def_for_backwards_compat": 54, "pipeline_dequeu": 6, "pipeline_enqueu": 6, "pipeline_failur": 6, "pipeline_failure_sensor": 55, "pipeline_nam": [6, 8, 9, 10, 13, 28, 31, 40, 50, 51, 55, 56], "pipeline_run": [6, 9, 31, 40, 45, 54, 55, 56], "pipeline_run_statu": 55, "pipeline_select": [40, 55], "pipeline_snapshot_id": 9, "pipeline_start": [6, 51, 56], "pipeline_success": 6, "pipelineconfigurationinvalid": 28, "pipelinedefinit": [6, 11, 13, 40, 45, 46, 48, 51, 53, 55, 56], "pipelineexecutionresult": 51, "pipelinefailuresensorcontext": [31, 40, 55], "pipelinenotfounderror": 28, "pipelinerun": [6, 9, 13, 45, 54, 55, 56], "pipelinerunreact": 55, "pipelinerunstatu": [9, 28, 55], "pkg_resourc": [52, 58], "pkg_resource_def": [52, 58], "place": [9, 10, 14, 22, 29, 34, 38, 49, 57], "placehold": 3, "placement": 25, "plai": 55, "plain": 40, "plan": [9, 13, 14, 18], "plan_context": 9, "plane": [14, 38], "planorchestrationcontext": 9, "platform": 25, "playground": [7, 11], "pleas": [9, 14, 38], "plu": 55, "plug": 9, "pluggabl": [3, 9], "plugin": 9, "plus_minu": [49, 56], "pod": [16, 18, 28, 29], "point": [2, 14, 16, 17, 18, 38, 45], "pointer": 51, "polici": [7, 9, 11, 18, 29, 49, 51, 56], "poll": [12, 14, 20, 22, 24, 38], "poll_interv": [12, 22, 24], "poll_interval_sec": 20, "poll_run": 22, "poll_sync": 24, "poll_timeout": [12, 22, 24], "polling_timeout": 9, "pollinginterv": [14, 38], "pool": [14, 20, 38], "poor": [14, 38], "pop": [6, 10, 56], "popul": 22, "port": [2, 9, 12, 14, 20, 22, 28, 29, 32, 35, 36, 38, 43], "port_numb": 28, "posit": [6, 14, 38, 49, 50, 55, 56, 58], "positional_input": 56, "possibl": [3, 9, 14, 18, 20, 29, 38, 50], "post": [3, 16, 22, 24, 31, 33, 40], "post_messag": 31, "postgr": [9, 18, 29], "postgres_db": [9, 36], "postgres_password_secret": [18, 29], "postgres_url": 36, "postgreseventlogstorag": [9, 36], "postgresql": [18, 29], "postgresrunstorag": [9, 36], "postgresschedulestorag": [9, 36], "postmessag": 40, "postpend": 13, "potenti": [14, 22, 38, 51], "power": 9, "pre": [3, 9, 14, 22, 28, 38], "preambl": 5, "preced": [1, 7, 14, 38, 56], "predefin": [2, 49, 52], "predict": [6, 11, 51], "preemptibl": 25, "prefer": [6, 7, 49, 51, 55, 56], "preferdirectbuf": [14, 38], "prefix": [2, 12, 14, 15, 16, 20, 22, 24, 25, 45], "pregel": [14, 38], "prepend": [14, 38], "presenc": [3, 34, 49, 56], "present": [3, 6, 14, 18, 25, 27, 28, 29, 33, 38, 40, 47, 48, 49, 53, 56], "preserv": [9, 14, 38, 49], "preset": [13, 28, 51, 56], "preset_def": [6, 11, 51], "presetdefinit": [51, 52], "presetnotfounderror": 28, "pressur": [14, 38], "pretti": 27, "prevent": [14, 38], "preview": [2, 25], "previou": [1, 5, 9, 14, 15, 24, 25, 38, 47, 49, 51, 56], "primarili": [14, 28, 38, 55], "primit": [3, 5], "princip": 25, "print": [2, 29, 49, 54, 56], "printgcdetail": 20, "prior": 24, "prioriti": 6, "priv": 29, "privat": [20, 27, 29], "proactiv": [14, 38], "problem": 22, "proce": 5, "process": [2, 3, 4, 5, 6, 7, 9, 11, 13, 14, 20, 22, 26, 29, 38, 41, 51, 55], "process_directori": 4, "process_fil": 4, "produc": [1, 4, 5, 6, 7, 9, 10, 22, 49, 51, 54, 56, 57, 58], "product": [16, 51], "profil": [14, 20, 22, 38], "profile_nam": 14, "profiles_dir": 22, "program": [14, 25, 38], "programat": [6, 7, 12, 22, 24, 27, 49], "programmat": [34, 57], "progress": [9, 12, 14, 22, 24, 29, 38], "project": [22, 25], "project_and_instance_metadata": 25, "project_dir": 22, "project_id": [22, 25], "projectid": 25, "prometheus_cli": 37, "prometheus_resourc": 37, "prometheusresourc": 37, "proper": [14, 38], "properli": [14, 38, 41], "properti": [6, 8, 9, 10, 14, 15, 22, 24, 25, 34, 38, 45, 49, 51, 53, 56, 57, 58], "protect": [14, 38], "protocol": [21, 31], "provid": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 27, 28, 29, 30, 33, 34, 35, 38, 39, 40, 41, 43, 44, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58], "provis": [20, 25, 27, 33], "proxi": [14, 31, 38], "ptat": [14, 38], "public": [14, 17, 20, 21, 23, 38], "publish": 21, "pull": [14, 18, 22, 27, 29], "purpos": [8, 14, 38, 57], "push": 29, "put": [1, 14, 38, 40, 49, 56], "putobjectacl": 20, "py37": 29, "pyamqp": [16, 17, 18], "pyfil": [14, 38], "pyformat": 41, "pylint": 39, "pyspark": [14, 20, 25], "pyspark_resourc": 38, "pysparkjob": 25, "pytest": 29, "python": [1, 2, 3, 5, 6, 9, 11, 13, 14, 15, 16, 20, 21, 25, 34, 38, 41, 46, 49, 51, 52, 56, 57, 58], "python_artifact": [49, 56], "python_fil": [2, 20], "python_logging_levels_nam": 9, "python_modul": [16, 29], "python_typ": 57, "python_valu": 3, "pythonartifactmetadatavalu": [49, 56], "pythonerror": 28, "pythonfileuri": 25, "pythonobjectdagstertyp": [49, 56, 57], "pythonoper": 13, "pythonpath": [14, 38], "qmark": 41, "qualifi": [14, 38], "qualiti": [49, 56], "quantil": [14, 38], "queri": [1, 2, 6, 7, 9, 11, 14, 21, 22, 25, 28, 41, 51], "query1": 25, "query2": 25, "query3": 25, "query4": 25, "queryfileuri": 25, "querylist": 25, "queu": 9, "queue": [9, 14, 16, 18, 38], "queuedruncoordin": 9, "quickstart": 27, "quit": [14, 29, 38], "quux": 9, "rabbitmq": [16, 17, 18, 29], "rack": [14, 38], "rais": [5, 6, 7, 10, 11, 12, 22, 24, 28, 39, 41, 45, 49, 51, 52, 54, 55, 56, 58], "raise_on_error": [5, 6, 7, 11, 51, 56], "random": [49, 56], "randomli": [29, 49, 56], "rang": [10, 14, 38, 50, 53, 58], "rapidli": [14, 38], "rasset_key_prefix": 22, "rate": [14, 38], "rather": [9, 14, 29, 38, 49, 51, 54, 56, 57], "ratio": [14, 38], "raw": [9, 14, 22, 38], "raw_conn": 41, "raw_output": 22, "rawmetadatavalu": [49, 56], "rbackend": [14, 38], "rdd": [14, 38], "reach": [14, 16, 38], "react": 55, "read": [2, 3, 9, 10, 13, 14, 20, 27, 38, 39, 57], "read_csv": 10, "read_data": 9, "read_fil": 9, "read_writ": 25, "readabl": [3, 6, 7, 10, 14, 35, 39, 46, 48, 49, 50, 51, 54, 55, 56, 57], "readi": 57, "readonli": 25, "readrc": 58, "readthedoc": [16, 23], "real": 16, "realm": 25, "reaper": [14, 38], "reason": [5, 28, 55], "rebuild": 2, "recalcul": 22, "receiv": [2, 5, 14, 34, 38, 57], "receive_processed_config_valu": [3, 56], "recent": [24, 55], "reclaim": [14, 38], "recommend": [12, 13, 14, 20, 22, 24, 38, 51, 56, 57], "recon_repo": 13, "reconstruct": [13, 14, 38, 56], "reconstruct_context": [51, 56], "reconstruct_job": 11, "reconstructable_arg": 11, "reconstructable_bar_job": [6, 11, 51], "reconstructable_foo_job": [6, 11, 51], "reconstructable_kwarg": 11, "reconstructablepipelin": [6, 11, 51], "reconstructablerepositori": [13, 51], "reconstructor_function_nam": 11, "reconstructor_module_nam": 11, "reconstructor_working_directori": 11, "record": [1, 6, 9, 10, 14, 38, 49, 56], "recov": [14, 38], "recoveri": [14, 38, 55], "recoverymod": [14, 38], "recurs": [3, 5], "red": 3, "redact": [14, 38], "redi": [16, 17, 18], "redshift_configur": 14, "redshift_resourc": 14, "reduc": [14, 38], "reducebykei": [14, 38], "redund": [14, 38], "reexecut": 51, "reexecute_pipelin": 51, "reexecute_pipeline_iter": 51, "ref": [9, 22, 43], "refactor": 56, "refer": [1, 13, 14, 15, 18, 20, 21, 22, 25, 26, 29, 33, 38, 40, 42, 49, 56, 57], "referenc": [1, 14, 49, 56], "referencetrack": [14, 38], "refresh": [13, 22, 28], "refresh_from_airflow_db": 13, "regardless": [14, 38], "regex": [14, 38], "region": [14, 20, 25, 29, 38], "region_nam": 14, "regist": [14, 38], "registr": [14, 38], "registrationrequir": [14, 38], "registri": [17, 23, 29], "regress": [14, 38], "regular": [9, 10, 57], "rehydr": 9, "reindex": 2, "rel": [4, 58], "relat": [4, 14, 22, 25, 38, 47, 49, 56], "relationship": [1, 25], "relative_path": 58, "relaunch": [14, 38], "releas": 25, "relev": [2, 5, 14, 22, 26, 27, 38], "reli": 53, "reliabl": 20, "reload": 28, "reload_repository_loc": 28, "reloadnotsupport": 28, "reloadrepositorylocationinfo": 28, "reloadrepositorylocationstatu": 28, "remain": 7, "remap": 56, "rememb": [14, 38], "remot": [2, 9, 14, 20, 25, 28, 29, 38, 43], "remote_host": 43, "remote_port": 43, "remov": [14, 38], "render": [14, 38], "repeat": [3, 9, 57], "repeat_word": 3, "repl": [6, 11, 51], "replac": [6, 7, 11, 14, 38], "replai": [14, 38], "replenish": [14, 38], "replic": [14, 38], "replica": [14, 38], "repo": [13, 14, 29, 38], "repo_location_nam": 18, "repo_nam": [13, 27], "repo_own": 27, "report": [9, 22, 24, 25], "report_engine_ev": 9, "repositori": [1, 2, 13, 14, 18, 23, 27, 28, 29, 31, 38, 40, 51, 55], "repository_data": 53, "repository_location_nam": 28, "repository_nam": [28, 55], "repositorydata": 53, "repositorydefinit": [1, 7, 13, 51, 53], "repositorylocationloadfailur": 28, "repositorylocationnotfound": 28, "repres": [1, 3, 4, 7, 9, 10, 12, 14, 22, 24, 38, 47, 49, 50, 51, 52, 54, 55, 56], "represent": [3, 6, 9, 14, 15, 22, 49, 51, 56, 57], "request": [2, 10, 12, 14, 20, 22, 24, 25, 27, 28, 31, 38, 41, 49, 56], "request_6": 24, "request_max_retri": [12, 22, 24], "request_retry_delai": [12, 22, 24], "request_token": 22, "requir": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 29, 34, 38, 39, 41, 45, 48, 49, 51, 54, 55, 56, 57], "required_resource_kei": [1, 5, 8, 9, 10, 14, 15, 20, 21, 22, 30, 31, 33, 34, 38, 39, 40, 41, 42, 45, 49, 51, 54, 56, 57], "rerais": 5, "reserv": [1, 10], "reset": [14, 38], "resid": [1, 14, 38, 57], "resolut": [14, 38], "resolv": [2, 3, 5, 14, 38, 54], "resolve_standoff": 3, "resolved_run_config": 45, "resolvedrunconfig": 45, "resourc": [1, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 18, 20, 21, 25, 27, 30, 31, 33, 34, 37, 38, 39, 40, 41, 43, 45, 47, 48, 49, 51, 52, 56, 57, 58], "resource_config": [10, 51, 54], "resource_def": [1, 7, 9, 10, 11, 12, 14, 15, 20, 21, 22, 24, 25, 27, 30, 31, 33, 38, 40, 41, 45, 48, 51, 54], "resource_definit": 38, "resource_fn": [5, 10, 54], "resource_keys_to_init": 45, "resource_nam": [5, 54], "resource_str": [52, 58], "resource_to_init": 54, "resourcedefinit": [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 20, 21, 22, 24, 25, 27, 30, 31, 33, 37, 38, 40, 41, 42, 43, 44, 48, 54], "resources_config": [6, 56], "resourceversioncontext": 47, "respect": [2, 14, 16, 20, 29, 41, 45], "respond": 20, "respons": [2, 9, 12, 21, 22, 24, 41], "response_dict": 22, "rest": [12, 22, 24], "restart": [2, 14, 20, 25, 28, 38, 49, 56], "restrict": [20, 25], "result": [1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 16, 17, 18, 21, 22, 24, 25, 28, 38, 41, 45, 46, 49, 50, 51, 54, 55, 56, 57, 58], "result_for_handl": [51, 56], "result_for_solid": [51, 56], "resum": 9, "resume_run": 9, "resync": 24, "resync_and_pol": 24, "resync_paramet": 24, "retain": [14, 38], "retainedbatch": [14, 20, 38], "retaineddeadexecutor": [14, 38], "retaineddriv": [14, 38], "retainedexecut": [14, 38], "retainedexecutor": [14, 38], "retainedjob": [14, 38], "retainedrootrdd": [14, 38], "retainedstag": [14, 38], "retainedtask": [14, 38], "rethrown": 5, "retri": [6, 7, 9, 11, 12, 14, 16, 17, 18, 20, 22, 23, 24, 29, 38, 49, 51, 55], "retriev": [6, 10, 12, 18, 20, 24, 25, 27, 29, 51, 53, 56], "retry_attempt": 56, "retry_interv": 22, "retry_numb": [6, 56], "retry_polici": [49, 56], "retrymod": 9, "retrypolici": [7, 11, 49, 51, 56], "retryrequest": [45, 49, 56], "retrywait": [14, 38], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 22, 24, 26, 28, 31, 34, 38, 39, 40, 41, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "return_cod": 22, "return_n": 53, "return_n_": 53, "return_on": [6, 7, 51], "return_text": 22, "reus": [14, 25, 38], "reusabl": 9, "revers": [14, 38], "reverseproxi": [14, 38], "reverseproxyurl": [14, 38], "reviv": [14, 38], "rewritten": [14, 38], "rfc": 25, "rfc1035": 25, "rich": 16, "rigidli": [14, 38], "role": [3, 20, 25, 41], "roll": [14, 38], "root": [9, 14, 25, 29, 38], "root_input_manag": 10, "root_manag": 1, "root_manager_kei": [10, 49, 56], "root_run_id": 9, "rootinputmanag": [10, 49, 56], "rootinputmanagerdefinit": 10, "rootlogg": 25, "rootprincipalpassworduri": 25, "rouberol": 16, "rout": 33, "routing_kei": 33, "row": [34, 49, 56], "rowcountconstraint": 34, "rowdict": 57, "rpc": [14, 16, 17, 18, 38], "rsa": 27, "rule": [3, 5, 34, 57], "run": [1, 5, 7, 8, 10, 11, 13, 14, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 36, 38, 40, 45, 46, 47, 49, 50, 52, 54, 56, 58], "run_cancel": 6, "run_config": [3, 6, 7, 9, 11, 13, 14, 20, 21, 27, 28, 30, 33, 40, 41, 45, 51, 52, 53, 55, 56], "run_config_fn": 55, "run_config_fn_for_partit": 50, "run_config_for_partition_fn": 50, "run_coordin": 9, "run_coordinator_data": 9, "run_dbt_cloud_op": 22, "run_dbt_nightly_sync": 22, "run_dequeu": 6, "run_enqueu": 6, "run_failur": 6, "run_failure_sensor": 55, "run_fn": 21, "run_id": [5, 6, 7, 8, 9, 10, 11, 22, 28, 45, 46, 49, 51, 54, 56, 58], "run_job": 22, "run_job_and_pol": 22, "run_kei": [53, 55], "run_launch": [9, 18, 29], "run_launcher_data": 9, "run_nam": 20, "run_oper": 22, "run_result": 22, "run_resultsjson": 22, "run_sql": 22, "run_start": 6, "run_status_sensor": 55, "run_status_sensor_fn": 55, "run_status_sensor_to_invok": 55, "run_storag": [9, 32, 36], "run_storage_data": 9, "run_success": 6, "run_updated_aft": 9, "runawai": [14, 38], "runconfigdata": 28, "runconfigvalidationinvalid": 28, "runconflict": 28, "runcoordin": 9, "runfailuresensorcontext": [40, 55], "runlaunch": [9, 14, 18, 23, 29], "runnabl": 58, "runner": 29, "runrequest": [53, 55], "runshardedeventscursor": 9, "runstatussensorcontext": 55, "runstatussensordefinit": 55, "runstorag": 9, "runtim": [3, 4, 5, 7, 11, 14, 20, 22, 25, 34, 38, 46, 56, 57], "runtime_metadata_fn": 22, "s3_bucket": [14, 57], "s3_file": 14, "s3_file_manag": [14, 45], "s3_job_package_path": 14, "s3_kei": [14, 57], "s3_path": [14, 57], "s3_pickle_asset_io_manag": 14, "s3_pickle_io_manag": 14, "s3_pipeline_package_path": 14, "s3_prefix": 14, "s3_resourc": [3, 14], "s3_session": 14, "s3computelogmanag": [9, 14], "s3coordin": 14, "s3filecach": 14, "s3filehandl": [14, 45, 57], "safe": [14, 38, 40, 54], "safe_mod": 13, "safeti": [14, 38], "same": [1, 2, 3, 4, 5, 6, 9, 10, 14, 16, 20, 38, 46, 49, 54, 55, 56, 57], "sampl": [4, 22, 25], "sample_data": 10, "sample_output": 10, "sanit": [49, 56], "sas": 15, "satisfi": [3, 6, 9, 46, 49, 51, 54, 56], "satur": [14, 38], "saturdai": 55, "save": [14, 26, 38], "saveashadoopfil": [14, 38], "scaffold": [2, 13], "scaffold_config": 2, "scala": [14, 38], "scala2": 20, "scalar": [3, 6, 51], "scalar_typ": 3, "scalarunion": 3, "scale": [14, 20, 38], "scan": [14, 38], "scenario": [14, 38], "schedul": [14, 19, 22, 24, 25, 36, 38, 50, 53], "schedule_definit": 50, "schedule_nam": 50, "schedule_storag": [9, 32, 36], "schedule_storage_data": 9, "schedule_typ": 24, "scheduled_execution_tim": 55, "scheduledefinit": [1, 50, 53, 55], "scheduleevaluationcontext": [50, 55], "scheduler_data": 9, "schedulerbacklogtimeout": [14, 38], "schedulestorag": 9, "schema": [3, 5, 7, 9, 10, 11, 12, 14, 22, 24, 28, 34, 35, 41, 46, 49, 54, 56, 57], "schema1": 24, "schema2": 24, "schema_nam": 24, "scheme": [14, 38], "scope": [6, 11, 14, 20, 25, 27, 35, 38, 46, 51, 54], "scoped_resources_build": 6, "scopedresourc": 54, "scratch": [14, 38], "script": [14, 20, 25, 38, 39, 51, 52], "scriptvari": 25, "search": [14, 38, 49, 56], "second": [9, 12, 14, 18, 20, 22, 24, 31, 38, 41, 43, 49, 50, 55, 56], "secondaryworkerconfig": 25, "seconds_to_wait": [49, 56], "secret": [7, 11, 14, 15, 18, 20, 24, 29], "secret_bool_op": 3, "secret_int_op": 3, "secret_job": 3, "secret_kei": 15, "secret_key_kei": 20, "secret_op": 3, "secret_scop": 20, "secretid": 14, "secrets_tag": 14, "secrets_to_env_vari": 20, "secretsmanager_resourc": 14, "secretsmanager_secrets_resourc": 14, "section": [14, 18, 29, 38], "secur": [2, 18, 25, 29], "securili": 27, "securityconfig": 25, "see": [9, 12, 13, 14, 16, 17, 18, 19, 22, 23, 24, 25, 26, 27, 29, 38, 40, 41, 42, 47, 51], "seed": 22, "seek": [9, 14, 38], "seem": 27, "select": [1, 3, 6, 7, 11, 14, 22, 25, 41, 51], "select_color": 3, "selected_unique_id": 22, "selector": [3, 5, 15, 16, 17, 18, 19, 20, 22, 23, 29, 50], "self": [10, 14, 25, 38, 53, 57], "semicolon": 25, "send": [2, 9, 12, 14, 16, 21, 22, 24, 31, 38, 40, 43], "send_messag": 8, "sens": [16, 17, 18], "sensit": [14, 25, 38], "sensor": [31, 40, 53], "sensor_nam": 55, "sensordefinit": [1, 53, 55], "sensorevaluationcontext": 55, "sent": [2, 14, 20, 38, 40, 55], "separ": [4, 9, 10, 14, 25, 29, 38, 51], "sequenc": [1, 49, 50, 51, 55, 56], "sequenti": 20, "serd": [9, 25], "seri": [9, 16], "serial": [1, 9, 10, 14, 15, 25, 38, 55], "serializ": [4, 9, 11, 14, 15, 38, 49, 56], "serializable_error_info_from_exc_info": 9, "serializableerrorinfo": 9, "serv": [2, 14, 28, 38, 50], "server": [2, 9, 12, 14, 20, 21, 22, 25, 28, 29, 30, 31, 38, 41], "servic": [14, 18, 20, 22, 25, 29, 33, 38], "service_account_nam": [18, 29], "service_check": 21, "serviceaccount": 25, "serviceaccountscop": 25, "servlet": [14, 38], "session": [14, 41], "set": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 34, 35, 36, 38, 39, 41, 45, 46, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58], "set_input": 57, "set_op": 57, "set_trac": [6, 56, 58], "setup": [14, 24, 30, 38], "seven": [14, 15], "sever": [3, 14, 20, 33], "sge": 19, "shape": [3, 5], "shard": 9, "share": [14, 15, 25, 27, 38, 54], "shell": [14, 22, 38], "shell_command": 39, "shell_command_op": 39, "shell_command_solid": 39, "shell_op": 39, "shell_script_path": 39, "shell_solid": 39, "shim": [34, 45, 57], "ship": 51, "short": [14, 25, 38, 49, 56], "shortcut": 56, "should": [2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 29, 32, 34, 36, 38, 39, 41, 45, 46, 47, 49, 50, 51, 53, 54, 55, 56, 57, 58], "should_autocreate_t": 36, "should_execut": [50, 55], "should_start_background_run_thread": 9, "show": [2, 10, 14, 22, 38, 58], "show_profil": [14, 38], "showconsoleprogress": [14, 38], "shown": [20, 25, 41], "shrink": [14, 38], "shuffl": [14, 20, 38], "shut": [2, 14, 16, 28, 38], "shutdown": [2, 14, 38], "shutdown_repository_loc": 28, "shutdownrepositorylocationinfo": 28, "sid": 44, "side": [6, 11, 14, 20, 38, 40, 41, 49, 51, 56], "sidecar": 14, "sign": 25, "signal": [2, 16], "signatur": [7, 9, 49, 56], "signific": [9, 14, 38], "significantli": [14, 38], "silenc": [14, 38], "similar": 56, "simpl": [4, 34, 41, 53], "simple_job": 53, "simple_repositori": 53, "simpler": [14, 38], "simpli": [14, 22, 38], "simplifi": 56, "simultan": [14, 38], "sinc": [2, 6, 9, 10, 14, 16, 20, 25, 38, 51, 56], "singl": [1, 2, 3, 6, 7, 11, 14, 15, 18, 20, 22, 25, 38, 39, 49, 50, 51, 52, 55, 56, 57], "sit": 1, "site": [5, 14, 25, 38], "situat": [14, 38], "size": [14, 20, 25, 38, 49, 56], "skip": [10, 14, 22, 38, 55, 56, 58], "skip_empty_fil": 14, "skip_messag": 55, "skipreason": 55, "slack": [8, 9], "slack_job": 40, "slack_message_on_failur": 8, "slack_message_on_success": 8, "slack_on_failur": 40, "slack_on_pipeline_failur": 40, "slack_on_run_failur": 40, "slack_on_success": 40, "slack_op": 40, "slack_resourc": 40, "slack_sdk": 40, "slack_token": 40, "sleep": 57, "slice": 50, "slightli": 55, "slow": [14, 16, 17, 18, 29, 38], "slower": [14, 38], "slowli": [14, 38], "slurm": 19, "small": [14, 38], "smaller": [14, 38], "snappi": [14, 38], "snappycompressioncodec": [14, 38], "snapshot": 22, "snapshot_fresh": 22, "snippet": 25, "snowflak": 15, "snowflake_account": 41, "snowflake_databas": 41, "snowflake_op_for_queri": 41, "snowflake_password": 41, "snowflake_resourc": 41, "snowflake_schema": 41, "snowflake_us": 41, "snowflake_warehous": 41, "snowflakeconnect": 41, "socket": [2, 14, 38], "softwar": [14, 15, 25], "softwareconfig": 25, "solid": [3, 6, 7, 8, 9, 10, 13, 16, 20, 25, 26, 28, 30, 31, 39, 40, 45, 47, 48, 49, 50, 51, 52, 55, 58], "solid_config": [6, 8, 45, 56], "solid_def": [6, 10, 45, 51, 56], "solid_definit": 22, "solid_except": 8, "solid_handl": [6, 45], "solid_nam": [31, 45, 56], "solid_output_valu": 8, "solid_result_list": [51, 56], "solid_retry_polici": 51, "solid_select": [9, 28, 50, 51, 52, 55], "solid_selection_str": 51, "solid_to_invok": 56, "soliddefinit": [3, 6, 8, 10, 20, 22, 39, 45, 51, 56], "solidexecutioncontext": [6, 22, 56], "solidexecutionresult": [51, 56], "solidinvoc": 51, "solids_to_execut": [9, 51], "solidversioncontext": 47, "solut": 13, "some": [3, 9, 10, 14, 15, 16, 18, 20, 22, 28, 29, 38, 55, 58], "some_asset": 1, "some_asset_kei": 1, "some_celery_backend_url": 18, "some_celery_broker_url": 18, "some_config": 3, "some_config1": 3, "some_config2": 3, "some_directori": 53, "some_job": 53, "some_model_nam": 30, "some_modul": 2, "some_op": [6, 7, 11, 12, 24], "some_param": 30, "some_run_id": 28, "some_sensor": 53, "some_solid": [50, 51, 52, 55], "some_validation_fn": [49, 56], "someon": 3, "someth": [56, 58], "sometim": 29, "somewher": 40, "sonnest": [50, 55], "soonest": [50, 55], "sort": [3, 14, 38, 57], "sourc": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "source_asset": 1, "sourceasset": 1, "sourcehashversionstrategi": 47, "space": [9, 14, 20, 38], "spark": [1, 14, 20, 25, 38], "spark_conf": [20, 38, 42], "spark_config": 14, "spark_daemon_java_opt": 20, "spark_env_var": 20, "spark_hom": 42, "spark_local_dir": [14, 20, 38], "spark_local_ip": [14, 38], "spark_python_task": 20, "spark_resourc": 42, "spark_sess": 38, "spark_vers": 20, "spark_worker_memori": 20, "sparkconf": [14, 25, 38], "sparkcontext": [14, 38], "sparkjob": 25, "sparklisten": [14, 38], "sparkoperror": 42, "sparkpi": 20, "sparkr": [14, 38], "sparkr_driver_r": [14, 38], "sparksess": 38, "sparkspi": 20, "sparksqljob": 25, "spars": [14, 38], "spawn": [2, 18], "spec": 57, "special": [14, 38], "specif": [3, 6, 8, 9, 14, 16, 19, 20, 22, 25, 31, 38, 40, 45, 46, 51, 54, 55, 56, 57], "specifi": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 22, 23, 24, 25, 29, 31, 34, 38, 40, 41, 45, 46, 49, 50, 51, 54, 55, 56, 57], "specul": [14, 20, 38], "speed": [14, 38], "spill": [14, 38], "spin": 39, "splendidrunstorag": 9, "split": [50, 55], "spread": [14, 25, 38], "spun": 13, "sql": [9, 14, 22, 25, 38, 41], "sql_queri": [25, 41], "sqlalchemi": 41, "sqleventlogstorag": 9, "sqlite": 9, "sqliteeventlogstorag": 9, "sqliterunstorag": 9, "sqliteschedulestorag": 9, "sqlrunstorag": 9, "sqlschedulestorag": 9, "squar": 57, "src": [38, 41], "ssd": 25, "sse": 20, "ssh": [19, 20], "ssh_port": 43, "ssh_public_kei": 20, "ssh_resourc": 43, "sshresourc": 43, "ssl": [2, 14, 25], "sslmode": 14, "stabil": [14, 38], "stabl": [16, 17, 18, 23, 29], "stack": [5, 9], "stackoverflowerror": [14, 38], "stage": [14, 15, 25, 38], "staging_bucket": 14, "staging_prefix": [14, 20], "standalon": [14, 38, 51], "standard": [7, 9, 11, 14, 25, 38, 46, 49, 56], "start": [2, 9, 14, 17, 18, 19, 24, 29, 31, 34, 38, 40, 49, 50, 55, 56], "start_aft": [12, 24], "start_asset": 1, "start_dat": [50, 55], "start_resync": 24, "start_sync": 24, "stat": [14, 38], "state": [5, 9, 14, 15, 20, 22, 25, 28, 38, 49, 55, 56], "statement": 2, "static": [1, 4, 9, 10, 34, 49, 50, 52, 53, 54, 56, 58], "static_partitioned_config": 50, "staticmethod": 9, "statu": [9, 14, 22, 24, 25, 28, 31, 38, 40, 49, 50, 55, 56], "status": 2, "stderr": [2, 9, 14, 15], "stderrfrom": 20, "stdin": 58, "stdout": [2, 9, 14, 15, 20, 22, 58], "step": [5, 6, 8, 9, 10, 14, 16, 17, 18, 20, 22, 23, 27, 28, 29, 30, 31, 38, 40, 47, 49, 51, 56, 58], "step_context": 10, "step_event_list": [51, 56], "step_events_by_kind": 56, "step_execution_context": [6, 8, 56], "step_expectation_result": [6, 56], "step_failur": [6, 56], "step_handl": 6, "step_input": [6, 56], "step_kei": [6, 8, 9, 10, 28, 58], "step_keys_to_execut": 9, "step_kind_valu": 6, "step_launch": [6, 56], "step_output": [6, 56], "step_output_vers": 9, "step_restart": 6, "step_select": 51, "step_skip": 6, "step_start": 6, "step_success": [6, 56], "step_up_for_retri": 6, "stepexecutioncontext": 10, "stepfailuredata": 56, "stepkind": 6, "steplaunch": [6, 56], "stepoutputhandl": 9, "still": [3, 14, 25, 38], "stop": [2, 14, 20, 22, 31, 38, 40, 50, 55], "stopgracefullyonshutdown": [14, 38], "storag": [2, 5, 14, 15, 18, 20, 25, 28, 29, 30, 32, 36, 38, 48, 51], "storage_account": 15, "storage_account_key_kei": 20, "storage_account_nam": 20, "storage_id": 9, "storagefract": [14, 38], "storagelevel": [14, 38], "store": [1, 2, 9, 10, 14, 20, 25, 27, 29, 38, 40, 49, 56, 57], "store_serialized_dag": 13, "str": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 22, 24, 25, 26, 28, 31, 34, 35, 39, 40, 41, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "str_param": 57, "str_valu": 56, "straightforward": 57, "strategi": [3, 6, 11, 14, 38, 47, 51], "stream": [9, 12, 14, 20, 38, 51], "streamingcontext": [14, 38], "strict": [9, 14, 16, 17, 18, 19, 20, 23, 25, 29, 36], "strict_column_list": 34, "strict_flag": 22, "strictcolumnsconstraint": 34, "strictli": [14, 20, 38], "string": [1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 14, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 29, 30, 33, 34, 38, 39, 40, 43, 45, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58], "string_column": 34, "stringio": 9, "stringsourc": [1, 3, 5, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 31, 32, 36, 38, 41, 43, 44], "structur": [6, 7, 9, 14, 38, 46, 49, 51, 56, 58], "structured_asset_kei": [49, 56], "structured_asset_key_2": [49, 56], "stub": [49, 56], "stuff": 58, "sub": [1, 9, 14, 38], "sub0": 25, "subclass": [5, 9, 13, 14, 38, 47, 53, 57, 58], "subminor": 25, "submiss": 25, "submit": [9, 14, 18, 20, 25, 28, 38, 42, 55], "submit_job_execut": 28, "submit_pipeline_execut": 28, "submit_run": 9, "subnet": 25, "subnetwork": 25, "subnetwork_uri": 25, "subnetworkuri": 25, "subselect": [1, 50, 52, 55], "subsequ": [1, 7, 14, 15, 22, 25, 29, 38, 41], "subset": [12, 22, 24, 51], "substanti": [14, 38], "subtract": 50, "succe": [8, 20, 56], "succeed": [8, 22, 24, 49, 56], "success": [2, 5, 6, 8, 9, 12, 14, 22, 24, 28, 31, 34, 38, 40, 41, 49, 51, 56, 57], "success_hook": 8, "successfulli": [22, 24], "suffix": [14, 38], "suggest": [20, 22], "suit": 26, "suitabl": [14, 15, 25], "suite_nam": 26, "sum": 57, "sum_job": 57, "sum_op": 57, "summari": 33, "summarize_directori": 4, "sundai": [50, 55], "super": 30, "supervis": [14, 38], "suppli": [3, 5, 7, 11, 18, 22, 29, 50, 58], "support": [4, 9, 11, 13, 14, 15, 20, 21, 25, 27, 33, 38, 39, 40, 46, 49, 50, 55, 56], "suppress": 2, "sure": [14, 27, 38], "surfac": [2, 14], "sustainedschedulerbacklogtimeout": [14, 38], "svc": [21, 29], "svv": 29, "switch": [6, 7, 11, 51], "symbol": [14, 38], "sync": [12, 24, 50], "sync_and_pol": [12, 24], "sync_foobar": [12, 24], "synchron": [5, 9, 22, 28, 51], "syntax": [6, 22, 57], "synthes": 47, "sys": 5, "system": [2, 3, 5, 6, 9, 11, 14, 15, 16, 20, 28, 38, 48, 49, 51, 54, 56, 57], "tab": [12, 24, 33, 58], "tabl": [1, 12, 13, 22, 24, 25, 41, 56], "table1": 24, "table2": 24, "table_nam": [12, 24], "table_schema": [49, 56], "tablecolumn": [49, 56], "tablecolumnconstraint": 49, "tableconstraint": 49, "tablemetadatavalu": [49, 56], "tablerecord": [49, 56], "tableschema": [49, 56], "tableschemametadatavalu": [49, 56], "tabluar": [49, 56], "tabular": [49, 56], "tag": [1, 6, 7, 9, 11, 13, 14, 20, 21, 25, 28, 29, 30, 39, 45, 47, 49, 51, 52, 55, 56], "tag_concurrency_limit": 9, "tags_fn": 55, "tags_fn_for_partit": 50, "tags_for_partition_fn": [50, 55], "tagsmor": 20, "take": [7, 14, 19, 20, 22, 26, 31, 34, 38, 40, 41, 49, 50, 53, 54, 55, 56, 57], "taken": 24, "tar": 25, "target": [6, 11, 14, 18, 22, 29, 38, 50, 51, 55], "target_dir": 22, "target_path": 22, "task": [13, 14, 17, 18, 22, 25, 29, 38], "task_definit": 14, "task_id": 22, "task_tag": 22, "taskinst": 13, "team": 9, "teams_on_failur": 31, "teams_on_pipeline_failur": 31, "teams_on_success": 31, "teams_pipelin": 31, "teams_solid": 31, "teams_webhook_url": 31, "teamsclient": 31, "teardown": 54, "tell": [6, 7, 9, 24, 51], "temp": 9, "tempfil": 9, "templat": 13, "temporari": [1, 9, 10], "tend": [14, 38], "term": [20, 49, 50], "termin": [20, 22, 25, 41], "test": [2, 5, 9, 15, 22, 25, 38, 49, 51, 52, 55, 56], "test_handle_output": 10, "test_load_input": 10, "test_project": 29, "test_util": 13, "test_valu": 3, "text": [2, 22, 27, 31, 40, 49, 56], "text_fn": 40, "text_messag": 31, "text_metadata": [49, 56], "text_usag": 40, "textio": 9, "textmetadatavalu": [49, 56], "tgtlifetimehour": 25, "tgz": 25, "than": [3, 6, 9, 14, 16, 17, 18, 20, 25, 27, 29, 38, 49, 50, 51, 54, 56, 57], "thank": 29, "the_asset_group": 1, "the_job": 51, "the_resourc": 54, "thei": [1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 20, 22, 38, 49, 51, 54, 56, 57, 58], "them": [1, 2, 5, 9, 10, 14, 20, 24, 25, 29, 34, 38, 39, 46, 49, 51, 55, 56, 57, 58], "themselv": [6, 7, 51], "therefor": 22, "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "thin": [21, 27, 33, 40], "thing": [14, 27, 38, 51, 58], "third": 9, "those": [1, 6, 7, 10, 11, 14, 20, 22, 38, 41, 46, 51, 55], "though": [14, 38], "thousand": [14, 38], "thrash": 25, "thread": [2, 9, 14, 19, 22, 38, 41], "threaddump": [14, 38], "threads_per_work": 19, "three": 1, "threshold": [14, 22, 34, 38], "through": [3, 9, 13, 14, 20, 22, 34, 38, 45, 46, 49, 50, 56, 57, 58], "throw": [3, 5, 14, 22, 28, 38, 56], "thrown": [5, 8, 22, 34, 57], "thu": 6, "tick": 55, "ticket": 25, "tie": 55, "tighter": 27, "tightli": [14, 38], "time": [1, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14, 16, 17, 21, 22, 23, 24, 25, 38, 39, 41, 49, 50, 51, 55, 56, 57], "timeout": [2, 14, 19, 20, 31, 37, 38, 41, 43], "timeout_second": 20, "timestamp": [9, 24, 34], "timewindowpartitionsdefinit": [6, 10, 56], "timezon": [2, 34, 41, 50, 55], "titl": 27, "tmp": [14, 15, 25, 29, 38], "to_job": [6, 7, 11, 20, 46, 51], "togeth": [49, 56], "toggl": 47, "token": [15, 20, 22, 27, 29, 40, 44], "too": [14, 29, 38], "tool": [9, 13, 20, 22, 34, 49, 51, 52, 56, 57], "top": [1, 3, 6, 7, 9, 12, 22, 24, 51, 56], "topic": 40, "torn": 54, "torrentbroadcastfactori": [14, 38], "total": [14, 20, 38], "touch": 9, "toward": 14, "trace": [5, 9, 46], "track": [14, 30, 38, 49, 51, 56], "transfer": [14, 38], "transform": [14, 38, 57], "transform_word": 3, "transient": [9, 14, 38], "transit": 15, "transport": 28, "travers": 18, "treat": [49, 56, 57], "tri": [14, 38], "trigger": [8, 14, 22, 31, 33, 38, 40, 55], "triggerrun": 22, "true": [3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 18, 20, 22, 24, 25, 29, 34, 36, 38, 41, 43, 47, 49, 50, 51, 55, 56, 57, 58], "trust": 25, "truststor": 25, "truststorepassworduri": 25, "truststoreuri": 25, "try": [14, 16, 28, 38, 49, 56], "tune": [14, 38], "tupl": [5, 6, 9, 11, 24, 49, 56, 57], "tuple_input": 57, "tuple_op": 57, "turn": [2, 14, 24, 29, 38, 53, 55], "twilio_resourc": 44, "two": [1, 10, 20, 49, 51, 54, 56, 57], "txt": [9, 25], "type": [1, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 18, 20, 24, 25, 26, 28, 29, 34, 39, 40, 41, 45, 46, 47, 50, 51, 52, 53, 54, 55, 58], "type_check_fn": [34, 49, 56, 57], "typecheck": [5, 34, 45, 49, 56, 57], "typecheckcontext": [6, 34, 57], "typehint": 7, "typic": [1, 5, 6, 9, 13, 14, 38, 40, 51, 53], "typing_typ": [34, 57], "ubuntu": 20, "udf": 25, "ugli": 3, "uksouth": 20, "uncondition": [14, 38], "unconnect": 57, "unconstrain": 57, "under": [1, 6, 7, 9, 14, 16, 17, 18, 25, 29, 38, 49, 56], "underestim": [14, 38], "underli": [6, 7, 13, 14, 15, 16, 17, 18, 22, 39, 46, 49, 54, 56], "underneath": 22, "underscor": 25, "underutil": 20, "unexpect": [5, 14, 22, 38], "unifi": [14, 38], "uniform": [9, 46], "uninstal": 29, "union": [1, 3, 4, 6, 7, 9, 10, 12, 14, 16, 17, 18, 22, 24, 25, 29, 30, 34, 40, 45, 46, 49, 50, 51, 53, 54, 55, 56, 57], "uniqu": [2, 3, 4, 6, 7, 9, 10, 13, 16, 21, 25, 34, 47, 48, 49, 51, 52, 56, 57], "unique_id": [13, 22], "unit": [6, 7, 14, 29, 38, 49, 51, 55, 56], "unix": 9, "unknown": 5, "unless": [12, 14, 22, 38, 49, 50, 55, 56], "unlik": [1, 3, 49, 56], "unlimit": [14, 38], "unpars": 22, "unpersist": [14, 38], "unrecover": [49, 56], "unregist": [14, 38], "unrol": [14, 38], "unrollfract": [14, 38], "unsaf": [14, 38], "unsatisfi": 1, "unset": 20, "unsign": 14, "unspecifi": [3, 25, 49], "unstructur": 9, "unsuccess": [12, 24], "until": [2, 12, 14, 20, 22, 24, 38], "untitl": 20, "untyp": 57, "unus": [14, 34, 38, 57], "unusu": [14, 38], "unwil": [14, 38], "unzip": 20, "up_for_retri": [49, 56], "updat": [14, 22, 24, 38], "update_connector": 24, "update_job": 22, "update_schedule_typ": 24, "updatejobbyid": 22, "upload": [14, 20], "upon": [9, 20, 22, 29, 47, 54], "upper": [14, 34, 38], "upstream": [1, 6, 7, 10, 11, 49, 51, 56], "upstream_output": 10, "uri": [25, 30, 41], "url": [2, 14, 15, 16, 17, 18, 20, 22, 23, 25, 28, 31, 35, 38, 40, 49, 56], "urlmetadatavalu": [49, 56], "usabl": [9, 57], "usable_as_dagster_typ": [49, 56, 57], "usag": [1, 9, 10, 13, 14, 28, 30, 38, 40, 49, 56], "use": [1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 33, 34, 36, 38, 39, 40, 41, 43, 45, 46, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58], "use_airflow_template_context": 13, "use_http": [12, 28], "use_ssl": 14, "use_unsigned_sess": 14, "used": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 38, 40, 41, 45, 46, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58], "usefetchcach": [14, 38], "useful": [2, 14, 25, 28, 29, 38, 50, 51, 54, 55, 56, 58], "uselegacymod": [14, 38], "usepassword": 29, "user": [1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 14, 15, 20, 21, 22, 24, 25, 26, 27, 32, 34, 35, 36, 38, 39, 40, 41, 45, 46, 47, 49, 50, 51, 53, 54, 56, 57], "user_code_error_boundari": [5, 9], "user_messag": 9, "useraccount": 25, "userclasspathfirst": [14, 38], "userdeploy": 29, "userguid": [16, 17, 18], "usernam": [9, 14, 17, 23, 29, 32, 36, 43], "uses": [5, 10, 20, 21, 25, 45, 49, 56], "using": [1, 2, 3, 4, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 29, 30, 31, 34, 38, 39, 40, 41, 43, 46, 49, 51, 52, 53, 54, 55, 56, 57, 58], "usr": 25, "usual": [1, 10, 13, 14, 38, 53], "utc": [13, 34, 50, 55], "utc_date_str": 13, "utc_execution_date_str": 13, "util": [4, 6, 10, 14, 15, 28, 29, 34, 49, 50, 56], "valid": [1, 3, 6, 9, 10, 14, 22, 25, 26, 34, 38, 41, 50, 55, 57], "validate_default_paramet": 41, "validate_run_config": 6, "validate_t": [49, 56], "validateoutputspec": [14, 38], "validation_operator_nam": 26, "validation_operators_and_act": 26, "valu": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 28, 29, 30, 31, 32, 34, 36, 38, 39, 41, 43, 45, 49, 50, 51, 53, 54, 55, 56, 57], "var": [17, 22], "vari": 51, "variabl": [2, 3, 14, 15, 17, 18, 20, 22, 23, 25, 27, 29, 30, 32, 36, 38, 41, 50, 55], "variant": [4, 14, 38], "variat": [16, 17, 18], "varieti": [22, 28], "variou": [14, 25, 27, 38, 55], "verbos": [14, 20, 38], "veri": [6, 11, 14, 38, 51, 57], "verifi": [14, 31], "verify_cert_path": 14, "version": [2, 3, 7, 9, 10, 11, 14, 15, 16, 17, 18, 20, 22, 25, 29, 38, 46, 49, 51, 54, 56, 57], "version_strategi": [6, 7, 11, 51], "versionstrategi": [7, 11, 47, 51], "very_cool_packag": 9, "very_secret_env_vari": 3, "very_secret_env_variable_bool": 3, "very_secret_env_variable_int": 3, "via": [1, 2, 3, 6, 9, 10, 13, 14, 15, 16, 20, 22, 25, 29, 31, 37, 38, 40, 45, 50, 51, 54, 55, 58], "viabl": 9, "view": [21, 22, 29, 50], "viewabl": [7, 11], "violat": 5, "visibl": 29, "visitor": 21, "visual": [34, 57], "void": 50, "volum": [18, 20, 29], "volume_mount": [18, 29], "volumemount": [18, 29], "vvv": 29, "wai": [1, 6, 9, 10, 11, 14, 24, 29, 38, 50, 51, 54, 56, 57], "wait": [2, 12, 14, 18, 20, 22, 24, 38, 49, 56, 57], "wait_for_log": [14, 20], "wait_for_process": 9, "wait_int": 57, "wal": [14, 38], "walk": 4, "want": [3, 12, 13, 14, 16, 17, 18, 20, 22, 24, 26, 27, 28, 29, 30, 31, 38, 40, 55, 56, 57, 58], "warehous": 41, "warm": 29, "warn": [2, 14, 21, 22, 38, 46], "warn_error": 22, "wast": [14, 38], "wave": 40, "weak": [14, 38], "web": [14, 16, 38], "webclient": 40, "webhook": 31, "week": [50, 55], "weekli": [50, 55], "weekly_partitioned_config": [50, 55], "well": [3, 5, 6, 7, 11, 14, 20, 22, 24, 29, 30, 38, 41, 49, 56, 57], "were": [10, 16, 17, 18, 22, 51, 55, 56], "west": [14, 20, 29], "wget": 25, "what": [7, 9, 14, 26, 38, 49, 55, 56], "whatev": 51, "when": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 36, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "whenev": [7, 11, 14, 35, 46, 49, 56], "where": [1, 2, 4, 6, 8, 9, 10, 14, 17, 18, 22, 25, 26, 28, 29, 34, 38, 39, 49, 50, 55, 56, 57], "whether": [3, 6, 7, 9, 10, 11, 14, 18, 22, 25, 28, 29, 31, 38, 40, 45, 47, 49, 50, 51, 55, 56], "which": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 35, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57], "while": [3, 5, 9, 14, 20, 22, 38, 47, 50, 56], "whitelist": 9, "who": [9, 14, 38], "whole": [9, 10, 14, 38], "whom": 3, "whose": [1, 3, 5, 6, 9, 10, 14, 20, 45, 47, 51, 56, 57, 58], "why": 55, "window": [6, 10, 25, 50, 55, 56], "wish": [9, 14, 22, 38, 49, 56, 57], "with_additional_config": 52, "with_hook": [11, 31, 40], "within": [1, 3, 5, 6, 7, 9, 10, 11, 13, 14, 18, 20, 21, 22, 24, 25, 27, 28, 29, 30, 33, 34, 35, 38, 46, 48, 49, 51, 53, 54, 56, 57], "without": [3, 5, 9, 13, 14, 16, 19, 25, 28, 38, 55, 57], "won": [10, 14, 16, 38], "word": [3, 49, 50, 57], "wordcount": 25, "work": [2, 4, 9, 14, 18, 22, 25, 29, 38, 40, 47, 50, 57, 58], "worker": [2, 9, 14, 17, 18, 19, 20, 25, 29, 38], "worker_main": 16, "workerconfig": 25, "working_directori": 2, "workload": [14, 20, 38], "workspac": [2, 9, 20, 40], "world": [3, 39, 56], "would": [1, 9, 16, 22, 24, 45, 49, 50, 56], "wrap": [3, 5, 6, 9, 10, 11, 15, 22, 39, 45, 46, 49, 51, 54, 56], "wrapper": [21, 27, 33, 40], "write": [7, 9, 11, 14, 16, 20, 27, 38, 51, 54, 55, 56, 57], "write_csv": 10, "write_data": 9, "write_fil": 9, "writeaheadlog": [14, 38], "writehead": 57, "writeif": 25, "writer": 57, "writerow": 57, "written": [14, 38], "www": [25, 50, 55], "xlarg": 20, "xloggc": [14, 38], "xml": 25, "xmlfor": 25, "xmx": [14, 38], "yaml": [1, 2, 9, 10, 14, 15, 16, 17, 18, 29, 32, 36, 52, 53, 57, 58], "yaml_directori": 53, "yaml_str": [52, 58], "yarn": [14, 19, 25, 38], "yes": [14, 38], "yet": [6, 10, 53, 55, 56], "yield": [3, 4, 6, 7, 9, 10, 12, 22, 24, 26, 45, 49, 51, 53, 54, 55, 56], "yield_ev": 45, "yield_materi": [12, 22, 24], "yield_result": 45, "yml": [22, 26], "you": [1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 39, 40, 41, 45, 46, 49, 50, 51, 53, 54, 55, 56, 57, 58], "your": [1, 6, 9, 10, 11, 13, 14, 15, 17, 18, 20, 22, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 45, 51, 54], "your_kei": 21, "your_org_her": 28, "your_service_account": 29, "yourself": 9, "zadrozni": 16, "zero": [14, 22, 25, 38, 39], "zip": [14, 20, 25, 38], "zone": [25, 41, 50, 55], "zoneuri": 25, "zookeep": [14, 38], "zstd": [14, 38], "zstdcompressioncodec": [14, 38], "\u4e16\u754c": 3, "\u4f60\u597d": 3}, "titles": ["Home", "Software-Defined Assets (Experimental)", "Dagster CLI", "Config", "Dynamic Mapping & Collect", "Errors", "Execution", "Graphs", "Hooks", "Internals", "IO Managers", "Jobs", "Airbyte (dagster-airbyte)", "Airflow (dagster-airflow)", "AWS (dagster-aws)", "Azure (dagster-azure)", "Celery (dagster-celery)", "Orchestration on Celery + Docker", "Orchestration on Celery + Kubernetes", "Dask (dagster-dask)", "Databricks (dagster-databricks)", "Datadog (dagster-datadog)", "dbt (dagster-dbt)", "Orchestration on Docker", "Fivetran (dagster-fivetran)", "GCP (dagster-gcp)", "Great Expectations (dagster-ge)", "GitHub (dagster-github)", "GraphQL (dagster-graphql)", "Kubernetes (dagster-k8s)", "MLflow (dagster-mlflow)", "Microsoft Teams (dagster-msteams)", "MySQL (dagster-mysql)", "PagerDuty (dagster-pagerduty)", "Pandas (dagster-pandas)", "Papertrail (dagster-papertrail)", "PostgreSQL (dagster-postgres)", "Prometheus (dagster-prometheus)", "Pyspark (dagster-pyspark)", "Shell (dagster-shell)", "Slack (dagster-slack)", "Snowflake (dagster-snowflake)", "Spark (dagster-spark)", "SSH / SFTP (dagster-ssh)", "Twilio (dagster-twilio)", "Dagstermill", "Loggers", "Versioning and Memoization", "[Legacy] Modes", "Ops", "Partitions", "[Legacy] Pipelines", "[Legacy] Presets", "Repositories", "Resources", "Run Requests", "[Legacy] Solids", "Types", "Utilities"], "titleterms": {"AWS": 14, "ECS": 14, "GCS": 25, "Ins": 49, "K8s": 29, "Ops": [12, 22, 24, 49], "about": 29, "access": 29, "airbyt": 12, "airflow": 13, "alias": 51, "api": [2, 16, 17, 18, 20, 23, 25, 29, 39, 55], "app": 16, "asset": [1, 2, 12, 22, 24, 49, 56], "aws": 14, "azur": 15, "backend": 16, "best": 16, "bigqueri": 25, "broker": 16, "built": [10, 46, 57], "celeri": [16, 17, 18], "chart": 29, "cli": [2, 16, 22], "client": 28, "cloud": 22, "cloudwatch": 14, "cluster": 29, "collect": 4, "compos": 56, "comput": 9, "config": [3, 6, 51], "configur": [6, 16, 51], "context": [6, 10, 56], "coordin": 9, "core": 22, "custom": [16, 46], "daemon": 2, "dagit": 2, "dagster": [2, 12, 13, 14, 15, 16, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44], "dagstermil": 45, "dask": 19, "databrick": 20, "datadog": 21, "dataproc": 25, "dbt": 22, "debug": 2, "defin": [1, 46, 49, 56], "definit": 51, "depend": 51, "develop": 29, "docker": [17, 23], "dump": 2, "dynam": 4, "emr": 14, "enabl": 29, "error": [5, 22], "event": [9, 49, 56], "except": 9, "execut": [6, 27, 49, 51, 56], "executor": [6, 9], "exist": 29, "expect": 26, "experiment": [1, 10], "faster": 29, "file": 9, "fivetran": 24, "from": [29, 46], "function": 50, "gcp": 25, "gcr": 29, "get": 33, "github": 27, "graph": [6, 7], "graphql": [2, 27, 28], "great": 26, "grpc": 2, "handl": 9, "heartbeat": 2, "helm": 29, "hook": 8, "input": [10, 56], "instanc": [2, 9], "intern": 9, "issu": 27, "job": [2, 6, 11], "k8s": 29, "kei": [49, 56], "kind": 29, "kubernet": [18, 29], "launcher": 9, "legaci": [2, 20, 22, 25, 30, 39, 48, 50, 51, 52, 55, 56], "list": 16, "local": 29, "log": [9, 46], "logger": 46, "make": 57, "manag": [9, 10], "manual": 29, "map": 4, "memoiz": 47, "metadata": [49, 56], "microsoft": 31, "minikub": 29, "mlflow": 30, "mode": 48, "monitor": 16, "msteam": 31, "mysql": 32, "new": 57, "note": 29, "ops": 49, "orchestr": [17, 18, 23], "other": 16, "out": 49, "output": [10, 56], "pagerduti": 33, "panda": 34, "papertrail": 35, "partit": [50, 55], "pipelin": [2, 51], "post": 27, "postgr": 36, "postgresql": 36, "practic": 16, "preset": 52, "primit": 57, "prometheu": 37, "pvc": 29, "pyspark": 38, "python": [28, 29], "queri": 27, "quickstart": 16, "reconstruct": [6, 11, 51], "redi": 29, "redshift": 14, "repositori": 53, "request": 55, "resourc": [12, 22, 24, 54], "result": 6, "retri": 56, "root": 10, "rpc": 22, "run": [2, 6, 9, 29, 51, 55], "schedul": [2, 9, 55], "schema": [6, 51], "secretsmanag": 14, "sensor": [2, 55], "setup": 29, "sftp": 43, "shell": 39, "slack": 40, "snowflak": 41, "softwar": 1, "solid": [22, 56], "spark": 42, "ssh": 43, "start": [16, 33], "storag": 9, "tabl": 49, "task": 16, "team": 31, "termin": 16, "test": [14, 29, 57], "twilio": 44, "type": [3, 22, 49, 56, 57], "util": [3, 22, 58], "valid": 29, "version": 47, "wipe": 2, "worker": 16, "your": 16}} \ No newline at end of file +{"docnames": ["index", "sections/api/apidocs/assets", "sections/api/apidocs/cli", "sections/api/apidocs/config", "sections/api/apidocs/dynamic", "sections/api/apidocs/errors", "sections/api/apidocs/execution", "sections/api/apidocs/graphs", "sections/api/apidocs/hooks", "sections/api/apidocs/internals", "sections/api/apidocs/io-managers", "sections/api/apidocs/jobs", "sections/api/apidocs/libraries/dagster-airbyte", "sections/api/apidocs/libraries/dagster-airflow", "sections/api/apidocs/libraries/dagster-aws", "sections/api/apidocs/libraries/dagster-azure", "sections/api/apidocs/libraries/dagster-celery", "sections/api/apidocs/libraries/dagster-celery-docker", "sections/api/apidocs/libraries/dagster-celery-k8s", "sections/api/apidocs/libraries/dagster-dask", "sections/api/apidocs/libraries/dagster-databricks", "sections/api/apidocs/libraries/dagster-datadog", "sections/api/apidocs/libraries/dagster-dbt", "sections/api/apidocs/libraries/dagster-docker", "sections/api/apidocs/libraries/dagster-fivetran", "sections/api/apidocs/libraries/dagster-gcp", "sections/api/apidocs/libraries/dagster-ge", "sections/api/apidocs/libraries/dagster-github", "sections/api/apidocs/libraries/dagster-graphql", "sections/api/apidocs/libraries/dagster-k8s", "sections/api/apidocs/libraries/dagster-mlflow", "sections/api/apidocs/libraries/dagster-msteams", "sections/api/apidocs/libraries/dagster-mysql", "sections/api/apidocs/libraries/dagster-pagerduty", "sections/api/apidocs/libraries/dagster-pandas", "sections/api/apidocs/libraries/dagster-papertrail", "sections/api/apidocs/libraries/dagster-postgres", "sections/api/apidocs/libraries/dagster-prometheus", "sections/api/apidocs/libraries/dagster-pyspark", "sections/api/apidocs/libraries/dagster-shell", "sections/api/apidocs/libraries/dagster-slack", "sections/api/apidocs/libraries/dagster-snowflake", "sections/api/apidocs/libraries/dagster-snowflake-pandas", "sections/api/apidocs/libraries/dagster-spark", "sections/api/apidocs/libraries/dagster-ssh", "sections/api/apidocs/libraries/dagster-twilio", "sections/api/apidocs/libraries/dagstermill", "sections/api/apidocs/loggers", "sections/api/apidocs/memoization", "sections/api/apidocs/modes", "sections/api/apidocs/ops", "sections/api/apidocs/partitions", "sections/api/apidocs/pipeline", "sections/api/apidocs/presets", "sections/api/apidocs/repositories", "sections/api/apidocs/resources", "sections/api/apidocs/schedules-sensors", "sections/api/apidocs/solids", "sections/api/apidocs/types", "sections/api/apidocs/utilities"], "envversion": {"nbsphinx": 3, "sphinx": 56, "sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 3, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 2, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["index.rst", "sections/api/apidocs/assets.rst", "sections/api/apidocs/cli.rst", "sections/api/apidocs/config.rst", "sections/api/apidocs/dynamic.rst", "sections/api/apidocs/errors.rst", "sections/api/apidocs/execution.rst", "sections/api/apidocs/graphs.rst", "sections/api/apidocs/hooks.rst", "sections/api/apidocs/internals.rst", "sections/api/apidocs/io-managers.rst", "sections/api/apidocs/jobs.rst", "sections/api/apidocs/libraries/dagster-airbyte.rst", "sections/api/apidocs/libraries/dagster-airflow.rst", "sections/api/apidocs/libraries/dagster-aws.rst", "sections/api/apidocs/libraries/dagster-azure.rst", "sections/api/apidocs/libraries/dagster-celery.rst", "sections/api/apidocs/libraries/dagster-celery-docker.rst", "sections/api/apidocs/libraries/dagster-celery-k8s.rst", "sections/api/apidocs/libraries/dagster-dask.rst", "sections/api/apidocs/libraries/dagster-databricks.rst", "sections/api/apidocs/libraries/dagster-datadog.rst", "sections/api/apidocs/libraries/dagster-dbt.rst", "sections/api/apidocs/libraries/dagster-docker.rst", "sections/api/apidocs/libraries/dagster-fivetran.rst", "sections/api/apidocs/libraries/dagster-gcp.rst", "sections/api/apidocs/libraries/dagster-ge.rst", "sections/api/apidocs/libraries/dagster-github.rst", "sections/api/apidocs/libraries/dagster-graphql.rst", "sections/api/apidocs/libraries/dagster-k8s.rst", "sections/api/apidocs/libraries/dagster-mlflow.rst", "sections/api/apidocs/libraries/dagster-msteams.rst", "sections/api/apidocs/libraries/dagster-mysql.rst", "sections/api/apidocs/libraries/dagster-pagerduty.rst", "sections/api/apidocs/libraries/dagster-pandas.rst", "sections/api/apidocs/libraries/dagster-papertrail.rst", "sections/api/apidocs/libraries/dagster-postgres.rst", "sections/api/apidocs/libraries/dagster-prometheus.rst", "sections/api/apidocs/libraries/dagster-pyspark.rst", "sections/api/apidocs/libraries/dagster-shell.rst", "sections/api/apidocs/libraries/dagster-slack.rst", "sections/api/apidocs/libraries/dagster-snowflake.rst", "sections/api/apidocs/libraries/dagster-snowflake-pandas.rst", "sections/api/apidocs/libraries/dagster-spark.rst", "sections/api/apidocs/libraries/dagster-ssh.rst", "sections/api/apidocs/libraries/dagster-twilio.rst", "sections/api/apidocs/libraries/dagstermill.rst", "sections/api/apidocs/loggers.rst", "sections/api/apidocs/memoization.rst", "sections/api/apidocs/modes.rst", "sections/api/apidocs/ops.rst", "sections/api/apidocs/partitions.rst", "sections/api/apidocs/pipeline.rst", "sections/api/apidocs/presets.rst", "sections/api/apidocs/repositories.rst", "sections/api/apidocs/resources.rst", "sections/api/apidocs/schedules-sensors.rst", "sections/api/apidocs/solids.rst", "sections/api/apidocs/types.rst", "sections/api/apidocs/utilities.rst"], "objects": {"dagit": {"--attribute": [2, 7, 1, "cmdoption-dagit-a"], "--db-statement-timeout": [2, 7, 1, "cmdoption-dagit-db-statement-timeout"], "--empty-workspace": [2, 7, 1, "cmdoption-dagit-empty-workspace"], "--grpc-host": [2, 7, 1, "cmdoption-dagit-grpc-host"], "--grpc-port": [2, 7, 1, "cmdoption-dagit-grpc-port"], "--grpc-socket": [2, 7, 1, "cmdoption-dagit-grpc-socket"], "--host": [2, 7, 1, "cmdoption-dagit-h"], "--log-level": [2, 7, 1, "cmdoption-dagit-log-level"], "--module-name": [2, 7, 1, "cmdoption-dagit-m"], "--package-name": [2, 7, 1, "cmdoption-dagit-package-name"], "--path-prefix": [2, 7, 1, "cmdoption-dagit-l"], "--port": [2, 7, 1, "cmdoption-dagit-p"], "--python-file": [2, 7, 1, "cmdoption-dagit-f"], "--read-only": [2, 7, 1, "cmdoption-dagit-read-only"], "--suppress-warnings": [2, 7, 1, "cmdoption-dagit-suppress-warnings"], "--use-ssl": [2, 7, 1, "cmdoption-dagit-use-ssl"], "--version": [2, 7, 1, "cmdoption-dagit-version"], "--working-directory": [2, 7, 1, "cmdoption-dagit-d"], "--workspace": [2, 7, 1, "cmdoption-dagit-w"], "-a": [2, 7, 1, "cmdoption-dagit-a"], "-d": [2, 7, 1, "cmdoption-dagit-d"], "-f": [2, 7, 1, "cmdoption-dagit-f"], "-h": [2, 7, 1, "cmdoption-dagit-h"], "-l": [2, 7, 1, "cmdoption-dagit-l"], "-m": [2, 7, 1, "cmdoption-dagit-m"], "-p": [2, 7, 1, "cmdoption-dagit-p"], "-w": [2, 7, 1, "cmdoption-dagit-w"]}, "dagster": {"Array": [3, 0, 1, ""], "AssetGroup": [1, 0, 1, ""], "AssetIn": [1, 0, 1, ""], "AssetKey": [50, 0, 1, ""], "AssetMaterialization": [50, 0, 1, ""], "AssetSensorDefinition": [56, 0, 1, ""], "Backoff": [50, 0, 1, ""], "BoolSource": [3, 2, 1, ""], "CompositeSolidDefinition": [57, 0, 1, ""], "CompositeSolidExecutionResult": [57, 0, 1, ""], "ConfigMapping": [3, 0, 1, ""], "ConfigSchema": [3, 0, 1, ""], "DagsterAssetMetadataValue": [50, 0, 1, ""], "DagsterConfigMappingFunctionError": [5, 3, 1, ""], "DagsterError": [5, 3, 1, ""], "DagsterEvent": [6, 0, 1, ""], "DagsterEventLogInvalidForRun": [5, 3, 1, ""], "DagsterEventType": [6, 0, 1, ""], "DagsterExecutionStepExecutionError": [5, 3, 1, ""], "DagsterExecutionStepNotFoundError": [5, 3, 1, ""], "DagsterInstance": [9, 0, 1, ""], "DagsterInvalidConfigDefinitionError": [5, 3, 1, ""], "DagsterInvalidConfigError": [5, 3, 1, ""], "DagsterInvalidDefinitionError": [5, 3, 1, ""], "DagsterInvariantViolationError": [5, 3, 1, ""], "DagsterLogManager": [47, 0, 1, ""], "DagsterPipelineRunMetadataValue": [50, 0, 1, ""], "DagsterResourceFunctionError": [5, 3, 1, ""], "DagsterRunNotFoundError": [5, 3, 1, ""], "DagsterRunStatus": [9, 0, 1, ""], "DagsterStepOutputNotFoundError": [5, 3, 1, ""], "DagsterSubprocessError": [5, 3, 1, ""], "DagsterType": [58, 0, 1, ""], "DagsterTypeCheckDidNotPass": [5, 3, 1, ""], "DagsterTypeCheckError": [5, 3, 1, ""], "DagsterTypeLoader": [58, 0, 1, ""], "DagsterTypeMaterializer": [58, 0, 1, ""], "DagsterUnknownResourceError": [5, 3, 1, ""], "DagsterUnmetExecutorRequirementsError": [5, 3, 1, ""], "DagsterUserCodeExecutionError": [5, 3, 1, ""], "DailyPartitionsDefinition": [51, 0, 1, ""], "DependencyDefinition": [7, 0, 1, ""], "DynamicOut": [4, 0, 1, ""], "DynamicOutput": [4, 0, 1, ""], "Enum": [3, 0, 1, ""], "EnumValue": [3, 0, 1, ""], "ExecuteInProcessResult": [6, 0, 1, ""], "Executor": [9, 0, 1, ""], "ExecutorDefinition": [9, 0, 1, ""], "ExpectationResult": [50, 0, 1, ""], "ExperimentalWarning": [59, 0, 1, ""], "Failure": [50, 0, 1, ""], "Field": [3, 0, 1, ""], "FileHandle": [58, 0, 1, ""], "FloatMetadataValue": [50, 0, 1, ""], "GraphDefinition": [7, 0, 1, ""], "GraphIn": [7, 0, 1, ""], "GraphOut": [7, 0, 1, ""], "HookContext": [8, 0, 1, ""], "HookDefinition": [8, 0, 1, ""], "HourlyPartitionsDefinition": [51, 0, 1, ""], "IOManager": [10, 0, 1, ""], "IOManagerDefinition": [10, 0, 1, ""], "In": [50, 0, 1, ""], "InitExecutorContext": [9, 0, 1, ""], "InitLoggerContext": [47, 0, 1, ""], "InitResourceContext": [55, 0, 1, ""], "InputContext": [10, 0, 1, ""], "InputDefinition": [57, 0, 1, ""], "InputMapping": [57, 0, 1, ""], "IntMetadataValue": [50, 0, 1, ""], "IntSource": [3, 2, 1, ""], "Jitter": [50, 0, 1, ""], "JobDefinition": [11, 0, 1, ""], "JsonMetadataValue": [50, 0, 1, ""], "LocalFileHandle": [58, 0, 1, ""], "LoggerDefinition": [47, 0, 1, ""], "MEMOIZED_RUN_TAG": [48, 2, 1, ""], "Map": [3, 0, 1, ""], "MarkdownMetadataValue": [50, 0, 1, ""], "MemoizableIOManager": [48, 0, 1, ""], "MetadataEntry": [50, 0, 1, ""], "MetadataValue": [50, 0, 1, ""], "ModeDefinition": [49, 0, 1, ""], "MonthlyPartitionsDefinition": [51, 0, 1, ""], "MultiDependencyDefinition": [7, 0, 1, ""], "NodeInvocation": [7, 0, 1, ""], "Noneable": [3, 0, 1, ""], "Nothing": [58, 2, 1, ""], "OpDefinition": [50, 0, 1, ""], "OpExecutionContext": [6, 0, 1, ""], "Out": [50, 0, 1, ""], "Output": [50, 0, 1, ""], "OutputContext": [10, 0, 1, ""], "OutputDefinition": [57, 0, 1, ""], "OutputMapping": [57, 0, 1, ""], "Partition": [51, 0, 1, ""], "PartitionScheduleDefinition": [56, 0, 1, ""], "PartitionSetDefinition": [51, 0, 1, ""], "PartitionedConfig": [51, 0, 1, ""], "PartitionsDefinition": [51, 0, 1, ""], "PathMetadataValue": [50, 0, 1, ""], "Permissive": [3, 0, 1, ""], "PipelineDefinition": [52, 0, 1, ""], "PipelineExecutionResult": [52, 0, 1, ""], "PipelineFailureSensorContext": [56, 0, 1, ""], "PipelineRun": [9, 0, 1, ""], "PipelineRunStatus": [9, 2, 1, ""], "PresetDefinition": [53, 0, 1, ""], "PythonArtifactMetadataValue": [50, 0, 1, ""], "PythonObjectDagsterType": [58, 4, 1, ""], "RepositoryData": [54, 0, 1, ""], "RepositoryDefinition": [54, 0, 1, ""], "ResourceDefinition": [55, 0, 1, ""], "RetryPolicy": [50, 0, 1, ""], "RetryRequested": [50, 0, 1, ""], "RootInputManager": [10, 0, 1, ""], "RootInputManagerDefinition": [10, 0, 1, ""], "RunFailureSensorContext": [56, 0, 1, ""], "RunRequest": [56, 0, 1, ""], "RunStatusSensorContext": [56, 0, 1, ""], "RunStatusSensorDefinition": [56, 0, 1, ""], "ScalarUnion": [3, 0, 1, ""], "ScheduleDefinition": [56, 0, 1, ""], "ScheduleEvaluationContext": [56, 0, 1, ""], "Selector": [3, 0, 1, ""], "SensorDefinition": [56, 0, 1, ""], "SensorEvaluationContext": [56, 0, 1, ""], "Shape": [3, 0, 1, ""], "SkipReason": [56, 0, 1, ""], "SolidDefinition": [57, 0, 1, ""], "SolidExecutionContext": [57, 0, 1, ""], "SolidExecutionResult": [57, 0, 1, ""], "SolidInvocation": [52, 2, 1, ""], "SourceAsset": [1, 0, 1, ""], "SourceHashVersionStrategy": [48, 0, 1, ""], "StaticPartitionsDefinition": [51, 0, 1, ""], "StringSource": [3, 2, 1, ""], "TableColumn": [50, 0, 1, ""], "TableColumnConstraints": [50, 0, 1, ""], "TableConstraints": [50, 0, 1, ""], "TableMetadataValue": [50, 0, 1, ""], "TableRecord": [50, 0, 1, ""], "TableSchema": [50, 0, 1, ""], "TableSchemaMetadataValue": [50, 0, 1, ""], "TextMetadataValue": [50, 0, 1, ""], "TimeWindowPartitionsDefinition": [51, 0, 1, ""], "TypeCheck": [50, 0, 1, ""], "TypeCheckContext": [6, 0, 1, ""], "UrlMetadataValue": [50, 0, 1, ""], "VersionStrategy": [48, 0, 1, ""], "WeeklyPartitionsDefinition": [51, 0, 1, ""], "asset": [1, 4, 1, ""], "asset_sensor": [56, 4, 1, ""], "build_assets_job": [1, 4, 1, ""], "build_hook_context": [8, 4, 1, ""], "build_init_logger_context": [47, 4, 1, ""], "build_init_resource_context": [55, 4, 1, ""], "build_input_context": [10, 4, 1, ""], "build_op_context": [6, 4, 1, ""], "build_output_context": [10, 4, 1, ""], "build_reconstructable_job": [11, 4, 1, ""], "build_resources": [55, 4, 1, ""], "build_run_status_sensor_context": [56, 4, 1, ""], "build_schedule_context": [56, 4, 1, ""], "build_schedule_from_partitioned_job": [56, 4, 1, ""], "build_sensor_context": [56, 4, 1, ""], "build_solid_context": [57, 4, 1, ""], "check_dagster_type": [58, 4, 1, ""], "composite_solid": [57, 4, 1, ""], "config_from_files": [59, 4, 1, ""], "config_from_pkg_resources": [59, 4, 1, ""], "config_from_yaml_strings": [59, 4, 1, ""], "configured": [3, 4, 1, ""], "create_offset_partition_selector": [51, 4, 1, ""], "custom_path_fs_io_manager": [10, 6, 1, ""], "dagster_type_loader": [58, 4, 1, ""], "dagster_type_materializer": [58, 4, 1, ""], "daily_partitioned_config": [51, 4, 1, ""], "date_partition_range": [51, 4, 1, ""], "default_executors": [52, 6, 1, ""], "dynamic_partitioned_config": [51, 4, 1, ""], "execute_pipeline": [52, 4, 1, ""], "execute_pipeline_iterator": [52, 4, 1, ""], "execute_solid": [57, 4, 1, ""], "execute_solid_within_pipeline": [57, 4, 1, ""], "execute_solids_within_pipeline": [57, 4, 1, ""], "executor": [9, 4, 1, ""], "failure_hook": [8, 4, 1, ""], "file_relative_path": [59, 4, 1, ""], "fs_io_manager": [10, 6, 1, ""], "get_dagster_logger": [59, 4, 1, ""], "graph": [7, 4, 1, ""], "hourly_partitioned_config": [51, 4, 1, ""], "identity_partition_selector": [51, 4, 1, ""], "in_process_executor": [6, 6, 1, ""], "io_manager": [10, 4, 1, ""], "job": [11, 4, 1, ""], "local_file_manager": [9, 6, 1, ""], "logger": [47, 4, 1, ""], "make_python_type_usable_as_dagster_type": [58, 4, 1, ""], "make_values_resource": [55, 4, 1, ""], "mem_io_manager": [10, 6, 1, ""], "monthly_partitioned_config": [51, 4, 1, ""], "multi_asset": [1, 4, 1, ""], "multiprocess_executor": [6, 6, 1, ""], "op": [50, 4, 1, ""], "pipeline": [52, 4, 1, ""], "pipeline_failure_sensor": [56, 4, 1, ""], "reconstructable": [6, 0, 1, ""], "reexecute_pipeline": [52, 4, 1, ""], "reexecute_pipeline_iterator": [52, 4, 1, ""], "repository": [54, 6, 1, ""], "resource": [55, 4, 1, ""], "root_input_manager": [10, 4, 1, ""], "run_failure_sensor": [56, 4, 1, ""], "run_status_sensor": [56, 4, 1, ""], "schedule": [56, 4, 1, ""], "sensor": [56, 4, 1, ""], "solid": [57, 4, 1, ""], "static_partitioned_config": [51, 4, 1, ""], "success_hook": [8, 4, 1, ""], "usable_as_dagster_type": [58, 4, 1, ""], "validate_run_config": [6, 4, 1, ""], "weekly_partitioned_config": [51, 4, 1, ""]}, "dagster-api-grpc": {"--attribute": [2, 7, 1, "cmdoption-dagster-api-grpc-a"], "--container-context": [2, 7, 1, "cmdoption-dagster-api-grpc-container-context"], "--container-image": [2, 7, 1, "cmdoption-dagster-api-grpc-container-image"], "--empty-working-directory": [2, 7, 1, "cmdoption-dagster-api-grpc-empty-working-directory"], "--fixed-server-id": [2, 7, 1, "cmdoption-dagster-api-grpc-fixed-server-id"], "--heartbeat": [2, 7, 1, "cmdoption-dagster-api-grpc-heartbeat"], "--heartbeat-timeout": [2, 7, 1, "cmdoption-dagster-api-grpc-heartbeat-timeout"], "--host": [2, 7, 1, "cmdoption-dagster-api-grpc-h"], "--ipc-output-file": [2, 7, 1, "cmdoption-dagster-api-grpc-ipc-output-file"], "--lazy-load-user-code": [2, 7, 1, "cmdoption-dagster-api-grpc-lazy-load-user-code"], "--log-level": [2, 7, 1, "cmdoption-dagster-api-grpc-log-level"], "--max_workers": [2, 7, 1, "cmdoption-dagster-api-grpc-n"], "--module-name": [2, 7, 1, "cmdoption-dagster-api-grpc-m"], "--override-system-timezone": [2, 7, 1, "cmdoption-dagster-api-grpc-override-system-timezone"], "--package-name": [2, 7, 1, "cmdoption-dagster-api-grpc-package-name"], "--port": [2, 7, 1, "cmdoption-dagster-api-grpc-p"], "--python-file": [2, 7, 1, "cmdoption-dagster-api-grpc-f"], "--socket": [2, 7, 1, "cmdoption-dagster-api-grpc-s"], "--use-python-environment-entry-point": [2, 7, 1, "cmdoption-dagster-api-grpc-use-python-environment-entry-point"], "--working-directory": [2, 7, 1, "cmdoption-dagster-api-grpc-d"], "-a": [2, 7, 1, "cmdoption-dagster-api-grpc-a"], "-d": [2, 7, 1, "cmdoption-dagster-api-grpc-d"], "-f": [2, 7, 1, "cmdoption-dagster-api-grpc-f"], "-h": [2, 7, 1, "cmdoption-dagster-api-grpc-h"], "-m": [2, 7, 1, "cmdoption-dagster-api-grpc-m"], "-n": [2, 7, 1, "cmdoption-dagster-api-grpc-n"], "-p": [2, 7, 1, "cmdoption-dagster-api-grpc-p"], "-s": [2, 7, 1, "cmdoption-dagster-api-grpc-s"]}, "dagster-celery-worker-list": {"--config-yaml": [16, 7, 1, "cmdoption-dagster-celery-worker-list-y"], "-y": [16, 7, 1, "cmdoption-dagster-celery-worker-list-y"]}, "dagster-celery-worker-start": {"--app": [16, 7, 1, "cmdoption-dagster-celery-worker-start-A"], "--background": [16, 7, 1, "cmdoption-dagster-celery-worker-start-d"], "--config-yaml": [16, 7, 1, "cmdoption-dagster-celery-worker-start-y"], "--includes": [16, 7, 1, "cmdoption-dagster-celery-worker-start-i"], "--loglevel": [16, 7, 1, "cmdoption-dagster-celery-worker-start-l"], "--name": [16, 7, 1, "cmdoption-dagster-celery-worker-start-n"], "--queue": [16, 7, 1, "cmdoption-dagster-celery-worker-start-q"], "-A": [16, 7, 1, "cmdoption-dagster-celery-worker-start-A"], "-d": [16, 7, 1, "cmdoption-dagster-celery-worker-start-d"], "-i": [16, 7, 1, "cmdoption-dagster-celery-worker-start-i"], "-l": [16, 7, 1, "cmdoption-dagster-celery-worker-start-l"], "-n": [16, 7, 1, "cmdoption-dagster-celery-worker-start-n"], "-q": [16, 7, 1, "cmdoption-dagster-celery-worker-start-q"], "-y": [16, 7, 1, "cmdoption-dagster-celery-worker-start-y"], "ADDITIONAL_ARGS": [16, 7, 1, "cmdoption-dagster-celery-worker-start-arg-ADDITIONAL_ARGS"]}, "dagster-celery-worker-terminate": {"--all": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-a"], "--config-yaml": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-y"], "-a": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-a"], "-y": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-y"], "NAME": [16, 7, 1, "cmdoption-dagster-celery-worker-terminate-arg-NAME"]}, "dagster-daemon-run": {"--attribute": [2, 7, 1, "cmdoption-dagster-daemon-run-a"], "--empty-workspace": [2, 7, 1, "cmdoption-dagster-daemon-run-empty-workspace"], "--grpc-host": [2, 7, 1, "cmdoption-dagster-daemon-run-grpc-host"], "--grpc-port": [2, 7, 1, "cmdoption-dagster-daemon-run-grpc-port"], "--grpc-socket": [2, 7, 1, "cmdoption-dagster-daemon-run-grpc-socket"], "--module-name": [2, 7, 1, "cmdoption-dagster-daemon-run-m"], "--package-name": [2, 7, 1, "cmdoption-dagster-daemon-run-package-name"], "--python-file": [2, 7, 1, "cmdoption-dagster-daemon-run-f"], "--use-ssl": [2, 7, 1, "cmdoption-dagster-daemon-run-use-ssl"], "--working-directory": [2, 7, 1, "cmdoption-dagster-daemon-run-d"], "--workspace": [2, 7, 1, "cmdoption-dagster-daemon-run-w"], "-a": [2, 7, 1, "cmdoption-dagster-daemon-run-a"], "-d": [2, 7, 1, "cmdoption-dagster-daemon-run-d"], "-f": [2, 7, 1, "cmdoption-dagster-daemon-run-f"], "-m": [2, 7, 1, "cmdoption-dagster-daemon-run-m"], "-w": [2, 7, 1, "cmdoption-dagster-daemon-run-w"]}, "dagster-graphql": {"--attribute": [2, 7, 1, "cmdoption-dagster-graphql-a"], "--empty-workspace": [2, 7, 1, "cmdoption-dagster-graphql-empty-workspace"], "--ephemeral-instance": [2, 7, 1, "cmdoption-dagster-graphql-ephemeral-instance"], "--file": [2, 7, 1, "cmdoption-dagster-graphql-f"], "--grpc-host": [2, 7, 1, "cmdoption-dagster-graphql-grpc-host"], "--grpc-port": [2, 7, 1, "cmdoption-dagster-graphql-grpc-port"], "--grpc-socket": [2, 7, 1, "cmdoption-dagster-graphql-grpc-socket"], "--module-name": [2, 7, 1, "cmdoption-dagster-graphql-m"], "--output": [2, 7, 1, "cmdoption-dagster-graphql-o"], "--package-name": [2, 7, 1, "cmdoption-dagster-graphql-package-name"], "--predefined": [2, 7, 1, "cmdoption-dagster-graphql-p"], "--python-file": [2, 7, 1, "cmdoption-dagster-graphql-0"], "--remote": [2, 7, 1, "cmdoption-dagster-graphql-r"], "--text": [2, 7, 1, "cmdoption-dagster-graphql-t"], "--use-ssl": [2, 7, 1, "cmdoption-dagster-graphql-use-ssl"], "--variables": [2, 7, 1, "cmdoption-dagster-graphql-v"], "--version": [2, 7, 1, "cmdoption-dagster-graphql-version"], "--working-directory": [2, 7, 1, "cmdoption-dagster-graphql-d"], "--workspace": [2, 7, 1, "cmdoption-dagster-graphql-w"], "-a": [2, 7, 1, "cmdoption-dagster-graphql-a"], "-d": [2, 7, 1, "cmdoption-dagster-graphql-d"], "-f": [2, 7, 1, "cmdoption-dagster-graphql-0"], "-m": [2, 7, 1, "cmdoption-dagster-graphql-m"], "-o": [2, 7, 1, "cmdoption-dagster-graphql-o"], "-p": [2, 7, 1, "cmdoption-dagster-graphql-p"], "-r": [2, 7, 1, "cmdoption-dagster-graphql-r"], "-t": [2, 7, 1, "cmdoption-dagster-graphql-t"], "-v": [2, 7, 1, "cmdoption-dagster-graphql-v"], "-w": [2, 7, 1, "cmdoption-dagster-graphql-w"]}, "dagster.AssetGroup": {"build_job": [1, 1, 1, ""], "from_current_module": [1, 1, 1, ""], "from_modules": [1, 1, 1, ""], "from_package_module": [1, 1, 1, ""], "from_package_name": [1, 1, 1, ""], "get_base_jobs": [1, 1, 1, ""], "materialize": [1, 1, 1, ""], "prefixed": [1, 1, 1, ""], "to_source_assets": [1, 1, 1, ""]}, "dagster.AssetKey": {"to_string": [50, 1, 1, ""], "to_user_string": [50, 1, 1, ""]}, "dagster.AssetMaterialization": {"file": [50, 1, 1, ""]}, "dagster.CompositeSolidDefinition": {"configured": [57, 1, 1, ""]}, "dagster.CompositeSolidExecutionResult": {"output_for_solid": [57, 1, 1, ""], "result_for_handle": [57, 1, 1, ""], "result_for_solid": [57, 1, 1, ""], "solid_result_list": [57, 1, 1, ""], "step_event_list": [57, 1, 1, ""], "success": [57, 1, 1, ""]}, "dagster.DagsterEvent": {"event_specific_data": [6, 2, 1, ""], "event_type": [6, 1, 1, ""], "event_type_value": [6, 2, 1, ""], "logging_tags": [6, 2, 1, ""], "message": [6, 2, 1, ""], "pid": [6, 2, 1, ""], "pipeline_name": [6, 2, 1, ""], "solid_handle": [6, 2, 1, ""], "step_key": [6, 2, 1, ""], "step_kind_value": [6, 2, 1, ""]}, "dagster.DagsterEventType": {"ALERT_FAILURE": [6, 2, 1, ""], "ALERT_START": [6, 2, 1, ""], "ALERT_SUCCESS": [6, 2, 1, ""], "ASSET_MATERIALIZATION": [6, 2, 1, ""], "ASSET_MATERIALIZATION_PLANNED": [6, 2, 1, ""], "ASSET_OBSERVATION": [6, 2, 1, ""], "ASSET_STORE_OPERATION": [6, 2, 1, ""], "ENGINE_EVENT": [6, 2, 1, ""], "HANDLED_OUTPUT": [6, 2, 1, ""], "HOOK_COMPLETED": [6, 2, 1, ""], "HOOK_ERRORED": [6, 2, 1, ""], "HOOK_SKIPPED": [6, 2, 1, ""], "LOADED_INPUT": [6, 2, 1, ""], "LOGS_CAPTURED": [6, 2, 1, ""], "OBJECT_STORE_OPERATION": [6, 2, 1, ""], "PIPELINE_CANCELED": [6, 2, 1, ""], "PIPELINE_CANCELING": [6, 2, 1, ""], "PIPELINE_DEQUEUED": [6, 2, 1, ""], "PIPELINE_ENQUEUED": [6, 2, 1, ""], "PIPELINE_FAILURE": [6, 2, 1, ""], "PIPELINE_START": [6, 2, 1, ""], "PIPELINE_STARTING": [6, 2, 1, ""], "PIPELINE_SUCCESS": [6, 2, 1, ""], "RUN_CANCELED": [6, 2, 1, ""], "RUN_CANCELING": [6, 2, 1, ""], "RUN_DEQUEUED": [6, 2, 1, ""], "RUN_ENQUEUED": [6, 2, 1, ""], "RUN_FAILURE": [6, 2, 1, ""], "RUN_START": [6, 2, 1, ""], "RUN_STARTING": [6, 2, 1, ""], "RUN_SUCCESS": [6, 2, 1, ""], "STEP_EXPECTATION_RESULT": [6, 2, 1, ""], "STEP_FAILURE": [6, 2, 1, ""], "STEP_INPUT": [6, 2, 1, ""], "STEP_OUTPUT": [6, 2, 1, ""], "STEP_RESTARTED": [6, 2, 1, ""], "STEP_SKIPPED": [6, 2, 1, ""], "STEP_START": [6, 2, 1, ""], "STEP_SUCCESS": [6, 2, 1, ""], "STEP_UP_FOR_RETRY": [6, 2, 1, ""]}, "dagster.DagsterInstance": {"add_daemon_heartbeat": [9, 1, 1, ""], "get_daemon_heartbeats": [9, 1, 1, ""], "launch_run": [9, 1, 1, ""], "report_engine_event": [9, 1, 1, ""], "resume_run": [9, 1, 1, ""], "should_start_background_run_thread": [9, 1, 1, ""], "submit_run": [9, 1, 1, ""]}, "dagster.DagsterRunStatus": {"CANCELED": [9, 2, 1, ""], "CANCELING": [9, 2, 1, ""], "FAILURE": [9, 2, 1, ""], "MANAGED": [9, 2, 1, ""], "NOT_STARTED": [9, 2, 1, ""], "QUEUED": [9, 2, 1, ""], "STARTED": [9, 2, 1, ""], "STARTING": [9, 2, 1, ""], "SUCCESS": [9, 2, 1, ""]}, "dagster.Enum": {"from_python_enum": [3, 1, 1, ""]}, "dagster.ExecuteInProcessResult": {"all_events": [6, 1, 1, ""], "all_node_events": [6, 1, 1, ""], "dagster_run": [6, 1, 1, ""], "events_for_node": [6, 1, 1, ""], "get_job_failure_event": [6, 1, 1, ""], "get_job_success_event": [6, 1, 1, ""], "output_for_node": [6, 1, 1, ""], "output_value": [6, 1, 1, ""], "run_id": [6, 1, 1, ""], "success": [6, 1, 1, ""]}, "dagster.Executor": {"execute": [9, 1, 1, ""], "retries": [9, 1, 1, ""]}, "dagster.ExecutorDefinition": {"configured": [9, 1, 1, ""]}, "dagster.FileHandle": {"path_desc": [58, 1, 1, ""]}, "dagster.GraphDefinition": {"execute_in_process": [7, 1, 1, ""], "to_job": [7, 1, 1, ""]}, "dagster.HookContext": {"hook_def": [8, 2, 1, ""], "job_name": [8, 2, 1, ""], "log": [8, 2, 1, ""], "mode_def": [8, 2, 1, ""], "op": [8, 2, 1, ""], "op_config": [8, 2, 1, ""], "op_exception": [8, 2, 1, ""], "op_output_values": [8, 2, 1, ""], "pipeline_name": [8, 2, 1, ""], "required_resource_keys": [8, 2, 1, ""], "resources": [8, 2, 1, ""], "run_id": [8, 2, 1, ""], "solid": [8, 2, 1, ""], "solid_config": [8, 2, 1, ""], "solid_exception": [8, 1, 1, ""], "solid_output_values": [8, 1, 1, ""], "step_key": [8, 2, 1, ""]}, "dagster.IOManager": {"get_input_asset_key": [10, 1, 1, ""], "get_input_asset_partitions": [10, 1, 1, ""], "get_output_asset_key": [10, 1, 1, ""], "get_output_asset_partitions": [10, 1, 1, ""], "handle_output": [10, 1, 1, ""], "load_input": [10, 1, 1, ""]}, "dagster.IOManagerDefinition": {"hardcoded_io_manager": [10, 1, 1, ""], "input_config_schema": [10, 1, 1, ""], "output_config_schema": [10, 1, 1, ""]}, "dagster.InitExecutorContext": {"executor_config": [9, 2, 1, ""], "executor_def": [9, 2, 1, ""], "instance": [9, 2, 1, ""], "job": [9, 2, 1, ""]}, "dagster.InitLoggerContext": {"logger_config": [47, 2, 1, ""], "logger_def": [47, 2, 1, ""], "pipeline_def": [47, 2, 1, ""], "run_id": [47, 2, 1, ""]}, "dagster.InitResourceContext": {"dagster_run": [55, 2, 1, ""], "log_manager": [55, 2, 1, ""], "pipeline_run": [55, 2, 1, ""], "resource_config": [55, 2, 1, ""], "resource_def": [55, 2, 1, ""], "resources": [55, 2, 1, ""], "run_id": [55, 2, 1, ""]}, "dagster.InputContext": {"add_input_metadata": [10, 1, 1, ""], "asset_partition_key": [10, 1, 1, ""], "asset_partition_key_range": [10, 1, 1, ""], "asset_partitions_time_window": [10, 1, 1, ""], "config": [10, 2, 1, ""], "consume_events": [10, 1, 1, ""], "dagster_type": [10, 2, 1, ""], "get_identifier": [10, 1, 1, ""], "get_observations": [10, 1, 1, ""], "has_input_name": [10, 1, 1, ""], "has_partition_key": [10, 1, 1, ""], "log": [10, 2, 1, ""], "metadata": [10, 2, 1, ""], "name": [10, 2, 1, ""], "op_def": [10, 2, 1, ""], "partition_key": [10, 1, 1, ""], "pipeline_name": [10, 2, 1, ""], "resource_config": [10, 2, 1, ""], "resources": [10, 2, 1, ""], "solid_def": [10, 2, 1, ""], "upstream_output": [10, 2, 1, ""]}, "dagster.JobDefinition": {"execute_in_process": [11, 1, 1, ""], "with_hooks": [11, 1, 1, ""]}, "dagster.LoggerDefinition": {"configured": [47, 1, 1, ""]}, "dagster.MemoizableIOManager": {"has_output": [48, 1, 1, ""]}, "dagster.MetadataEntry": {"asset": [50, 1, 1, ""], "float": [50, 1, 1, ""], "fspath": [50, 1, 1, ""], "int": [50, 1, 1, ""], "json": [50, 1, 1, ""], "md": [50, 1, 1, ""], "path": [50, 1, 1, ""], "table": [50, 1, 1, ""], "table_schema": [50, 1, 1, ""], "text": [50, 1, 1, ""], "url": [50, 1, 1, ""], "value": [50, 1, 1, ""]}, "dagster.MetadataValue": {"asset": [50, 1, 1, ""], "bool": [50, 1, 1, ""], "dagster_run": [50, 1, 1, ""], "float": [50, 1, 1, ""], "int": [50, 1, 1, ""], "json": [50, 1, 1, ""], "md": [50, 1, 1, ""], "path": [50, 1, 1, ""], "python_artifact": [50, 1, 1, ""], "table": [50, 1, 1, ""], "table_schema": [50, 1, 1, ""], "text": [50, 1, 1, ""], "url": [50, 1, 1, ""]}, "dagster.OpDefinition": {"configured": [50, 1, 1, ""]}, "dagster.OpExecutionContext": {"add_output_metadata": [6, 1, 1, ""], "consume_events": [6, 1, 1, ""], "get_mapping_key": [6, 1, 1, ""], "get_tag": [6, 1, 1, ""], "has_partition_key": [6, 1, 1, ""], "has_tag": [6, 1, 1, ""], "instance": [6, 1, 1, ""], "job_def": [6, 1, 1, ""], "job_name": [6, 1, 1, ""], "log": [6, 1, 1, ""], "log_event": [6, 1, 1, ""], "mode_def": [6, 1, 1, ""], "op_def": [6, 1, 1, ""], "output_asset_partition_key": [6, 1, 1, ""], "output_asset_partitions_time_window": [6, 1, 1, ""], "partition_key": [6, 1, 1, ""], "partition_time_window": [6, 1, 1, ""], "pdb": [6, 1, 1, ""], "pipeline_def": [6, 1, 1, ""], "pipeline_name": [6, 1, 1, ""], "pipeline_run": [6, 1, 1, ""], "resources": [6, 1, 1, ""], "retry_number": [6, 1, 1, ""], "run": [6, 1, 1, ""], "run_config": [6, 1, 1, ""], "run_id": [6, 1, 1, ""], "solid_config": [6, 1, 1, ""], "solid_def": [6, 1, 1, ""], "step_launcher": [6, 1, 1, ""]}, "dagster.OutputContext": {"add_output_metadata": [10, 1, 1, ""], "asset_info": [10, 2, 1, ""], "asset_partition_key": [10, 1, 1, ""], "asset_partition_key_range": [10, 1, 1, ""], "asset_partitions_time_window": [10, 1, 1, ""], "config": [10, 2, 1, ""], "consume_events": [10, 1, 1, ""], "consume_logged_metadata_entries": [10, 1, 1, ""], "dagster_type": [10, 2, 1, ""], "get_identifier": [10, 1, 1, ""], "get_logged_events": [10, 1, 1, ""], "get_logged_metadata_entries": [10, 1, 1, ""], "get_run_scoped_output_identifier": [10, 1, 1, ""], "has_partition_key": [10, 1, 1, ""], "log": [10, 2, 1, ""], "log_event": [10, 1, 1, ""], "mapping_key": [10, 2, 1, ""], "metadata": [10, 2, 1, ""], "name": [10, 2, 1, ""], "op_def": [10, 2, 1, ""], "partition_key": [10, 1, 1, ""], "pipeline_name": [10, 2, 1, ""], "resource_config": [10, 2, 1, ""], "resources": [10, 2, 1, ""], "run_id": [10, 2, 1, ""], "solid_def": [10, 2, 1, ""], "step_key": [10, 2, 1, ""], "version": [10, 2, 1, ""]}, "dagster.PartitionSetDefinition": {"create_schedule_definition": [51, 1, 1, ""], "get_partitions": [51, 1, 1, ""]}, "dagster.PartitionedConfig": {"get_run_config_for_partition_key": [51, 1, 1, ""]}, "dagster.PipelineExecutionResult": {"output_for_solid": [52, 1, 1, ""], "result_for_handle": [52, 1, 1, ""], "result_for_solid": [52, 1, 1, ""], "solid_result_list": [52, 1, 1, ""], "step_event_list": [52, 1, 1, ""], "success": [52, 1, 1, ""]}, "dagster.PipelineFailureSensorContext": {"failure_event": [56, 2, 1, ""], "pipeline_run": [56, 2, 1, ""], "sensor_name": [56, 2, 1, ""]}, "dagster.PresetDefinition": {"from_files": [53, 1, 1, ""], "from_pkg_resources": [53, 1, 1, ""], "from_yaml_strings": [53, 1, 1, ""], "get_environment_yaml": [53, 1, 1, ""], "with_additional_config": [53, 1, 1, ""]}, "dagster.RepositoryDefinition": {"get_all_jobs": [54, 1, 1, ""], "get_job": [54, 1, 1, ""], "has_job": [54, 1, 1, ""], "job_names": [54, 1, 1, ""]}, "dagster.ResourceDefinition": {"configured": [55, 1, 1, ""], "hardcoded_resource": [55, 1, 1, ""], "mock_resource": [55, 1, 1, ""], "none_resource": [55, 1, 1, ""]}, "dagster.RootInputManager": {"load_input": [10, 1, 1, ""]}, "dagster.RootInputManagerDefinition": {"input_config_schema": [10, 1, 1, ""]}, "dagster.RunFailureSensorContext": {"failure_event": [56, 2, 1, ""], "pipeline_run": [56, 2, 1, ""], "sensor_name": [56, 2, 1, ""]}, "dagster.RunRequest": {"job_name": [56, 2, 1, ""], "run_config": [56, 2, 1, ""], "run_key": [56, 2, 1, ""], "tags": [56, 2, 1, ""]}, "dagster.RunStatusSensorContext": {"dagster_event": [56, 2, 1, ""], "dagster_run": [56, 2, 1, ""], "for_run_failure": [56, 1, 1, ""], "instance": [56, 2, 1, ""], "sensor_name": [56, 2, 1, ""]}, "dagster.ScheduleEvaluationContext": {"instance_ref": [56, 2, 1, ""], "scheduled_execution_time": [56, 2, 1, ""]}, "dagster.SensorEvaluationContext": {"cursor": [56, 2, 1, ""], "instance": [56, 2, 1, ""], "instance_ref": [56, 2, 1, ""], "last_completion_time": [56, 2, 1, ""], "last_run_key": [56, 2, 1, ""], "repository_name": [56, 2, 1, ""]}, "dagster.SkipReason": {"skip_message": [56, 2, 1, ""]}, "dagster.SolidDefinition": {"configured": [57, 1, 1, ""]}, "dagster.SolidExecutionContext": {"add_output_metadata": [57, 1, 1, ""], "consume_events": [57, 1, 1, ""], "get_mapping_key": [57, 1, 1, ""], "get_tag": [57, 1, 1, ""], "has_partition_key": [57, 1, 1, ""], "has_tag": [57, 1, 1, ""], "instance": [57, 1, 1, ""], "job_def": [57, 1, 1, ""], "job_name": [57, 1, 1, ""], "log": [57, 1, 1, ""], "log_event": [57, 1, 1, ""], "mode_def": [57, 1, 1, ""], "op_def": [57, 1, 1, ""], "output_asset_partition_key": [57, 1, 1, ""], "output_asset_partitions_time_window": [57, 1, 1, ""], "partition_key": [57, 1, 1, ""], "partition_time_window": [57, 1, 1, ""], "pdb": [57, 1, 1, ""], "pipeline_def": [57, 1, 1, ""], "pipeline_name": [57, 1, 1, ""], "pipeline_run": [57, 1, 1, ""], "resources": [57, 1, 1, ""], "retry_number": [57, 1, 1, ""], "run": [57, 1, 1, ""], "run_config": [57, 1, 1, ""], "run_id": [57, 1, 1, ""], "solid_config": [57, 1, 1, ""], "solid_def": [57, 1, 1, ""], "step_launcher": [57, 1, 1, ""]}, "dagster.SolidExecutionResult": {"compute_input_event_dict": [57, 1, 1, ""], "compute_output_events_dict": [57, 1, 1, ""], "compute_step_events": [57, 1, 1, ""], "compute_step_failure_event": [57, 1, 1, ""], "expectation_events_during_compute": [57, 1, 1, ""], "expectation_results_during_compute": [57, 1, 1, ""], "failure_data": [57, 1, 1, ""], "get_output_event_for_compute": [57, 1, 1, ""], "get_output_events_for_compute": [57, 1, 1, ""], "get_step_success_event": [57, 1, 1, ""], "input_events_during_compute": [57, 1, 1, ""], "materialization_events_during_compute": [57, 1, 1, ""], "materializations_during_compute": [57, 1, 1, ""], "output_events_during_compute": [57, 1, 1, ""], "output_value": [57, 1, 1, ""], "output_values": [57, 1, 1, ""], "retry_attempts": [57, 1, 1, ""], "skipped": [57, 1, 1, ""], "success": [57, 1, 1, ""]}, "dagster.SourceAsset": {"description": [1, 2, 1, ""], "io_manager_key": [1, 2, 1, ""], "key": [1, 2, 1, ""], "metadata_entries": [1, 2, 1, ""], "partitions_def": [1, 2, 1, ""]}, "dagster.TypeCheckContext": {"log": [6, 2, 1, ""], "resources": [6, 2, 1, ""], "run_id": [6, 2, 1, ""]}, "dagster.core": {"errors": [5, 5, 0, "-"]}, "dagster.core.definitions.reconstruct": {"ReconstructablePipeline": [52, 0, 1, ""]}, "dagster.core.definitions.reconstruct.ReconstructablePipeline": {"get_module": [52, 1, 1, ""]}, "dagster.core.errors": {"user_code_error_boundary": [9, 4, 1, ""]}, "dagster.core.instance": {"InstanceRef": [9, 0, 1, ""]}, "dagster.core.launcher": {"DefaultRunLauncher": [9, 0, 1, ""], "RunLauncher": [9, 0, 1, ""]}, "dagster.core.run_coordinator": {"DefaultRunCoordinator": [9, 0, 1, ""], "QueuedRunCoordinator": [9, 6, 1, ""]}, "dagster.core.scheduler": {"DagsterDaemonScheduler": [56, 6, 1, ""], "Scheduler": [9, 0, 1, ""]}, "dagster.core.storage.compute_log_manager": {"ComputeLogManager": [9, 0, 1, ""]}, "dagster.core.storage.event_log": {"ConsolidatedSqliteEventLogStorage": [9, 0, 1, ""], "EventLogEntry": [9, 0, 1, ""], "EventLogRecord": [9, 0, 1, ""], "EventLogStorage": [9, 0, 1, ""], "EventRecordsFilter": [9, 0, 1, ""], "RunShardedEventsCursor": [9, 0, 1, ""], "SqlEventLogStorage": [9, 0, 1, ""], "SqliteEventLogStorage": [9, 0, 1, ""]}, "dagster.core.storage.file_manager": {"FileManager": [9, 0, 1, ""]}, "dagster.core.storage.file_manager.FileManager": {"copy_handle_to_local_temp": [9, 1, 1, ""], "delete_local_temp": [9, 1, 1, ""], "read": [9, 1, 1, ""], "read_data": [9, 1, 1, ""], "write": [9, 1, 1, ""], "write_data": [9, 1, 1, ""]}, "dagster.core.storage.local_compute_log_manager": {"LocalComputeLogManager": [9, 0, 1, ""]}, "dagster.core.storage.root": {"LocalArtifactStorage": [9, 0, 1, ""]}, "dagster.core.storage.root.LocalArtifactStorage": {"config_type": [9, 1, 1, ""], "from_config_value": [9, 1, 1, ""], "inst_data": [9, 1, 1, ""]}, "dagster.core.storage.runs": {"RunStorage": [9, 0, 1, ""], "SqlRunStorage": [9, 0, 1, ""], "SqliteRunStorage": [9, 0, 1, ""]}, "dagster.core.storage.schedules": {"ScheduleStorage": [9, 0, 1, ""], "SqlScheduleStorage": [9, 0, 1, ""], "SqliteScheduleStorage": [9, 0, 1, ""]}, "dagster.loggers": {"colored_console_logger": [47, 4, 1, ""], "json_console_logger": [47, 4, 1, ""]}, "dagster.serdes": {"ConfigurableClass": [9, 0, 1, ""], "ConfigurableClassData": [9, 0, 1, ""]}, "dagster.serdes.ConfigurableClass": {"config_type": [9, 1, 1, ""], "from_config_value": [9, 1, 1, ""], "inst_data": [9, 1, 1, ""]}, "dagster.utils": {"make_email_on_run_failure_sensor": [59, 4, 1, ""]}, "dagster.utils.forked_pdb": {"ForkedPdb": [59, 0, 1, ""]}, "dagster_airbyte": {"AirbyteResource": [12, 0, 1, ""], "airbyte_resource": [12, 6, 1, ""], "airbyte_sync_op": [12, 6, 1, ""], "build_airbyte_assets": [12, 4, 1, ""]}, "dagster_airbyte.AirbyteResource": {"make_request": [12, 1, 1, ""], "sync_and_poll": [12, 1, 1, ""]}, "dagster_airflow": {"make_airflow_dag": [13, 4, 1, ""], "make_airflow_dag_containerized": [13, 4, 1, ""], "make_airflow_dag_for_operator": [13, 4, 1, ""], "make_dagster_job_from_airflow_dag": [13, 4, 1, ""], "make_dagster_pipeline_from_airflow_dag": [13, 4, 1, ""], "make_dagster_repo_from_airflow_dag_bag": [13, 4, 1, ""], "make_dagster_repo_from_airflow_dags_path": [13, 4, 1, ""], "make_dagster_repo_from_airflow_example_dags": [13, 4, 1, ""]}, "dagster_aws.cloudwatch": {"cloudwatch_logger": [14, 6, 1, ""]}, "dagster_aws.ecs": {"EcsRunLauncher": [14, 6, 1, ""]}, "dagster_aws.emr": {"EmrClusterState": [14, 6, 1, ""], "EmrError": [14, 0, 1, ""], "EmrJobRunner": [14, 0, 1, ""], "EmrStepState": [14, 6, 1, ""], "emr_pyspark_step_launcher": [14, 6, 1, ""]}, "dagster_aws.redshift": {"fake_redshift_resource": [14, 6, 1, ""], "redshift_resource": [14, 6, 1, ""]}, "dagster_aws.s3": {"S3ComputeLogManager": [14, 0, 1, ""], "S3Coordinate": [14, 6, 1, ""], "S3FileCache": [14, 0, 1, ""], "S3FileHandle": [14, 0, 1, ""], "s3_file_manager": [14, 6, 1, ""], "s3_pickle_asset_io_manager": [14, 6, 1, ""], "s3_pickle_io_manager": [14, 6, 1, ""], "s3_resource": [14, 6, 1, ""]}, "dagster_aws.s3.S3FileHandle": {"path_desc": [14, 1, 1, ""], "s3_bucket": [14, 1, 1, ""], "s3_key": [14, 1, 1, ""], "s3_path": [14, 1, 1, ""]}, "dagster_aws.secretsmanager": {"secretsmanager_resource": [14, 6, 1, ""], "secretsmanager_secrets_resource": [14, 6, 1, ""]}, "dagster_azure.adls2": {"ADLS2FileHandle": [15, 0, 1, ""], "FakeADLS2Resource": [15, 0, 1, ""], "adls2_file_cache": [15, 6, 1, ""], "adls2_file_manager": [15, 6, 1, ""], "adls2_pickle_asset_io_manager": [15, 6, 1, ""], "adls2_pickle_io_manager": [15, 6, 1, ""], "adls2_resource": [15, 6, 1, ""]}, "dagster_azure.adls2.ADLS2FileHandle": {"account": [15, 1, 1, ""], "adls2_path": [15, 1, 1, ""], "file_system": [15, 1, 1, ""], "key": [15, 1, 1, ""], "path_desc": [15, 1, 1, ""]}, "dagster_azure.blob": {"AzureBlobComputeLogManager": [15, 0, 1, ""]}, "dagster_celery": {"celery_executor": [16, 6, 1, ""]}, "dagster_celery_docker": {"celery_docker_executor": [17, 6, 1, ""]}, "dagster_celery_k8s": {"CeleryK8sRunLauncher": [18, 6, 1, ""], "celery_k8s_job_executor": [18, 6, 1, ""]}, "dagster_dask": {"dask_executor": [19, 6, 1, ""]}, "dagster_databricks": {"DatabricksError": [20, 0, 1, ""], "create_databricks_job_op": [20, 4, 1, ""], "create_databricks_job_solid": [20, 4, 1, ""], "databricks_pyspark_step_launcher": [20, 6, 1, ""]}, "dagster_datadog": {"datadog_resource": [21, 6, 1, ""]}, "dagster_dbt": {"DagsterDbtCliFatalRuntimeError": [22, 3, 1, ""], "DagsterDbtCliHandledRuntimeError": [22, 3, 1, ""], "DagsterDbtCliOutputsNotFoundError": [22, 3, 1, ""], "DagsterDbtCliRuntimeError": [22, 3, 1, ""], "DagsterDbtCliUnexpectedOutputError": [22, 3, 1, ""], "DagsterDbtError": [22, 3, 1, ""], "DagsterDbtRpcUnexpectedPollOutputError": [22, 3, 1, ""], "DbtCliOutput": [22, 0, 1, ""], "DbtCliResource": [22, 0, 1, ""], "DbtCloudResourceV2": [22, 0, 1, ""], "DbtOutput": [22, 0, 1, ""], "DbtResource": [22, 0, 1, ""], "DbtRpcOutput": [22, 0, 1, ""], "DbtRpcResource": [22, 0, 1, ""], "DbtRpcSyncResource": [22, 0, 1, ""], "create_dbt_rpc_run_sql_solid": [22, 4, 1, ""], "dbt_cli_compile": [22, 6, 1, ""], "dbt_cli_resource": [22, 6, 1, ""], "dbt_cli_run": [22, 6, 1, ""], "dbt_cli_run_operation": [22, 6, 1, ""], "dbt_cli_snapshot": [22, 6, 1, ""], "dbt_cli_snapshot_freshness": [22, 6, 1, ""], "dbt_cli_test": [22, 6, 1, ""], "dbt_cloud_resource": [22, 6, 1, ""], "dbt_cloud_run_op": [22, 6, 1, ""], "dbt_compile_op": [22, 4, 1, ""], "dbt_docs_generate_op": [22, 4, 1, ""], "dbt_ls_op": [22, 4, 1, ""], "dbt_rpc_compile_sql": [22, 6, 1, ""], "dbt_rpc_resource": [22, 6, 1, ""], "dbt_rpc_run": [22, 6, 1, ""], "dbt_rpc_run_and_wait": [22, 6, 1, ""], "dbt_rpc_run_operation": [22, 6, 1, ""], "dbt_rpc_run_operation_and_wait": [22, 6, 1, ""], "dbt_rpc_snapshot": [22, 6, 1, ""], "dbt_rpc_snapshot_and_wait": [22, 6, 1, ""], "dbt_rpc_snapshot_freshness": [22, 6, 1, ""], "dbt_rpc_snapshot_freshness_and_wait": [22, 6, 1, ""], "dbt_rpc_sync_resource": [22, 6, 1, ""], "dbt_rpc_test": [22, 6, 1, ""], "dbt_rpc_test_and_wait": [22, 6, 1, ""], "dbt_run_op": [22, 6, 1, ""], "dbt_seed_op": [22, 4, 1, ""], "dbt_snapshot_op": [22, 4, 1, ""], "dbt_test_op": [22, 4, 1, ""], "load_assets_from_dbt_manifest": [22, 4, 1, ""], "load_assets_from_dbt_project": [22, 4, 1, ""], "local_dbt_rpc_resource": [22, 6, 1, ""]}, "dagster_dbt.DbtCliOutput": {"command": [22, 2, 1, ""], "docs_url": [22, 2, 1, ""], "logs": [22, 2, 1, ""], "raw_output": [22, 2, 1, ""], "result": [22, 2, 1, ""], "return_code": [22, 2, 1, ""]}, "dagster_dbt.DbtCliResource": {"build": [22, 1, 1, ""], "cli": [22, 1, 1, ""], "compile": [22, 1, 1, ""], "default_flags": [22, 1, 1, ""], "freshness": [22, 1, 1, ""], "generate_docs": [22, 1, 1, ""], "get_manifest_json": [22, 1, 1, ""], "get_run_results_json": [22, 1, 1, ""], "ls": [22, 1, 1, ""], "run": [22, 1, 1, ""], "run_operation": [22, 1, 1, ""], "seed": [22, 1, 1, ""], "snapshot": [22, 1, 1, ""], "strict_flags": [22, 1, 1, ""], "test": [22, 1, 1, ""]}, "dagster_dbt.DbtCloudResourceV2": {"cancel_run": [22, 1, 1, ""], "get_job": [22, 1, 1, ""], "get_manifest": [22, 1, 1, ""], "get_run": [22, 1, 1, ""], "get_run_artifact": [22, 1, 1, ""], "get_run_results": [22, 1, 1, ""], "get_run_steps": [22, 1, 1, ""], "get_runs": [22, 1, 1, ""], "list_run_artifacts": [22, 1, 1, ""], "make_request": [22, 1, 1, ""], "poll_run": [22, 1, 1, ""], "run_job": [22, 1, 1, ""], "run_job_and_poll": [22, 1, 1, ""], "update_job": [22, 1, 1, ""]}, "dagster_dbt.DbtResource": {"build": [22, 1, 1, ""], "compile": [22, 1, 1, ""], "generate_docs": [22, 1, 1, ""], "get_manifest_json": [22, 1, 1, ""], "get_run_results_json": [22, 1, 1, ""], "logger": [22, 1, 1, ""], "ls": [22, 1, 1, ""], "run": [22, 1, 1, ""], "run_operation": [22, 1, 1, ""], "seed": [22, 1, 1, ""], "snapshot": [22, 1, 1, ""], "test": [22, 1, 1, ""]}, "dagster_dbt.DbtRpcOutput": {"response": [22, 2, 1, ""], "response_dict": [22, 2, 1, ""], "result": [22, 2, 1, ""]}, "dagster_dbt.DbtRpcResource": {"build": [22, 1, 1, ""], "cli": [22, 1, 1, ""], "compile": [22, 1, 1, ""], "compile_sql": [22, 1, 1, ""], "generate_docs": [22, 1, 1, ""], "get_manifest_json": [22, 1, 1, ""], "get_run_results_json": [22, 1, 1, ""], "host": [22, 1, 1, ""], "jsonrpc_version": [22, 1, 1, ""], "kill": [22, 1, 1, ""], "logger": [22, 1, 1, ""], "ls": [22, 1, 1, ""], "poll": [22, 1, 1, ""], "port": [22, 1, 1, ""], "ps": [22, 1, 1, ""], "run": [22, 1, 1, ""], "run_operation": [22, 1, 1, ""], "run_sql": [22, 1, 1, ""], "seed": [22, 1, 1, ""], "snapshot": [22, 1, 1, ""], "snapshot_freshness": [22, 1, 1, ""], "status": [22, 1, 1, ""], "test": [22, 1, 1, ""], "url": [22, 1, 1, ""]}, "dagster_dbt.utils": {"generate_materializations": [22, 4, 1, ""]}, "dagster_docker": {"DockerRunLauncher": [23, 6, 1, ""], "docker_executor": [23, 6, 1, ""]}, "dagster_fivetran": {"FivetranResource": [24, 0, 1, ""], "build_fivetran_assets": [24, 4, 1, ""], "fivetran_resource": [24, 6, 1, ""], "fivetran_sync_op": [24, 6, 1, ""]}, "dagster_fivetran.FivetranResource": {"get_connector_details": [24, 1, 1, ""], "get_connector_sync_status": [24, 1, 1, ""], "make_request": [24, 1, 1, ""], "poll_sync": [24, 1, 1, ""], "resync_and_poll": [24, 1, 1, ""], "start_resync": [24, 1, 1, ""], "start_sync": [24, 1, 1, ""], "sync_and_poll": [24, 1, 1, ""], "update_connector": [24, 1, 1, ""], "update_schedule_type": [24, 1, 1, ""]}, "dagster_gcp": {"BigQueryError": [25, 0, 1, ""], "GCSFileHandle": [25, 0, 1, ""], "bigquery_resource": [25, 6, 1, ""], "bq_create_dataset": [25, 4, 1, ""], "bq_delete_dataset": [25, 4, 1, ""], "bq_op_for_queries": [25, 4, 1, ""], "bq_solid_for_queries": [25, 4, 1, ""], "dataproc_op": [25, 6, 1, ""], "dataproc_resource": [25, 6, 1, ""], "dataproc_solid": [25, 4, 1, ""], "gcs_file_manager": [25, 6, 1, ""], "gcs_resource": [25, 6, 1, ""], "import_df_to_bq": [25, 4, 1, ""], "import_file_to_bq": [25, 4, 1, ""], "import_gcs_paths_to_bq": [25, 4, 1, ""]}, "dagster_gcp.GCSFileHandle": {"gcs_bucket": [25, 1, 1, ""], "gcs_key": [25, 1, 1, ""], "gcs_path": [25, 1, 1, ""], "path_desc": [25, 1, 1, ""]}, "dagster_gcp.gcs": {"gcs_pickle_asset_io_manager": [25, 6, 1, ""], "gcs_pickle_io_manager": [25, 6, 1, ""]}, "dagster_ge": {"ge_validation_op_factory": [26, 4, 1, ""], "ge_validation_solid_factory": [26, 4, 1, ""]}, "dagster_github": {"github_resource": [27, 6, 1, ""]}, "dagster_graphql": {"DagsterGraphQLClient": [28, 0, 1, ""], "DagsterGraphQLClientError": [28, 3, 1, ""], "InvalidOutputErrorInfo": [28, 0, 1, ""], "ReloadRepositoryLocationInfo": [28, 0, 1, ""], "ReloadRepositoryLocationStatus": [28, 0, 1, ""]}, "dagster_graphql.DagsterGraphQLClient": {"get_run_status": [28, 1, 1, ""], "reload_repository_location": [28, 1, 1, ""], "shutdown_repository_location": [28, 1, 1, ""], "submit_job_execution": [28, 1, 1, ""], "submit_pipeline_execution": [28, 1, 1, ""], "terminate_run": [28, 1, 1, ""]}, "dagster_k8s": {"K8sRunLauncher": [29, 6, 1, ""], "k8s_job_executor": [29, 6, 1, ""]}, "dagster_mlflow": {"end_mlflow_on_run_finished": [30, 6, 1, ""], "end_mlflow_run_on_pipeline_finished": [30, 6, 1, ""], "mlflow_tracking": [30, 6, 1, ""]}, "dagster_msteams": {"make_teams_on_pipeline_failure_sensor": [31, 4, 1, ""], "msteams_resource": [31, 6, 1, ""], "teams_on_failure": [31, 6, 1, ""], "teams_on_success": [31, 6, 1, ""]}, "dagster_mysql": {"MySQLEventLogStorage": [32, 0, 1, ""], "MySQLRunStorage": [32, 0, 1, ""], "MySQLScheduleStorage": [32, 0, 1, ""]}, "dagster_pagerduty": {"pagerduty_resource": [33, 6, 1, ""]}, "dagster_pandas": {"DataFrame": [34, 6, 1, ""], "PandasColumn": [34, 0, 1, ""], "RowCountConstraint": [34, 0, 1, ""], "StrictColumnsConstraint": [34, 0, 1, ""], "create_dagster_pandas_dataframe_type": [34, 4, 1, ""]}, "dagster_pandas.PandasColumn": {"boolean_column": [34, 1, 1, ""], "categorical_column": [34, 1, 1, ""], "datetime_column": [34, 1, 1, ""], "exists": [34, 1, 1, ""], "float_column": [34, 1, 1, ""], "integer_column": [34, 1, 1, ""], "numeric_column": [34, 1, 1, ""], "string_column": [34, 1, 1, ""]}, "dagster_papertrail": {"papertrail_logger": [35, 6, 1, ""]}, "dagster_postgres": {"PostgresEventLogStorage": [36, 6, 1, ""], "PostgresRunStorage": [36, 6, 1, ""], "PostgresScheduleStorage": [36, 6, 1, ""]}, "dagster_prometheus": {"prometheus_resource": [37, 6, 1, ""]}, "dagster_prometheus.resources": {"PrometheusResource": [37, 0, 1, ""]}, "dagster_pyspark": {"pyspark_resource": [38, 6, 1, ""]}, "dagster_shell": {"create_shell_command_op": [39, 4, 1, ""], "create_shell_command_solid": [39, 4, 1, ""], "create_shell_script_op": [39, 4, 1, ""], "create_shell_script_solid": [39, 4, 1, ""], "shell_op": [39, 4, 1, ""], "shell_solid": [39, 4, 1, ""]}, "dagster_slack": {"make_slack_on_pipeline_failure_sensor": [40, 4, 1, ""], "make_slack_on_run_failure_sensor": [40, 4, 1, ""], "slack_on_failure": [40, 6, 1, ""], "slack_on_success": [40, 6, 1, ""], "slack_resource": [40, 6, 1, ""]}, "dagster_snowflake": {"SnowflakeConnection": [41, 0, 1, ""], "build_snowflake_io_manager": [41, 4, 1, ""], "snowflake_op_for_query": [41, 4, 1, ""], "snowflake_resource": [41, 6, 1, ""]}, "dagster_snowflake.SnowflakeConnection": {"execute_queries": [41, 1, 1, ""], "execute_query": [41, 1, 1, ""], "get_connection": [41, 1, 1, ""], "load_table_from_local_parquet": [41, 1, 1, ""]}, "dagster_snowflake_pandas": {"SnowflakePandasTypeHandler": [42, 0, 1, ""]}, "dagster_spark": {"SparkOpError": [43, 0, 1, ""], "construct_spark_shell_command": [43, 4, 1, ""], "create_spark_op": [43, 4, 1, ""], "define_spark_config": [43, 4, 1, ""], "spark_resource": [43, 6, 1, ""]}, "dagster_ssh": {"SSHResource": [44, 0, 1, ""], "ssh_resource": [44, 6, 1, ""]}, "dagster_twilio": {"twilio_resource": [45, 6, 1, ""]}, "dagstermill": {"DagstermillError": [46, 0, 1, ""], "DagstermillExecutionContext": [46, 0, 1, ""], "define_dagstermill_op": [46, 4, 1, ""], "define_dagstermill_solid": [46, 4, 1, ""], "get_context": [46, 4, 1, ""], "local_output_notebook_io_manager": [46, 4, 1, ""], "yield_event": [46, 4, 1, ""], "yield_result": [46, 4, 1, ""]}, "dagstermill.DagstermillExecutionContext": {"get_tag": [46, 1, 1, ""], "has_tag": [46, 1, 1, ""], "log": [46, 1, 1, ""], "logging_tags": [46, 1, 1, ""], "pipeline_def": [46, 1, 1, ""], "pipeline_run": [46, 1, 1, ""], "resolved_run_config": [46, 1, 1, ""], "resources": [46, 1, 1, ""], "run_config": [46, 1, 1, ""], "run_id": [46, 1, 1, ""], "solid": [46, 1, 1, ""], "solid_config": [46, 1, 1, ""], "solid_def": [46, 1, 1, ""]}}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "exception", "Python exception"], "4": ["py", "function", "Python function"], "5": ["py", "module", "Python module"], "6": ["py", "data", "Python data"], "7": ["std", "cmdoption", "program option"]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:attribute", "3": "py:exception", "4": "py:function", "5": "py:module", "6": "py:data", "7": "std:cmdoption"}, "terms": {"00am": 51, "0123456789abcdef0123456789abcdef": 33, "100": [14, 22, 25, 38], "1000": [50, 57], "10000": 54, "1001": 21, "1035": 25, "11000": 54, "120": 22, "1200": 25, "1234": [3, 21], "12345": 22, "127": [2, 29], "145224193": 34, "15000": 2, "1677": 34, "20000": [14, 38], "200m": [14, 38], "2017": 16, "2020": 29, "2021": 51, "2022": [51, 56], "2048m": [14, 38], "21t21": 29, "2200": 20, "2262": 34, "2546": [14, 38], "28000m": 20, "2auto": 25, "2fbl320": 41, "2gb": [14, 38], "3000": [2, 28, 31, 40], "30000": 22, "300mb": [14, 38], "3333": 2, "4815": [14, 38], "5000": 30, "500gb": 25, "500m": [14, 38], "512m": [14, 38], "5432": 36, "54321": 22, "5439": 14, "5672": 16, "6313": [14, 38], "6379": 29, "77777": 22, "8080": 22, "854775807": 34, "8580": 22, "86400": [18, 20], "95590a": 29, "999": 21, "AWS": [20, 29], "But": [14, 38], "EBS": 20, "EKS": 29, "For": [1, 2, 3, 6, 7, 9, 10, 11, 12, 14, 18, 20, 22, 24, 25, 29, 34, 38, 41, 50, 51, 52, 55, 56, 57, 58], "IDs": 27, "Its": [14, 38], "K8s": 18, "KMS": [20, 25], "NFS": [14, 38], "Not": [2, 14, 38, 56], "ONE": 2, "Ops": 57, "PBS": 19, "SAS": 15, "Such": 59, "TLS": 31, "The": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 29, 31, 33, 34, 35, 38, 39, 40, 41, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "Then": [16, 17, 18, 23, 29, 40], "There": [1, 5, 31, 33], "These": [1, 4, 6, 7, 8, 9, 11, 14, 20, 29, 34, 38, 50, 52, 55, 57, 58], "UDS": 2, "UIs": [14, 38], "USE": [14, 41], "Use": [2, 3, 9, 12, 13, 14, 20, 21, 29, 34, 38, 54, 56, 57, 58], "Used": [9, 13, 22, 50, 52, 57], "Useful": [2, 14, 22, 25, 38], "Uses": [4, 7, 57], "Using": [6, 7, 9, 11, 25, 52, 57], "Will": [18, 22, 25, 29], "With": 18, "__executor_name__": [6, 52], "__fieldvaluesentinel": 3, "__file__": [4, 39, 59], "__init__": [54, 58], "__input_name__": [6, 52], "__logger_name__": [6, 52], "__main__": 16, "__name__": [3, 16, 58], "__op_name__": 6, "__resource_name__": [6, 52], "__solid_name__": 52, "_add_on": [50, 57], "_asset_selection_data": [1, 7], "_clean": 4, "_config_map": 49, "_construct_job_def_from_yaml_fil": 54, "_context": [10, 34, 50, 57, 58], "_default_failure_messag": 31, "_default_failure_message_text_fn": 40, "_get_node_asset_kei": 22, "_graph_": 7, "_input_valu": [6, 11], "_job": 11, "_kei": 3, "_parent_pipeline_def": 52, "_partit": 49, "_partitioned_config": 49, "_s3_bucket": 58, "_s3_kei": 58, "_schedul": 56, "_subset_selection_data": [6, 11], "_yaml_directori": 54, "_yaml_file_for_job_nam": 54, "a_solid": 31, "abcdef": 14, "abid": 34, "abil": [9, 50, 57], "abl": [3, 6, 9, 11, 14, 16, 17, 18, 20, 38, 52], "abort": [14, 38], "about": [1, 2, 4, 7, 9, 10, 14, 22, 24, 28, 38, 40, 50, 57], "abov": [3, 14, 20, 27, 29, 38, 50, 57], "absolut": [14, 20, 38], "abstract": [9, 10, 14, 15, 22, 25, 48, 57, 58], "abstractset": [7, 8, 52], "acceler": 25, "accept": [3, 5, 9, 10, 14, 20, 34, 35, 38, 47, 50, 51, 55, 56, 57], "access": [5, 6, 9, 13, 14, 15, 20, 25, 27, 30, 33, 34, 38, 41, 42, 46, 54, 55, 57, 58], "access_key_kei": 20, "accord": [3, 5, 9, 14, 22, 25, 38], "accordingli": [14, 38], "account": [14, 15, 18, 20, 21, 22, 24, 25, 27, 29, 35, 38, 41, 45], "account_id": [22, 25], "account_nam": 15, "account_sid": 45, "accur": [14, 38], "accurateblockthreshold": [14, 38], "achiev": [14, 38], "ack": [14, 38], "acl": 20, "acquir": 20, "across": [4, 9, 14, 25, 38, 50, 51, 56, 57], "act": [14, 25, 38], "action": [14, 25, 38], "action_list_oper": 26, "action_on_failur": 14, "activ": [20, 25, 35, 41], "actual": [2, 3, 6, 10, 14, 38, 50, 56, 57], "acycl": [6, 7, 52], "adapt": 22, "add": [3, 6, 9, 10, 13, 14, 20, 22, 25, 28, 29, 30, 36, 41, 52, 57], "add_attach": 31, "add_daemon_heartbeat": 9, "add_input_metadata": 10, "add_metadata": [6, 57], "add_metadata_two_output": [6, 57], "add_mod": 52, "add_on": [6, 7, 50, 52, 57], "add_output_metadata": [6, 10, 57], "add_three_preset": 52, "add_to_environ": 14, "add_two": 57, "added": [1, 10, 14, 20, 25, 38, 51, 52, 57], "adder_1": 57, "adder_2": 57, "adder_resourc": 52, "addfil": [14, 38], "adding": [18, 29], "addit": [1, 5, 9, 13, 14, 16, 17, 18, 20, 22, 38, 46, 50, 53, 57], "addition": [50, 57], "additional_arg": 16, "address": [12, 14, 19, 22, 25, 38], "adl": [15, 20], "adls2": [15, 20], "adls2_client": 15, "adls2_file_cach": 15, "adls2_file_manag": 15, "adls2_file_system": 15, "adls2_path": 15, "adls2_pickle_asset_io_manag": 15, "adls2_pickle_io_manag": 15, "adls2_prefix": 15, "adls2_resourc": 15, "adls2filehandl": 15, "adls2resourc": 15, "admin": 25, "administr": [22, 27], "advanc": [14, 22, 38], "advantag": 51, "advertis": [14, 38], "advis": 16, "affect": [14, 38], "after": [1, 2, 14, 20, 22, 24, 25, 29, 38, 40, 41, 50, 51, 55, 56, 57], "after_cursor": 9, "after_timestamp": 9, "against": [2, 5, 6, 7, 9, 11, 14, 29, 41, 52, 57, 58], "agent": 21, "aggreg": [14, 38], "ahead": [14, 38], "aim": 15, "airbyte_host": 12, "airbyte_port": 12, "airbyte_resourc": 12, "airbyte_sync_op": 12, "airbyteoutput": 12, "airbyteresourc": 12, "airflow_example_dags_repo": 13, "airflow_execution_d": 13, "airflow_hom": 13, "airline_demo": [53, 59], "aka": 15, "album": 21, "alert": [33, 40, 56], "alert_failur": 6, "alert_start": 6, "alert_success": 6, "algorithm": [14, 38], "alia": [3, 6, 7, 9, 50, 52, 54, 57], "alias": [6, 7, 57], "align": [6, 9, 16, 17, 18, 52], "aliv": [9, 14, 38], "all": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15, 16, 17, 18, 20, 22, 24, 25, 26, 28, 29, 30, 34, 38, 41, 50, 51, 52, 54, 55, 56, 57, 58], "all_ev": 6, "all_node_ev": 6, "all_user_ev": 10, "alloc": [14, 38], "allow": [1, 2, 3, 6, 7, 9, 10, 11, 12, 14, 20, 22, 24, 27, 29, 31, 34, 38, 40, 46, 48, 50, 51, 52, 54, 55, 57], "allow_host_key_chang": 44, "aloha": 3, "along": [1, 14, 20, 22, 38, 39], "alreadi": [9, 14, 20, 28, 29, 38], "also": [1, 2, 6, 7, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22, 23, 25, 27, 29, 33, 38, 47, 48, 50, 53, 55, 56, 57, 58], "alter": 4, "altern": [3, 9, 14, 41, 52, 57, 59], "alwai": [22, 25, 34, 46, 51, 56, 58, 59], "amazon": [14, 20], "amazonaw": [14, 20, 29], "amazonec": 14, "amazons3": 20, "america": [41, 51, 56], "amount": [14, 22, 38], "amqp": 16, "an_existing_mlflow_run_id": 30, "an_op": 40, "analyt": 25, "ancestor": [1, 6, 7, 11, 52], "ani": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 22, 24, 27, 28, 30, 31, 34, 35, 38, 39, 40, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "annot": [46, 58], "anonym": [50, 57], "anoth": [5, 6, 7, 13, 14, 28, 38, 52, 58], "ant": [14, 38], "apach": [14, 25, 38, 43], "api": [1, 4, 5, 6, 7, 9, 12, 14, 15, 21, 22, 24, 27, 28, 30, 31, 33, 34, 38, 40, 47, 48, 50, 51, 52, 53, 55, 57, 58], "api_kei": [21, 24], "api_secret": [14, 24], "api_stepconfig": 14, "apirefer": 14, "app": [14, 17, 18, 27, 38, 40], "app_id": [14, 38], "app_kei": 21, "appauthexampl": 14, "appear": [3, 6, 10, 14, 20, 27, 38, 57], "append": [20, 50, 57], "appli": [7, 9, 11, 13, 14, 18, 20, 22, 29, 38, 50, 51, 52, 53, 56, 57], "applic": [14, 18, 21, 22, 25, 27, 29, 38, 43], "application_argu": 43, "application_jar": 43, "apply_op": 52, "applylimitperuniquevalu": 9, "appropri": [3, 7, 17, 18, 29, 41, 42, 52, 57, 58], "arbitrari": [1, 3, 4, 5, 6, 7, 11, 28, 39, 50, 52, 55, 57, 58], "arbitrarili": 57, "arbyt": 12, "archiv": 25, "archiveuri": 25, "aren": [10, 40], "arg": [2, 5, 6, 11, 17, 22, 25, 28, 42, 47, 51, 57, 59], "argument": [1, 2, 3, 5, 6, 7, 8, 9, 11, 14, 16, 17, 18, 22, 25, 34, 38, 47, 50, 51, 52, 53, 54, 55, 56, 57, 58], "arn": 14, "around": [9, 14, 16, 17, 18, 38], "arrai": [3, 5, 14], "arrang": [6, 7, 52], "articl": 16, "artifact": [2, 9, 14, 22, 38, 50, 52, 57], "artifactid": [14, 38], "artifactori": [14, 38], "as_dagster_typ": [50, 57, 58], "asia": 25, "asid": [14, 38], "ask": [14, 38], "asktimeout": [14, 38], "assembl": 7, "assert": [1, 9, 14, 55, 58], "assert_failur": 58, "assert_success": 58, "asset": [6, 7, 9, 10, 14, 15, 25, 46, 52, 56, 58], "asset1": 1, "asset2": 1, "asset_group": [1, 14, 15, 25], "asset_info": 10, "asset_kei": [1, 4, 9, 10, 24, 50, 56, 57], "asset_key_prefix": [12, 22, 24, 46], "asset_lay": [6, 7, 11, 52], "asset_materi": [6, 10, 57], "asset_materialization_fn": 56, "asset_materialization_plan": 6, "asset_observ": 6, "asset_partit": [4, 9, 50, 57], "asset_partition_kei": 10, "asset_partition_key_rang": 10, "asset_partitions_def": [4, 50, 57], "asset_partitions_time_window": 10, "asset_select": [6, 9, 11, 52], "asset_sensor": 56, "asset_store_oper": 6, "assetgroup": [1, 14, 15, 25], "assetin": 1, "assetkei": [1, 9, 10, 12, 22, 24, 50, 52, 56, 57], "assetlay": [7, 52], "assetmateri": [6, 10, 22, 46, 50, 56, 57, 58], "assetobserv": [6, 10, 57], "assetoutputinfo": 10, "assetsdefinit": 1, "assetsdefint": 24, "assetsensordefinit": 56, "assign": [9, 10, 14, 15, 25], "assist": 21, "associ": [1, 2, 6, 8, 9, 10, 22, 25, 50, 56, 57], "assum": [1, 18, 29, 50, 51], "assumpt": 14, "async": [50, 57], "asynchron": [22, 52], "attach": [1, 4, 6, 10, 14, 15, 25, 28, 39, 49, 50, 52, 56, 57, 58], "attempt": [5, 6, 9, 14, 38, 44, 50, 57], "attempt_num": [50, 57], "attempt_numb": 9, "attit": 20, "attribut": [2, 6, 9, 25, 50, 55, 56], "audit": [14, 38], "auth": [25, 28, 29, 45], "auth_token": [22, 45], "authent": [14, 15, 38, 41], "author": [6, 7, 9, 13, 33, 50, 57], "auto": [3, 22, 24, 25], "autocommit": [14, 41], "autom": [27, 29], "automat": [2, 3, 5, 6, 10, 14, 20, 22, 24, 28, 35, 38, 47, 50, 55, 57], "autosc": 20, "autoscal": 20, "avail": [3, 5, 6, 8, 9, 10, 13, 14, 15, 16, 20, 22, 23, 25, 28, 29, 35, 38, 46, 47, 49, 52, 55, 56, 57, 58], "avoid": [9, 14, 38, 50, 57], "aws": [20, 29], "aws_access_key_id": [14, 30], "aws_account_id": [14, 29], "aws_region": 14, "aws_secret_access_kei": [14, 30], "axi": 51, "azur": 20, "azure_data_lake_storage_kei": 15, "azureblobcomputelogmanag": 15, "azuredatabrick": 20, "back": [9, 14, 15, 16, 18, 25, 29, 32, 36, 38, 46, 56], "backend": [14, 17, 18, 38], "backendconnectiontimeout": [14, 38], "backfil": [2, 51, 56], "background": 16, "backlog": [14, 38], "backoff": [50, 57], "backoff_delai": [50, 57], "backpressur": [14, 38], "backward": [14, 38], "bad": 3, "badg": 1, "balthazar": 16, "bar": [6, 9, 10, 14, 25, 38, 50, 54, 55, 57, 58], "bare": [3, 5], "base": [1, 5, 7, 9, 10, 11, 14, 15, 16, 17, 18, 19, 22, 25, 28, 31, 32, 34, 36, 38, 40, 46, 48, 50, 51, 56, 57, 58, 59], "base_dir": [9, 10], "basedir": 10, "baseexcept": 8, "baseoper": 13, "basi": [29, 52], "basic": [6, 7, 25, 52], "basicprofil": [14, 38], "bat": [6, 57], "batch": [14, 38], "batch_kwarg": 26, "baz": [6, 50, 57], "becaus": [9, 14, 15, 22, 24, 38], "becom": [3, 5, 34, 47, 55, 58], "been": [5, 6, 9, 10, 11, 12, 14, 38, 54, 57], "befor": [3, 9, 12, 14, 18, 20, 22, 24, 25, 29, 38, 39, 50, 51, 56, 57], "before_cursor": 9, "before_timestamp": 9, "begin": [6, 9, 14, 38, 57], "behalf": 25, "behavior": [6, 11, 13, 14, 20, 22, 38, 50, 52, 57], "behind": [14, 38, 56], "being": [3, 8, 9, 10, 14, 22, 25, 38, 47, 50, 55, 56, 57], "belong": [6, 7, 8, 14, 20, 25, 38, 52, 56], "below": [3, 14, 20, 25, 27, 29, 38, 41], "bertovi\u0107": 16, "bespok": 54, "best": 13, "beta": 25, "better": [13, 14, 38], "between": [1, 7, 9, 12, 14, 15, 22, 24, 25, 38, 39, 41, 42, 50, 51, 52, 55, 56, 57], "beyond": [14, 38, 50], "bigger": [14, 38], "bigquery_resourc": 25, "bigqueryerror": 25, "bigtabl": 25, "binari": [14, 25, 38], "binaryio": 9, "bind": [11, 14, 38, 41], "bindaddress": [14, 38], "bit": 41, "bitnami": 29, "blacklist": [14, 38], "blank": [12, 24], "blob": [6, 15, 20, 22], "block": [9, 14, 15, 16, 36, 38, 40, 57], "blockinterv": [14, 38], "blockmanag": [14, 38], "blockmanagerslavetimeoutm": [14, 38], "blocks_fn": 40, "blocksiz": [14, 38], "blog": 16, "blue": [3, 21], "bodi": [5, 6, 7, 10, 27, 28, 40, 47, 50, 55, 57], "bool": [1, 3, 5, 6, 7, 11, 12, 13, 14, 15, 18, 20, 22, 24, 25, 28, 29, 31, 34, 36, 38, 41, 44, 46, 48, 50, 51, 52, 54, 56, 57, 58], "boolean": [3, 34, 48, 50, 56], "boolean_column": 34, "boolmetadatavaluy": [50, 57], "boolsourc": 3, "boot": 25, "bootdisksizegb": 25, "bootdisktyp": 25, "bootstrap": [14, 29], "bot": 40, "both": [3, 6, 9, 13, 14, 20, 21, 22, 26, 29, 34, 38, 52, 56, 57], "boto": 14, "boto3": 14, "botocor": 14, "bound": [14, 34, 38, 51, 56], "boundari": [5, 6, 9, 11, 50, 52, 57], "bq_create_dataset": 25, "bq_delete_dataset": 25, "bq_op_for_queri": 25, "bq_solid_for_queri": 25, "breakpoint": 59, "brew": 29, "bridg": [14, 38], "broadcast": [14, 38], "broker": [17, 18], "broker_url": 16, "brought": 1, "browser": 21, "bucket": [3, 14, 20, 25, 34, 58], "bucket_prefix": 3, "buffer": [14, 38], "bufferediobas": 46, "buffers": [14, 38], "build": [1, 6, 7, 8, 10, 11, 12, 22, 24, 29, 41, 47, 50, 52, 55, 56, 57], "build_airbyte_asset": 12, "build_assets_job": [1, 24], "build_fivetran_asset": 24, "build_hook_context": 8, "build_init_logger_context": 47, "build_init_resource_context": 55, "build_input_context": 10, "build_job": 1, "build_op_context": [6, 14], "build_output_context": 10, "build_reconstructable_job": [6, 11, 52], "build_resourc": 55, "build_run_status_sensor_context": 56, "build_schedule_context": 56, "build_schedule_from_partitioned_job": [51, 56], "build_sensor_context": 56, "build_snowflake_io_manag": [41, 42], "build_solid_context": 57, "buildkit": 29, "buildup": [14, 38], "built": [3, 6, 9, 14, 22, 34, 38, 46, 52], "builtin": [3, 12, 22, 24, 59], "bulk": 25, "bundl": 14, "bus": [14, 38], "busi": 58, "bypass": [22, 29], "bypass_cach": 22, "bypassmergethreshold": [14, 38], "byte": [9, 14, 38, 50, 57], "cach": [14, 22, 38, 41, 54], "cache_column_metadata": 41, "cachedexecutoridletimeout": [14, 38], "cadenc": [51, 56], "calcul": [14, 22, 38, 50, 57], "calculate_byt": [50, 57], "call": [1, 2, 3, 5, 6, 9, 10, 11, 13, 14, 19, 20, 22, 26, 29, 35, 38, 41, 46, 47, 48, 52, 54, 57, 58], "call_user_provided_funct": 9, "callabl": [3, 8, 9, 13, 14, 22, 31, 34, 35, 40, 47, 50, 51, 54, 55, 56, 57, 58], "callback": 8, "caller": [14, 25, 38, 52], "callercontext": [14, 38], "can": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "can_subset": 1, "cancel": [9, 22], "cancel_and_wait": 14, "cancel_run": 22, "canned_acl": 20, "cannot": [5, 13, 14, 20, 25, 28, 34, 38, 46, 50, 56, 57], "capac": [14, 25, 38], "captur": [51, 59], "card": 31, "care": [7, 11], "case": [6, 7, 8, 9, 14, 16, 17, 18, 22, 23, 34, 38, 39, 47, 50, 52, 54, 55, 57, 58, 59], "catalog": [22, 50, 57], "catch": 5, "categor": 34, "categori": 34, "categorical_column": 34, "caus": [2, 8, 14, 25, 38], "caution": [14, 38], "celeri": 29, "celery_docker_executor": 17, "celery_docker_job_executor": 17, "celery_enabled_job": [16, 17, 18], "celery_executor": 16, "celery_k8s_job_executor": 18, "celeryk8srunlaunch": 18, "celeryq": [16, 17, 18], "central": [6, 8, 14, 38, 47, 51], "central1": 25, "cert": 14, "certain": [1, 6, 9, 11, 14, 34, 38, 39, 41, 50, 52, 57], "certif": [14, 25, 31], "chain": [14, 38], "chang": [2, 14, 16, 22, 24, 25, 29, 40, 41, 51], "channel": [2, 31, 40], "charact": [14, 20, 25, 38], "chat": 40, "chat_postmessag": 40, "check": [3, 5, 6, 9, 14, 22, 26, 34, 38, 46, 48, 50, 54, 55, 56, 57, 58], "check_cluster_everi": 14, "check_dagster_typ": 58, "check_nam": 21, "checker": 3, "checkerror": [34, 58], "checkpoint": [14, 38], "checkpointinterv": [14, 38], "checksum": [14, 38], "child": [1, 3, 5, 6, 7, 11, 50, 52, 57, 59], "children": [52, 57], "choic": 11, "chosen": 14, "chunk": [9, 14, 38], "circumst": [14, 38], "claim": 29, "class": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 20, 22, 24, 25, 28, 29, 32, 34, 35, 36, 37, 38, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "class_nam": 9, "classestoregist": [14, 38], "classmethod": [3, 9], "classpath": [14, 25, 38], "clean": [14, 38, 55], "cleancheckpoint": [14, 38], "cleaner": [14, 38], "cleanup": [14, 29, 38, 55], "clear": [14, 38], "cli": [13, 29], "click": 40, "client": [2, 14, 15, 19, 22, 25, 30, 38, 40, 41], "client_prefetch_thread": 41, "client_session_keep_al": 41, "clone": [4, 14, 38], "cloneconf": [14, 38], "close": [9, 14, 38, 41], "closefileafterwrit": [14, 38], "cloud": [9, 25], "cloudwatch_logg": 14, "cls": [9, 58], "cluster": [9, 14, 18, 19, 20, 25, 38, 58], "cluster_config": 25, "cluster_id": 14, "cluster_log_conf": 20, "clusternam": 25, "coars": [14, 38], "code": [2, 5, 6, 8, 9, 14, 20, 22, 25, 27, 33, 38, 39, 46, 47, 48, 50, 52, 57, 58], "codec": [14, 38], "coerc": [50, 57], "cogroup": [14, 38], "col": [50, 57], "col_a": 50, "col_b": 50, "collect": [3, 5, 6, 7, 10, 14, 38, 46, 50, 52, 57], "collis": 25, "color": [3, 21], "colored_console_logg": 47, "column": [34, 41, 50, 57], "com": [14, 16, 17, 18, 20, 22, 24, 25, 27, 29, 38, 40, 44, 50, 57], "combin": [1, 7, 22, 50, 52, 57], "come": [14, 38, 56], "comma": [14, 38, 52], "command": [2, 14, 16, 17, 18, 22, 25, 29, 38, 39, 43, 53], "committ": [14, 38], "common": [9, 16, 17, 18, 25, 52, 58], "commun": [2, 9, 12, 14, 24, 25, 38, 50, 57, 58], "compani": [29, 40], "compar": 22, "compat": [14, 15, 25, 38], "compelt": 22, "compil": [13, 22], "compile_project": 22, "compile_sql": 22, "complet": [6, 7, 12, 14, 18, 20, 22, 24, 25, 29, 34, 38, 51], "completed": 24, "completekei": 59, "complex": [54, 57], "complex_repositori": 54, "complex_solid": 59, "complexrepositorydata": 54, "complic": 59, "compon": [3, 7, 9, 10, 12, 22, 24, 25], "compos": [1, 16, 50], "composit": [3, 4, 7, 52, 57], "composite_solid": [22, 39, 57], "compositesoliddefinit": [3, 57], "compositesolidexecutionresult": [52, 57], "comprehens": 24, "compress": [14, 38, 44], "comput": [1, 4, 5, 6, 7, 8, 10, 14, 15, 18, 20, 22, 24, 25, 28, 29, 35, 38, 47, 50, 52, 56, 57, 58], "compute_fn": [50, 57], "compute_input_event_dict": 57, "compute_kind": 1, "compute_log": [14, 15], "compute_log_manag": [9, 14, 15], "compute_logs_data": 9, "compute_output_events_dict": 57, "compute_step_ev": 57, "compute_step_failure_ev": 57, "computelogmanag": 9, "computemetadata": 25, "concept": [18, 29], "conceptu": 1, "concert": [5, 18], "concis": [14, 38], "concret": [7, 9], "concurr": [6, 9, 14, 29, 38], "condit": 9, "conf": [14, 20, 25, 38], "config": [1, 2, 5, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 46, 47, 50, 53, 54, 55, 56, 57, 58, 59], "config_field": [6, 52, 55], "config_fil": [53, 59], "config_fn": [3, 52, 57], "config_from_fil": 59, "config_from_pkg_resourc": 59, "config_from_yaml_str": 59, "config_map": [6, 11, 57], "config_or_config_fn": [9, 47, 50, 55, 57], "config_schema": [3, 4, 9, 10, 14, 22, 35, 46, 47, 50, 52, 54, 55, 57, 58], "config_sourc": [16, 17, 18], "config_typ": 9, "config_valu": [3, 5, 9, 58], "config_yaml": [9, 16], "configbucket": 25, "configmap": [3, 6, 7, 11, 18, 29, 49, 57], "configmapenvsourc": [18, 29], "configschema": [3, 9, 10, 14, 35, 47, 50, 55, 57, 58], "configtyp": 9, "configu": 3, "configur": [2, 3, 5, 7, 9, 10, 11, 12, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 27, 28, 29, 31, 32, 35, 36, 38, 39, 40, 41, 42, 43, 46, 47, 50, 51, 53, 55, 56, 57, 58, 59], "configurableclass": 9, "configurableclassdata": [9, 14, 15], "configurabledefinit": [3, 9, 47, 50, 55, 57], "conflict": [14, 25, 28, 38], "conflictingexecutionparamserror": 28, "conform": [3, 7, 11, 13, 25, 28, 50, 57], "confus": [9, 20, 50, 57], "conjunct": [3, 56], "conn_str": 9, "connect": [2, 7, 9, 10, 12, 14, 15, 17, 19, 22, 23, 24, 25, 28, 29, 31, 38, 40, 41, 44, 50, 55, 57], "connect_timeout": 14, "connection_id": 12, "connectionerror": 28, "connectiontimeout": [14, 38], "connector": [12, 15, 24, 41], "connector_id": 24, "consecut": [14, 38], "consequ": [6, 11, 13, 52], "conserv": [6, 11, 52], "consid": [14, 20, 22, 38, 39, 56, 58], "consider": [14, 38], "consist": [10, 50, 51, 52], "consol": [14, 25, 38, 46], "consolid": 9, "consolidatedsqliteeventlogstorag": 9, "constant": 3, "constitu": [1, 6, 7, 13, 57], "constraint": [22, 28, 34, 50, 57], "construct": [1, 5, 6, 7, 9, 10, 11, 13, 16, 17, 18, 22, 34, 39, 41, 43, 46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59], "construct_spark_shell_command": 43, "constructor": [4, 5, 9, 13, 14, 16, 17, 18, 34, 38, 50, 52, 53, 54, 57, 59], "consult": 9, "consum": [1, 9, 14, 38, 50, 57], "consume_ev": [6, 10, 57], "consume_logged_metadata_entri": 10, "consumpt": [14, 38], "contact": 16, "contain": [2, 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 29, 31, 34, 38, 40, 50, 51, 52, 54, 55, 56, 57, 58], "container": 13, "container_context": 2, "container_imag": 2, "container_kwarg": [17, 23], "container_nam": 14, "content": [1, 14, 20, 22, 38], "context": [1, 2, 3, 4, 5, 8, 9, 14, 15, 20, 21, 22, 25, 27, 28, 30, 31, 33, 34, 35, 38, 39, 40, 41, 46, 47, 48, 50, 51, 52, 54, 55, 56, 58, 59], "context_": [34, 58], "context_manager_resourc": [6, 8, 10, 55, 57], "contextlib": 55, "contextmanag": 55, "continu": 25, "continueonfailur": 25, "contrain": 3, "contrast": 18, "contribut": 9, "control": [9, 14, 20, 38, 50, 54, 56, 57], "conveni": [47, 59], "convent": 13, "convert": [3, 56, 57, 59], "cool": [14, 15, 25, 50], "coordin": [14, 38], "copi": [9, 14, 20, 22, 25, 38, 40], "copy_handle_to_local_temp": 9, "core": [4, 5, 7, 9, 11, 12, 14, 18, 24, 25, 26, 29, 34, 35, 38, 47, 50, 52, 55, 56, 57], "core_concept": 26, "correct": [1, 5, 50], "correctli": [14, 20, 29, 38], "correpond": 1, "correspond": [1, 2, 3, 4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 20, 22, 24, 25, 50, 51, 54, 56, 57], "corrupt": [14, 38], "cost": [14, 38], "costli": 54, "could": [3, 14, 20, 22, 38], "count": [14, 21, 34], "cover": 10, "cowboytyp": 3, "cpu": [14, 38], "cpu_count": 6, "crash": [14, 38], "creat": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 31, 33, 38, 40, 41, 46, 50, 51, 52, 54, 55, 56, 57, 58, 59], "create_dagster_pandas_dataframe_typ": 34, "create_databricks_job_op": 20, "create_databricks_job_solid": 20, "create_dbt_rpc_run_sql_solid": 22, "create_issu": 27, "create_k8s_job_task": 18, "create_offset_partition_selector": 51, "create_registered_model": 30, "create_run": 9, "create_schedule_definit": 51, "create_shell_command_op": 39, "create_shell_command_solid": 39, "create_shell_script_op": 39, "create_shell_script_solid": 39, "create_spark_op": 43, "create_task": 16, "creation": [14, 17, 20, 23, 29, 38, 49, 54], "cred": 15, "credenti": [14, 15, 18, 20, 25, 29, 40], "criteria": [1, 6, 7, 11, 39, 50, 52, 57], "critic": [2, 47], "cron": [9, 51, 56], "cron_schedul": [51, 54, 56], "cross": [6, 9, 11, 20, 25, 52], "crossrealmtrustadminserv": 25, "crossrealmtrustkdc": 25, "crossrealmtrustrealm": 25, "crossrealmtrustsharedpassworduri": 25, "csv": [3, 58], "csv_loader": 10, "csv_loader_kei": 10, "curiou": 9, "curl": 25, "current": [2, 6, 9, 10, 12, 13, 14, 22, 24, 25, 38, 47, 50, 51, 52, 55, 56, 57, 59], "current_tim": 51, "current_valu": 5, "curri": 3, "cursor": [2, 9, 56], "custom": [3, 6, 9, 10, 13, 14, 18, 20, 22, 25, 28, 29, 34, 38, 41, 50, 52, 57, 58], "custom_dbt_cli_resourc": 22, "custom_dbt_rpc_resourc": 22, "custom_dbt_rpc_sync_resourc": 22, "custom_instance_class_data": 9, "custom_path_fs_io_manag": 10, "custom_service_account": 25, "custom_sync_dbt_rpc_resourc": 22, "custom_tag": 20, "custom_typ": 50, "cyclic": 55, "d9971c84d44d47f382a2928c8c161faa": 29, "daemon": [9, 25, 56], "daemon_heartbeat": 9, "dag": [6, 7, 13, 14, 29, 38, 52], "dag_bag": 13, "dag_descript": 13, "dag_id": 13, "dag_kwarg": 13, "dag_path": 13, "dagbag": 13, "daggraph": [14, 38], "dagit": [1, 7, 9, 11, 13, 28, 29, 31, 32, 36, 40, 50, 51, 52, 53, 56, 57, 59], "dagit_base_url": [31, 40], "dagit_port": 2, "dagredi": 29, "dagrun": 13, "dagster": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 23, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "dagster_airbyt": 12, "dagster_airflow": 13, "dagster_attribut": 2, "dagster_aw": [9, 14, 46, 58], "dagster_azur": 15, "dagster_celeri": [16, 17, 18], "dagster_celery_broker_host": [16, 17, 18], "dagster_celery_dock": 17, "dagster_celery_k8": [16, 18], "dagster_container_context": 2, "dagster_container_imag": 2, "dagster_dask": 19, "dagster_databrick": 20, "dagster_datadog": 21, "dagster_dbt": 22, "dagster_dock": 23, "dagster_docker_imag": 29, "dagster_docker_image_tag": 29, "dagster_docker_repositori": 29, "dagster_empty_working_directori": 2, "dagster_ev": [9, 46, 56], "dagster_exampl": [53, 59], "dagster_fivetran": 24, "dagster_g": 26, "dagster_gcp": 25, "dagster_github": 27, "dagster_graphql": 28, "dagster_grpc_host": 2, "dagster_grpc_port": 2, "dagster_grpc_socket": 2, "dagster_handl": 47, "dagster_hom": [2, 9, 17, 18, 29, 32, 36], "dagster_imag": 29, "dagster_inst": 56, "dagster_k8": [18, 29], "dagster_lazy_load_user_cod": 2, "dagster_mlflow": 30, "dagster_module_nam": 2, "dagster_msteam": 31, "dagster_mysql": [9, 32], "dagster_package_nam": 2, "dagster_pagerduti": 33, "dagster_panda": [26, 34], "dagster_papertrail": 35, "dagster_pg_password": [18, 29], "dagster_pipeline_factori": 13, "dagster_postgr": [9, 36], "dagster_prometheu": 37, "dagster_pyspark": 38, "dagster_python_fil": 2, "dagster_run": [6, 50, 55, 56, 57], "dagster_shel": 39, "dagster_slack": 40, "dagster_snowflak": [41, 42], "dagster_snowflake_panda": [41, 42], "dagster_spark": 43, "dagster_ssh": 44, "dagster_stag": 20, "dagster_test": 29, "dagster_twilio": 45, "dagster_typ": [1, 3, 4, 5, 6, 10, 26, 34, 50, 57, 58], "dagster_type_load": [6, 34, 52, 58], "dagster_type_m": 58, "dagster_type_materi": [34, 58], "dagster_use_python_environment_entry_point": 2, "dagster_working_directori": 2, "dagsterassetmetadatavalu": [50, 57], "dagsterconfigmappingfunctionerror": 5, "dagsterdaemonschedul": 56, "dagsterdbtclifatalruntimeerror": 22, "dagsterdbtclihandledruntimeerror": 22, "dagsterdbtclioutputsnotfounderror": 22, "dagsterdbtcliruntimeerror": 22, "dagsterdbtcliunexpectedoutputerror": 22, "dagsterdbterror": 22, "dagsterdbtrpcunexpectedpolloutputerror": 22, "dagstererror": 5, "dagsterev": [6, 8, 9, 52, 56, 57], "dagstereventloginvalidforrun": 5, "dagstereventtyp": [6, 9], "dagsterexecutionstepexecutionerror": [5, 9], "dagsterexecutionstepnotfounderror": 5, "dagstergraphqlcli": 28, "dagstergraphqlclienterror": 28, "dagsterinst": [2, 5, 6, 7, 9, 11, 13, 52, 55, 56, 57], "dagsterinvalidconfigdefinitionerror": 5, "dagsterinvalidconfigerror": [3, 5], "dagsterinvaliddefinitionerror": [5, 22], "dagsterinvariantviolationerror": [5, 6, 53, 55, 59], "dagsterlogmanag": [6, 8, 9, 10, 46, 47, 55, 57], "dagstermil": [6, 11, 52], "dagstermillerror": 46, "dagstermillexecutioncontext": 46, "dagsterpipelinerunmetadatavalu": [50, 57], "dagsterresourcefunctionerror": 5, "dagsterrun": [6, 56, 57], "dagsterrunconflict": 28, "dagsterrunnotfounderror": 5, "dagsterrunstatu": 9, "dagsterstepoutputnotfounderror": 5, "dagstersubprocesserror": 5, "dagstertyp": [1, 5, 6, 10, 14, 26, 34, 50, 57, 58], "dagstertypecheckdidnotpass": 5, "dagstertypecheckerror": 5, "dagstertypekind": [34, 58], "dagstertypeload": [34, 58], "dagstertypemateri": [34, 58], "dagsterunknownresourceerror": 5, "dagsterunmetexecutorrequirementserror": 5, "dagsterusercodeexecutionerror": [5, 9], "dai": [51, 56], "daili": [14, 38, 51, 56], "daily_10am_schedul": 51, "daily_partitioned_config": [51, 56], "daily_schedul": 56, "dailypartitionsdefinit": 51, "dashboard": [14, 38, 50, 57], "dashboard_url": [50, 57], "dask_enabled_job": 19, "dask_executor": 19, "data": [3, 4, 6, 7, 9, 10, 12, 14, 15, 18, 20, 22, 24, 25, 26, 29, 34, 35, 38, 41, 42, 47, 48, 50, 52, 55, 57, 58], "databas": [2, 9, 14, 22, 25, 29, 32, 36, 41, 50, 51, 55, 56, 57], "databricks_cli": 20, "databricks_host": 20, "databricks_job": 20, "databricks_pyspark_step_launch": 20, "databricks_token": 20, "databrickserror": 20, "datadog_op": 21, "datadog_resourc": 21, "datadogpi": 21, "datafram": [22, 26, 34, 38, 41, 42], "dataframe_constraint": 34, "dataframe_load": 34, "dataframe_materi": 34, "dataframeconstraint": 34, "datalakeservicecli": 15, "dataproc_op": 25, "dataproc_resourc": 25, "dataproc_solid": 25, "dataset": [25, 26, 50, 57], "datasourc": 26, "datasource_nam": 26, "date": [2, 14, 20, 22, 48, 51, 56], "date_partition_rang": 51, "datetim": [24, 34, 51, 56], "datetime64": 34, "datetime_column": 34, "day_of_month": [51, 56], "day_of_week": [51, 56], "day_offset": [51, 56], "db_name": [9, 32, 36], "db_statement_timeout": 2, "dbf": 20, "dbt": 1, "dbt_cli_compil": 22, "dbt_cli_pipelin": 22, "dbt_cli_resourc": 22, "dbt_cli_run": 22, "dbt_cli_run_oper": 22, "dbt_cli_snapshot": 22, "dbt_cli_snapshot_fresh": 22, "dbt_cli_test": 22, "dbt_cloud": 22, "dbt_cloud_auth_token": 22, "dbt_cloud_host": 22, "dbt_cloud_resourc": 22, "dbt_cloud_run_op": 22, "dbt_compile_op": 22, "dbt_docs_generate_op": 22, "dbt_execut": 22, "dbt_ls_op": 22, "dbt_output": 22, "dbt_profiles_dir": 22, "dbt_project": 22, "dbt_rpc": 22, "dbt_rpc_compile_sql": 22, "dbt_rpc_job": 22, "dbt_rpc_resourc": 22, "dbt_rpc_run": 22, "dbt_rpc_run_and_wait": 22, "dbt_rpc_run_oper": 22, "dbt_rpc_run_operation_and_wait": 22, "dbt_rpc_snapshot": 22, "dbt_rpc_snapshot_and_wait": 22, "dbt_rpc_snapshot_fresh": 22, "dbt_rpc_snapshot_freshness_and_wait": 22, "dbt_rpc_sync_job": 22, "dbt_rpc_sync_resourc": 22, "dbt_rpc_test": 22, "dbt_rpc_test_and_wait": 22, "dbt_run_op": 22, "dbt_seed_op": 22, "dbt_snapshot_op": 22, "dbt_test_op": 22, "dbtclioutput": 22, "dbtcliresourc": 22, "dbtcloudoutput": 22, "dbtcloudresourcev2": 22, "dbtoutput": 22, "dbtresourc": 22, "dbtrpcoutput": 22, "dbtrpcresourc": 22, "dbtrpcsyncresourc": 22, "dbtypehandl": 41, "dd_job": 21, "dead": [14, 38], "debian": 25, "debug": [6, 12, 22, 24, 25, 29, 47, 57, 59], "debug_log": 22, "debugg": 59, "decid": [14, 38], "declar": [6, 7, 14, 15, 50, 52, 57], "decor": [1, 3, 6, 7, 8, 9, 10, 11, 21, 22, 34, 39, 47, 50, 51, 52, 54, 55, 56, 57, 58], "decorated_fn": [8, 51, 56], "decreas": [14, 38], "decrement": 21, "dedupl": 57, "deeplink": [31, 40], "def": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 30, 31, 33, 38, 39, 40, 41, 42, 50, 52, 54, 55, 57, 58, 59], "default": [1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 26, 27, 28, 29, 30, 31, 34, 36, 38, 39, 40, 41, 44, 46, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "default_arg": 13, "default_executor": [49, 52], "default_flag": 22, "default_output": [52, 57], "default_statu": [31, 40, 51, 56], "default_tag": 20, "default_valu": [3, 4, 50, 57], "defaultcor": [14, 38], "defaultruncoordin": 9, "defaultrunlaunch": 9, "defaultschedulestatu": [51, 56], "defaultsensorstatu": [31, 40, 56], "defin": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22, 23, 25, 28, 29, 34, 35, 41, 42, 46, 48, 49, 51, 52, 53, 54, 55, 56, 58], "define_dagstermill_op": 46, "define_dagstermill_solid": 46, "define_my_job": [6, 11, 52], "define_pipelin": 2, "define_repo": 2, "define_spark_config": 43, "definit": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 22, 24, 25, 38, 39, 41, 46, 47, 50, 54, 55, 56, 57], "delai": [14, 38, 50, 57], "deleg": [9, 16, 18, 47], "delet": [2, 9, 14, 25, 29, 38], "delete_local_temp": 9, "delin": [51, 56], "deliv": 20, "delta_rang": 51, "deni": 16, "denibertov": 16, "depend": [1, 4, 6, 11, 13, 14, 15, 22, 25, 38, 40, 46, 52, 55, 57, 58], "depende": 7, "dependency_asset_kei": 1, "dependency_structur": 7, "dependencydefinit": [6, 7, 52, 57], "deploi": [14, 25, 29, 38], "deploy": [14, 17, 18, 19, 28, 29, 38, 56], "deploy_local_job_packag": 14, "deploy_local_pipeline_packag": 14, "deploy_mod": 43, "deploymod": [14, 38], "deprec": [6, 14, 22, 38, 49, 56], "deqeueu": 9, "dequeue_interval_second": 9, "deriv": 9, "descend": [1, 6, 7, 11, 52], "describ": [3, 7, 9, 10, 11, 20, 22, 28, 29, 48, 50, 57], "descript": [1, 3, 4, 5, 6, 7, 9, 10, 11, 13, 14, 20, 22, 34, 35, 38, 39, 43, 46, 47, 49, 50, 51, 52, 54, 55, 56, 57, 58], "descriptor": 50, "deseri": [10, 56], "design": [1, 6, 10, 22, 57], "desir": [12, 15, 22, 24, 29, 57], "destin": 20, "destination_t": [12, 24], "destruct": [14, 38], "detail": [12, 14, 16, 17, 18, 20, 22, 24, 38, 40, 41, 50, 57], "detect": [14, 38], "determin": [1, 2, 4, 6, 7, 11, 20, 22, 25, 34, 50, 51, 52, 56, 57, 58], "determinist": [50, 57, 58], "dev": [3, 14, 16, 17, 18, 20, 29], "dev_s3": 3, "develop": [25, 27, 46, 58], "devstorag": 25, "dfoo": 25, "dict": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 29, 30, 34, 36, 38, 39, 40, 41, 46, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "dictionari": [1, 3, 5, 6, 7, 8, 10, 11, 22, 24, 49, 50, 51, 54, 55, 56, 57, 59], "dictread": 58, "dictwrit": 58, "did": [57, 59], "died": 9, "differ": [3, 6, 10, 11, 14, 16, 17, 18, 22, 38, 39, 50, 51, 52, 56, 57], "dir": [9, 14, 22, 25, 38], "direct": [1, 6, 7, 11, 14, 38, 52], "directli": [2, 3, 6, 7, 9, 11, 13, 14, 15, 21, 22, 32, 36, 38, 39, 46, 47, 50, 52, 54, 56, 57], "directori": [2, 9, 10, 13, 14, 15, 16, 20, 22, 25, 38, 41, 54, 59], "dirnam": 4, "disabl": [9, 14, 16, 17, 18, 22, 23, 29, 38, 39, 41], "disable_schedule_on_trigg": [22, 24], "disallow": 9, "discret": [7, 11, 51], "disk": [9, 14, 20, 25, 38, 50, 57], "diskconfig": 25, "dispatch": [6, 8, 28, 47], "displai": [1, 2, 4, 14, 22, 34, 38, 40, 50, 56, 57, 58], "distcp": 25, "distinguish": [4, 9, 34, 58], "distribut": [14, 15, 18, 19, 21, 25, 29], "divid": [14, 38], "dkr": 29, "do_someth": [8, 10], "do_something_on_failur": 8, "do_something_on_success": 8, "do_stuff": 10, "doc": [9, 12, 14, 16, 17, 18, 20, 22, 24, 25, 26, 29, 40, 43], "docker": [13, 16, 18, 29], "docker_executor": 23, "docker_image_tag": 30, "docker_job": 23, "docker_password": 17, "dockeroper": 13, "dockerrunlaunch": 23, "docs_url": 22, "docstr": [50, 57], "document": [2, 3, 9, 12, 14, 20, 21, 22, 24, 33, 38, 40, 43, 53, 59], "doe": [5, 6, 7, 14, 22, 23, 28, 34, 38, 50, 51, 52, 55, 56, 57, 58], "doesn": 41, "dog": [6, 57], "dogstatsd": 21, "don": 56, "done": [9, 32, 36, 58], "doubl": [14, 38], "down": [1, 2, 6, 7, 11, 14, 16, 20, 21, 28, 38, 52, 55], "download": [9, 15, 41], "downstream": [1, 4, 6, 7, 10, 11, 22, 46, 50, 52, 57], "downtim": 56, "draw": [18, 29], "drive": 25, "driver": [14, 20, 25, 38], "driver_node_type_id": 20, "driverloglevel": 25, "drop": [14, 22, 38], "dry": 29, "dspark": 20, "dtype": 34, "dublin": 34, "due": [14, 15, 38], "dump": [1, 6, 7, 11, 14, 38, 39, 50, 52, 57], "dump_profil": [14, 38], "dunderfil": 59, "durat": [14, 19, 38, 51], "dure": [5, 6, 8, 9, 14, 28, 38, 39, 41, 47, 50, 55, 57], "dynam": [6, 11, 14, 20, 38, 39, 41, 46, 51, 52, 54], "dynamic_partitioned_config": 51, "dynamicalloc": [14, 38], "dynamicout": 4, "dynamicoutput": [4, 6, 57], "each": [1, 3, 4, 6, 7, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 29, 38, 41, 46, 47, 48, 50, 51, 52, 55, 56, 57], "eager": 9, "earlier": 56, "eas": 50, "easi": [3, 6, 9, 11, 52], "easier": 14, "easiest": [9, 10, 47, 50, 55, 57], "easili": 35, "east": 14, "east1": 25, "eastern": 34, "echo": [39, 50], "echo_2": 50, "ecr": 29, "ecs": 14, "ecsrunlaunch": 14, "edg": 7, "edit": [7, 11], "effect": [14, 29, 38, 50, 57], "effici": [14, 38], "egg": [14, 25, 38], "either": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 20, 22, 24, 28, 29, 34, 38, 40, 41, 47, 50, 51, 52, 55, 56, 57, 58], "elaps": 56, "element": [3, 5, 14, 29, 38], "elimin": 2, "els": 25, "email": 29, "embed": 59, "emit": [1, 6, 9], "emit_metadata": [50, 57], "emit_two_four": 52, "empti": [2, 14, 18, 22, 25, 29, 52], "emr_pyspark_step_launch": 14, "emr_stag": 14, "emrclusterst": 14, "emrerror": 14, "emrjobrunn": 14, "emrstepst": 14, "enabl": [1, 7, 9, 11, 14, 16, 17, 18, 20, 23, 25, 38, 41, 48, 50, 52, 57], "enable_elastic_disk": 20, "enable_encrypt": 20, "enablecompress": [14, 38], "enablekerbero": 25, "encapsul": [13, 25, 29, 50, 57], "encod": [1, 2, 6, 7, 11, 20, 22, 39, 50, 52, 57, 59], "encrypt": [20, 25], "encryption_typ": 20, "encryptionconfig": 25, "end": [1, 6, 7, 16, 27, 30, 50, 51, 52, 56, 57], "end_mlflow_on_run_finish": 30, "end_mlflow_run_on_pipeline_finish": 30, "end_offset": [51, 56], "endpoint": [12, 14, 20, 22, 24, 38], "endpoint_url": 14, "enforc": [5, 13, 14, 34, 38], "enforce_ord": 34, "engin": [6, 9, 25, 41], "engine_ev": 6, "engine_event_data": 9, "engineev": 9, "enough": [2, 14, 38], "enqueu": 9, "ensur": [9, 13, 20, 21, 29, 39, 41, 42, 56], "entail": 16, "enterpris": 27, "entir": [14, 22, 38, 51, 52], "entireti": [14, 38], "entiti": [14, 38], "entri": [1, 2, 3, 4, 9, 10, 14, 22, 25, 38, 50, 57], "entry_data": [50, 57], "enum": [3, 5, 14, 28, 57], "enum_valu": 3, "enumer": 14, "enummeta": 3, "enumvalu": 3, "env": [3, 12, 15, 17, 22, 24, 29, 30, 41], "env_config_map": [18, 29], "env_secret": [18, 29], "env_to_tag": 30, "env_var": [17, 18, 23, 29], "envfrom": [18, 29], "environ": [1, 2, 3, 6, 11, 13, 14, 15, 17, 18, 20, 22, 23, 25, 27, 28, 29, 30, 32, 36, 38, 51, 52, 53, 56, 59], "environment": 20, "environment_var": [51, 56], "ephemer": [2, 6, 7, 9, 11, 16, 25, 26, 46, 52, 55, 56, 57], "equal": [14, 38, 51, 56], "equival": [3, 25, 53], "error": [1, 2, 3, 6, 9, 10, 12, 14, 24, 28, 31, 33, 38, 40, 46, 47, 50, 53, 56, 57, 59], "error_cl": 9, "error_info": 9, "error_object": 28, "error_toler": 34, "especi": [14, 16, 17, 18, 38], "essenti": [14, 38], "establish": 58, "estim": [14, 38], "etc": [6, 14, 18, 22, 25, 29, 38, 51, 57, 59], "europ": 34, "evalu": [34, 51, 54, 56], "evaluate_tick": 56, "evaluatevalueresult": 9, "evaluation_fn": 56, "even": [14, 38, 41, 56], "event": [2, 4, 5, 6, 8, 10, 14, 18, 21, 22, 28, 29, 31, 32, 33, 36, 38, 40, 46, 52, 56, 59], "event_act": 33, "event_list": [52, 57], "event_log": [9, 32, 36], "event_log_entri": 9, "event_log_storag": [9, 32, 36], "event_metadata_fn": 34, "event_specific_data": 6, "event_storag": 9, "event_storage_data": 9, "event_typ": [6, 9], "event_type_valu": 6, "eventlog": [14, 38], "eventlogentri": [9, 56], "eventlogrecord": 9, "eventlogstorag": 9, "eventqueu": [14, 38], "eventrecordsfilt": 9, "events_for_nod": 6, "eventu": [9, 50, 57], "eventv2_cr": 33, "everi": [14, 16, 17, 18, 20, 22, 38, 44, 56], "evict": [14, 38], "exact": [14, 34, 38], "exactli": [3, 20], "examin": [50, 57, 59], "exampl": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 24, 25, 27, 28, 29, 30, 31, 33, 38, 39, 40, 41, 42, 47, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "example_adls2_op": 15, "example_external_task_marker_child": 13, "example_job": 14, "example_mapping_kei": 4, "example_pig_oper": 13, "example_pipelin": 53, "example_preset": 53, "example_redshift_op": 14, "example_s3_op": 14, "example_secretsmanager_op": 14, "example_secretsmanager_secrets_op": 14, "example_secretsmanager_secrets_op_2": 14, "example_skip_dag": 13, "example_trigger_target_dag": 13, "example_xcom": 13, "exampleenum": 3, "exc_info": 5, "exceed": [14, 22, 38], "except": [3, 5, 6, 7, 8, 11, 14, 22, 25, 28, 38, 41, 47, 50, 52, 57], "excess": [14, 38], "excit": 9, "exclud": [14, 22, 38, 52, 57], "execut": [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 28, 29, 38, 39, 41, 44, 46, 47, 48, 49, 51, 53, 55, 56, 58, 59], "execute_in_process": [3, 5, 6, 7, 11, 13, 14, 21, 27, 30, 33, 40, 41, 56], "execute_pipelin": [13, 31, 52, 53], "execute_pipeline_iter": 52, "execute_plan": 16, "execute_queri": [14, 41], "execute_solid": 57, "execute_solid_within_pipelin": 57, "execute_solids_within_pipelin": 57, "executeinprocessresult": [1, 6, 7, 11], "execution_d": 13, "execution_fn": 56, "execution_plan": 9, "execution_plan_snapshot_id": 9, "execution_time_to_partition_fn": 51, "execution_timezon": [51, 56], "executionplan": 9, "executor": [1, 3, 5, 11, 14, 15, 16, 17, 18, 19, 20, 23, 25, 29, 38, 49, 50, 52, 55, 57], "executor_config": 9, "executor_creation_fn": 9, "executor_def": [1, 6, 7, 9, 11, 16, 17, 18, 19, 23, 29, 49], "executor_id": [14, 38], "executorallocationratio": [14, 38], "executordefinit": [1, 3, 6, 7, 9, 11, 16, 17, 18, 19, 23, 29, 49, 52], "executoridletimeout": [14, 38], "executorrequir": 9, "exist": [2, 3, 5, 6, 7, 14, 18, 19, 20, 25, 28, 34, 38, 41, 48, 51, 52, 53, 55, 56, 57, 58], "exit": [2, 14, 22, 25, 38, 39, 55], "expect": [1, 9, 12, 14, 16, 17, 18, 22, 24, 25, 28, 34, 38, 39, 50, 57, 58], "expectation_events_during_comput": 57, "expectation_results_during_comput": 57, "expectationresult": [6, 46, 50, 57], "expens": [9, 14, 38], "expensive_job": 54, "expensive_schedul": 54, "experi": [29, 30], "experienc": 56, "experiment": [4, 9, 14, 38, 50, 55, 56, 57, 58], "experiment_nam": 30, "experimentalwarn": 59, "expir": 13, "explicit": [57, 58], "explicitli": [6, 7, 9, 10, 50, 57, 58, 59], "explod": 57, "explor": 46, "exponenti": [50, 57], "export": [2, 9, 20, 29], "expos": [3, 12, 16, 17, 18, 22, 24, 29, 47, 57], "express": [14, 34, 38, 50, 51], "ext": 9, "extend": [7, 10, 14, 38, 51, 56], "extens": 9, "extern": [9, 14, 20, 25, 28, 29, 38, 55, 56, 58], "external_pipeline_origin": 9, "external_version_fn": 58, "externalpipelineorigin": 9, "extra": [14, 20, 22, 30, 38], "extra_source_asset": 1, "extra_tag": 30, "extraclasspath": [14, 38], "extract": 25, "extrajavaopt": [14, 20, 38], "extralibrarypath": [14, 38], "extralisten": [14, 38], "extras_requir": 9, "face": [9, 14, 38], "facil": 58, "factori": [11, 22, 39, 41], "fail": [2, 5, 8, 12, 14, 18, 20, 22, 24, 25, 28, 29, 31, 34, 38, 40, 50, 55, 56, 57, 58], "fail_fast": 22, "fail_pod_on_run_failur": [18, 29], "failur": [2, 4, 8, 9, 14, 20, 22, 28, 31, 38, 39, 40, 46, 50, 56, 57], "failure_data": 57, "failure_ev": [31, 40, 56], "failure_hook": 8, "failure_typ": 28, "fair": [14, 38], "fake": 15, "fake_redshift_resourc": 14, "fakeadls2resourc": 15, "fall": [18, 25, 29, 56], "fallback": 40, "fals": [1, 3, 5, 9, 10, 12, 13, 14, 15, 20, 22, 25, 28, 29, 34, 38, 41, 44, 48, 50, 51, 57, 58, 59], "fan": 7, "fast": [14, 16, 17, 18, 22, 38], "faster": [14, 38], "fatal": [22, 25, 47], "favor": 49, "featur": [9, 14, 16, 20, 25, 38], "feedback": 29, "femal": 21, "fetch": [9, 14, 22, 38, 41], "fetch_result": [14, 41], "fetchfailur": [14, 38], "fetchtimeout": [14, 38], "few": [14, 38], "fewer": [14, 38], "fh_1": 9, "fh_2": 9, "field": [3, 4, 5, 9, 13, 14, 18, 20, 22, 29, 32, 36, 38, 39, 50, 54, 57], "field_alias": 3, "field_util": 3, "fieldnam": 58, "file": [2, 3, 4, 10, 13, 14, 15, 16, 18, 22, 25, 29, 32, 36, 38, 39, 41, 44, 46, 50, 53, 54, 57, 58, 59], "file_handl": 9, "file_manag": [9, 46, 58], "file_nam": 54, "file_obj": 9, "file_relative_path": [4, 39, 59], "file_result": 4, "file_system": 15, "filehandl": [9, 46, 58], "filemanag": [9, 14, 15, 25, 46, 58], "filenam": [4, 14], "filenotfounderror": 59, "fileoutputcommitt": [14, 38], "filepath": [10, 50, 57], "files_in_directori": 4, "files_pipelin": 9, "filesystem": [6, 7, 9, 10, 11, 14, 15, 25, 38, 50, 57, 58], "fileuri": 25, "fileystem": [14, 20], "fill": 51, "filter": [2, 9, 14, 22, 34, 38, 58], "filter1": [14, 38], "final": [10, 14, 22, 25, 38, 40], "final_foobar_st": [12, 24], "find": [9, 13, 14, 16, 22, 24, 27, 29, 33, 38], "fine": [27, 54], "finish": [14, 30, 38, 40], "fire": 56, "firewal": [14, 38], "first": [2, 5, 6, 14, 21, 22, 27, 29, 33, 34, 38, 40, 41, 42, 47, 50, 51, 54, 56, 57, 58], "first_compon": [50, 57], "fit": [14, 38], "fivetran_api_kei": 24, "fivetran_api_secret": 24, "fivetran_asset": 24, "fivetran_resourc": 24, "fivetran_sync_op": 24, "fivetranoutput": 24, "fivetranresourc": 24, "fix": [2, 14, 38, 51], "fixed_server_id": 2, "fixtur": 29, "flag": [2, 9, 16, 22, 25, 29, 34, 41], "flake": [50, 57], "flakey_oper": [50, 57], "flat_asset_kei": [50, 57], "flavor": 25, "flexibl": [50, 57], "float": [3, 4, 5, 9, 12, 14, 18, 20, 22, 24, 31, 34, 38, 50, 56, 57, 58], "float_column": 34, "floatmetadatavalu": [50, 57], "flow": [6, 7, 14, 34, 50, 52, 57, 58], "flower": [16, 29], "flush": [14, 38], "flux": 9, "fmt": [51, 56], "follow": [3, 4, 6, 7, 9, 10, 13, 14, 15, 16, 17, 18, 19, 22, 23, 25, 27, 29, 36, 38, 50, 51, 52, 53, 56, 57], "foo": [1, 6, 8, 9, 10, 11, 14, 25, 31, 38, 40, 50, 52, 54, 55, 57, 58], "foo_job": [6, 11, 52], "foo_job_arg": 11, "foo_job_kwarg": 11, "foo_resourc": 1, "foobar": [3, 12, 24], "footprint": [14, 38], "for_run_failur": 56, "forc": [14, 38], "forefront": 1, "fork": [14, 38, 59], "forked_pdb": [6, 57, 59], "forkedpdb": [6, 57, 59], "form": [1, 7, 14, 20, 23, 38, 54], "format": [3, 7, 8, 11, 12, 14, 22, 24, 25, 28, 31, 34, 38, 40, 41, 51, 54, 56, 58], "forver": 22, "forward": [5, 14, 17, 29, 38], "found": [1, 5, 13, 14, 16, 20, 22, 24, 27, 28, 29, 38, 48, 50, 52, 57], "foundat": 50, "four": [14, 16, 38, 52], "fraction": [14, 38], "fragment": [6, 9], "framework": [1, 5, 6, 7, 9, 13, 25, 28, 39, 50, 57], "free": [14, 38], "freeli": 5, "frequenc": [14, 38], "frequent": [14, 16, 17, 18, 20, 38], "fresh": [2, 22], "friend": 9, "from": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 46, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "from_config_valu": 9, "from_current_modul": 1, "from_def": 55, "from_fil": 53, "from_modul": 1, "from_package_modul": 1, "from_package_nam": 1, "from_pkg_resourc": 53, "from_python_enum": 3, "from_val": 55, "from_yaml_str": 53, "front": [14, 38], "frozenset": [20, 34, 43, 52, 57], "fs_io_manag": 10, "fspath": [50, 57], "full": [14, 16, 20, 22, 23, 25, 38, 50, 52, 57], "full_control": 25, "full_refresh": 22, "fulli": [9, 14, 22, 38, 47, 50, 55, 57], "fulltrac": 29, "function": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 22, 24, 31, 34, 35, 38, 39, 40, 41, 47, 50, 52, 54, 55, 56, 57, 58, 59], "further": [14, 20, 38], "futur": 54, "gain": 9, "garbag": [14, 38], "gate": 9, "gatewai": 37, "gather": [4, 6, 11], "gaug": 21, "gceclusterconfig": 25, "gcepdkmskeynam": 25, "gcloud": 29, "gcp": 29, "gcs": 25, "gcs_bucket": 25, "gcs_file_manag": 25, "gcs_kei": 25, "gcs_path": 25, "gcs_pickle_asset_io_manag": 25, "gcs_pickle_io_manag": 25, "gcs_prefix": 25, "gcs_resourc": 25, "gcsfilehandl": 25, "ge_data_context": 26, "ge_validation_op_factori": 26, "ge_validation_solid_factori": 26, "gen": 15, "gen2": 15, "gender": 21, "gener": [2, 5, 6, 7, 9, 10, 12, 13, 14, 15, 18, 20, 22, 24, 25, 26, 29, 34, 38, 39, 41, 50, 51, 52, 56, 57, 58], "generate_doc": 22, "generate_materi": 22, "get": [2, 6, 9, 10, 13, 14, 16, 18, 20, 21, 22, 24, 25, 28, 29, 38, 46, 52, 53, 54, 57, 59], "get_all_job": 54, "get_all_pipelin": 54, "get_base_job": 1, "get_batch": 26, "get_connect": 41, "get_connector_detail": 24, "get_connector_sync_statu": 24, "get_context": 46, "get_daemon_heartbeat": 9, "get_dag": 13, "get_dagster_logg": 59, "get_environment_yaml": 53, "get_identifi": 10, "get_input_asset_kei": 10, "get_input_asset_partit": 10, "get_job": [22, 54], "get_job_failure_ev": [6, 56], "get_job_success_ev": [6, 56], "get_logged_ev": 10, "get_logged_metadata_entri": 10, "get_manifest": 22, "get_manifest_json": 22, "get_mapping_kei": [6, 57], "get_modul": 52, "get_observ": 10, "get_on": 41, "get_output_asset_kei": 10, "get_output_asset_partit": 10, "get_output_event_for_comput": 57, "get_output_events_for_comput": 57, "get_partit": 51, "get_repo_id": 27, "get_resource_vers": 48, "get_run": 22, "get_run_artifact": 22, "get_run_config_for_partition_kei": 51, "get_run_result": 22, "get_run_results_json": 22, "get_run_scoped_output_identifi": 10, "get_run_statu": 28, "get_run_step": 22, "get_secret_valu": 14, "get_solid_vers": 48, "get_step_success_ev": 57, "get_system_temp_directori": [14, 15], "get_tag": [6, 46, 57], "get_template_context": 13, "getdbt": 22, "getenv": [14, 16, 17, 18, 27, 31, 40], "getrunbyid": 22, "giant": [14, 38], "github": 44, "github_app_id": 27, "github_app_private_rsa_kei": 27, "github_hostnam": 27, "github_installation_id": 27, "github_job": 27, "github_op": 27, "github_private_kei": 27, "github_resourc": 27, "give": [1, 6, 14, 15, 20, 28, 38, 41, 57], "given": [1, 6, 9, 10, 12, 13, 14, 20, 22, 24, 28, 31, 34, 38, 40, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58], "gke": 29, "glob": [14, 38, 53, 59], "global": [25, 29, 55, 58], "goe": [14, 38], "going": [14, 22, 38, 50, 57], "good": [3, 14, 15, 25, 27, 34, 38], "googl": 25, "googleapi": 25, "gql": 28, "grab": 9, "gracefulli": [14, 38], "grain": [14, 38, 54], "grandchild": [50, 57], "grant": [25, 27], "graph": [1, 3, 4, 5, 11, 12, 14, 20, 24, 38, 39, 41, 52, 57], "graph_a": 7, "graph_def": [6, 7, 11, 52], "graphdefinit": [5, 6, 7, 11, 40, 47, 50, 52, 56], "graphin": 7, "graphout": 7, "graphql": [9, 29, 31, 32, 36, 40, 51, 56], "graphx": [14, 38], "great_expect": 26, "greater": [9, 14, 20, 38], "greatexpect": 26, "green": 3, "group": [1, 14, 25, 54], "groupid": [14, 38], "grow": [14, 38], "grpc": 9, "grpc_host": 2, "grpc_port": 2, "grpc_socket": 2, "gserviceaccount": 25, "guarante": [9, 14, 20, 38], "guest": [16, 17, 18], "guid": [14, 19, 25, 27, 29, 34, 38], "had": [14, 38], "hadoop": [14, 25, 38], "hadoopjob": 25, "hand": [14, 38, 40], "handi": 35, "handl": [1, 10, 14, 24, 38, 39, 46, 47, 50, 52, 57, 58], "handle_input": 10, "handle_output": [10, 48], "handle_str": [52, 57], "handled_output": [6, 10], "handler": [14, 35, 41, 47], "hang": 41, "happen": [5, 14, 38, 56], "happi": 9, "hard": [9, 14, 25, 38, 47], "hardcod": [10, 55], "hardcoded_io_manag": 10, "hardcoded_resourc": 55, "has": [3, 5, 6, 9, 10, 12, 14, 15, 20, 22, 25, 28, 31, 34, 38, 50, 51, 52, 53, 54, 57, 58, 59], "has_error": [50, 57], "has_input_nam": 10, "has_job": 54, "has_output": 48, "has_partition_kei": [6, 10, 57], "has_tag": [6, 46, 57], "have": [1, 4, 5, 6, 9, 10, 11, 12, 14, 20, 22, 24, 25, 28, 29, 34, 35, 38, 41, 42, 48, 50, 54, 55, 56, 57, 58, 59], "haw": 3, "hcf": 25, "hdf": [14, 25, 38], "hdfs_user_guid": 25, "heap": [14, 38], "heartbeat": [9, 14, 38], "heartbeat_timeout": 2, "heartbeatinterv": [14, 38], "hei": 40, "hello": [3, 31, 39, 50, 57], "hello_op": 59, "hello_world": [39, 50, 57], "hello_world_daily_schedul": 51, "hello_world_partition_set": 51, "hello_world_pipelin": 51, "hello_world_with_default": 3, "help": [9, 10, 14, 16, 38, 54, 55], "helper": [10, 55], "here": [6, 7, 11, 14, 16, 17, 18, 20, 21, 22, 24, 25, 27, 29, 33, 38, 40, 48, 50, 52, 57], "heurist": 13, "hierarch": [50, 57], "high": [14, 38], "higher": [6, 14, 38], "highlight": [50, 57], "highlycompressedmapstatu": [14, 38], "hint": 58, "histogram": 21, "histor": [2, 5, 24], "histori": [2, 9, 14, 38, 51], "hit": [14, 38], "hive": 25, "hivejob": 25, "hoc": 13, "hold": [3, 50], "home": [17, 22, 25], "honor": [14, 41], "honua": 3, "hook": [7, 9, 11, 30, 31, 40, 52], "hook_complet": 6, "hook_def": [6, 7, 8, 11, 31, 40, 52], "hook_error": 6, "hook_fn": 8, "hook_skip": 6, "hook_to_invok": 8, "hook_url": 31, "hookcontext": [8, 31, 40], "hookdefinit": [7, 8, 30, 31, 40, 52], "hope": [16, 17, 18], "host": [2, 12, 14, 20, 21, 22, 28, 38, 44], "hostnam": [2, 9, 14, 16, 22, 25, 27, 28, 32, 36, 38], "hour": [25, 51, 56], "hour_of_dai": [51, 56], "hour_offset": [51, 56], "hourli": [14, 38, 51, 56], "hourly_partitioned_config": [51, 56], "hourlypartitionsdefinit": 51, "hous": [14, 38], "how": [1, 3, 6, 7, 10, 11, 14, 16, 20, 21, 22, 38, 41, 42, 52, 56, 57], "howev": [3, 14, 22, 28, 38, 39, 54], "href": 22, "html": [14, 16, 17, 18, 20, 23, 25, 26, 38, 43], "http": [12, 14, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 38, 40, 41, 43, 44, 50, 51, 56, 57], "http_proxi": 31, "https_proxi": 31, "human": [3, 6, 7, 10, 14, 35, 39, 47, 49, 50, 51, 52, 55, 56, 57], "hydrat": [14, 58], "hyphen": 25, "iam": 25, "iana": [51, 56], "idea": [14, 27, 38], "idempot": [16, 17, 18, 20], "idempotency_token": 20, "ident": [1, 55, 58], "identifi": [4, 7, 10, 11, 22, 34, 49, 50, 51, 56, 57, 58], "identity_partition_selector": 51, "idl": [14, 38], "ids": 13, "ietf": 25, "ifnotpres": 29, "ignor": [3, 6, 9, 11, 14, 38], "ignore_handled_error": 22, "ignore_missing_v": 34, "illeg": [14, 38], "imag": [2, 13, 17, 18, 23, 25, 29], "image_nam": [17, 18], "image_pull_polici": [18, 29], "image_pull_secret": [18, 29], "image_vers": 25, "imagepullpolici": 29, "imageuri": 25, "imagevers": 25, "immedi": [9, 14, 38], "immun": [14, 38], "impact": [14, 38], "implement": [6, 7, 9, 10, 11, 12, 14, 15, 16, 18, 22, 24, 25, 34, 38, 48, 49, 50, 52, 56, 57], "implementor": 9, "import": [1, 2, 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 27, 29, 30, 31, 38, 39, 40, 41, 42, 50, 51, 52, 55, 57, 59], "import_df_to_bq": 25, "import_file_to_bq": 25, "import_gcs_paths_to_bq": 25, "imprecis": [14, 38], "improv": [9, 14, 38, 41], "in_process": [6, 18, 49, 52], "in_process_executor": [6, 52], "inbound": [14, 38], "includ": [1, 5, 6, 7, 9, 11, 12, 13, 14, 16, 17, 18, 20, 22, 23, 24, 25, 28, 29, 31, 33, 38, 40, 50, 51, 52, 53, 56, 57, 58], "include_exampl": 13, "include_rel": 22, "include_sidecar": 14, "inclus": 51, "incom": [14, 31, 38], "incompat": [5, 15, 50, 57], "incorrect": 25, "increas": [14, 38, 41], "increment": [14, 21, 22, 38], "incur": 9, "indefinit": 41, "independ": [25, 55], "index": [2, 9, 14, 22, 38, 50, 51, 52, 54, 56, 57], "indic": [2, 3, 5, 9, 20, 22, 25, 28, 34, 41, 50, 56, 57], "individu": [4, 6, 49], "inf": 34, "infer": [1, 7, 25, 28, 50, 57], "infinit": [14, 38, 41], "info": [2, 9, 10, 14, 16, 22, 25, 27, 38, 46, 47, 50, 57, 59], "inform": [1, 2, 7, 14, 17, 20, 22, 23, 25, 28, 38, 40, 50, 52, 56, 57, 59], "ingest": 48, "inherit": [5, 9, 13, 34, 47, 50, 55, 57, 58, 59], "init": [20, 34, 55, 58], "init_context": [9, 10, 14, 35, 46, 47, 52, 55], "init_script": 20, "initexecutorcontext": 9, "initi": [3, 5, 6, 9, 10, 12, 13, 14, 15, 19, 20, 22, 24, 30, 38, 46, 47, 55, 56, 57, 59], "initial_last_sync_complet": 24, "initialexecutor": [14, 38], "initializationact": 25, "initialr": [14, 38], "initloggercontext": [14, 35, 47], "initresourcecontext": [10, 55], "inject": [5, 18, 22, 29, 46, 50, 57], "inlin": 39, "inner": [9, 47, 50, 55, 57], "inner_nod": 6, "inner_typ": 3, "input": [1, 3, 5, 6, 7, 9, 11, 14, 22, 26, 34, 38, 39, 41, 46, 47, 50, 52, 55, 58], "input1": 10, "input_config_schema": 10, "input_dagster_typ": 26, "input_def": [7, 22, 39, 46, 50, 52, 57], "input_events_during_comput": 57, "input_map": [6, 7, 57], "input_nam": 57, "input_valu": [6, 7, 11, 57], "inputcontext": [10, 50, 57], "inputdefinit": [7, 10, 34, 39, 46, 50, 52, 57, 58], "inputmap": [6, 7, 57], "ins": [1, 7, 9, 10, 50, 58], "insensit": 9, "insid": [1, 12, 14, 22, 24, 25, 38], "inst_data": [9, 14, 15, 32], "instal": [13, 14, 20, 25, 27, 29, 33, 40], "installation_id": 27, "instanc": [3, 5, 6, 7, 8, 10, 11, 13, 14, 18, 20, 22, 25, 29, 30, 31, 34, 38, 40, 47, 48, 52, 55, 56, 57, 58], "instance_config_map": [18, 29], "instance_pool_id": 20, "instance_ref": 56, "instance_typ": 9, "instanceof": 58, "instanceref": [9, 56], "instancetyp": 9, "instanti": [6, 9, 14, 15, 22, 32, 35, 36, 47, 52, 55, 57], "instead": [1, 2, 3, 5, 6, 7, 9, 11, 13, 14, 15, 16, 17, 18, 20, 22, 28, 34, 38, 39, 52, 54, 56, 58], "instruct": [22, 27, 29, 33], "insuffici": [14, 38], "int": [1, 3, 4, 5, 6, 9, 12, 13, 14, 20, 22, 24, 25, 28, 34, 44, 50, 51, 52, 54, 55, 56, 57, 58], "integ": [3, 9, 22, 34, 47, 50], "integer_column": 34, "integr": [12, 14, 20, 21, 22, 24, 27, 29, 33, 35, 37, 38, 40, 41, 42, 44, 45], "intend": [6, 7, 9, 14, 18, 22, 28, 50, 52, 54, 57, 58], "intens": 20, "intent": 9, "inter": 2, "interact": [6, 11, 14, 17, 22, 26, 38, 46, 52], "interchang": 57, "interfac": [1, 2, 9, 12, 14, 22, 24], "intermedi": [9, 14, 38], "intern": [1, 2, 6, 10, 14, 22, 24, 25, 28, 32, 34, 36, 38, 49, 52, 57, 58], "internal_asset_dep": 1, "internal_ip_onli": 25, "internaliponli": 25, "interpol": [14, 38], "interrupt": [14, 38], "interv": [9, 14, 22, 38, 51, 56], "intmetadatavalu": [50, 57], "introduc": [16, 17, 18], "introduct": 16, "intsourc": [3, 9, 14, 20, 22, 27, 29, 32, 36, 38, 41, 56], "intuit": 14, "invalid": [5, 22, 28, 50, 53, 57, 59], "invalid_line_no": 22, "invalid_output_nam": 28, "invalid_step_kei": 28, "invalidoutputerror": 28, "invalidoutputerrorinfo": 28, "invalidsteperror": 28, "invari": 5, "invert": 1, "invoc": [1, 6, 7, 8, 11, 13, 29, 47, 52, 55, 56], "invok": [3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 17, 18, 22, 30, 35, 47, 52, 56, 57], "io_manag": [1, 6, 7, 10, 11, 14, 15, 22, 24, 25, 41, 42, 50, 57], "io_manager_def": 1, "io_manager_kei": [1, 4, 10, 22, 24, 50, 57], "iomanag": [1, 10, 22, 48, 57], "iomanagerdefinit": [1, 10, 14, 15, 25, 41], "iomanagerdefnit": 10, "ipc": 2, "ipc_output_fil": 2, "ipipelin": [9, 52], "is_builtin": [34, 58], "is_pres": [50, 57], "is_requir": [3, 4, 34, 50, 57], "is_valid": [50, 57], "isinst": [10, 58], "isol": [13, 29], "ispreempt": 25, "issu": [14, 33, 38], "item": [3, 4], "iter": [1, 14, 38, 52, 56], "its": [1, 2, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 22, 24, 38, 46, 47, 50, 51, 52, 54, 55, 56, 57], "itself": [1, 2, 3, 5, 6, 7, 11, 14, 38, 52], "ivi": [14, 38], "ivy2": [14, 38], "ivyset": [14, 38], "jar": [14, 25, 38], "jar_file_uri": 25, "jarfileuri": 25, "java": [14, 38], "javaseri": [14, 38], "javax": [14, 38], "jitter": [50, 57], "jni": [14, 38], "job": [1, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 35, 38, 40, 41, 42, 43, 47, 48, 50, 51, 52, 54, 55, 56, 57, 58], "job_config": 25, "job_def": [6, 47, 57], "job_definition_id": 22, "job_id": 22, "job_imag": [18, 29], "job_nam": [6, 8, 9, 13, 28, 51, 54, 56, 57], "job_namespac": [18, 29], "job_runn": 29, "job_scoped_clust": 25, "job_select": [40, 56], "job_timeout_in_second": 25, "job_wait_timeout": 18, "job_with_all_asset": 1, "job_with_multiple_select": 1, "job_with_one_select": 1, "jobconfigvalidationinvalid": 28, "jobdefinit": [1, 5, 6, 7, 11, 13, 15, 47, 49, 50, 52, 54, 56, 57], "jobfactori": 11, "jobid": 25, "jobnotfounderror": 28, "join": [4, 14, 38], "json": [1, 2, 3, 4, 6, 7, 11, 12, 22, 24, 38, 39, 50, 52, 57], "json_console_logg": 47, "jsonmetadatavalu": [50, 57], "jsonrpc_vers": 22, "jupyt": [6, 11, 46, 52], "just": [3, 23], "jvm": [14, 20, 38], "k8s": 18, "k8s_job": 29, "k8s_job_executor": 29, "k8srunlaunch": [18, 29], "kafka": [14, 38], "kdc": 25, "kdcdbkeyuri": 25, "keep": [14, 29, 38, 41, 51], "keepal": 44, "keepalive_interv": 44, "kei": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 34, 38, 44, 46, 49, 51, 52, 55, 56, 58], "kerber": 25, "kerbero": 25, "kerberosconfig": 25, "key_fil": 44, "key_label_nam": 3, "key_prefix": 1, "key_str": 44, "key_typ": 3, "keypassworduri": 25, "keystor": 25, "keystorepassworduri": 25, "keystoreuri": 25, "keyword": [3, 17, 22, 50, 55], "kib": [14, 38], "kill": [14, 22, 38], "killblacklistedexecutor": [14, 38], "killen": [14, 38], "killtimeout": [14, 38], "kind": [1, 4, 6, 11, 34, 46, 52, 58], "kit": 40, "kms": 20, "kms_kei": 20, "kmskeyuri": 25, "know": [1, 6, 11, 14, 16, 38, 39, 41, 52], "known": 51, "kryo": [14, 38], "kryoregistr": [14, 38], "kryoseri": [14, 38], "kube": 19, "kubeconfig": [18, 29], "kubeconfig_fil": [18, 29], "kubectl": 29, "kubernet": [9, 14, 16, 19, 28, 38], "kwarg": [3, 5, 6, 7, 9, 11, 13, 22, 39, 47, 55, 57, 58, 59], "kwd": [42, 51], "lab": 22, "label": [4, 18, 25, 29, 34, 50, 51, 57], "lack": [14, 38], "lake": [15, 20], "lambda": [6, 11, 51, 52], "lambda_solid": [52, 57], "larg": [14, 38], "larger": [14, 38], "last": [6, 10, 14, 22, 24, 38, 51, 56, 57], "last_completion_tim": 56, "last_run_kei": 56, "latenc": [14, 38], "later": [14, 38], "latest": [9, 14, 16, 17, 18, 20, 25, 26, 29, 38, 43], "latter": [9, 47, 50, 55, 57], "launch": [2, 9, 12, 14, 17, 18, 20, 22, 23, 24, 29, 38, 51, 56], "launch_run": 9, "launcher": [2, 6, 14, 18, 20, 23, 29, 57], "launchpipelineexecut": 2, "lazi": [2, 54], "lazili": 54, "lazy_loaded_repositori": 54, "lead": [14, 38], "leader": [14, 38], "leaf": 57, "least": [14, 38], "leav": [3, 7, 14, 29, 38], "left": [12, 24, 29, 40], "legaci": [6, 7, 8, 9, 13, 14, 38, 50, 55], "len": [1, 50, 57], "length": [3, 5, 14, 25, 38, 41], "lengthi": 9, "less": [9, 14, 16, 17, 18, 38, 51], "let": [14, 16, 38], "letter": [25, 50], "level": [1, 2, 3, 6, 7, 9, 10, 11, 14, 16, 22, 25, 34, 38, 47, 50, 52, 57, 59], "lib": 25, "libjar": 25, "librari": [9, 12, 14, 16, 20, 21, 22, 24, 27, 28, 29, 33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 47], "lifecycl": [52, 57], "lifetim": 25, "like": [3, 6, 9, 11, 14, 18, 22, 24, 27, 28, 29, 38, 39, 41, 46, 47, 50, 51, 52, 55, 57], "limit": [9, 14, 22, 29, 38, 57], "line": [14, 22, 38, 53], "lineag": [14, 38, 50, 57], "linear": [50, 57], "lint": 29, "list": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 20, 22, 23, 24, 25, 29, 30, 34, 38, 39, 40, 46, 48, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59], "list_file_system": 15, "list_objects_v2": 14, "list_run_artifact": 22, "list_vers": 2, "listdir": 54, "listen": [14, 16, 17, 18, 38], "listenerbu": [14, 38], "liter": 3, "littl": [14, 38], "live": [1, 2, 14, 38, 56], "liveupd": [14, 38], "load": [1, 2, 3, 6, 7, 9, 10, 11, 14, 18, 22, 29, 32, 36, 38, 39, 41, 50, 51, 52, 53, 54, 57, 58, 59], "load_assets_from_dbt_manifest": 22, "load_assets_from_dbt_project": 22, "load_dict": 58, "load_incluster_config": [18, 29], "load_input": [10, 48], "load_kube_config": [18, 29], "load_table_from_local_parquet": 41, "loaded_input": [6, 10], "loader": [3, 34, 58], "loader_vers": 58, "loadrepositori": 2, "local": [9, 14, 15, 16, 17, 19, 20, 23, 38, 52, 53, 58], "local_artifact_storag": [9, 10], "local_artifact_storage_data": 9, "local_bas": [53, 59], "local_compute_log_manag": 9, "local_dagster_job_package_path": 20, "local_dbt_rpc_resourc": 22, "local_dir": [14, 15, 38], "local_disk0": 20, "local_file_manag": 9, "local_job_package_path": 14, "local_output_notebook_io_manag": 46, "local_pipeline_package_path": [14, 20], "local_warehous": [53, 59], "localartifactstorag": 9, "localclust": 19, "localcomputelogmanag": 9, "localfilehandl": 58, "localhost": [2, 14, 16, 17, 18, 28, 30, 31, 33, 40], "locat": [9, 14, 18, 20, 25, 28, 29, 38], "log": [2, 5, 6, 8, 10, 12, 14, 15, 16, 18, 20, 22, 24, 25, 29, 30, 32, 35, 36, 38, 41, 46, 52, 55, 57, 59], "log_ev": [6, 10, 57], "log_group_nam": 14, "log_level": [2, 14], "log_manag": [6, 9, 10, 55], "log_materi": [6, 57], "log_param": 30, "log_request": 22, "log_stream_nam": 14, "logblockupd": [14, 38], "logconf": [14, 38], "logger": [3, 6, 7, 11, 12, 14, 22, 24, 35, 44, 46, 49, 52, 57, 59], "logger_config": [14, 35, 47], "logger_def": [6, 7, 11, 46, 47, 49], "logger_fn": [14, 35, 47], "logger_to_init": 47, "loggerdefinit": [3, 7, 11, 14, 35, 47, 49], "logging_tag": [6, 46], "loggingconfig": 25, "logic": [10, 13, 14, 35, 38, 47, 50, 55, 57, 58], "login": [14, 20, 29, 41], "login_timeout": 41, "loglevel": 16, "logs_captur": 6, "logs_start": 22, "logwrit": 25, "long": [14, 15, 20, 25, 38, 56], "longer": [14, 38, 50, 57], "longform": [14, 38], "look": [1, 3, 7, 9, 13, 22, 56], "lookup": [14, 38], "lookuptimeout": [14, 38], "loop": [14, 29, 38], "los_angel": [41, 51, 56], "lost": [14, 38], "lot": [14, 38], "low": 20, "lower": [14, 34, 38], "lowercas": [16, 17, 18], "lsf": 19, "lz4": [14, 38], "lz4compressioncodec": [14, 38], "lzf": [14, 38], "lzfcompressioncodec": [14, 38], "machin": [2, 14, 25, 38], "machineri": [9, 32, 34, 36, 46, 50, 57, 58], "machinetyp": 25, "machinetypeuri": 25, "maco": 29, "macro": 22, "made": [6, 7, 14, 38, 46, 47, 51, 52, 55, 56, 57], "magic": [9, 55], "magic_word": 9, "magicmock": [15, 55], "mai": [1, 3, 5, 6, 7, 8, 9, 11, 14, 15, 16, 17, 18, 20, 22, 25, 29, 38, 39, 41, 46, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "main": [20, 25, 34, 38, 40], "main_class": 43, "mainclass": 25, "mainjarfileuri": 25, "mainpythonfileuri": 25, "maintain": 9, "majmin": 29, "make": [2, 3, 7, 9, 10, 14, 15, 16, 17, 18, 20, 25, 27, 38, 41, 51, 54, 55], "make_airflow_dag": 13, "make_airflow_dag_container": 13, "make_airflow_dag_for_oper": 13, "make_airflow_example_dag": 13, "make_bar_job": [6, 11, 52], "make_dagster_job_from_airflow_dag": 13, "make_dagster_pipeline_from_airflow_dag": 13, "make_dagster_repo": 13, "make_dagster_repo_from_airflow_dag_bag": 13, "make_dagster_repo_from_airflow_dags_path": 13, "make_dagster_repo_from_airflow_example_dag": 13, "make_email_on_run_failure_sensor": 59, "make_expensive_job": 54, "make_expensive_schedul": 54, "make_job": 11, "make_python_type_usable_as_dagster_typ": 58, "make_repo_from_dag_bag": 13, "make_repo_from_dir": 13, "make_request": [12, 22, 24], "make_slack_on_pipeline_failure_sensor": 40, "make_slack_on_run_failure_sensor": 40, "make_teams_on_pipeline_failure_sensor": 31, "make_values_resourc": 55, "malform": 5, "man": 21, "manag": [2, 5, 6, 8, 14, 15, 16, 18, 20, 22, 25, 29, 38, 41, 46, 47, 48, 50, 55, 57], "managed_logg": 47, "managedgroupconfig": 25, "mani": [6, 14, 18, 20, 29, 38, 50, 54, 56, 57], "manifest": 22, "manifest_json": 22, "manipul": 58, "manner": 55, "manual": [7, 14, 20, 22, 24, 38, 41], "map": [1, 3, 5, 6, 7, 8, 10, 11, 12, 14, 18, 22, 25, 34, 38, 51, 52, 57, 58], "map_config_op": 3, "mapped_op": 4, "mappedinputplacehold": 7, "mapping_from": 57, "mapping_kei": [4, 6, 10, 57], "mapping_to": 57, "mapr": 25, "mapreduc": [14, 25, 38], "mapreducetutori": 25, "maps_from": 57, "maps_to": 57, "mark": [3, 14, 18, 38], "markdown": [34, 50, 57, 58], "markdownmetadatavalu": [50, 57], "master": [14, 25, 29, 38], "master_url": 43, "masterconfig": 25, "match": [5, 9, 14, 22, 34, 38, 48, 50, 51, 55, 57], "materi": [1, 3, 6, 9, 10, 12, 14, 15, 22, 24, 25, 34, 46, 50, 56, 57, 58], "materialization_events_during_comput": 57, "materializations_during_comput": 57, "materialize_df": 58, "math_pipelin": 52, "matter": [14, 38, 56], "maven": [14, 38], "max": [14, 34, 38, 50, 57], "max_attempt": 14, "max_catchup_run": 56, "max_completion_wait_time_second": 20, "max_concurr": [6, 29], "max_concurrent_run": 9, "max_datetim": 34, "max_retri": [22, 50, 57], "max_tick_retri": 56, "max_valu": 34, "max_work": [2, 20], "maxattempt": [14, 38], "maxblocksinflightperaddress": [14, 38], "maxchunksbeingtransf": [14, 38], "maxconsecutiveattempt": [14, 38], "maxexecutor": [14, 38], "maxfailedexecutorspernod": [14, 38], "maxfailedtasksperexecutor": [14, 38], "maxfailur": [14, 38], "maxfailuresperhour": 25, "maxim": [14, 29, 38], "maximum": [2, 9, 12, 14, 20, 22, 24, 25, 38, 50, 56, 57], "maxpartitionbyt": [14, 38], "maxrat": [14, 38], "maxrateperpartit": [14, 38], "maxregisteredresourceswaitingtim": [14, 38], "maxremoteblocksizefetchtomem": [14, 38], "maxreqsinflight": [14, 38], "maxresults": [14, 38], "maxretainedfil": [14, 38], "maxretri": [14, 38], "maxsiz": [14, 38], "maxsizeinflight": [14, 38], "maxtaskattemptsperexecutor": [14, 38], "maxtaskattemptspernod": [14, 38], "md_str": [50, 57], "mean": [3, 4, 14, 34, 38, 40, 56, 59], "meant": [5, 14, 15, 25, 34, 58], "measur": [14, 38], "mechan": [14, 16, 38, 41], "median": [14, 38], "meet": [1, 6, 7, 11, 39, 50, 52, 57], "mem_io_manag": 10, "member": [5, 9, 54], "memoiz": [2, 10, 52], "memoizableiomanag": 48, "memoizaton": [7, 11], "memoized_run_tag": 48, "memori": [6, 7, 9, 10, 11, 14, 15, 20, 38, 41], "memory_onli": [14, 38], "memory_only_s": [14, 38], "memoryfract": [14, 38], "memorymapthreshold": [14, 38], "memoryoverhead": [14, 38], "merg": [14, 22, 38, 53], "meso": [14, 38], "mesos_sandbox": [14, 38], "messag": [5, 6, 8, 9, 14, 28, 31, 35, 38, 40, 46, 47, 56, 59], "message_fn": [31, 40], "met": 9, "metadata": [1, 4, 6, 7, 9, 10, 11, 14, 22, 25, 26, 28, 38, 39, 41, 48, 52, 58], "metadata_entri": [1, 4, 5, 22, 50, 57, 58], "metadataentri": [1, 4, 34, 50, 57], "metadatavalu": [4, 34, 50, 57], "method": [4, 6, 9, 10, 11, 12, 13, 14, 21, 22, 24, 26, 28, 30, 35, 40, 41, 47, 48, 50, 52, 55, 57], "metric": [14, 21, 25, 38], "mgmt": 14, "mgr": 10, "mib": [14, 38], "midnight": [51, 56], "might": [3, 14, 38, 39, 58], "migrat": [2, 50], "mileston": [14, 38], "millisecond": [2, 14, 38], "min": [20, 34], "min_datetim": 34, "min_valu": 34, "min_work": 20, "minexecutor": [14, 38], "minim": [14, 38], "minimum": [14, 20, 38, 56], "minimum_interval_second": 56, "minrateperpartit": [14, 38], "minregisteredresourcesratio": [14, 38], "minut": [14, 20, 25, 38, 51, 56], "minute_of_hour": [51, 56], "minute_offset": [51, 56], "mirror": 21, "mismatch": 15, "miss": [22, 56], "missing_column": [50, 57], "missing_th": [50, 57], "mitig": [14, 38], "mixin": 9, "mlf_exampl": 30, "mlflow_op": 30, "mlflow_s3_endpoint_url": 30, "mlflow_solid": 30, "mlflow_track": 30, "mlflow_tracking_uri": 30, "mlflowclient": 30, "mnt": 17, "moab": 19, "mock": [8, 10, 15, 55], "mock_resourc": 55, "mode": [2, 6, 7, 8, 9, 13, 14, 28, 38, 41, 46, 51, 52, 53, 56, 57], "mode_def": [6, 8, 9, 20, 22, 31, 46, 52, 57], "modedefinit": [6, 8, 9, 20, 22, 31, 46, 49, 52, 57], "model": [13, 22], "modifi": [13, 16, 17, 18, 22, 50, 57], "modifyaconnector": 24, "modul": [1, 2, 6, 9, 11, 13, 14, 15, 16, 17, 18, 29, 32, 36, 47, 50, 52, 53, 57, 59], "module_nam": [2, 9, 13], "moduletyp": 1, "monitor": [9, 12, 14, 22, 24, 38, 40, 56], "month": [51, 56], "monthli": [51, 56], "monthly_partitioned_config": [51, 56], "monthlypartitionsdefinit": 51, "more": [5, 6, 14, 16, 20, 22, 25, 27, 38, 40, 41, 50, 55, 56, 57, 59], "most": [14, 16, 17, 18, 20, 22, 24, 34, 38, 52, 56, 57, 58, 59], "mostli": 15, "mount": [14, 18, 29], "mrkdwn": 40, "msg": [47, 50], "msg_fn": 9, "msteams_resourc": 31, "much": [9, 14, 29, 38], "mult_two": 52, "multi": [1, 7, 14, 16, 38], "multi_asset": 1, "multi_or_in_process_executor": [1, 7], "multi_out": 50, "multidependencydefinit": 7, "multipl": [1, 2, 6, 7, 10, 11, 14, 16, 24, 25, 38, 50, 52, 56, 57], "multipli": [14, 38], "multiprocess": [6, 49, 52, 59], "multiprocess_executor": [6, 11, 52], "must": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 20, 25, 28, 29, 31, 34, 38, 39, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58], "mutabl": 52, "mutat": [2, 28], "my_airbyte_job": 12, "my_airbyte_resourc": 12, "my_asset": 1, "my_assets_job": 1, "my_aws_key_id": 30, "my_channel": 40, "my_composed_airbyte_job": 12, "my_composed_fivetran_job": 24, "my_custom_dbt_run": 22, "my_custom_path_fs_io_manag": 10, "my_dag_bag": 13, "my_dagster_job": 13, "my_dashboard": [50, 57], "my_dataset": [50, 57], "my_dbt_cli_job": 22, "my_dbt_cloud_job": 22, "my_dbt_cloud_resourc": 22, "my_dbt_output": 22, "my_dbt_rpc_job": 22, "my_downstream_op": 7, "my_experi": 30, "my_first_dbt_model": 22, "my_fivetran_job": 24, "my_fivetran_resourc": 24, "my_funct": [50, 57], "my_graph": [6, 11, 39, 52], "my_int_var": 55, "my_io_manag": 10, "my_io_manager_kei": 10, "my_job": [7, 10, 14, 15, 16, 25, 40, 41, 42, 54, 55, 56], "my_message_fn": [31, 40], "my_modul": [16, 17, 18], "my_new_project": 22, "my_op": [10, 25, 38, 55], "my_org": 22, "my_other_t": [50, 57], "my_pipelin": [20, 31, 40], "my_prefix": 1, "my_project": 29, "my_pyspark_resourc": 38, "my_repo": [17, 18, 31, 40], "my_repo_nam": 13, "my_return_n_": 54, "my_run_config_fn": 51, "my_s3_endpoint": 30, "my_sas_token": 15, "my_schedul": 54, "my_secret": 30, "my_sensor": 56, "my_simple_airbyte_job": 12, "my_simple_fivetran_job": 24, "my_slack_token": 40, "my_snowflake_job": 41, "my_spark": 20, "my_spark_job": 38, "my_storage_account": 15, "my_str_var": 55, "my_tabl": [50, 57], "my_table_schema": [50, 57], "my_text_label": [50, 57], "my_upstream_asset": 1, "my_upstream_graph": 7, "my_upstream_op": 7, "my_us": 17, "my_valu": 22, "my_vari": 22, "myclass": [50, 57], "mycompani": [18, 29], "myconfigurableclass": 9, "mycoolsit": [40, 50, 57], "mycorp": 14, "myiomanag": 10, "mymodul": 11, "mysql_db": 32, "mysql_url": 32, "mysqleventlogstorag": [9, 32], "mysqlrunstorag": [9, 32], "mysqlschedulestorag": [9, 32], "mytabl": [50, 57], "n_worker": 19, "naiv": [25, 34], "name": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 38, 39, 40, 41, 43, 46, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59], "name1": [14, 38], "name2": [14, 38], "namedtemporaryfil": 9, "namedtupl": 46, "namespac": [1, 18, 29], "nativ": [14, 38], "necessari": [7, 9, 14, 20, 22, 24, 38, 58], "need": [6, 7, 10, 11, 14, 15, 16, 20, 21, 25, 27, 33, 34, 35, 38, 40, 50, 52, 54, 57, 58, 59], "neg": [6, 9, 14, 38, 50, 57], "neither": [25, 34, 58], "nest": [6, 7, 11, 30, 52, 57], "net": [14, 38], "netti": [14, 38], "network": [9, 14, 15, 17, 23, 25, 38], "network_timeout": 41, "network_uri": 25, "networkuri": 25, "never": [6, 10, 12, 14, 16, 17, 18, 22, 24, 38, 56, 57], "new": [9, 14, 15, 16, 17, 18, 20, 29, 33, 38, 40, 47, 50, 52, 53, 54, 55, 57], "new_clust": 20, "newer": [14, 38], "newli": 9, "next": [4, 24, 50, 57], "next_asset": 1, "no_host_key_check": 44, "node": [1, 3, 6, 7, 11, 14, 15, 20, 22, 25, 38, 46, 52, 57], "node_a": 7, "node_b": 7, "node_def": [6, 7], "node_info_to_asset_kei": 22, "node_nam": 6, "node_str": 6, "node_typ": 20, "node_type_id": 20, "nodedefinit": [6, 7], "nodehandl": [6, 52, 57], "nodeinvoc": [6, 7, 52, 57], "nois": 40, "non": [1, 3, 9, 14, 17, 23, 25, 34, 38, 39], "non_argument_dep": 1, "non_nul": 34, "non_scalar_schema": 3, "noncancel": [14, 38], "none": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 26, 28, 29, 30, 31, 32, 34, 39, 40, 41, 43, 44, 46, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "none_resourc": 55, "noneabl": [3, 5], "nonetyp": 3, "nor": [25, 34, 58], "normal": [8, 22, 57], "normalized_nam": 13, "nosigint": 59, "not_start": 9, "note": [1, 3, 9, 10, 14, 15, 16, 17, 18, 20, 22, 27, 28, 32, 33, 36, 38, 39, 41, 47, 50, 54, 56, 57], "notebook": [6, 11, 46, 52], "notebook_path": 46, "noth": [22, 39, 55, 56, 58], "nothing_int_job": 58, "nothing_job": 58, "notic": 47, "notif": 40, "novaluesentinel": [4, 50, 57], "novel": 1, "now": [20, 27, 28, 49], "ntype": 58, "null": [9, 34, 50], "nullabl": 50, "num": [6, 7, 14, 38, 50, 52, 57], "num_allowed_row": 34, "num_input": 20, "num_row": [50, 57], "num_work": 20, "number": [2, 6, 9, 12, 14, 19, 20, 22, 24, 25, 28, 29, 34, 38, 41, 50, 51, 56, 57], "numconnectionsperp": [14, 38], "numer": [34, 41], "numeric_column": 34, "numinst": 25, "numlocalssd": 25, "numpi": 41, "numrbackendthread": [14, 38], "numretri": [14, 38], "oar": 19, "oauth": [14, 38], "oauth2accesstoken": 29, "obj": 10, "object": [3, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, 24, 25, 26, 28, 29, 31, 34, 38, 40, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58], "object_store_oper": 6, "objectadmin": 25, "objectstreamreset": [14, 38], "observ": [4, 10], "occasion": [14, 38], "occur": [5, 6, 7, 9, 11, 14, 25, 28, 38, 39, 50, 52, 57], "ocsp": 41, "ocsp_response_cache_filenam": 41, "ocur": 6, "of_typ": 34, "off": [2, 14, 24, 25, 29, 38, 48], "offer": [14, 38], "offheap": [14, 38], "offici": [20, 28], "offset": [14, 22, 38, 51], "often": [5, 14, 38], "old": [14, 15, 38], "older": [14, 38], "omit": [14, 22, 25, 28, 34, 38, 57, 58, 59], "onc": [3, 6, 9, 27, 33, 40, 55, 57, 58], "one": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 16, 17, 18, 19, 20, 22, 23, 25, 28, 29, 38, 41, 47, 50, 51, 52, 53, 54, 56, 57, 58, 59], "ones": [51, 56], "ongo": 9, "onli": [1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 14, 20, 22, 24, 25, 34, 38, 39, 41, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58], "onlin": 21, "onto": [13, 29, 57], "oom": [14, 38], "op_a": [7, 10], "op_b": [7, 10], "op_c": 7, "op_config": [3, 4, 6, 8, 54], "op_def": [6, 10, 57], "op_definit": [12, 22, 24, 25], "op_except": 8, "op_kwarg": 13, "op_output_valu": 8, "op_retry_polici": [6, 7, 11], "op_select": [6, 7, 11, 28], "op_tag": 1, "op_to_invok": 6, "opdefinit": [6, 8, 10, 12, 20, 22, 24, 25, 39, 41, 50, 57], "open": [9, 14, 27, 38, 58], "opencostinbyt": [14, 38], "oper": [12, 13, 14, 22, 24, 26, 27, 28, 38, 41, 49, 52], "opexecutioncontext": 6, "ops": [1, 3, 4, 6, 7, 10, 11, 14, 15, 20, 21, 22, 25, 26, 33, 34, 39, 47, 54, 55, 57, 58], "opt": [18, 29], "optim": [14, 20, 38], "option": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 35, 36, 38, 39, 40, 41, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "optionalcompon": 25, "orchestr": [4, 9], "order": [3, 9, 14, 15, 18, 20, 22, 29, 34, 38, 50, 57, 58], "order_bi": 22, "ordinari": 14, "ore": 56, "org": [14, 25, 38, 43, 51, 56], "organ": [50, 57], "origin": [3, 5, 7, 9, 18, 22, 29, 52], "original_exc_info": 5, "original_root": 5, "other": [1, 3, 5, 6, 7, 9, 11, 14, 18, 22, 38, 41, 50, 51, 52, 54, 57, 59], "other_asset": 1, "other_asset_key_a": 1, "other_asset_key_b": 1, "other_expensive_job": 54, "other_nam": 7, "other_op": [12, 24], "other_op_a": [6, 7, 11], "other_op_b": [6, 7, 11], "other_result": 7, "other_solid": [51, 53, 56], "other_solid_a": 52, "other_solid_b": 52, "otherwis": [6, 14, 18, 22, 24, 29, 38, 41, 48, 50, 57], "ought": 34, "our": 9, "out": [1, 2, 4, 6, 7, 10, 12, 14, 22, 24, 29, 38, 57], "outer": 57, "outer_graph": 6, "outliv": 29, "output": [1, 2, 4, 5, 6, 7, 8, 9, 12, 14, 22, 24, 25, 26, 28, 31, 34, 38, 40, 41, 46, 48, 50, 52, 58, 59], "output_asset_partition_kei": [6, 57], "output_asset_partitions_time_window": [6, 57], "output_captur": [6, 52, 57], "output_config_schema": 10, "output_def": [7, 22, 39, 46, 50, 52, 57], "output_events_during_comput": 57, "output_for_nod": 6, "output_for_solid": [52, 57], "output_map": [6, 7, 57], "output_nam": [4, 6, 10, 46, 50, 52, 57], "output_notebook": 46, "output_notebook_nam": 46, "output_valu": [6, 57], "outputcontext": [10, 48, 50, 57], "outputdefinit": [7, 10, 22, 34, 46, 50, 52, 57, 58], "outputmap": [6, 7, 57], "outsid": [9, 10, 14, 29, 38, 55], "over": [2, 14, 21, 22, 28, 38, 51, 54, 56], "overestim": [14, 38], "overhead": [14, 38], "overload": 20, "overrid": [2, 3, 6, 7, 11, 13, 14, 22, 26, 29, 38, 39, 57], "overridden": [14, 22, 31, 38, 40, 51, 56], "override_system_timezon": 2, "overview": [18, 20, 29, 50, 57], "overwrit": [10, 14, 15, 25, 38], "overwritten": [1, 6, 7, 11, 25, 46, 52], "own": [5, 6, 11, 14, 22, 38, 50, 52, 55, 57], "owner": [20, 27], "pack": [14, 38], "packag": [1, 2, 9, 14, 15, 20, 25, 38, 52, 53, 59], "package_modul": 1, "package_nam": [1, 2], "packet": 44, "page": [14, 21, 22, 24, 27, 38], "pagerduty_op": 33, "pagerduty_resourc": 33, "pagerduty_test": 33, "pagin": 22, "pair": [9, 18, 20, 23, 29, 50, 51, 52, 56, 57], "panda": [22, 41], "pandascolumn": 34, "papertrail_logg": 35, "parallel": [14, 25, 29, 38], "param": [2, 3, 9, 14, 22, 36, 38], "paramet": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 26, 28, 31, 34, 35, 38, 39, 40, 41, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "parameter": [7, 11, 51, 56, 57], "parametr": 52, "paramiko": 44, "paramstyl": 41, "parent": [10, 22, 30, 50, 57], "parent_run_id": [9, 30, 52], "parquet": 3, "pars": [3, 5, 6, 8, 9, 12, 22, 24, 53, 57, 58, 59], "part": [4, 27, 31, 50], "parti": 9, "partial": [3, 25], "partially_specified_config": 3, "particular": [1, 6, 9, 10, 14, 38, 51], "particularli": 9, "partit": [1, 2, 6, 7, 9, 10, 11, 14, 38, 50, 53, 57], "partition_fn": 51, "partition_kei": [6, 10, 11, 51, 57], "partition_map": 1, "partition_selector": 51, "partition_set": [51, 54, 56], "partition_set_def": 51, "partition_time_window": [6, 57], "partitioned_config": [6, 11], "partitionedconfig": [7, 11, 49, 51], "partitionmap": 1, "partitionmetadataentri": [4, 50, 57], "partitions_def": [1, 7, 11, 51], "partitionscheduledefinit": [51, 56], "partitionsdefinit": [1, 7, 11, 51], "partitionset": 51, "partitionsetdefinit": [51, 54], "pass": [1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 28, 29, 30, 31, 34, 38, 39, 40, 46, 47, 50, 51, 52, 54, 55, 56, 57, 58], "password": [9, 14, 17, 18, 23, 25, 29, 32, 36, 41, 44], "past": [9, 51, 56], "patch": [22, 24], "path": [2, 4, 9, 10, 13, 14, 15, 16, 18, 20, 22, 25, 38, 46, 50, 53, 57, 58, 59], "path_desc": [14, 15, 25, 58], "path_prefix": 2, "pathlik": [50, 57], "pathmetadatavalu": [50, 57], "pattern": [9, 53, 59], "paus": [14, 38], "pawel": 16, "pawelzni": 16, "payload": [24, 31, 33], "pbs": 19, "pdb": [6, 57, 59], "peer": 4, "pem": 14, "pend": [14, 16, 38], "pendingnodeinvoc": 8, "pendulum": 51, "peopl": 38, "per": [1, 6, 9, 10, 14, 19, 25, 29, 38, 56, 57], "perform": [2, 5, 9, 14, 22, 27, 38, 41, 48, 50, 51, 54, 57], "period": [9, 14, 22, 38, 51, 56], "periodicgc": [14, 38], "permiss": [3, 5, 9, 14, 16, 17, 18, 19, 20, 22, 23, 25, 27, 29, 30, 36, 38, 41, 58], "permit": [6, 52], "persist": [9, 14, 15, 20, 25, 29, 34, 38, 50, 52, 56, 57, 58], "person": 27, "photo": 21, "pick": [16, 17, 18, 25], "pickl": [3, 10, 14, 15, 25], "pid": 6, "piec": [9, 14, 20, 38], "pig": 25, "pigjob": 25, "pip": 33, "pipe": 39, "pipelin": [4, 6, 7, 8, 9, 10, 11, 13, 14, 20, 22, 28, 30, 31, 35, 39, 40, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 59], "pipeline_cancel": 6, "pipeline_code_origin": 9, "pipeline_context": 46, "pipeline_def": [6, 13, 46, 47, 52, 53, 57], "pipeline_def_for_backwards_compat": 55, "pipeline_dequeu": 6, "pipeline_enqueu": 6, "pipeline_failur": 6, "pipeline_failure_sensor": 56, "pipeline_nam": [6, 8, 9, 10, 13, 28, 31, 40, 51, 52, 56, 57], "pipeline_run": [6, 9, 31, 40, 46, 55, 56, 57], "pipeline_run_statu": 56, "pipeline_select": [40, 56], "pipeline_snapshot_id": 9, "pipeline_start": [6, 52, 57], "pipeline_success": 6, "pipelineconfigurationinvalid": 28, "pipelinedefinit": [6, 11, 13, 40, 46, 47, 49, 52, 54, 56, 57], "pipelineexecutionresult": 52, "pipelinefailuresensorcontext": [31, 40, 56], "pipelinenotfounderror": 28, "pipelinerun": [6, 9, 13, 46, 55, 56, 57], "pipelinerunreact": 56, "pipelinerunstatu": [9, 28, 56], "pkg_resourc": [53, 59], "pkg_resource_def": [53, 59], "place": [9, 10, 14, 22, 29, 34, 38, 50, 58], "placehold": 3, "placement": 25, "plai": 56, "plain": 40, "plan": [9, 13, 14, 18], "plan_context": 9, "plane": [14, 38], "planorchestrationcontext": 9, "platform": 25, "playground": [7, 11], "pleas": [9, 14, 38], "plu": 56, "plug": 9, "pluggabl": [3, 9], "plugin": 9, "plus_minu": [50, 57], "pod": [16, 18, 28, 29], "point": [2, 14, 16, 17, 18, 38, 46], "pointer": 52, "polici": [7, 9, 11, 18, 29, 50, 52, 57], "poll": [12, 14, 20, 22, 24, 38], "poll_interv": [12, 22, 24], "poll_interval_sec": 20, "poll_run": 22, "poll_sync": 24, "poll_timeout": [12, 22, 24], "polling_timeout": 9, "pollinginterv": [14, 38], "pool": [14, 20, 38], "poor": [14, 38], "pop": [6, 10, 57], "popul": 22, "port": [2, 9, 12, 14, 20, 22, 28, 29, 32, 35, 36, 38, 44], "port_numb": 28, "posit": [6, 14, 38, 50, 51, 56, 57, 59], "positional_input": 57, "possibl": [3, 9, 14, 18, 20, 29, 38, 51], "post": [3, 16, 22, 24, 31, 33, 40], "post_messag": 31, "postgr": [9, 18, 29], "postgres_db": [9, 36], "postgres_password_secret": [18, 29], "postgres_url": 36, "postgreseventlogstorag": [9, 36], "postgresql": [18, 29], "postgresrunstorag": [9, 36], "postgresschedulestorag": [9, 36], "postmessag": 40, "postpend": 13, "potenti": [14, 22, 38, 52], "power": 9, "pre": [3, 9, 14, 22, 28, 38], "preambl": 5, "preced": [7, 10, 14, 38, 57], "predefin": [2, 50, 53], "predict": [6, 11, 52], "preemptibl": 25, "prefer": [6, 7, 50, 56, 57], "preferdirectbuf": [14, 38], "prefix": [1, 2, 12, 14, 15, 16, 20, 22, 24, 25, 46], "pregel": [14, 38], "prepend": [14, 38], "presenc": [3, 34, 50, 57], "present": [3, 6, 14, 18, 25, 27, 28, 29, 33, 38, 40, 48, 49, 50, 54, 57], "preserv": [9, 14, 38, 50], "preset": [13, 28, 52, 57], "preset_def": [6, 11, 52], "presetdefinit": [52, 53], "presetnotfounderror": 28, "pressur": [14, 38], "pretti": 27, "prevent": [14, 38], "preview": [2, 25], "previou": [5, 9, 10, 14, 15, 24, 25, 38, 48, 50, 52, 57], "primarili": [14, 28, 38, 56], "primit": [3, 5], "princip": 25, "print": [2, 29, 50, 55, 57], "printgcdetail": 20, "prior": 24, "prioriti": 6, "priv": 29, "privat": [20, 27, 29], "proactiv": [14, 38], "problem": 22, "proce": [1, 5], "process": [1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 14, 20, 22, 26, 29, 38, 41, 42, 52, 56], "process_directori": 4, "process_fil": 4, "produc": [1, 4, 5, 6, 7, 9, 10, 22, 50, 52, 55, 57, 58, 59], "product": [16, 52], "profil": [14, 20, 22, 38], "profile_nam": 14, "profiles_dir": 22, "program": [14, 25, 38], "programat": [6, 7, 12, 22, 24, 27, 50], "programmat": [34, 58], "progress": [9, 12, 14, 22, 24, 29, 38], "project": [22, 25], "project_and_instance_metadata": 25, "project_dir": 22, "project_id": [22, 25], "projectid": 25, "prometheus_cli": 37, "prometheus_resourc": 37, "prometheusresourc": 37, "proper": [14, 38], "properli": [14, 38, 41], "properti": [6, 8, 9, 10, 14, 15, 22, 24, 25, 34, 38, 46, 50, 52, 54, 57, 58, 59], "protect": [14, 38], "protocol": [21, 31], "provid": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 27, 28, 29, 30, 33, 34, 35, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59], "provis": [20, 25, 27, 33], "proxi": [14, 31, 38], "ptat": [14, 38], "public": [14, 17, 20, 21, 23, 38], "publish": 21, "pull": [14, 18, 22, 23, 27, 29], "purpos": [8, 14, 38, 58], "push": 29, "put": [1, 14, 38, 40, 50, 57], "putobjectacl": 20, "py37": 29, "pyamqp": [16, 17, 18], "pyfil": [14, 38], "pyformat": 41, "pylint": 39, "pyspark": [14, 20, 25], "pyspark_resourc": 38, "pysparkjob": 25, "pytest": 29, "python": [1, 2, 3, 5, 6, 7, 9, 11, 13, 14, 15, 16, 20, 21, 25, 34, 38, 41, 47, 50, 52, 53, 57, 58, 59], "python_artifact": [50, 57], "python_fil": [2, 20], "python_logging_levels_nam": 9, "python_modul": [16, 29], "python_typ": 58, "python_valu": 3, "pythonartifactmetadatavalu": [50, 57], "pythonerror": 28, "pythonfileuri": 25, "pythonobjectdagstertyp": [50, 57, 58], "pythonoper": 13, "pythonpath": [14, 38], "qmark": 41, "qualifi": [14, 38], "qualiti": [50, 57], "quantil": [14, 38], "queri": [1, 2, 6, 7, 9, 11, 14, 21, 22, 25, 28, 41, 52], "query1": 25, "query2": 25, "query3": 25, "query4": 25, "queryfileuri": 25, "querylist": 25, "queu": 9, "queue": [9, 14, 16, 18, 38], "queuedruncoordin": 9, "quickstart": 27, "quit": [14, 29, 38], "quux": 9, "rabbitmq": [16, 17, 18, 29], "rack": [14, 38], "rais": [5, 6, 7, 10, 11, 12, 22, 24, 28, 39, 41, 46, 50, 52, 53, 55, 56, 57, 59], "raise_on_error": [5, 6, 7, 11, 52, 57], "random": [50, 57], "randomli": [29, 50, 57], "rang": [10, 14, 38, 51, 54, 59], "rapidli": [14, 38], "rasset_key_prefix": 22, "rate": [14, 38], "rather": [9, 14, 22, 29, 38, 50, 52, 55, 57, 58], "ratio": [14, 38], "raw": [9, 14, 22, 38], "raw_conn": 41, "raw_output": 22, "rawmetadatavalu": [50, 57], "rbackend": [14, 38], "rdd": [14, 38], "reach": [14, 16, 38], "react": 56, "read": [2, 3, 9, 10, 13, 14, 20, 27, 38, 39, 41, 58], "read_csv": 10, "read_data": 9, "read_fil": 9, "read_writ": 25, "readabl": [3, 6, 7, 10, 14, 35, 39, 47, 49, 50, 51, 52, 55, 56, 57, 58], "readi": 58, "readonli": 25, "readrc": 59, "readthedoc": [16, 23], "real": 16, "realm": 25, "reaper": [14, 38], "reason": [5, 28, 56], "rebuild": 2, "recalcul": 22, "receiv": [2, 5, 14, 34, 38, 58], "receive_processed_config_valu": [3, 57], "recent": [24, 56], "reclaim": [14, 38], "recommend": [12, 13, 14, 20, 22, 24, 38, 52, 57, 58], "recon_repo": 13, "reconstruct": [13, 14, 38, 57], "reconstruct_context": [52, 57], "reconstruct_job": 11, "reconstructable_arg": 11, "reconstructable_bar_job": [6, 11, 52], "reconstructable_foo_job": [6, 11, 52], "reconstructable_kwarg": 11, "reconstructablepipelin": [6, 11, 52], "reconstructablerepositori": [13, 52], "reconstructor_function_nam": 11, "reconstructor_module_nam": 11, "reconstructor_working_directori": 11, "record": [1, 6, 9, 10, 14, 38, 50, 57], "recov": [14, 38], "recoveri": [14, 38, 56], "recoverymod": [14, 38], "recurs": [3, 5], "red": 3, "redact": [14, 38], "redi": [16, 17, 18], "redshift_configur": 14, "redshift_resourc": 14, "reduc": [14, 38], "reducebykei": [14, 38], "redund": [14, 38], "reexecut": 52, "reexecute_pipelin": 52, "reexecute_pipeline_iter": 52, "ref": [9, 22, 44], "refactor": 57, "refer": [1, 13, 14, 15, 18, 20, 21, 22, 25, 26, 29, 33, 38, 40, 43, 50, 57, 58], "referenc": [14, 50, 57], "referencetrack": [14, 38], "refresh": [13, 22, 28], "refresh_from_airflow_db": 13, "regardless": [14, 38], "regex": [14, 38], "region": [14, 20, 25, 29, 38], "region_nam": 14, "regist": [14, 38], "registr": [14, 38], "registrationrequir": [14, 38], "registri": [17, 23, 29], "regress": [14, 38], "regular": [9, 10, 58], "rehydr": 9, "reindex": 2, "rel": [4, 59], "relat": [4, 14, 22, 25, 38, 48, 50, 57], "relationship": [1, 25], "relative_path": 59, "relaunch": [14, 38], "releas": 25, "relev": [2, 5, 14, 22, 26, 27, 38], "reli": 54, "reliabl": 20, "reload": 28, "reload_repository_loc": 28, "reloadnotsupport": 28, "reloadrepositorylocationinfo": 28, "reloadrepositorylocationstatu": 28, "remain": 7, "remap": 57, "rememb": [14, 38], "remot": [2, 9, 14, 20, 25, 28, 29, 38, 44], "remote_host": 44, "remote_port": 44, "remov": [14, 38], "render": [14, 38], "repeat": [3, 9], "repeat_word": 3, "repl": [6, 11, 52], "replac": [6, 11, 14, 38], "replai": [14, 38], "replenish": [14, 38], "replic": [14, 38], "replica": [14, 38], "repo": [13, 14, 29, 38], "repo_location_nam": 18, "repo_nam": [13, 27], "repo_own": 27, "report": [9, 22, 24, 25], "report_engine_ev": 9, "repositori": [1, 2, 13, 14, 18, 22, 23, 27, 28, 29, 31, 38, 40, 52, 56], "repository_data": 54, "repository_location_nam": 28, "repository_nam": [28, 56], "repositorydata": 54, "repositorydefinit": [1, 7, 13, 52, 54], "repositorylocationloadfailur": 28, "repositorylocationnotfound": 28, "repres": [1, 3, 4, 6, 7, 9, 10, 12, 14, 22, 24, 38, 48, 50, 51, 53, 55, 56, 57], "represent": [3, 6, 9, 14, 15, 22, 50, 52, 57, 58], "request": [2, 10, 12, 14, 20, 22, 24, 25, 27, 28, 31, 38, 41, 50, 57], "request_6": 24, "request_max_retri": [12, 22, 24], "request_retry_delai": [12, 22, 24], "request_token": 22, "requir": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 29, 34, 38, 39, 41, 46, 49, 50, 52, 55, 56, 57, 58], "required_resource_kei": [1, 5, 8, 9, 10, 14, 15, 20, 21, 22, 30, 31, 33, 34, 38, 39, 40, 41, 43, 46, 50, 52, 55, 57, 58], "rerais": 5, "reserv": [1, 10], "reset": [14, 38], "resid": [1, 14, 38, 58], "resolut": [14, 38], "resolv": [2, 3, 5, 14, 38, 55], "resolve_standoff": 3, "resolved_run_config": 46, "resolvedrunconfig": 46, "resourc": [1, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 18, 20, 21, 25, 27, 29, 30, 31, 33, 34, 37, 38, 39, 40, 41, 44, 46, 48, 49, 50, 52, 53, 57, 58, 59], "resource_config": [10, 52, 55], "resource_def": [1, 6, 7, 9, 10, 11, 12, 14, 15, 20, 21, 22, 24, 25, 27, 30, 31, 33, 38, 40, 41, 42, 46, 49, 52, 55], "resource_definit": 38, "resource_fn": [5, 10, 55], "resource_keys_to_init": 46, "resource_nam": [5, 55], "resource_str": [53, 59], "resource_to_init": 55, "resourcedefinit": [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 20, 21, 22, 24, 25, 27, 30, 31, 33, 37, 38, 40, 41, 43, 44, 45, 49, 55], "resources_config": [6, 57], "resourceversioncontext": 48, "respect": [2, 14, 16, 20, 29, 41, 46], "respond": 20, "respons": [2, 9, 12, 21, 22, 24, 41], "response_dict": 22, "rest": [12, 22, 24], "restart": [2, 14, 20, 25, 28, 38, 50, 57], "restrict": [20, 25], "result": [1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 16, 17, 18, 21, 22, 24, 25, 28, 38, 41, 46, 47, 50, 51, 52, 55, 56, 57, 58, 59], "result_for_handl": [52, 57], "result_for_solid": [52, 57], "resum": 9, "resume_run": 9, "resync": 24, "resync_and_pol": 24, "resync_paramet": 24, "retain": [14, 38], "retainedbatch": [14, 20, 38], "retaineddeadexecutor": [14, 38], "retaineddriv": [14, 38], "retainedexecut": [14, 38], "retainedexecutor": [14, 38], "retainedjob": [14, 38], "retainedrootrdd": [14, 38], "retainedstag": [14, 38], "retainedtask": [14, 38], "rethrown": 5, "retri": [6, 7, 9, 11, 12, 14, 16, 17, 18, 20, 22, 23, 24, 29, 38, 50, 52, 56], "retriev": [6, 10, 12, 18, 20, 24, 25, 27, 29, 52, 54, 57], "retry_attempt": 57, "retry_interv": 22, "retry_numb": [6, 57], "retry_polici": [7, 50, 57], "retrymod": 9, "retrypolici": [7, 11, 50, 52, 57], "retryrequest": [46, 50, 57], "retrywait": [14, 38], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 22, 24, 26, 28, 31, 34, 38, 39, 40, 41, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "return_cod": 22, "return_n": 54, "return_n_": 54, "return_on": [6, 7, 52], "return_text": 22, "reus": [14, 25, 38], "reusabl": 9, "revers": [14, 22, 38], "reverseproxi": [14, 38], "reverseproxyurl": [14, 38], "reviv": [14, 38], "rewritten": [14, 38], "rfc": 25, "rfc1035": 25, "rich": 16, "rigidli": [14, 38], "role": [3, 20, 25, 41], "roll": [14, 38], "root": [9, 14, 25, 29, 38], "root_input_manag": 10, "root_manag": 1, "root_manager_kei": [10, 50, 57], "root_run_id": 9, "rootinputmanag": [10, 50, 57], "rootinputmanagerdefinit": 10, "rootlogg": 25, "rootprincipalpassworduri": 25, "rouberol": 16, "rout": 33, "routing_kei": 33, "row": [22, 34, 50, 57], "rowcountconstraint": 34, "rowdict": 58, "rpc": [14, 16, 17, 18, 38], "rsa": 27, "rule": [3, 5, 34, 58], "run": [1, 5, 7, 8, 10, 11, 13, 14, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 36, 38, 40, 46, 47, 48, 50, 51, 53, 55, 57, 59], "run_cancel": 6, "run_config": [3, 6, 7, 9, 11, 13, 14, 20, 21, 27, 28, 30, 33, 40, 41, 46, 52, 53, 54, 56, 57], "run_config_fn": 56, "run_config_fn_for_partit": 51, "run_config_for_partition_fn": 51, "run_coordin": 9, "run_coordinator_data": 9, "run_dbt_nightly_sync": 22, "run_dequeu": 6, "run_enqueu": 6, "run_failur": 6, "run_failure_sensor": 56, "run_fn": 21, "run_id": [5, 6, 7, 8, 9, 10, 11, 22, 28, 46, 47, 50, 52, 55, 57, 59], "run_job": 22, "run_job_and_pol": 22, "run_kei": [54, 56], "run_launch": [9, 18, 29], "run_launcher_data": 9, "run_nam": 20, "run_oper": 22, "run_result": 22, "run_resultsjson": 22, "run_sql": 22, "run_start": 6, "run_status_sensor": 56, "run_status_sensor_fn": 56, "run_status_sensor_to_invok": 56, "run_storag": [9, 32, 36], "run_storage_data": 9, "run_success": 6, "run_updated_aft": 9, "runawai": [14, 38], "runconfigdata": 28, "runconfigvalidationinvalid": 28, "runconflict": 28, "runcoordin": 9, "runfailuresensorcontext": [40, 56], "runlaunch": [9, 14, 18, 23, 29], "runnabl": 59, "runner": 29, "runrequest": [54, 56], "runshardedeventscursor": 9, "runstatussensorcontext": 56, "runstatussensordefinit": 56, "runstorag": 9, "runtim": [3, 4, 5, 7, 11, 14, 20, 22, 25, 34, 38, 47, 57, 58], "runtime_metadata_fn": 22, "s3_bucket": [14, 58], "s3_file": 14, "s3_file_manag": [14, 46], "s3_job_package_path": 14, "s3_kei": [14, 58], "s3_path": [14, 58], "s3_pickle_asset_io_manag": 14, "s3_pickle_io_manag": 14, "s3_pipeline_package_path": 14, "s3_prefix": 14, "s3_resourc": [3, 14], "s3_session": 14, "s3computelogmanag": [9, 14], "s3coordin": 14, "s3filecach": 14, "s3filehandl": [14, 46, 58], "safe": [14, 38, 40, 55], "safe_mod": 13, "safeti": [14, 38], "same": [1, 2, 3, 4, 5, 6, 7, 9, 10, 14, 16, 20, 38, 47, 50, 55, 56, 57, 58], "sampl": [4, 22, 25], "sample_data": 10, "sample_output": 10, "sanit": [50, 57], "sas": 15, "satisfi": [3, 6, 9, 47, 50, 52, 55, 57], "satur": [14, 38], "saturdai": 56, "save": [14, 26, 38], "saveashadoopfil": [14, 38], "scaffold": [2, 13], "scaffold_config": 2, "scala": [14, 38], "scala2": 20, "scalar": [3, 6, 52], "scalar_typ": 3, "scalarunion": 3, "scale": [14, 20, 38], "scan": [14, 38], "scenario": [14, 38], "schedul": [14, 19, 22, 24, 25, 36, 38, 54], "schedule_definit": 51, "schedule_nam": 51, "schedule_storag": [9, 32, 36], "schedule_storage_data": 9, "schedule_typ": [24, 51], "scheduled_execution_tim": 56, "scheduledefinit": [1, 51, 54, 56], "scheduleevaluationcontext": [51, 56], "scheduler_data": 9, "schedulerbacklogtimeout": [14, 38], "schedulestorag": 9, "schema": [3, 5, 7, 9, 10, 11, 12, 14, 22, 24, 28, 34, 35, 41, 47, 50, 55, 57, 58], "schema1": 24, "schema2": 24, "schema_nam": 24, "scheme": [14, 38], "scope": [6, 11, 14, 20, 25, 27, 29, 35, 38, 47, 52, 55], "scoped_resources_build": 6, "scopedresourc": 55, "scratch": [14, 38], "script": [14, 20, 25, 38, 39, 52, 53], "scriptvari": 25, "search": [14, 38, 50, 57], "second": [9, 12, 14, 18, 20, 22, 24, 25, 31, 38, 41, 44, 50, 51, 56, 57], "second_compon": [50, 57], "secondaryworkerconfig": 25, "seconds_to_wait": [50, 57], "secret": [7, 11, 14, 15, 18, 20, 24, 29], "secret_bool_op": 3, "secret_int_op": 3, "secret_job": 3, "secret_kei": 15, "secret_key_kei": 20, "secret_op": 3, "secret_scop": 20, "secretid": 14, "secrets_tag": 14, "secrets_to_env_vari": 20, "secretsmanager_resourc": 14, "secretsmanager_secrets_resourc": 14, "section": [14, 18, 29, 38], "secur": [2, 18, 25, 29], "securili": 27, "securityconfig": 25, "see": [9, 12, 13, 14, 16, 17, 18, 19, 22, 23, 24, 25, 26, 27, 29, 38, 40, 41, 43, 48, 52], "seed": 22, "seek": [9, 14, 38], "seem": 27, "select": [1, 3, 6, 7, 11, 14, 22, 25, 41, 52], "select_color": 3, "selected_asset": 1, "selected_unique_id": 22, "selector": [3, 5, 15, 16, 17, 18, 19, 20, 22, 23, 29, 51], "self": [10, 14, 25, 38, 54, 58], "semicolon": 25, "send": [2, 9, 12, 14, 16, 21, 22, 24, 31, 38, 40, 44], "send_messag": 8, "sens": [16, 17, 18], "sensit": [14, 25, 38], "sensor": [31, 40, 54], "sensor_nam": 56, "sensordefinit": [1, 54, 56], "sensorevaluationcontext": 56, "sent": [2, 14, 20, 38, 40, 56], "separ": [4, 9, 10, 14, 25, 29, 38, 52], "sequenc": [1, 10, 41, 50, 51, 52, 56, 57], "sequenti": 20, "serd": [9, 25], "seri": [9, 16], "serial": [1, 2, 9, 10, 14, 15, 25, 38, 56], "serializ": [4, 9, 11, 14, 15, 38, 50, 57], "serializable_error_info_from_exc_info": 9, "serializableerrorinfo": 9, "serv": [2, 14, 22, 28, 38, 51], "server": [2, 9, 12, 14, 20, 21, 22, 25, 28, 29, 30, 31, 38, 41], "servic": [14, 18, 20, 22, 25, 29, 33, 38], "service_account_nam": [18, 29], "service_check": 21, "serviceaccount": 25, "serviceaccountscop": 25, "servlet": [14, 38], "session": [14, 41], "set": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 34, 35, 36, 38, 39, 41, 46, 47, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59], "set_trac": [6, 57, 59], "setup": [14, 24, 30, 38], "seven": [14, 15], "sever": [3, 14, 20, 33], "sge": 19, "shape": [3, 5], "shard": 9, "share": [14, 15, 25, 27, 38, 55], "shell": [14, 22, 38], "shell_command": 39, "shell_command_op": 39, "shell_command_solid": 39, "shell_op": 39, "shell_script_path": 39, "shell_solid": 39, "shim": [34, 46, 58], "ship": 52, "short": [14, 25, 38, 50, 57], "shortcut": 57, "should": [2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 29, 32, 34, 36, 38, 39, 41, 42, 46, 47, 48, 50, 51, 52, 54, 55, 56, 57, 58, 59], "should_autocreate_t": 36, "should_execut": [51, 56], "should_start_background_run_thread": 9, "show": [2, 10, 14, 22, 38, 59], "show_profil": [14, 38], "showconsoleprogress": [14, 38], "shown": [20, 25, 41], "shrink": [14, 38], "shuffl": [14, 20, 38], "shut": [2, 14, 16, 28, 38], "shutdown": [2, 14, 38], "shutdown_repository_loc": 28, "shutdownrepositorylocationinfo": 28, "sid": 45, "side": [6, 11, 14, 20, 38, 40, 41, 50, 52, 57], "sidecar": 14, "sign": 25, "signal": [2, 16], "signatur": [7, 9, 50, 57], "signific": [9, 14, 38], "significantli": [14, 38], "silenc": [14, 38], "similar": 57, "simpl": [4, 34, 41, 54], "simple_job": 54, "simple_repositori": 54, "simpler": [14, 38], "simpli": [14, 22, 38], "simplifi": 57, "simultan": [14, 38], "sinc": [2, 6, 9, 10, 14, 16, 20, 25, 38, 52, 57], "singl": [1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 20, 22, 25, 38, 39, 50, 51, 52, 53, 56, 57, 58], "sit": 1, "site": [5, 14, 25, 38], "situat": [14, 38], "size": [14, 20, 25, 38, 50, 57], "skip": [10, 14, 22, 38, 56, 57, 59], "skip_empty_fil": 14, "skip_messag": 56, "skipreason": 56, "slack": [8, 9], "slack_job": 40, "slack_message_on_failur": 8, "slack_message_on_success": 8, "slack_on_failur": 40, "slack_on_pipeline_failur": 40, "slack_on_run_failur": 40, "slack_on_success": 40, "slack_op": 40, "slack_resourc": 40, "slack_sdk": 40, "slack_token": 40, "sleep": 58, "slice": [41, 42, 51], "slightli": 56, "slow": [14, 16, 17, 18, 29, 38], "slower": [14, 38], "slowli": [14, 38], "slurm": 19, "small": [14, 38], "smaller": [14, 38], "snappi": [14, 38], "snappycompressioncodec": [14, 38], "snapshot": 22, "snapshot_fresh": 22, "snippet": 25, "snowflak": 15, "snowflake_account": 41, "snowflake_databas": 41, "snowflake_io_manag": [41, 42], "snowflake_op_for_queri": 41, "snowflake_password": 41, "snowflake_resourc": 41, "snowflake_schema": 41, "snowflake_us": 41, "snowflake_warehous": 41, "snowflakeconnect": 41, "snowflakepandastypehandl": [41, 42], "socket": [2, 14, 38], "softwar": [14, 15, 25], "softwareconfig": 25, "solid": [3, 6, 7, 8, 9, 10, 13, 16, 20, 25, 26, 28, 30, 31, 39, 46, 48, 49, 50, 51, 52, 53, 56, 59], "solid_config": [6, 8, 46, 57], "solid_def": [6, 10, 46, 52, 57], "solid_definit": 22, "solid_except": 8, "solid_handl": [6, 46], "solid_nam": [31, 46, 57], "solid_output_valu": 8, "solid_result_list": [52, 57], "solid_retry_polici": 52, "solid_select": [9, 28, 51, 52, 53, 56], "solid_selection_str": 52, "solid_to_invok": 57, "soliddefinit": [3, 6, 8, 10, 20, 22, 39, 46, 52, 57], "solidexecutioncontext": [6, 22, 57], "solidexecutionresult": [52, 57], "solidinvoc": 52, "solids_to_execut": [9, 52], "solidversioncontext": 48, "solut": 13, "some": [3, 9, 10, 14, 15, 16, 18, 20, 22, 28, 29, 38, 56, 59], "some_asset": 1, "some_asset_kei": 1, "some_celery_backend_url": 18, "some_celery_broker_url": 18, "some_config": 3, "some_config1": 3, "some_config2": 3, "some_directori": 54, "some_graph": 7, "some_job": 54, "some_model_nam": 30, "some_modul": 2, "some_op": [6, 7, 11, 12, 24], "some_param": 30, "some_run_id": 28, "some_sensor": 54, "some_solid": [51, 52, 53, 56], "some_validation_fn": [50, 57], "someon": 3, "someth": [57, 59], "sometim": 29, "somewher": 40, "sonnest": [51, 56], "soonest": [51, 56], "sort": [3, 14, 22, 38], "sourc": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "source_asset": 1, "sourceasset": 1, "sourcehashversionstrategi": 48, "space": [9, 14, 20, 38], "spark": [1, 14, 20, 25, 38], "spark_conf": [20, 38, 43], "spark_config": 14, "spark_daemon_java_opt": 20, "spark_env_var": 20, "spark_hom": 43, "spark_local_dir": [14, 20, 38], "spark_local_ip": [14, 38], "spark_python_task": 20, "spark_resourc": 43, "spark_sess": 38, "spark_vers": 20, "spark_worker_memori": 20, "sparkconf": [14, 25, 38], "sparkcontext": [14, 38], "sparkjob": 25, "sparklisten": [14, 38], "sparkoperror": 43, "sparkpi": 20, "sparkr": [14, 38], "sparkr_driver_r": [14, 38], "sparksess": 38, "sparkspi": 20, "sparksqljob": 25, "spars": [14, 38], "spawn": [2, 18], "special": [14, 38], "specif": [3, 6, 7, 8, 9, 14, 16, 19, 20, 22, 25, 31, 38, 40, 46, 47, 52, 55, 56, 57, 58], "specifi": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 22, 23, 24, 25, 29, 31, 34, 38, 40, 41, 46, 47, 50, 51, 52, 55, 56, 57, 58], "specul": [14, 20, 38], "speed": [14, 38], "spill": [14, 38], "spin": 39, "splendidrunstorag": 9, "split": [51, 56], "spread": [14, 25, 38], "spun": 13, "sql": [9, 14, 22, 25, 38, 41], "sql_queri": [25, 41], "sqlalchemi": 41, "sqleventlogstorag": 9, "sqlite": 9, "sqliteeventlogstorag": 9, "sqliterunstorag": 9, "sqliteschedulestorag": 9, "sqlrunstorag": 9, "sqlschedulestorag": 9, "src": [38, 41], "ssd": 25, "sse": 20, "ssh": [19, 20], "ssh_port": 44, "ssh_public_kei": 20, "ssh_resourc": 44, "sshresourc": 44, "ssl": [2, 14, 25], "sslmode": 14, "stabil": [14, 38], "stabl": [16, 17, 18, 23, 29], "stack": [5, 9], "stackoverflowerror": [14, 38], "stage": [14, 15, 25, 38], "staging_bucket": 14, "staging_prefix": [14, 20], "standalon": [14, 38, 52], "standard": [7, 9, 11, 14, 25, 38, 47, 50, 57], "start": [2, 9, 14, 17, 18, 19, 24, 29, 31, 34, 38, 40, 50, 51, 56, 57], "start_aft": [12, 24], "start_asset": 1, "start_dat": [51, 56], "start_resync": 24, "start_sync": 24, "stat": [14, 38], "state": [5, 9, 14, 15, 20, 22, 25, 28, 38, 50, 56, 57], "statement": 2, "static": [1, 4, 9, 10, 34, 50, 51, 53, 54, 55, 57, 59], "static_partitioned_config": 51, "staticmethod": 9, "staticpartitionsdefinit": 51, "statu": [9, 14, 22, 24, 25, 28, 31, 38, 40, 50, 51, 56, 57], "status": 2, "stderr": [2, 9, 14, 15], "stderrfrom": 20, "stdin": 59, "stdout": [2, 9, 14, 15, 20, 22, 59], "step": [5, 6, 8, 9, 10, 14, 16, 17, 18, 20, 22, 23, 27, 28, 29, 30, 31, 38, 40, 48, 50, 52, 57, 59], "step_context": 10, "step_event_list": [52, 57], "step_events_by_kind": 57, "step_execution_context": [6, 8, 57], "step_expectation_result": [6, 57], "step_failur": [6, 57], "step_handl": 6, "step_input": [6, 57], "step_kei": [6, 8, 9, 10, 28, 59], "step_keys_to_execut": 9, "step_kind_valu": 6, "step_launch": [6, 57], "step_output": [6, 57], "step_restart": 6, "step_select": 52, "step_skip": 6, "step_start": 6, "step_success": [6, 57], "step_up_for_retri": 6, "stepexecutioncontext": 10, "stepfailuredata": 57, "stepkind": 6, "steplaunch": [6, 57], "still": [3, 14, 25, 38], "stop": [2, 9, 14, 20, 22, 28, 31, 38, 40, 51, 56], "stopgracefullyonshutdown": [14, 38], "storag": [2, 5, 14, 15, 18, 20, 25, 28, 29, 30, 32, 36, 38, 49, 52], "storage_account": 15, "storage_account_key_kei": 20, "storage_account_nam": 20, "storage_id": 9, "storagefract": [14, 38], "storagelevel": [14, 38], "store": [1, 2, 9, 10, 14, 20, 25, 27, 29, 38, 40, 50, 57, 58], "store_serialized_dag": 13, "str": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 22, 24, 25, 26, 28, 31, 34, 35, 39, 40, 41, 46, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "str_param": 58, "str_valu": 57, "straightforward": 58, "strategi": [3, 6, 11, 14, 38, 48, 52], "stream": [9, 12, 14, 20, 38, 52], "streamingcontext": [14, 38], "strict": [9, 14, 16, 17, 18, 19, 20, 23, 25, 29, 36], "strict_column_list": 34, "strict_flag": 22, "strictcolumnsconstraint": 34, "strictli": [14, 20, 38], "string": [1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 14, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 29, 30, 33, 34, 38, 39, 40, 44, 46, 48, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59], "string_column": 34, "stringio": 9, "stringsourc": [3, 5, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 31, 32, 36, 38, 41, 44, 45], "structur": [6, 7, 9, 14, 38, 47, 50, 52, 57, 59], "structured_asset_kei": [50, 57], "structured_asset_key_2": [50, 57], "stub": [50, 57], "stuff": 59, "sub": [1, 9, 14, 38], "sub0": 25, "subclass": [5, 9, 13, 14, 38, 48, 54, 58, 59], "subminor": 25, "submiss": 25, "submit": [9, 14, 18, 20, 25, 28, 38, 43, 56], "submit_job_execut": 28, "submit_pipeline_execut": 28, "submit_run": 9, "subnet": 25, "subnetwork": 25, "subnetwork_uri": 25, "subnetworkuri": 25, "subselect": [1, 51, 53, 56], "subsequ": [7, 10, 14, 15, 22, 25, 29, 38, 41], "subset": [1, 12, 22, 24, 52], "substanti": [14, 38], "subtract": 51, "succe": [8, 20, 57], "succeed": [8, 22, 24, 50, 57], "success": [2, 5, 6, 8, 9, 12, 14, 22, 24, 28, 31, 34, 38, 40, 41, 50, 52, 57, 58], "success_hook": 8, "successfulli": [22, 24], "suffix": [14, 38], "suggest": [20, 22], "suit": 26, "suitabl": [14, 15, 25], "suite_nam": 26, "summari": 33, "summarize_directori": 4, "sundai": [51, 56], "super": 30, "supervis": [14, 38], "suppli": [3, 5, 7, 11, 18, 22, 29, 51, 59], "support": [1, 4, 9, 11, 13, 14, 15, 20, 21, 22, 25, 27, 33, 38, 39, 40, 47, 50, 51, 56, 57], "suppress": 2, "sure": [14, 27, 38], "surfac": [2, 14], "sustainedschedulerbacklogtimeout": [14, 38], "svc": [21, 29], "svv": 29, "switch": [6, 7, 11, 52], "symbol": [14, 38], "sync": [12, 24, 51], "sync_and_pol": [12, 24], "sync_foobar": [12, 24], "synchron": [5, 9, 22, 28, 52], "syntax": [6, 22, 58], "synthes": 48, "sys": 5, "system": [2, 3, 5, 6, 9, 11, 14, 15, 16, 20, 28, 38, 49, 50, 52, 55, 57, 58], "tab": [12, 24, 33, 59], "tabl": [1, 12, 13, 22, 24, 25, 41, 42, 57], "table1": 24, "table2": 24, "table_nam": [12, 24], "table_schema": [50, 57], "tablecolumn": [50, 57], "tablecolumnconstraint": 50, "tableconstraint": 50, "tablemetadatavalu": [50, 57], "tablerecord": [50, 57], "tableschema": [50, 57], "tableschemametadatavalu": [50, 57], "tabluar": [50, 57], "tabular": [50, 57], "tag": [1, 6, 7, 9, 11, 13, 14, 20, 21, 25, 28, 29, 30, 39, 46, 48, 50, 52, 53, 56, 57], "tag_concurrency_limit": 9, "tags_fn": 56, "tags_fn_for_partit": 51, "tags_for_partition_fn": [51, 56], "tagsmor": 20, "take": [7, 14, 19, 20, 22, 26, 31, 34, 38, 40, 41, 50, 51, 54, 55, 56, 57, 58], "taken": 24, "tar": 25, "target": [6, 11, 14, 18, 22, 29, 38, 51, 52, 56], "target_dir": 22, "target_path": 22, "task": [13, 14, 17, 18, 22, 25, 29, 38], "task_definit": 14, "task_id": 22, "task_tag": 22, "taskinst": 13, "team": 9, "teams_on_failur": 31, "teams_on_pipeline_failur": 31, "teams_on_success": 31, "teams_pipelin": 31, "teams_solid": 31, "teams_webhook_url": 31, "teamsclient": 31, "teardown": 55, "technic": 22, "tell": [6, 7, 9, 24, 52], "temp": 9, "tempfil": 9, "templat": 13, "temporari": [9, 10], "tend": [14, 38], "term": [20, 50, 51], "termin": [20, 22, 25, 28, 41], "terminate_run": 28, "test": [2, 5, 9, 15, 22, 25, 38, 50, 52, 53, 56, 57], "test_handle_output": 10, "test_load_input": 10, "test_project": 29, "test_util": 13, "test_valu": 3, "text": [2, 22, 27, 31, 40, 50, 57], "text_fn": 40, "text_messag": 31, "text_metadata": [50, 57], "text_usag": 40, "textio": 9, "textmetadatavalu": [50, 57], "tgtlifetimehour": 25, "tgz": 25, "than": [3, 6, 9, 14, 16, 17, 18, 20, 22, 25, 27, 29, 38, 50, 51, 52, 55, 57, 58], "thank": 29, "the_asset_group": 1, "the_job": 7, "the_resourc": 55, "thei": [1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 20, 22, 38, 50, 52, 55, 57, 58, 59], "them": [2, 5, 9, 10, 14, 20, 22, 24, 25, 29, 34, 38, 39, 47, 50, 52, 56, 57, 58, 59], "themselv": [6, 7, 52], "therefor": 22, "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "thin": [21, 27, 33, 40], "thing": [14, 27, 38, 52, 59], "third": 9, "those": [1, 6, 7, 10, 11, 14, 20, 22, 38, 41, 47, 52, 56], "though": [14, 38], "thousand": [14, 38], "thrash": 25, "thread": [1, 2, 9, 14, 19, 22, 38, 41], "threaddump": [14, 38], "threads_per_work": 19, "three": 10, "threshold": [14, 22, 34, 38], "through": [3, 9, 13, 14, 20, 22, 34, 38, 46, 47, 50, 51, 57, 58, 59], "throw": [3, 5, 14, 22, 28, 38, 57], "thrown": [5, 8, 22, 34, 58], "thu": 6, "tick": 56, "ticket": 25, "tighter": 27, "tightli": [14, 38], "time": [1, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14, 16, 17, 21, 22, 23, 24, 25, 38, 39, 41, 50, 51, 52, 56, 57, 58], "timeout": [2, 14, 19, 20, 31, 37, 38, 41, 44], "timeout_second": 20, "timestamp": [9, 24, 34], "timewindowpartitionsdefinit": [6, 10, 51, 57], "timezon": [2, 34, 41, 51, 56], "titl": 27, "tmp": [14, 15, 25, 29, 38], "to_job": [6, 7, 11, 20, 47, 52], "to_source_asset": 1, "to_str": [50, 57], "to_user_str": [50, 57], "togeth": [50, 57], "toggl": 48, "token": [15, 20, 22, 27, 29, 40, 45], "too": [14, 29, 38], "tool": [9, 13, 20, 22, 34, 50, 52, 53, 57, 58], "top": [1, 3, 6, 7, 9, 11, 12, 22, 24, 52, 57], "topic": 40, "torn": 55, "torrentbroadcastfactori": [14, 38], "total": [14, 20, 38], "touch": 9, "toward": 14, "trace": [2, 5, 9, 47], "track": [14, 30, 38, 50, 52, 57], "transfer": [14, 38], "transform": [14, 38, 58], "transform_word": 3, "transient": [9, 14, 38], "transit": 15, "translat": [41, 42], "transport": 28, "travers": 18, "treat": [50, 57, 58], "tri": [14, 38], "trigger": [8, 14, 22, 31, 33, 38, 40, 56], "triggerrun": 22, "true": [3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 18, 20, 22, 24, 25, 29, 34, 36, 38, 41, 44, 48, 50, 51, 52, 56, 57, 58, 59], "trust": 25, "truststor": 25, "truststorepassworduri": 25, "truststoreuri": 25, "try": [14, 16, 28, 38, 50, 57], "tune": [14, 38], "tupl": [5, 6, 9, 11, 24, 50, 57, 58], "turn": [2, 14, 24, 29, 38, 54, 56], "twilio_resourc": 45, "two": [7, 10, 20, 50, 52, 55, 57, 58], "txt": [9, 25], "type": [1, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 18, 20, 24, 25, 26, 28, 29, 34, 39, 40, 41, 46, 47, 48, 51, 52, 53, 54, 55, 56, 59], "type_check_fn": [34, 50, 57, 58], "type_handl": 41, "typecheck": [5, 34, 46, 50, 57, 58], "typecheckcontext": [6, 34, 58], "typehint": 7, "typic": [1, 5, 6, 9, 13, 14, 38, 40, 52, 54], "typing_typ": [34, 58], "ubuntu": 20, "udf": 25, "ugli": 3, "uksouth": 20, "uncondition": [14, 38], "unconnect": 58, "under": [6, 7, 9, 10, 14, 16, 17, 18, 25, 29, 38, 50, 57], "underestim": [14, 38], "underli": [6, 7, 13, 14, 15, 16, 17, 18, 22, 39, 47, 50, 55, 57], "underneath": 22, "underscor": 25, "underutil": 20, "unexpect": [5, 14, 22, 38], "unifi": [14, 38], "uniform": [9, 47], "uninstal": 29, "union": [1, 3, 4, 6, 7, 9, 10, 12, 14, 16, 17, 18, 22, 24, 25, 29, 30, 34, 40, 46, 47, 50, 51, 52, 54, 55, 56, 57, 58], "uniqu": [2, 3, 4, 6, 7, 9, 10, 13, 16, 21, 25, 34, 48, 49, 50, 52, 53, 57, 58], "unique_id": [13, 22], "unit": [6, 7, 14, 29, 38, 50, 52, 56, 57], "unix": 9, "unknown": 5, "unless": [12, 14, 22, 38, 50, 51, 56, 57], "unlik": [1, 3, 50, 57], "unlimit": [14, 38], "unpars": 22, "unpersist": [14, 38], "unrecover": [50, 57], "unregist": [14, 38], "unrol": [14, 38], "unrollfract": [14, 38], "unsaf": [14, 38], "unsatisfi": 1, "unset": 20, "unsign": 14, "unspecifi": [3, 25, 50], "unstructur": 9, "unsuccess": [12, 24], "until": [2, 12, 14, 20, 22, 24, 38], "untitl": 20, "unus": [14, 34, 38, 58], "unusu": [14, 38], "unwil": [14, 38], "unzip": 20, "up_for_retri": [50, 57], "updat": [1, 14, 22, 24, 38], "update_connector": 24, "update_job": 22, "update_schedule_typ": 24, "updatejobbyid": 22, "upload": [14, 20], "upon": [9, 20, 22, 29, 48, 55], "upper": [14, 34, 38], "upstream": [1, 6, 7, 10, 11, 50, 52, 57], "upstream_output": 10, "upstream_prefix": 1, "uri": [25, 30, 41], "url": [2, 14, 15, 16, 17, 18, 20, 22, 23, 25, 28, 31, 35, 38, 40, 50, 57], "urlmetadatavalu": [50, 57], "usabl": [9, 58], "usable_as_dagster_typ": [50, 57, 58], "usag": [9, 10, 13, 14, 28, 30, 38, 40, 50, 57], "use": [1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 33, 34, 36, 38, 39, 40, 41, 42, 44, 46, 47, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59], "use_airflow_template_context": 13, "use_build_command": 22, "use_http": [12, 28], "use_ssl": 14, "use_unsigned_sess": 14, "used": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 38, 40, 41, 46, 47, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59], "usefetchcach": [14, 38], "useful": [2, 14, 25, 28, 29, 38, 51, 52, 55, 56, 57, 59], "uselegacymod": [14, 38], "usepassword": 29, "user": [1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 14, 15, 20, 21, 22, 24, 25, 26, 27, 32, 34, 35, 36, 38, 39, 40, 41, 42, 46, 47, 48, 50, 51, 52, 54, 55, 57, 58], "user1": 17, "user_code_error_boundari": [5, 9], "user_messag": 9, "useraccount": 25, "userclasspathfirst": [14, 38], "userdeploy": 29, "userguid": [16, 17, 18], "usernam": [9, 14, 17, 23, 29, 32, 36, 44], "uses": [5, 10, 20, 21, 25, 46, 50, 57], "using": [1, 2, 3, 4, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 29, 30, 31, 34, 38, 39, 40, 41, 44, 47, 50, 52, 53, 54, 55, 56, 57, 58, 59], "usr": 25, "usual": [1, 10, 13, 14, 38, 54], "utc": [13, 34, 51, 56], "utc_date_str": 13, "utc_execution_date_str": 13, "util": [4, 6, 10, 14, 15, 28, 29, 34, 50, 51, 57], "uvicorn": 2, "valid": [1, 3, 6, 9, 10, 14, 22, 25, 26, 34, 38, 41, 51, 56, 58], "validate_default_paramet": 41, "validate_run_config": 6, "validate_t": [50, 57], "validateoutputspec": [14, 38], "validation_operator_nam": 26, "validation_operators_and_act": 26, "valu": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 28, 29, 30, 31, 32, 34, 36, 38, 39, 41, 44, 46, 50, 51, 52, 54, 55, 56, 57, 58], "var": [17, 22], "vari": 52, "variabl": [2, 3, 14, 15, 17, 18, 20, 22, 23, 25, 27, 29, 30, 32, 36, 38, 41, 51, 56], "variant": [4, 14, 38], "variat": [16, 17, 18], "varieti": [22, 28], "variou": [14, 25, 27, 38, 56], "verbos": [14, 20, 38], "veri": [6, 11, 14, 38, 52, 58], "verifi": [14, 31], "verify_cert_path": 14, "version": [2, 3, 7, 9, 10, 11, 14, 15, 16, 17, 18, 20, 22, 25, 29, 38, 47, 50, 52, 55, 57, 58], "version_strategi": [6, 7, 11, 52], "versionstrategi": [7, 11, 48, 52], "very_cool_packag": 9, "very_secret_env_vari": 3, "very_secret_env_variable_bool": 3, "very_secret_env_variable_int": 3, "via": [2, 3, 6, 7, 9, 10, 13, 14, 15, 16, 20, 22, 25, 29, 31, 37, 38, 40, 46, 51, 52, 55, 56, 59], "viabl": 9, "view": [21, 22, 29, 51], "viewabl": [7, 11], "violat": 5, "visibl": 29, "visitor": 21, "visual": [34, 58], "void": 51, "vol1": 17, "vol2": 17, "volum": [17, 18, 20, 29], "volume_mount": [18, 29], "volumemount": [18, 29], "vvv": 29, "wai": [1, 6, 9, 10, 11, 14, 24, 29, 38, 51, 52, 55, 57, 58], "wait": [2, 12, 14, 18, 20, 22, 24, 25, 38, 50, 57, 58], "wait_for_log": [14, 20], "wait_for_process": 9, "wait_int": 58, "wal": [14, 38], "walk": 4, "want": [3, 12, 13, 14, 16, 17, 18, 20, 22, 24, 26, 27, 28, 29, 30, 31, 38, 40, 56, 57, 58, 59], "warehous": [41, 42], "warm": 29, "warn": [2, 14, 21, 22, 38, 47], "warn_error": 22, "warn_on_step_context_us": 10, "wast": [14, 38], "wave": 40, "weak": [14, 38], "web": [2, 14, 16, 38], "webclient": 40, "webhook": 31, "week": [51, 56], "weekli": [51, 56], "weekly_partitioned_config": [51, 56], "weeklypartitionsdefinit": 51, "well": [1, 3, 5, 6, 7, 11, 14, 20, 22, 24, 29, 30, 38, 50, 57, 58], "were": [10, 16, 17, 18, 22, 52, 56, 57], "west": [14, 20, 29], "wget": 25, "what": [7, 9, 14, 26, 38, 50, 56, 57], "whatev": 52, "when": [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 36, 38, 39, 40, 41, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "whenev": [7, 11, 14, 35, 47, 50, 57], "where": [1, 2, 4, 6, 8, 9, 10, 14, 17, 18, 22, 25, 26, 28, 29, 34, 38, 39, 50, 51, 56, 57, 58], "whether": [3, 6, 7, 9, 10, 11, 14, 18, 22, 25, 28, 29, 31, 38, 40, 46, 48, 50, 51, 52, 56, 57], "which": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 35, 38, 39, 40, 41, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58], "while": [3, 5, 9, 14, 20, 22, 38, 48, 51, 57], "whitelist": 9, "who": [9, 14, 38], "whole": [9, 10, 14, 38], "whom": 3, "whose": [1, 3, 5, 6, 7, 9, 10, 14, 20, 46, 48, 57, 58, 59], "why": 56, "window": [6, 10, 25, 51, 56, 57], "wish": [9, 14, 22, 38, 50, 57, 58], "with_additional_config": 53, "with_hook": [11, 31, 40], "within": [1, 3, 5, 6, 7, 9, 10, 11, 13, 14, 18, 20, 21, 22, 24, 25, 27, 28, 29, 30, 33, 34, 35, 38, 47, 49, 50, 52, 54, 55, 57, 58], "without": [3, 5, 9, 13, 14, 16, 19, 25, 28, 38, 56, 58], "won": [10, 14, 16, 38], "word": [3, 50, 51], "wordcount": 25, "work": [2, 4, 9, 14, 18, 22, 25, 29, 38, 40, 48, 51, 58, 59], "worker": [2, 9, 14, 17, 18, 19, 20, 25, 29, 38], "worker_main": 16, "workerconfig": 25, "working_directori": 2, "workload": [14, 20, 38], "workspac": [2, 9, 20, 40], "world": [3, 39, 57], "would": [9, 10, 16, 22, 24, 28, 46, 50, 51, 57], "wrap": [3, 5, 6, 9, 10, 11, 15, 22, 39, 46, 47, 50, 52, 55, 57], "wrapper": [21, 27, 33, 40], "write": [7, 9, 11, 14, 16, 20, 27, 38, 41, 52, 55, 56, 57, 58], "write_csv": 10, "write_data": 9, "write_fil": 9, "writeaheadlog": [14, 38], "writehead": 58, "writeif": 25, "writer": 58, "writerow": 58, "written": [14, 38], "www": [17, 25, 51, 56], "xlarg": 20, "xloggc": [14, 38], "xml": 25, "xmlfor": 25, "xmx": [14, 38], "yaml": [2, 9, 10, 14, 15, 16, 17, 18, 29, 32, 36, 53, 54, 59], "yaml_directori": 54, "yaml_str": [53, 59], "yarn": [14, 19, 25, 38], "yes": [14, 38], "yet": [6, 10, 54, 56, 57], "yield": [3, 4, 6, 7, 9, 10, 12, 22, 24, 26, 46, 50, 52, 54, 55, 56, 57], "yield_ev": 46, "yield_materi": [12, 22, 24], "yield_result": 46, "yml": [22, 26], "you": [1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 39, 40, 41, 42, 46, 47, 50, 51, 52, 54, 55, 56, 57, 58, 59], "your": [6, 9, 10, 11, 13, 14, 15, 17, 18, 20, 22, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 42, 46, 52, 55], "your_kei": 21, "your_org_her": 28, "your_service_account": 29, "yourself": 9, "zadrozni": 16, "zero": [14, 22, 25, 38, 39], "zip": [14, 20, 25, 38], "zone": [25, 41, 51, 56], "zoneuri": 25, "zookeep": [14, 38], "zstd": [14, 38], "zstdcompressioncodec": [14, 38], "\u4e16\u754c": 3, "\u4f60\u597d": 3}, "titles": ["Home", "Software-Defined Assets (Experimental)", "Dagster CLI", "Config", "Dynamic Mapping & Collect", "Errors", "Execution", "Graphs", "Hooks", "Internals", "IO Managers", "Jobs", "Airbyte (dagster-airbyte)", "Airflow (dagster-airflow)", "AWS (dagster-aws)", "Azure (dagster-azure)", "Celery (dagster-celery)", "Orchestration on Celery + Docker", "Orchestration on Celery + Kubernetes", "Dask (dagster-dask)", "Databricks (dagster-databricks)", "Datadog (dagster-datadog)", "dbt (dagster-dbt)", "Orchestration on Docker", "Fivetran (dagster-fivetran)", "GCP (dagster-gcp)", "Great Expectations (dagster-ge)", "GitHub (dagster-github)", "GraphQL (dagster-graphql)", "Kubernetes (dagster-k8s)", "MLflow (dagster-mlflow)", "Microsoft Teams (dagster-msteams)", "MySQL (dagster-mysql)", "PagerDuty (dagster-pagerduty)", "Pandas (dagster-pandas)", "Papertrail (dagster-papertrail)", "PostgreSQL (dagster-postgres)", "Prometheus (dagster-prometheus)", "Pyspark (dagster-pyspark)", "Shell (dagster-shell)", "Slack (dagster-slack)", "Snowflake (dagster-snowflake)", "Snowflake with Pandas (dagster-snowflake-pandas)", "Spark (dagster-spark)", "SSH / SFTP (dagster-ssh)", "Twilio (dagster-twilio)", "Dagstermill", "Loggers", "Versioning and Memoization", "[Legacy] Modes", "Ops", "Partitioned Config", "[Legacy] Pipelines", "[Legacy] Presets", "Repositories", "Resources", "Run Requests", "[Legacy] Solids", "Types", "Utilities"], "titleterms": {"AWS": 14, "ECS": 14, "GCS": 25, "Ins": 50, "K8s": 29, "Ops": [12, 22, 24, 50], "about": 29, "access": 29, "airbyt": 12, "airflow": 13, "alias": 52, "api": [2, 16, 17, 18, 20, 23, 25, 29, 39, 56], "app": 16, "asset": [1, 2, 12, 22, 24, 50, 57], "aws": 14, "azur": 15, "backend": 16, "best": 16, "bigqueri": 25, "broker": 16, "built": [10, 47, 58], "celeri": [16, 17, 18], "chart": 29, "cli": [2, 16, 22], "client": 28, "cloud": 22, "cloudwatch": 14, "cluster": 29, "collect": 4, "compos": 57, "comput": 9, "config": [3, 6, 51, 52], "configur": [6, 16, 52], "context": [6, 10, 57], "coordin": 9, "core": 22, "custom": [16, 47], "daemon": 2, "dagit": 2, "dagster": [2, 12, 13, 14, 15, 16, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45], "dagstermil": 46, "dask": 19, "databrick": 20, "datadog": 21, "dataproc": 25, "dbt": 22, "debug": 2, "defin": [1, 47, 50, 57], "definit": [51, 52], "depend": 7, "develop": 29, "docker": [17, 23], "dump": 2, "dynam": 4, "emr": 14, "enabl": 29, "error": [5, 22], "event": [9, 50, 57], "except": 9, "execut": [6, 27, 50, 52, 57], "executor": [6, 9], "exist": 29, "expect": 26, "experiment": [1, 10], "explicit": 7, "faster": 29, "file": 9, "fivetran": 24, "from": [29, 47], "function": 51, "gcp": 25, "gcr": 29, "get": 33, "github": 27, "graph": [6, 7], "graphql": [2, 27, 28], "great": 26, "grpc": 2, "handl": 9, "heartbeat": 2, "helm": 29, "hook": 8, "input": [10, 57], "instanc": [2, 9], "intern": 9, "issu": 27, "job": [2, 6, 11], "k8s": 29, "kei": [50, 57], "kind": 29, "kubernet": [18, 29], "launcher": 9, "legaci": [2, 20, 22, 25, 30, 39, 49, 51, 52, 53, 56, 57], "list": 16, "local": 29, "log": [9, 47], "logger": 47, "make": 58, "manag": [9, 10], "manual": 29, "map": 4, "memoiz": 48, "metadata": [50, 57], "microsoft": 31, "minikub": 29, "mlflow": 30, "mode": 49, "monitor": 16, "msteam": 31, "mysql": 32, "new": 58, "note": 29, "ops": 50, "orchestr": [17, 18, 23], "other": 16, "out": 50, "output": [10, 57], "pagerduti": 33, "panda": [34, 42], "papertrail": 35, "partit": [51, 56], "pipelin": [2, 52], "post": 27, "postgr": 36, "postgresql": 36, "practic": 16, "preset": 53, "prometheu": 37, "pvc": 29, "pyspark": 38, "python": [28, 29], "queri": 27, "quickstart": 16, "reconstruct": [6, 11, 52], "redi": 29, "redshift": 14, "repositori": 54, "request": 56, "resourc": [12, 22, 24, 55], "result": 6, "retri": 57, "root": 10, "rpc": 22, "run": [2, 6, 9, 29, 52, 56], "schedul": [2, 9, 51, 56], "schema": [6, 52], "secretsmanag": 14, "sensor": [2, 56], "setup": 29, "sftp": 44, "shell": 39, "slack": 40, "snowflak": [41, 42], "softwar": 1, "solid": [22, 57], "spark": 43, "ssh": 44, "start": [16, 33], "storag": 9, "tabl": 50, "task": 16, "team": 31, "termin": 16, "test": [14, 29, 58], "twilio": 45, "type": [3, 22, 50, 57, 58], "util": [3, 22, 59], "valid": 29, "version": 48, "wipe": 2, "worker": 16, "your": 16}} \ No newline at end of file diff --git a/docs/content/api/sections.json b/docs/content/api/sections.json index 03146ae1869e0..a4ef18b26148c 100644 --- a/docs/content/api/sections.json +++ b/docs/content/api/sections.json @@ -1 +1 @@ -{"api": {"apidocs": {"assets": {"alabaster_version": "0.7.12", "body": "
\n

Software-Defined Assets (Experimental)\u00b6

\n

Software-defined assets sit on top of the graph/job/op APIs and enable a novel way of constructing Dagster jobs that puts assets at the forefront.

\n

Conceptually, software-defined assets invert the typical relationship between assets and computation. Instead of defining a graph of ops and recording which assets those ops end up materializing, you define a set of assets, each of which knows how to compute its contents from upstream assets.

\n

A software-defined asset combines:\n- An asset key, e.g. the name of a table.\n- A function, which can be run to compute the contents of the asset.\n- A set of upstream assets that are provided as inputs to the function when computing the asset.

\n
\n
\n@dagster.asset(name=None, namespace=None, ins=None, non_argument_deps=None, metadata=None, description=None, required_resource_keys=None, io_manager_key=None, compute_kind=None, dagster_type=None, partitions_def=None, partition_mappings=None)[source]\u00b6
\n

Create a definition for how to compute an asset.

\n

A software-defined asset is the combination of:\n1. An asset key, e.g. the name of a table.\n2. A function, which can be run to compute the contents of the asset.\n3. A set of upstream assets that are provided as inputs to the function when computing the asset.

\n

Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\nabout the upstream assets it depends on. The upstream assets are inferred from the arguments\nto the decorated function. The name of the argument designates the name of the upstream asset.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the asset. If not provided, defaults to the name of the\ndecorated function.

  • \n
  • namespace (Optional[Sequence[str]]) \u2013 The namespace that the asset resides in. The namespace + the\nname forms the asset key.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to their metadata\nand namespaces.

  • \n
  • non_argument_deps (Optional[Set[AssetKey]]) \u2013 Set of asset keys that are upstream dependencies,\nbut do not pass an input to the asset.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata entries for the asset.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the op.

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the IOManager used for storing the\noutput of the op as an asset, and for loading it in downstream ops\n(default: \u201cio_manager\u201d).

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that produces\nthe asset, e.g. \u201cdbt\u201d or \u201cspark\u201d. It will be displayed in Dagit as a badge on the asset.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 Allows specifying type validation functions that\nwill be executed on the output of the decorated function after it runs.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the asset.

  • \n
  • partition_mappings (Optional[Mapping[str, PartitionMapping]]) \u2013 Defines how to map partition\nkeys for this asset to partition keys of upstream assets. Each key in the dictionary\ncorreponds to one of the input assets, and each value is a PartitionMapping.\nIf no entry is provided for a particular asset dependency, the partition mapping defaults\nto the default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

  • \n
\n
\n
\n

Examples

\n
@asset\ndef my_asset(my_upstream_asset: int) -> int:\n    return my_upstream_asset + 1\n
\n
\n
\n\n
\n
\nclass dagster.AssetGroup(assets, source_assets=None, resource_defs=None, executor_def=None)[source]\u00b6
\n

Defines a group of assets, along with environment information in the\nform of resources and an executor.

\n

An AssetGroup can be provided to a RepositoryDefinition. When\nprovided to a repository, the constituent assets can be materialized from\nDagit. The AssetGroup also provides an interface for creating jobs from\nsubselections of assets, which can then be provided to a\nScheduleDefinition or SensorDefinition.

\n

There can only be one AssetGroup per repository.

\n
\n
Parameters
\n
    \n
  • assets (Sequence[AssetsDefinition]) \u2013 The set of software-defined assets\nto group.

  • \n
  • source_assets (Optional[Sequence[SourceAsset]]) \u2013 The set of source\nassets that the software-defined may depend on.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A\ndictionary of resource definitions. When the AssetGroup is\nconstructed, if there are any unsatisfied resource requirements\nfrom the assets, it will result in an error. Note that the\nroot_manager key is a reserved resource key, and will result in\nan error if provided by the user.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 The executor definition to\nuse when re-materializing assets in this group.

  • \n
\n
\n
\n

Examples

\n
from dagster import AssetGroup, asset, AssetIn, AssetKey, SourceAsset, resource\n\nsource_asset = SourceAsset("source")\n\n@asset(required_resource_keys={"foo"})\ndef start_asset(context, source):\n    ...\n\n@asset\ndef next_asset(start_asset):\n    ...\n\n@resource\ndef foo_resource():\n    ...\n\nasset_group = AssetGroup(\n    assets=[start_asset, next_asset],\n    source_assets=[source_asset],\n    resource_defs={"foo": foo_resource},\n)\n...\n
\n
\n
\n
\nstatic all_assets_job_name()[source]\u00b6
\n

The name of the mega-job that the provided list of assets is coerced into.

\n
\n\n
\n
\nbuild_job(name, selection=None, executor_def=None, tags=None, description=None)[source]\u00b6
\n

Defines an executable job from the provided assets, resources, and executor.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name to give the job.

  • \n
  • selection (Union[str, List[str]]) \u2013

    A single selection query or list of selection queries\nto execute. For example:

    \n
    \n
      \n
    • ['some_asset_key'] select some_asset_key itself.

    • \n
    • ['*some_asset_key'] select some_asset_key and all its ancestors (upstream dependencies).

    • \n
    • ['*some_asset_key+++'] select some_asset_key, all its ancestors, and its descendants (downstream dependencies) within 3 levels down.

    • \n
    • ['*some_asset_key', 'other_asset_key_a', 'other_asset_key_b+'] select some_asset_key and all its ancestors, other_asset_key_a itself, and other_asset_key_b and its direct child asset keys. When subselecting into a multi-asset, all of the asset keys in that multi-asset must be selected.

    • \n
    \n
    \n

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 The executor\ndefinition to use when executing the job. Defaults to the\nexecutor on the AssetGroup. If no executor was provided on the\nAssetGroup, then it defaults to multi_or_in_process_executor.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten\ntag values provided at invocation time.

  • \n
  • description (Optional[str]) \u2013 A description of the job.

  • \n
\n
\n
\n

Examples

\n
from dagster import AssetGroup\n\nthe_asset_group = AssetGroup(...)\n\njob_with_all_assets = the_asset_group.build_job()\n\njob_with_one_selection = the_asset_group.build_job(selection="some_asset")\n\njob_with_multiple_selections = the_asset_group.build_job(selection=["*some_asset", "other_asset++"])\n
\n
\n
\n\n
\n
\nstatic from_current_module(resource_defs=None, executor_def=None)[source]\u00b6
\n

Constructs an AssetGroup that includes all asset definitions and source assets in the module\nwhere this is called from.

\n
\n
Parameters
\n
    \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dictionary of resource\ndefinitions to include on the returned asset group.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 An executor to include on the returned\nasset group.

  • \n
\n
\n
Returns
\n

An asset group with all the assets defined in the module.

\n
\n
Return type
\n

AssetGroup

\n
\n
\n
\n\n
\n
\nstatic from_modules(modules, resource_defs=None, executor_def=None)[source]\u00b6
\n

Constructs an AssetGroup that includes all asset definitions and source assets in the given\nmodules.

\n
\n
Parameters
\n
    \n
  • modules (Iterable[ModuleType]) \u2013 The Python modules to look for assets inside.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dictionary of resource\ndefinitions to include on the returned asset group.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 An executor to include on the returned\nasset group.

  • \n
\n
\n
Returns
\n

An asset group with all the assets defined in the given modules.

\n
\n
Return type
\n

AssetGroup

\n
\n
\n
\n\n
\n
\nstatic from_package_module(package_module, resource_defs=None, executor_def=None)[source]\u00b6
\n

Constructs an AssetGroup that includes all asset definitions and source assets in all\nsub-modules of the given package module.

\n

A package module is the result of importing a package.

\n
\n
Parameters
\n
    \n
  • package_module (ModuleType) \u2013 The package module to looks for assets inside.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dictionary of resource\ndefinitions to include on the returned asset group.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 An executor to include on the returned\nasset group.

  • \n
\n
\n
Returns
\n

An asset group with all the assets in the package.

\n
\n
Return type
\n

AssetGroup

\n
\n
\n
\n\n
\n
\nstatic from_package_name(package_name, resource_defs=None, executor_def=None)[source]\u00b6
\n

Constructs an AssetGroup that includes all asset definitions and source assets in all\nsub-modules of the given package.

\n
\n
Parameters
\n
    \n
  • package_name (str) \u2013 The name of a Python package to look for assets inside.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dictionary of resource\ndefinitions to include on the returned asset group.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 An executor to include on the returned\nasset group.

  • \n
\n
\n
Returns
\n

An asset group with all the assets in the package.

\n
\n
Return type
\n

AssetGroup

\n
\n
\n
\n\n
\n\n
\n
\n@dagster.multi_asset(outs, name=None, ins=None, non_argument_deps=None, description=None, required_resource_keys=None, compute_kind=None, internal_asset_deps=None)[source]\u00b6
\n

Create a combined definition of multiple assets that are computed using the same op and same\nupstream assets.

\n

Each argument to the decorated function references an upstream asset that this asset depends on.\nThe name of the argument designates the name of the upstream asset.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the op.

  • \n
  • outs \u2013 (Optional[Dict[str, Out]]): The Outs representing the produced assets.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to their metadata\nand namespaces.

  • \n
  • non_argument_deps (Optional[Set[AssetKey]]) \u2013 Set of asset keys that are upstream dependencies,\nbut do not pass an input to the multi_asset.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the op.

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the IOManager used for storing the\noutput of the op as an asset, and for loading it in downstream ops\n(default: \u201cio_manager\u201d).

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that produces\nthe asset, e.g. \u201cdbt\u201d or \u201cspark\u201d. It will be displayed in Dagit as a badge on the asset.

  • \n
  • internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]) \u2013 By default, it is assumed\nthat all assets produced by a multi_asset depend on all assets that are consumed by that\nmulti asset. If this default is not correct, you pass in a map of output names to a\ncorrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\nused as input to the asset or produced within the op.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.build_assets_job(name, assets, source_assets=None, resource_defs=None, description=None, config=None, tags=None, executor_def=None)[source]\u00b6
\n

Builds a job that materializes the given assets.

\n

The dependencies between the ops in the job are determined by the asset dependencies defined\nin the metadata on the provided asset nodes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the job.

  • \n
  • assets (List[AssetsDefinition]) \u2013 A list of assets or\nmulti-assets - usually constructed using the @asset() or @multi_asset()\ndecorator.

  • \n
  • source_assets (Optional[Sequence[Union[SourceAsset, AssetsDefinition]]]) \u2013 A list of\nassets that are not materialized by this job, but that assets in this job depend on.

  • \n
  • resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 Resource defs to be included in\nthis job.

  • \n
  • description (Optional[str]) \u2013 A description of the job.

  • \n
\n
\n
\n

Examples

\n
@asset\ndef asset1():\n    return 5\n\n@asset\ndef asset2(asset1):\n    return my_upstream_asset + 1\n\nmy_assets_job = build_assets_job("my_assets_job", assets=[asset1, asset2])\n
\n
\n
\n
Returns
\n

A job that materializes the given assets.

\n
\n
Return type
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nclass dagster.AssetIn(asset_key=None, metadata=None, namespace=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster.SourceAsset(key, metadata=None, io_manager_key='io_manager', description=None, partitions_def=None)[source]\u00b6
\n

A SourceAsset represents an asset that is not generated by any Dagster op in the repository\nthat it\u2019s referenced from.

\n
\n
\nkey\u00b6
\n

The key of the asset.

\n
\n
Type
\n

AssetKey

\n
\n
\n
\n\n
\n
\nmetadata_entries\u00b6
\n

Metadata associated with the asset.

\n
\n
Type
\n

List[MetadataEntry]

\n
\n
\n
\n\n
\n
\nio_manager_key\u00b6
\n

The key for the IOManager that will be used to load the contents of\nthe asset when it\u2019s used as an input to other assets inside a job.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\ndescription\u00b6
\n

The description of the asset.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npartitions_def\u00b6
\n

Defines the set of partition keys that\ncompose the asset.

\n
\n
Type
\n

Optional[PartitionsDefinition]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.fs_asset_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
base_dir (dagster.StringSource, optional)
\n

\n
\n

IO manager that stores values on the local filesystem, serializing them with pickle.

\n

Each asset is assigned to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset\nkey has multiple components, the final component is used as the name of the file, and the\npreceding components as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.

\n

If not provided via configuration, the base dir is the local_artifact_storage in your\ndagster.yaml file. That will be a temporary directory if not explicitly set.

\n

So, with a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n

1. Specify a collection-level IO manager using the reserved resource key "io_manager",\nwhich will set the given IO manager on all assets in the collection.

\n
from dagster import AssetGroup, asset, fs_asset_io_manager\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return df[:5]\n\nasset_group = AssetGroup(\n    [asset1, asset2],\n    resource_defs={\n        "io_manager": fs_asset_io_manager.configured({"base_path": "/my/base/path"})\n    },\n)\n
\n
\n

2. Specify IO manager on the asset, which allows the user to set different IO managers on\ndifferent assets.

\n
from dagster import fs_io_manager, job, op, Out\n\n@asset(io_manager_key="my_io_manager")\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return df[:5]\n\nasset_group = AssetGroup(\n    [asset1, asset2],\n    resource_defs={\n        "my_io_manager": fs_asset_io_manager.configured({"base_path": "/my/base/path"})\n    },\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/assets", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../cli/", "title": "Dagster CLI"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../../../../", "title": "<no title>"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/cli", "Dagster CLI", "N", "next"], ["index", "<no title>", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/assets.rst.txt", "title": "Software-Defined Assets (Experimental)", "toc": "\n"}, "cli": {"alabaster_version": "0.7.12", "body": "
\n

Dagster CLI\u00b6

\n
\n

dagster asset\u00b6

\n

Commands for working with Dagster assets.

\n
dagster asset [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nwipe
\n

Eliminate asset key indexes from event logs.

\n
\n\n
\n
\n

dagster debug\u00b6

\n

Commands for debugging Dagster pipeline/job runs.

\n
dagster debug [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nexport
\n

Export the relevant artifacts for a\u2026

\n
\n\n
\n
\nimport
\n

Import the relevant artifacts for a\u2026

\n
\n\n
\n
\n

dagster instance\u00b6

\n

Commands for working with the current Dagster instance.

\n
dagster instance [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ninfo
\n

List the information about the current\u2026

\n
\n\n
\n
\nmigrate
\n

Automatically migrate an out of date\u2026

\n
\n\n
\n
\nreindex
\n

Rebuild index over historical runs for\u2026

\n
\n\n
\n
\n

dagster job\u00b6

\n

Commands for working with Dagster jobs.

\n
dagster job [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nbackfill
\n

Backfill a partitioned job.

\n
\n\n
\n
\nexecute
\n

Execute a job.

\n
\n\n
\n
\nlaunch
\n

Launch a job using the run launcher\u2026

\n
\n\n
\n
\nlist
\n

List the jobs in a repository.

\n
\n\n
\n
\nlist_versions
\n

Display the freshness of memoized results\u2026

\n
\n\n
\n
\nprint
\n

Print a job.

\n
\n\n
\n
\nscaffold_config
\n

Scaffold the config for a job.

\n
\n\n
\n
\n

dagster run\u00b6

\n

Commands for working with Dagster pipeline/job runs.

\n
dagster run [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ndelete
\n

Delete a run by id and its associated\u2026

\n
\n\n
\n
\nlist
\n

List the runs in the current Dagster\u2026

\n
\n\n
\n
\nwipe
\n

Eliminate all run history and event logs.

\n
\n\n
\n
\n

dagster schedule\u00b6

\n

Commands for working with Dagster schedules.

\n
dagster schedule [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ndebug
\n

Debug information about the scheduler.

\n
\n\n
\n
\nlist
\n

List all schedules that correspond to a\u2026

\n
\n\n
\n
\nlogs
\n

Get logs for a schedule.

\n
\n\n
\n
\npreview
\n

Preview changes that will be performed by\u2026

\n
\n\n
\n
\nrestart
\n

Restart a running schedule.

\n
\n\n
\n
\nstart
\n

Start an existing schedule.

\n
\n\n
\n
\nstop
\n

Stop an existing schedule.

\n
\n\n
\n
\nwipe
\n

Delete the schedule history and turn off\u2026

\n
\n\n
\n
\n

dagster sensor\u00b6

\n

Commands for working with Dagster sensors.

\n
dagster sensor [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ncursor
\n

Set the cursor value for an existing sensor.

\n
\n\n
\n
\nlist
\n

List all sensors that correspond to a\u2026

\n
\n\n
\n
\npreview
\n

Preview an existing sensor execution.

\n
\n\n
\n
\nstart
\n

Start an existing sensor.

\n
\n\n
\n
\nstop
\n

Stop an existing sensor.

\n
\n\n
\n
\n

dagster-graphql\u00b6

\n

Run a GraphQL query against the dagster interface to a specified repository or pipeline/job.

\n

Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.

\n

Examples:

\n
    \n
  1. dagster-graphql

  2. \n
  3. dagster-graphql -y path/to/workspace.yaml

  4. \n
  5. dagster-graphql -f path/to/file.py -a define_repo

  6. \n
  7. dagster-graphql -m some_module -a define_repo

  8. \n
  9. dagster-graphql -f path/to/file.py -a define_pipeline

  10. \n
  11. dagster-graphql -m some_module -a define_pipeline

  12. \n
\n
dagster-graphql [OPTIONS]\n
\n
\n

Options

\n
\n
\n--version\u00b6
\n

Show the version and exit.

\n
\n\n
\n
\n-t, --text <text>\u00b6
\n

GraphQL document to execute passed as a string

\n
\n\n
\n
\n-f, --file <file>\u00b6
\n

GraphQL document to execute passed as a file

\n
\n\n
\n
\n-p, --predefined <predefined>\u00b6
\n

GraphQL document to execute, from a predefined set provided by dagster-graphql.

\n
\n
Options
\n

launchPipelineExecution

\n
\n
\n
\n\n
\n
\n-v, --variables <variables>\u00b6
\n

A JSON encoded string containing the variables for GraphQL execution.

\n
\n\n
\n
\n-r, --remote <remote>\u00b6
\n

A URL for a remote instance running dagit server to send the GraphQL request to.

\n
\n\n
\n
\n-o, --output <output>\u00b6
\n

A file path to store the GraphQL response to. This flag is useful when making pipeline/job execution queries, since pipeline/job execution causes logs to print to stdout and stderr.

\n
\n\n
\n
\n--ephemeral-instance\u00b6
\n

Use an ephemeral DagsterInstance instead of resolving via DAGSTER_HOME

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or pipeline/job.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where repository or pipeline/job function lives

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or pipeline/job function lives

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where repository or pipeline/job function lives

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or pipeline/job or 2) a function that returns a repository or pipeline/job

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n

dagit\u00b6

\n

Run dagit. Loads a repository or pipeline/job.

\n

Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.

\n

Examples:

\n
    \n
  1. dagit (works if .workspace.yaml exists)

  2. \n
  3. dagit -w path/to/workspace.yaml

  4. \n
  5. dagit -f path/to/file.py

  6. \n
  7. dagit -f path/to/file.py -d path/to/working_directory

  8. \n
  9. dagit -m some_module

  10. \n
  11. dagit -f path/to/file.py -a define_repo

  12. \n
  13. dagit -m some_module -a define_repo

  14. \n
  15. dagit -p 3333

  16. \n
\n

Options can also provide arguments via environment variables prefixed with DAGIT

\n

For example, DAGIT_PORT=3333 dagit

\n
dagit [OPTIONS]\n
\n
\n

Options

\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or pipeline/job or 2) a function that returns a repository or pipeline/job

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where repository or pipeline/job function lives

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or pipeline/job function lives

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where repository or pipeline/job function lives

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or pipeline/job.

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n-h, --host <host>\u00b6
\n

Host to run server on

\n
\n
Default
\n

127.0.0.1

\n
\n
\n
\n\n
\n
\n-p, --port <port>\u00b6
\n

Port to run server on.

\n
\n
Default
\n

3000

\n
\n
\n
\n\n
\n
\n-l, --path-prefix <path_prefix>\u00b6
\n

The path prefix where Dagit will be hosted (eg: /dagit)

\n
\n
Default
\n

\n
\n
\n\n
\n
\n--db-statement-timeout <db_statement_timeout>\u00b6
\n

The timeout in milliseconds to set on database statements sent to the DagsterInstance. Not respected in all configurations.

\n
\n
Default
\n

15000

\n
\n
\n
\n\n
\n
\n--read-only\u00b6
\n

Start Dagit in read-only mode, where all mutations such as launching runs and turning schedules on/off are turned off.

\n
\n\n
\n
\n--suppress-warnings\u00b6
\n

Filter all warnings when hosting Dagit.

\n
\n\n
\n
\n--version\u00b6
\n

Show the version and exit.

\n
\n\n
\n
\n

dagster-daemon run\u00b6

\n

Run any daemons configured on the DagsterInstance.

\n
dagster-daemon run [OPTIONS]\n
\n
\n

Options

\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or pipeline/job or 2) a function that returns a repository or pipeline/job

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where repository or pipeline/job function lives

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or pipeline/job function lives

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where repository or pipeline/job function lives

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or pipeline/job.

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n

dagster-daemon wipe\u00b6

\n

Wipe all heartbeats from storage.

\n
dagster-daemon wipe [OPTIONS]\n
\n
\n
\n
\n

dagster-daemon debug heartbeat-dump\u00b6

\n

Log all heartbeat statuses

\n
dagster-daemon debug heartbeat-dump [OPTIONS]\n
\n
\n
\n
\n

dagster api grpc\u00b6

\n

Serve the Dagster inter-process API over GRPC

\n
dagster api grpc [OPTIONS]\n
\n
\n

Options

\n
\n
\n-p, --port <port>\u00b6
\n

Port over which to serve. You must pass one and only one of \u2013port/-p or \u2013socket/-s.

\n
\n\n
\n
\n-s, --socket <socket>\u00b6
\n

Serve over a UDS socket. You must pass one and only one of \u2013port/-p or \u2013socket/-s.

\n
\n\n
\n
\n-h, --host <host>\u00b6
\n

Hostname at which to serve. Default is localhost.

\n
\n\n
\n
\n-n, --max_workers <max_workers>\u00b6
\n

Maximum number of (threaded) workers to use in the GRPC server

\n
\n\n
\n
\n--heartbeat\u00b6
\n

If set, the GRPC server will shut itself down when it fails to receive a heartbeat after a timeout configurable with \u2013heartbeat-timeout.

\n
\n\n
\n
\n--heartbeat-timeout <heartbeat_timeout>\u00b6
\n

Timeout after which to shutdown if \u2013heartbeat is set and a heartbeat is not received

\n
\n\n
\n
\n--lazy-load-user-code\u00b6
\n

Wait until the first LoadRepositories call to actually load the repositories, instead of waiting to load them when the server is launched. Useful for surfacing errors when the server is managed directly from Dagit

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or pipeline/job or 2) a function that returns a repository or pipeline/job

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where repository or pipeline/job function lives

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or pipeline/job function lives

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where repository or pipeline/job function lives

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or pipeline/job.

\n
\n\n
\n
\n--use-python-environment-entry-point\u00b6
\n

If this flag is set, the server will signal to clients that they should launch dagster commands using <this server\u2019s python executable> -m dagster, instead of the default dagster entry point. This is useful when there are multiple Python environments running in the same machine, so a single dagster entry point is not enough to uniquely determine the environment.

\n
\n\n
\n
\n--empty-working-directory\u00b6
\n

Indicates that the working directory should be empty and should not set to the current directory as a default

\n
\n\n
\n
\n--ipc-output-file <ipc_output_file>\u00b6
\n

[INTERNAL] This option should generally not be used by users. Internal param used by dagster when it automatically spawns gRPC servers to communicate the success or failure of the server launching.

\n
\n\n
\n
\n--fixed-server-id <fixed_server_id>\u00b6
\n

[INTERNAL] This option should generally not be used by users. Internal param used by dagster to spawn a gRPC server with the specified server id.

\n
\n\n
\n
\n--override-system-timezone <override_system_timezone>\u00b6
\n

[INTERNAL] This option should generally not be used by users. Override the system timezone for tests.

\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Level at which to log output from the gRPC server process

\n
\n\n
\n
\n

Legacy APIs\u00b6

\n
\n

dagster pipeline\u00b6

\n

Commands for working with Dagster pipelines/jobs.

\n
dagster pipeline [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nbackfill
\n

Backfill a partitioned pipeline/job.

\n
\n\n
\n
\nexecute
\n

Execute a pipeline.

\n
\n\n
\n
\nlaunch
\n

Launch a pipeline using the run launcher\u2026

\n
\n\n
\n
\nlist
\n

List the pipelines/jobs in a repository.

\n
\n\n
\n
\nlist_versions
\n

Display the freshness of memoized results\u2026

\n
\n\n
\n
\nprint
\n

Print a pipeline/job.

\n
\n\n
\n
\nscaffold_config
\n

Scaffold the config for a pipeline.

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/cli", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../config/", "title": "Config"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../assets/", "title": "Software-Defined Assets (Experimental)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/config", "Config", "N", "next"], ["sections/api/apidocs/assets", "Software-Defined Assets (Experimental)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/cli.rst.txt", "title": "Dagster CLI", "toc": "\n"}, "config": {"alabaster_version": "0.7.12", "body": "
\n

Config\u00b6

\n
\n

Config Types\u00b6

\n

The following types are used to describe the schema of configuration\ndata via config. They are used in conjunction with the\nbuiltin types above.

\n
\n
\nclass dagster.ConfigSchema[source]\u00b6
\n

This is a placeholder type. Any time that it appears in documentation, it means that any of\nthe following types are acceptable:

\n
    \n
  1. A Python scalar type that resolves to a Dagster config type\n(int, float, bool,\nor str). For example:

    \n
      \n
    • @op(config_schema=int)

    • \n
    • @op(config_schema=str)

    • \n
    \n
  2. \n
  3. A built-in python collection (list, or dict).\nlist is exactly equivalent to Array [\nAny ] and dict is equivalent to\nPermissive. For example:

    \n
      \n
    • @op(config_schema=list)

    • \n
    • @op(config_schema=dict)

    • \n
    \n
  4. \n
  5. A Dagster config type:

    \n\n
  6. \n
  7. A bare python dictionary, which will be automatically wrapped in\nShape. Values of the dictionary are resolved recursively\naccording to the same rules. For example:

    \n
      \n
    • {'some_config': str} is equivalent to Shape({'some_config: str}).

    • \n
    • \n
      {'some_config1': {'some_config2': str}} is equivalent to

      Shape({'some_config1: Shape({'some_config2: str})}).

      \n
      \n
      \n
    • \n
    \n
  8. \n
  9. A bare python list of length one, whose single element will be wrapped in a\nArray is resolved recursively according to the same\nrules. For example:

    \n
      \n
    • [str] is equivalent to Array[str].

    • \n
    • [[str]] is equivalent to Array[Array[str]].

    • \n
    • [{'some_config': str}] is equivalent to Array(Shape({'some_config: str})).

    • \n
    \n
  10. \n
  11. An instance of Field.

  12. \n
\n
\n\n
\n
\nclass dagster.Field(config, default_value=<class 'dagster.config.field_utils.__FieldValueSentinel'>, is_required=None, description=None)[source]\u00b6
\n

Defines the schema for a configuration field.

\n

Fields are used in config schema instead of bare types when one wants to add a description,\na default value, or to mark it as not required.

\n

Config fields are parsed according to their schemas in order to yield values available at\njob execution time through the config system. Config fields can be set on ops, on\nloaders and materializers for custom, and on other pluggable components of the system, such as\nresources, loggers, and executors.

\n
\n
Parameters
\n
    \n
  • config (Any) \u2013

    The schema for the config. This value can be any of:

    \n
      \n
    1. A Python primitive type that resolves to a Dagster config type\n(int, float, bool,\nstr, or list).

    2. \n
    3. A Dagster config type:

      \n\n
    4. \n
    5. A bare python dictionary, which will be automatically wrapped in\nShape. Values of the dictionary are resolved recursively\naccording to the same rules.

    6. \n
    7. A bare python list of length one which itself is config type.\nBecomes Array with list element as an argument.

    8. \n
    \n

  • \n
  • default_value (Any) \u2013

    A default value for this field, conformant to the schema set by the dagster_type\nargument. If a default value is provided, is_required should be False.

    \n

    Note: for config types that do post processing such as Enum, this value must be\nthe pre processed version, ie use ExampleEnum.VALUE.name instead of\nExampleEnum.VALUE

    \n

  • \n
  • is_required (bool) \u2013 Whether the presence of this field is required. Defaults to true. If is_required\nis True, no default value should be provided.

  • \n
  • description (str) \u2013 A human-readable description of this config field.

  • \n
\n
\n
\n

Examples:

\n
@op(\n    config_schema={\n        'word': Field(str, description='I am a word.'),\n        'repeats': Field(Int, default_value=1, is_required=False),\n    }\n)\ndef repeat_word(context):\n    return context.op_config['word'] * context.op_config['repeats']\n
\n
\n
\n\n
\n
\nclass dagster.Selector(fields, description=None)[source]\u00b6
\n

Define a config field requiring the user to select one option.

\n

Selectors are used when you want to be able to present several different options in config but\nallow only one to be selected. For example, a single input might be read in from either a csv\nfile or a parquet file, but not both at once.

\n

Note that in some other type systems this might be called an \u2018input union\u2019.

\n

Functionally, a selector is like a Dict, except that only one key from the dict can\nbe specified in valid config.

\n
\n
Parameters
\n

fields (Dict[str, Field]) \u2013 The fields from which the user must select.

\n
\n
\n

Examples:

\n
@op(\n    config_schema=Field(\n        Selector(\n            {\n                'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n                'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n                'en': {'whom': Field(String, default_value='world', is_required=False)},\n            }\n        ),\n        is_required=False,\n        default_value={'en': {'whom': 'world'}},\n    )\n)\ndef hello_world_with_default(context):\n    if 'haw' in context.op_config:\n        return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n    if 'cn' in context.op_config:\n        return '\u4f60\u597d\uff0c{whom}!'.format(whom=context.op_config['cn']['whom'])\n    if 'en' in context.op_config:\n        return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n
\n
\n
\n\n
\n
\nclass dagster.Permissive(fields=None, description=None)[source]\u00b6
\n

Defines a config dict with a partially specified schema.

\n

A permissive dict allows partial specification of the config schema. Any fields with a\nspecified schema will be type checked. Other fields will be allowed, but will be ignored by\nthe type checker.

\n
\n
Parameters
\n

fields (Dict[str, Field]) \u2013 The partial specification of the config dict.

\n
\n
\n

Examples:

\n
@op(config_schema=Field(Permissive({'required': Field(String)})))\ndef map_config_op(context) -> List:\n    return sorted(list(context.op_config.items()))\n
\n
\n
\n\n
\n
\nclass dagster.Shape(fields, description=None, field_aliases=None)[source]\u00b6
\n

Schema for configuration data with string keys and typed values via Field.

\n

Unlike Permissive, unspecified fields are not allowed and will throw a\nDagsterInvalidConfigError.

\n
\n
Parameters
\n
    \n
  • fields (Dict[str, Field]) \u2013 The specification of the config dict.

  • \n
  • field_aliases (Dict[str, str]) \u2013 Maps a string key to an alias that can be used instead of the original key. For example,\nan entry {\u201csolids\u201d: \u201cops\u201d} means that someone could use \u201cops\u201d instead of \u201csolids\u201d as a\ntop level string key.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Map(key_type, inner_type, key_label_name=None)[source]\u00b6
\n

Defines a config dict with arbitrary scalar keys and typed values.

\n

A map can contrain arbitrary keys of the specified scalar type, each of which has\ntype checked values. Unlike Shape and Permissive, scalar\nkeys other than strings can be used, and unlike Permissive, all\nvalues are type checked.\n:param key_type: The type of keys this map can contain. Must be a scalar type.\n:type key_type: type\n:param inner_type: The type of the values that this map type can contain.\n:type inner_type: type\n:param key_label_name: Optional name which describes the role of keys in the map.\n:type key_label_name: string

\n

Examples:

\n
@op(config_schema=Field(Map({str: int})))\ndef partially_specified_config(context) -> List:\n    return sorted(list(context.op_config.items()))\n
\n
\n
\n\n
\n
\nclass dagster.Array(inner_type)[source]\u00b6
\n

Defines an array (list) configuration type that contains values of type inner_type.

\n
\n
Parameters
\n

inner_type (type) \u2013 The type of the values that this configuration type can contain.

\n
\n
\n
\n\n
\n
\nclass dagster.Noneable(inner_type)[source]\u00b6
\n

Defines a configuration type that is the union of NoneType and the type inner_type.

\n
\n
Parameters
\n

inner_type (type) \u2013 The type of the values that this configuration type can contain.

\n
\n
\n

Examples:

\n
config_schema={"name": Noneable(str)}\n\nconfig={"name": "Hello"}  # Ok\nconfig={"name": None}     # Ok\nconfig={}                 # Error\n
\n
\n
\n\n
\n
\nclass dagster.Enum(name, enum_values)[source]\u00b6
\n

Defines a enum configuration type that allows one of a defined set of possible values.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the enum configuration type.

  • \n
  • enum_values (List[EnumValue]) \u2013 The set of possible values for the enum configuration type.

  • \n
\n
\n
\n

Examples:

\n
@op(\n    config_schema=Field(\n        Enum(\n            'CowboyType',\n            [\n                EnumValue('good'),\n                EnumValue('bad'),\n                EnumValue('ugly'),\n            ]\n        )\n    )\n)\ndef resolve_standoff(context):\n    # ...\n
\n
\n
\n
\nclassmethod from_python_enum(enum, name=None)[source]\u00b6
\n

Create a Dagster enum corresponding to an existing Python enum.

\n
\n
Parameters
\n
    \n
  • enum (enum.EnumMeta) \u2013 The class representing the enum.

  • \n
  • name (Optional[str]) \u2013 The name for the enum. If not present, enum.__name__ will be used.

  • \n
\n
\n
\n

Example:

\n
class Color(enum.Enum):\n    RED = enum.auto()\n    GREEN = enum.auto()\n    BLUE = enum.auto()\n\n@op(\n    config_schema={"color": Field(Enum.from_python_enum(Color))}\n)\ndef select_color(context):\n    # ...\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.EnumValue(config_value, python_value=None, description=None)[source]\u00b6
\n

Define an entry in a Enum.

\n
\n
Parameters
\n
    \n
  • config_value (str) \u2013 The string representation of the config to accept when passed.

  • \n
  • python_value (Optional[Any]) \u2013 The python value to convert the enum entry in to. Defaults to the config_value.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the enum entry.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScalarUnion(scalar_type, non_scalar_schema, _key=None)[source]\u00b6
\n

Defines a configuration type that accepts a scalar value OR a non-scalar value like a\nList, Dict, or Selector.

\n

This allows runtime scalars to be configured without a dictionary with the key value and\ninstead just use the scalar value directly. However this still leaves the option to\nload scalars from a json or pickle file.

\n
\n
Parameters
\n
    \n
  • scalar_type (type) \u2013 The scalar type of values that this configuration type can hold. For example,\nint, float, bool,\nor str.

  • \n
  • non_scalar_schema (ConfigSchema) \u2013 The schema of a non-scalar Dagster configuration type. For example, List,\nDict, or Selector.

  • \n
  • key (Optional[str]) \u2013 The configuation type\u2019s unique key. If not set, then the key will be set to\nScalarUnion.{scalar_type}-{non_scalar_schema}.

  • \n
\n
\n
\n

Examples:

\n
graph:\n  transform_word:\n    inputs:\n      word:\n        value: foobar\n
\n
\n

becomes, optionally,

\n
graph:\n  transform_word:\n    inputs:\n      word: foobar\n
\n
\n
\n\n
\n
\ndagster.StringSource\u00b6
\n

Use this type when you want to read a string config value from an environment variable. The value\npassed to a config field of this type may either be a string literal, or a selector describing\nhow to look up the value from the executing process\u2019s environment variables.

\n

Examples:

\n
from dagster import job, op, StringSource\n\n@op(config_schema=StringSource)\ndef secret_op(context) -> str:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'graph': {'secret_op': {'config': 'test_value'}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'graph': {'secret_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE'}}}\n    }\n)\n
\n
\n
\n\n
\n
\ndagster.IntSource\u00b6
\n

Use this type when you want to read an integer config value from an environment variable. The\nvalue passed to a config field of this type may either be a integer literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables.

\n

Examples:

\n
from dagster import job, op, IntSource\n\n@op(config_schema=IntSource)\ndef secret_int_op(context) -> str:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'graph': {'secret_int_op': {'config': 'test_value'}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'graph': {'secret_int_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_INT'}}}\n    }\n)\n
\n
\n
\n\n
\n
\ndagster.BoolSource\u00b6
\n

Use this type when you want to read an boolean config value from an environment variable. The\nvalue passed to a config field of this type may either be a boolean literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables. Set the\nvalue of the corresponding environment variable to "" to indicate False.

\n

Examples:

\n
from dagster import job, op, BoolSource\n\n@op(config_schema=BoolSource)\ndef secret_bool_op(context) -> str:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'graph': {'secret_bool_op': {'config': 'test_value'}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'graph': {'secret_bool_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_BOOL'}}}\n    }\n)\n
\n
\n
\n\n
\n
\n

Config Utilities\u00b6

\n
\n
\nclass dagster.ConfigMapping(config_fn, config_schema=None, receive_processed_config_values=None)[source]\u00b6
\n

Defines a config mapping for a graph (or job).

\n

By specifying a config mapping function, you can override the configuration for the child\nops and graphs contained within a graph.

\n

Config mappings require the configuration schema to be specified as config_schema, which will\nbe exposed as the configuration schema for the graph, as well as a configuration mapping\nfunction, config_fn, which maps the config provided to the composite solid to the config\nthat will be provided to the child nodes.

\n
\n
Parameters
\n
    \n
  • config_fn (Callable[[dict], dict]) \u2013 The function that will be called\nto map the graph config to a config appropriate for the child nodes.

  • \n
  • config_schema (ConfigSchema) \u2013 The schema of the graph config.

  • \n
  • receive_processed_config_values (Optional[bool]) \u2013 If true, config values provided to the config_fn\nwill be converted to their dagster types before being passed in. For example, if this\nvalue is true, enum config passed to config_fn will be actual enums, while if false,\nthen enum config passed to config_fn will be strings.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.configured(configurable, config_schema=None, **kwargs)[source]\u00b6
\n

A decorator that makes it easy to create a function-configured version of an object.\nThe following definition types can be configured using this function:

\n\n

If the config that will be supplied to the object is constant, you may alternatively invoke this\nand call the result with a dict of config values to be curried. Examples of both strategies\nbelow.

\n
\n
Parameters
\n
    \n
  • configurable (ConfigurableDefinition) \u2013 An object that can be configured.

  • \n
  • config_schema (ConfigSchema) \u2013 The config schema that the inputs to the decorated function\nmust satisfy.

  • \n
  • **kwargs \u2013 Arbitrary keyword arguments that will be passed to the initializer of the returned\nobject.

  • \n
\n
\n
Returns
\n

(Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])

\n
\n
\n

Examples:

\n
dev_s3 = configured(s3_resource, name="dev_s3")({'bucket': 'dev'})\n\n@configured(s3_resource)\ndef dev_s3(_):\n    return {'bucket': 'dev'}\n\n@configured(s3_resource, {'bucket_prefix', str})\ndef dev_s3(config):\n    return {'bucket': config['bucket_prefix'] + 'dev'}\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/config", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../errors/", "title": "Errors"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../cli/", "title": "Dagster CLI"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/errors", "Errors", "N", "next"], ["sections/api/apidocs/cli", "Dagster CLI", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/config.rst.txt", "title": "Config", "toc": "\n"}, "dynamic": {"alabaster_version": "0.7.12", "body": "
\n

Dynamic Mapping & Collect\u00b6

\n

These APIs provide the means for a simple kind of dynamic orchestration \u2014 where the work to be orchestrated is determined not at pipeline definition time but at runtime, dependent on data that\u2019s observed as part of pipeline execution.

\n
\n
\nclass dagster.DynamicOut(dagster_type=<class 'dagster.core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, asset_key=None, asset_partitions=None, asset_partitions_def=None)[source]\u00b6
\n

Variant of Out for an output that will dynamically alter the graph at\nruntime.

\n

When using in a composition function such as @graph,\ndynamic outputs must be used with either

\n
    \n
  • map - clone downstream ops for each separate DynamicOut

  • \n
  • collect - gather across all DynamicOut in to a list

  • \n
\n

Uses the same constructor as Out

\n
\n
@op(\n    config_schema={\n        "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n    },\n    out=DynamicOut(str),\n)\ndef files_in_directory(context):\n    path = context.op_config["path"]\n    dirname, _, filenames = next(os.walk(path))\n    for file in filenames:\n        yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n@job\ndef process_directory():\n    files = files_in_directory()\n\n    # use map to invoke an op on each dynamic output\n    file_results = files.map(process_file)\n\n    # use collect to gather the results in to a list\n    summarize_directory(file_results.collect())\n
\n
\n
\n
\n\n
\n
\nclass dagster.DynamicOutput(value, mapping_key, output_name='result', metadata_entries=None, metadata=None)[source]\u00b6
\n

Variant of Output used to support\ndynamic mapping & collect. Each DynamicOutput produced by an op represents\none item in a set that can be processed individually with map or gathered\nwith collect.

\n

Each DynamicOutput must have a unique mapping_key to distinguish it with it\u2019s set.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • mapping_key (str) \u2013 The key that uniquely identifies this dynamic value relative to its peers.\nThis key will be used to identify the downstream ops when mapped, ie\nmapped_op[example_mapping_key]

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding DynamicOut defined on the op.\n(default: \u201cresult\u201d)

  • \n
  • metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]) \u2013 (Experimental) A set of metadata entries to attach to events related to this output.

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/dynamic", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../types/", "title": "Types"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../solids/", "title": "[Legacy] Solids"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/types", "Types", "N", "next"], ["sections/api/apidocs/solids", "[Legacy] Solids", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/dynamic.rst.txt", "title": "Dynamic Mapping & Collect", "toc": "\n"}, "errors": {"alabaster_version": "0.7.12", "body": "
\n

Errors\u00b6

\n

Core Dagster error classes.

\n

All errors thrown by the Dagster framework inherit from DagsterError. Users\nshould not subclass this base class for their own exceptions.

\n

There is another exception base class, DagsterUserCodeExecutionError, which is\nused by the framework in concert with the user_code_error_boundary().

\n

Dagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\nDagsterUserCodeExecutionError.

\n

The wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.

\n
\n
\nexception dagster.DagsterError[source]\u00b6
\n

Base class for all errors thrown by the Dagster framework.

\n

Users should not subclass this base class for their own exceptions.

\n
\n\n
\n
\nexception dagster.DagsterConfigMappingFunctionError(*args, **kwargs)[source]\u00b6
\n

Indicates that an unexpected error occurred while executing the body of a config mapping\nfunction defined in a JobDefinition or ~dagster.GraphDefinition during\nconfig parsing.

\n
\n\n
\n
\nexception dagster.DagsterEventLogInvalidForRun(run_id)[source]\u00b6
\n

Raised when the event logs for a historical run are malformed or invalid.

\n
\n\n
\n
\nexception dagster.DagsterExecutionStepExecutionError(*args, **kwargs)[source]\u00b6
\n

Indicates an error occurred while executing the body of an execution step.

\n
\n\n
\n
\nexception dagster.DagsterExecutionStepNotFoundError(*args, **kwargs)[source]\u00b6
\n

Thrown when the user specifies execution step keys that do not exist.

\n
\n\n
\n
\nexception dagster.DagsterInvalidConfigError(preamble, errors, config_value, *args, **kwargs)[source]\u00b6
\n

Thrown when provided config is invalid (does not type check against the relevant config\nschema).

\n
\n\n
\n
\nexception dagster.DagsterInvalidConfigDefinitionError(original_root, current_value, stack, reason=None, **kwargs)[source]\u00b6
\n

Indicates that you have attempted to construct a config with an invalid value

\n
\n
Acceptable values for config types are any of:
    \n
  1. \n
    A Python primitive type that resolves to a Dagster config type

    (int, float, bool,\nstr, or list).

    \n
    \n
    \n
  2. \n
  3. \n
    A Dagster config type: Int, Float,

    Bool, String,\nStringSource, Any,\nArray, Noneable, Enum,\nSelector, Shape, or\nPermissive.

    \n
    \n
    \n
  4. \n
  5. \n
    A bare python dictionary, which will be automatically wrapped in

    Shape. Values of the dictionary are resolved recursively\naccording to the same rules.

    \n
    \n
    \n
  6. \n
  7. \n
    A bare python list of length one which itself is config type.

    Becomes Array with list element as an argument.

    \n
    \n
    \n
  8. \n
  9. An instance of Field.

  10. \n
\n
\n
\n
\n\n
\n
\nexception dagster.DagsterInvalidDefinitionError[source]\u00b6
\n

Indicates that the rules for a definition have been violated by the user.

\n
\n\n
\n
\nexception dagster.DagsterInvariantViolationError[source]\u00b6
\n

Indicates the user has violated a well-defined invariant that can only be enforced\nat runtime.

\n
\n\n
\n
\nexception dagster.DagsterResourceFunctionError(*args, **kwargs)[source]\u00b6
\n

Indicates an error occurred while executing the body of the resource_fn in a\nResourceDefinition during resource initialization.

\n
\n\n
\n
\nexception dagster.DagsterRunNotFoundError(*args, **kwargs)[source]\u00b6
\n

Thrown when a run cannot be found in run storage.

\n
\n\n
\n
\nexception dagster.DagsterStepOutputNotFoundError(*args, **kwargs)[source]\u00b6
\n

Indicates that previous step outputs required for an execution step to proceed are not\navailable.

\n
\n\n
\n
\nexception dagster.DagsterSubprocessError(*args, **kwargs)[source]\u00b6
\n

An exception has occurred in one or more of the child processes dagster manages.\nThis error forwards the message and stack trace for all of the collected errors.

\n
\n\n
\n
\nexception dagster.DagsterTypeCheckDidNotPass(description=None, metadata_entries=None, dagster_type=None)[source]\u00b6
\n

Indicates that a type check failed.

\n

This is raised when raise_on_error is True in calls to the synchronous job and\ngraph execution APIs (e.g. graph.execute_in_process(), job.execute_in_process() \u2013 typically\nwithin a test), and a DagsterType\u2019s type check fails by returning either\nFalse or an instance of TypeCheck whose success member is False.

\n
\n\n
\n
\nexception dagster.DagsterTypeCheckError(*args, **kwargs)[source]\u00b6
\n

Indicates an error in the op type system at runtime. E.g. a op receives an\nunexpected input, or produces an output that does not match the type of the output definition.

\n
\n\n
\n
\nexception dagster.DagsterUnknownResourceError(resource_name, *args, **kwargs)[source]\u00b6
\n

Indicates that an unknown resource was accessed in the body of an execution step. May often\nhappen by accessing a resource in the compute function of an op without first supplying the\nop with the correct required_resource_keys argument.

\n
\n\n
\n
\nexception dagster.DagsterUnmetExecutorRequirementsError[source]\u00b6
\n

Indicates the resolved executor is incompatible with the state of other systems\nsuch as the DagsterInstance or system storage configuration.

\n
\n\n
\n
\nexception dagster.DagsterUserCodeExecutionError(*args, **kwargs)[source]\u00b6
\n

This is the base class for any exception that is meant to wrap an\nException thrown by user code. It wraps that existing user code.\nThe original_exc_info argument to the constructor is meant to be a tuple of the type\nreturned by sys.exc_info at the call site of the constructor.

\n

Users should not subclass this base class for their own exceptions and should instead throw\nfreely from user code. User exceptions will be automatically wrapped and rethrown.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/errors", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../execution/", "title": "Execution"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../config/", "title": "Config"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/execution", "Execution", "N", "next"], ["sections/api/apidocs/config", "Config", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/errors.rst.txt", "title": "Errors", "toc": "\n"}, "execution": {"alabaster_version": "0.7.12", "body": "
\n

Execution\u00b6

\n
\n

Executing Jobs\u00b6

\n
\n
\nclass dagster.JobDefinition(mode_def, graph_def, name=None, description=None, preset_defs=None, tags=None, hook_defs=None, op_retry_policy=None, version_strategy=None, _op_selection_data=None)[source]
\n
\n
\nexecute_in_process(run_config=None, instance=None, partition_key=None, raise_on_error=True, op_selection=None, run_id=None)[source]
\n

Execute the Job in-process, gathering results in-memory.

\n

The executor_def on the Job will be ignored, and replaced with the in-process executor.\nIf using the default io_manager, it will switch from filesystem to in-memory.

\n
\n
Parameters
\n
    \n
  • (Optional[Dict[str (run_config) \u2013 The configuration for the run

  • \n
  • Any]] \u2013 The configuration for the run

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for jobs with partitioned config.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
\n
\n
Returns
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n\n
\n
\n

Executing Graphs\u00b6

\n
\n
\nclass dagster.GraphDefinition(name, description=None, node_defs=None, dependencies=None, input_mappings=None, output_mappings=None, config=None, tags=None, **kwargs)[source]
\n

Defines a Dagster graph.

\n

A graph is made up of

\n
    \n
  • Nodes, which can either be an op (the functional unit of computation), or another graph.

  • \n
  • Dependencies, which determine how the values produced by nodes as outputs flow from\none node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n(DAG) of compute.

  • \n
\n

End users should prefer the @graph decorator. GraphDefinition is generally\nintended to be used by framework authors or for programatically generated graphs.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the graph. Must be unique within any GraphDefinition\nor JobDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the pipeline.

  • \n
  • node_defs (Optional[List[NodeDefinition]]) \u2013 The set of ops / graphs used in this graph.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each op\u2019s inputs on the outputs of other\nops in the graph. Keys of the top level dict are either the string names of ops in the\ngraph or, in the case of aliased ops, NodeInvocations.\nValues of the top level dict are themselves dicts, which map input names belonging to\nthe op or aliased op to DependencyDefinitions.

  • \n
  • input_mappings (Optional[List[InputMapping]]) \u2013 Defines the inputs to the nested graph, and\nhow they map to the inputs of its constituent ops.

  • \n
  • output_mappings (Optional[List[OutputMapping]]) \u2013 Defines the outputs of the nested graph,\nand how they map from the outputs of its constituent ops.

  • \n
  • config (Optional[ConfigMapping]) \u2013 Defines the config of the graph, and how its schema maps\nto the config of its constituent ops.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the graph.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(num):\n    return num + 1\n\ngraph_def = GraphDefinition(\n    name='basic',\n    node_defs=[return_one, add_one],\n    dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n)\n
\n
\n
\n
\nexecute_in_process(run_config=None, instance=None, resources=None, raise_on_error=True, op_selection=None, run_id=None)[source]
\n

Execute this graph in-process, collecting results in-memory.

\n
\n
Parameters
\n
    \n
  • run_config (Optional[Dict[str, Any]]) \u2013 Run config to provide to execution. The configuration for the underlying graph\nshould exist under the \u201cops\u201d key.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
\n
\n
Returns
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n\n
\n
\n

Execution results\u00b6

\n
\n
\nclass dagster.ExecuteInProcessResult(node_def, all_events, dagster_run, output_capture)[source]\u00b6
\n
\n
\nproperty all_events\u00b6
\n

All dagster events emitted during in-process execution.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty all_node_events\u00b6
\n

All dagster events from the in-process execution.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty dagster_run\u00b6
\n

the DagsterRun object for the completed execution.

\n
\n
Type
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nevents_for_node(node_name)[source]\u00b6
\n

Retrieves all dagster events for a specific node.

\n
\n
Parameters
\n

node_name (str) \u2013 The name of the node for which outputs should be retrieved.

\n
\n
Returns
\n

A list of all dagster events associated with provided node name.

\n
\n
Return type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nget_job_failure_event()[source]\u00b6
\n

Returns a DagsterEvent with type DagsterEventType.PIPELINE_FAILURE if it ocurred during\nexecution

\n
\n\n
\n
\nget_job_success_event()[source]\u00b6
\n

Returns a DagsterEvent with type DagsterEventType.PIPELINE_SUCCESS if it ocurred during\nexecution

\n
\n\n
\n
\noutput_for_node(node_str, output_name='result')[source]\u00b6
\n

Retrieves output value with a particular name from the in-process run of the job.

\n
\n
Parameters
\n
    \n
  • node_str (str) \u2013 Name of the op/graph whose output should be retrieved. If the intended\ngraph/op is nested within another graph, the syntax is outer_graph.inner_node.

  • \n
  • output_name (Optional[str]) \u2013 Name of the output on the op/graph to retrieve. Defaults to\nresult, the default output name in dagster.

  • \n
\n
\n
Returns
\n

The value of the retrieved output.

\n
\n
Return type
\n

Any

\n
\n
\n
\n\n
\n
\noutput_value(output_name='result')[source]\u00b6
\n

Retrieves output of top-level job, if an output is returned.

\n

If the top-level job has no output, calling this method will result in a\nDagsterInvariantViolationError.

\n
\n
Parameters
\n

output_name (Optional[str]) \u2013 The name of the output to retrieve. Defaults to result,\nthe default output name in dagster.

\n
\n
Returns
\n

The value of the retrieved output.

\n
\n
Return type
\n

Any

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The run id for the executed run

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty success\u00b6
\n

Whether execution was successful.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterEvent(event_type_value, pipeline_name, step_handle=None, solid_handle=None, step_kind_value=None, logging_tags=None, event_specific_data=None, message=None, pid=None, step_key=None)[source]\u00b6
\n

Events yielded by solid and pipeline execution.

\n

Users should not instantiate this class.

\n
\n
\nevent_type_value\u00b6
\n

Value for a DagsterEventType.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\npipeline_name\u00b6
\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nsolid_handle\u00b6
\n
\n
Type
\n

NodeHandle

\n
\n
\n
\n\n
\n
\nstep_kind_value\u00b6
\n

Value for a StepKind.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nlogging_tags\u00b6
\n
\n
Type
\n

Dict[str, str]

\n
\n
\n
\n\n
\n
\nevent_specific_data\u00b6
\n

Type must correspond to event_type_value.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\nmessage\u00b6
\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\npid\u00b6
\n
\n
Type
\n

int

\n
\n
\n
\n\n
\n
\nstep_key\u00b6
\n

DEPRECATED

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty event_type\u00b6
\n

The type of this event.

\n
\n
Type
\n

DagsterEventType

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterEventType(value)[source]\u00b6
\n

The types of events that may be yielded by solid and pipeline execution.

\n
\n
\nALERT_FAILURE = 'ALERT_FAILURE'\u00b6
\n
\n\n
\n
\nALERT_START = 'ALERT_START'\u00b6
\n
\n\n
\n
\nALERT_SUCCESS = 'ALERT_SUCCESS'\u00b6
\n
\n\n
\n
\nASSET_MATERIALIZATION = 'ASSET_MATERIALIZATION'\u00b6
\n
\n\n
\n
\nASSET_MATERIALIZATION_PLANNED = 'ASSET_MATERIALIZATION_PLANNED'\u00b6
\n
\n\n
\n
\nASSET_OBSERVATION = 'ASSET_OBSERVATION'\u00b6
\n
\n\n
\n
\nASSET_STORE_OPERATION = 'ASSET_STORE_OPERATION'\u00b6
\n
\n\n
\n
\nENGINE_EVENT = 'ENGINE_EVENT'\u00b6
\n
\n\n
\n
\nHANDLED_OUTPUT = 'HANDLED_OUTPUT'\u00b6
\n
\n\n
\n
\nHOOK_COMPLETED = 'HOOK_COMPLETED'\u00b6
\n
\n\n
\n
\nHOOK_ERRORED = 'HOOK_ERRORED'\u00b6
\n
\n\n
\n
\nHOOK_SKIPPED = 'HOOK_SKIPPED'\u00b6
\n
\n\n
\n
\nLOADED_INPUT = 'LOADED_INPUT'\u00b6
\n
\n\n
\n
\nLOGS_CAPTURED = 'LOGS_CAPTURED'\u00b6
\n
\n\n
\n
\nOBJECT_STORE_OPERATION = 'OBJECT_STORE_OPERATION'\u00b6
\n
\n\n
\n
\nPIPELINE_CANCELED = 'PIPELINE_CANCELED'\u00b6
\n
\n\n
\n
\nPIPELINE_CANCELING = 'PIPELINE_CANCELING'\u00b6
\n
\n\n
\n
\nPIPELINE_DEQUEUED = 'PIPELINE_DEQUEUED'\u00b6
\n
\n\n
\n
\nPIPELINE_ENQUEUED = 'PIPELINE_ENQUEUED'\u00b6
\n
\n\n
\n
\nPIPELINE_FAILURE = 'PIPELINE_FAILURE'\u00b6
\n
\n\n
\n
\nPIPELINE_START = 'PIPELINE_START'\u00b6
\n
\n\n
\n
\nPIPELINE_STARTING = 'PIPELINE_STARTING'\u00b6
\n
\n\n
\n
\nPIPELINE_SUCCESS = 'PIPELINE_SUCCESS'\u00b6
\n
\n\n
\n
\nRUN_CANCELED = 'PIPELINE_CANCELED'\u00b6
\n
\n\n
\n
\nRUN_CANCELING = 'PIPELINE_CANCELING'\u00b6
\n
\n\n
\n
\nRUN_DEQUEUED = 'PIPELINE_DEQUEUED'\u00b6
\n
\n\n
\n
\nRUN_ENQUEUED = 'PIPELINE_ENQUEUED'\u00b6
\n
\n\n
\n
\nRUN_FAILURE = 'PIPELINE_FAILURE'\u00b6
\n
\n\n
\n
\nRUN_START = 'PIPELINE_START'\u00b6
\n
\n\n
\n
\nRUN_STARTING = 'PIPELINE_STARTING'\u00b6
\n
\n\n
\n
\nRUN_SUCCESS = 'PIPELINE_SUCCESS'\u00b6
\n
\n\n
\n
\nSTEP_EXPECTATION_RESULT = 'STEP_EXPECTATION_RESULT'\u00b6
\n
\n\n
\n
\nSTEP_FAILURE = 'STEP_FAILURE'\u00b6
\n
\n\n
\n
\nSTEP_INPUT = 'STEP_INPUT'\u00b6
\n
\n\n
\n
\nSTEP_OUTPUT = 'STEP_OUTPUT'\u00b6
\n
\n\n
\n
\nSTEP_RESTARTED = 'STEP_RESTARTED'\u00b6
\n
\n\n
\n
\nSTEP_SKIPPED = 'STEP_SKIPPED'\u00b6
\n
\n\n
\n
\nSTEP_START = 'STEP_START'\u00b6
\n
\n\n
\n
\nSTEP_SUCCESS = 'STEP_SUCCESS'\u00b6
\n
\n\n
\n
\nSTEP_UP_FOR_RETRY = 'STEP_UP_FOR_RETRY'\u00b6
\n
\n\n
\n\n
\n
\n

Reconstructable jobs\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]\u00b6
\n

Create a ReconstructablePipeline from a\nfunction that returns a PipelineDefinition/JobDefinition,\nor a function decorated with @pipeline/@job.

\n

When your pipeline/job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the pipeline/job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of pipelines\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples:

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\n

Executors\u00b6

\n
\n
\ndagster.in_process_executor ExecutorDefinition[source]\u00b6
\n

The in-process executor executes all steps in a single process.

\n

For legacy pipelines, this will be the default executor. To select it explicitly,\ninclude the following top-level fragment in config:

\n
execution:\n  in_process:\n
\n
\n

Execution priority can be configured using the dagster/priority tag via solid/op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\ndagster.multiprocess_executor ExecutorDefinition[source]\u00b6
\n

The multiprocess executor executes each step in an individual process.

\n

Any job that does not specify custom executors will use the multiprocess_executor by default.\nFor jobs or legacy pipelines, to configure the multiprocess executor, include a fragment such\nas the following in your config:

\n
execution:\n  multiprocess:\n    config:\n      max_concurrent: 4\n
\n
\n

The max_concurrent arg is optional and tells the execution engine how many processes may run\nconcurrently. By default, or if you set max_concurrent to be 0, this is the return value of\nmultiprocessing.cpu_count().

\n

Execution priority can be configured using the dagster/priority tag via solid/op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\n

Contexts\u00b6

\n
\n
\nclass dagster.OpExecutionContext(step_execution_context)[source]\u00b6
\n
\n
\nadd_output_metadata(metadata, output_name=None, mapping_key=None)\u00b6
\n

Add metadata to one of the outputs of an op.

\n

This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.

\n
\n
Parameters
\n
    \n
  • metadata (Mapping[str, Any]) \u2013 The metadata to attach to the output

  • \n
  • output_name (Optional[str]) \u2013 The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.

  • \n
\n
\n
\n

Examples:

\n
from dagster import Out, op\nfrom typing import Tuple\n\n@op\ndef add_metadata(context):\n    context.add_output_metadata({"foo", "bar"})\n    return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n@op(out={"a": Out(), "b": Out()})\ndef add_metadata_two_outputs(context) -> Tuple[str, int]:\n    context.add_output_metadata({"foo": "bar"}, output_name="b")\n    context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n    return ("dog", 5)\n
\n
\n
\n\n
\n
\nconsume_events()\u00b6
\n

Pops and yields all user-generated events that have been recorded from this context.

\n

If consume_events has not yet been called, this will yield all logged events since the beginning of the op\u2019s computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nget_mapping_key()\u00b6
\n

Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None.

\n
\n\n
\n
\nget_tag(key)\u00b6
\n

Get a logging tag.

\n
\n
Parameters
\n

key (tag) \u2013 The tag to get.

\n
\n
Returns
\n

The value of the tag, if present.

\n
\n
Return type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run

\n
\n\n
\n
\nhas_tag(key)\u00b6
\n

Check if a logging tag is set.

\n
\n
Parameters
\n

key (str) \u2013 The tag to check.

\n
\n
Returns
\n

Whether the tag is set.

\n
\n
Return type
\n

bool

\n
\n
\n
\n\n
\n
\nproperty instance\u00b6
\n

The current Dagster instance

\n
\n
Type
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager available in the execution context.

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nlog_event(event)\u00b6
\n

Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.

\n

Events logged with this method will appear in the list of DagsterEvents, as well as the event log.

\n
\n
Parameters
\n

event (Union[AssetMaterialization, Materialization, AssetObservation, ExpectationResult]) \u2013 The event to log.

\n
\n
\n

Examples:

\n
from dagster import op, AssetMaterialization\n\n@op\ndef log_materialization(context):\n    context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty mode_def\u00b6
\n

The mode of the current execution.

\n
\n
Type
\n

ModeDefinition

\n
\n
\n
\n\n
\n
\noutput_asset_partition_key(output_name='result')\u00b6
\n

Returns the asset partition key for the given output. Defaults to \u201cresult\u201d, which is the\nname of the default output.

\n
\n\n
\n
\noutput_asset_partitions_time_window(output_name='result')\u00b6
\n

The time window for the partitions of the output asset.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nproperty pdb\u00b6
\n

Gives access to pdb debugging from within the op.

\n

Example:

\n
@op\ndef debug(context):\n    context.pdb.set_trace()\n
\n
\n
\n
Type
\n

dagster.utils.forked_pdb.ForkedPdb

\n
\n
\n
\n\n
\n
\nproperty pipeline_def\u00b6
\n

The currently executing pipeline.

\n
\n
Type
\n

PipelineDefinition

\n
\n
\n
\n\n
\n
\nproperty pipeline_name\u00b6
\n

The name of the currently executing pipeline.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty pipeline_run\u00b6
\n

The current pipeline run

\n
\n
Type
\n

PipelineRun

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

The currently available resources.

\n
\n
Type
\n

Resources

\n
\n
\n
\n\n
\n
\nproperty retry_number\u00b6
\n

Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc.

\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run config for the current execution.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the current execution\u2019s run.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty solid_config\u00b6
\n

The parsed config specific to this solid.

\n
\n\n
\n
\nproperty solid_def\u00b6
\n

The current solid definition.

\n
\n
Type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\nproperty step_launcher\u00b6
\n

The current step launcher, if any.

\n
\n
Type
\n

Optional[StepLauncher]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_op_context(resources=None, op_config=None, resources_config=None, instance=None, config=None, partition_key=None)[source]\u00b6
\n

Builds op execution context from provided parameters.

\n

op is currently built on top of solid, and thus this function creates a SolidExecutionContext.\nbuild_op_context can be used as either a function or context manager. If there is a\nprovided resource that is a context manager, then build_op_context must be used as a\ncontext manager. This function can be used to provide the context argument when directly\ninvoking a op.

\n
\n
Parameters
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • config (Optional[Any]) \u2013 The op config to provide to the context.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
\n
\n
\n

Examples

\n
context = build_op_context()\nop_to_invoke(context)\n\nwith build_op_context(resources={"foo": context_manager_resource}) as context:\n    op_to_invoke(context)\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheckContext(run_id, log_manager, scoped_resources_builder, dagster_type)[source]\u00b6
\n

The context object available to a type check function on a DagsterType.

\n
\n
\nlog\u00b6
\n

Centralized log dispatch from user code.

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

An object whose attributes contain the resources available to this op.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The id of this job run.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\n

Job configuration\u00b6

\n
\n
\ndagster.validate_run_config(job_def=None, run_config=None, mode=None, pipeline_def=None)[source]\u00b6
\n

Function to validate a provided run config blob against a given job. For legacy APIs, a\npipeline/mode can also be passed in.

\n

If validation is successful, this function will return a dictionary representation of the\nvalidated config actually used during execution.

\n
\n
Parameters
\n
    \n
  • job_def (Union[PipelineDefinition, JobDefinition]) \u2013 The job definition to validate run\nconfig against

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 The run config to validate

  • \n
  • mode (str) \u2013 The mode of the pipeline to validate against (different modes may require\ndifferent config)

  • \n
  • pipeline_def (PipelineDefinition) \u2013 The pipeline definition to validate run config against.

  • \n
\n
\n
Returns
\n

A dictionary representation of the validated config.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n

Run Config Schema\u00b6

\n
\n

The run_config used for jobs has the following schema:

\n
{\n  # configuration for execution, required if executors require config\n  execution: {\n    # the name of one, and only one available executor, typically 'in_process' or 'multiprocess'\n    __executor_name__: {\n      # executor-specific config, if required or permitted\n      config: {\n        ...\n      }\n    }\n  },\n\n  # configuration for loggers, required if loggers require config\n  loggers: {\n    # the name of an available logger\n    __logger_name__: {\n      # logger-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for resources, required if resources require config\n  resources: {\n    # the name of a resource\n    __resource_name__: {\n      # resource-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for underlying ops, required if ops require config\n  ops: {\n\n    # these keys align with the names of the ops, or their alias in this job\n    __op_name__: {\n\n      # pass any data that was defined via config_field\n      config: ...,\n\n      # configurably specify input values, keyed by input name\n      inputs: {\n        __input_name__: {\n          # if an dagster_type_loader is specified, that schema must be satisfied here;\n          # scalar, built-in types will generally allow their values to be specified directly:\n          value: ...\n        }\n      },\n\n    }\n  },\n\n}\n
\n
\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/execution", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../graphs/", "title": "Graphs"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../errors/", "title": "Errors"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/graphs", "Graphs", "N", "next"], ["sections/api/apidocs/errors", "Errors", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/execution.rst.txt", "title": "Execution", "toc": "\n"}, "graphs": {"alabaster_version": "0.7.12", "body": "
\n

Graphs\u00b6

\n

The replacement for composite_solid / CompositeSolidDefinition . It has a more intuitive name and there is no longer a distinction between a graph for execution (pipeline) and a graph for composition (composite solid).

\n
\n
\n@dagster.graph(name=None, description=None, input_defs=None, output_defs=None, ins=None, out=None, tags=None, config=None)[source]\u00b6
\n

Create a graph with the specified parameters from the decorated composition function.

\n

Using this decorator allows you to build up a dependency graph by writing a\nfunction that invokes ops (or other graphs) and passes the output to subsequent invocations.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the graph. Must be unique within any RepositoryDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the graph.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013

    Information about the inputs that this graph maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit InputDefinitions taking precedence.

    \n

    Uses of inputs in the body of the decorated composition function will determine\nthe InputMappings passed to the underlying\nGraphDefinition.

    \n

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013

    Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.

    \n

    Uses of these outputs in the body of the decorated composition function, as well as the\nreturn value of the decorated function, will be used to infer the appropriate set of\nOutputMappings for the underlying\nGraphDefinition.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
  • ins (Optional[Dict[str, GraphIn]]) \u2013 Information about the inputs that this graph maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit GraphIn taking precedence.

  • \n
  • out \u2013

    Information about the outputs that this graph maps. Information provided here will be\ncombined with what can be inferred from the return type signature if the function does\nnot use yield.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.GraphDefinition(name, description=None, node_defs=None, dependencies=None, input_mappings=None, output_mappings=None, config=None, tags=None, **kwargs)[source]\u00b6
\n

Defines a Dagster graph.

\n

A graph is made up of

\n
    \n
  • Nodes, which can either be an op (the functional unit of computation), or another graph.

  • \n
  • Dependencies, which determine how the values produced by nodes as outputs flow from\none node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n(DAG) of compute.

  • \n
\n

End users should prefer the @graph decorator. GraphDefinition is generally\nintended to be used by framework authors or for programatically generated graphs.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the graph. Must be unique within any GraphDefinition\nor JobDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the pipeline.

  • \n
  • node_defs (Optional[List[NodeDefinition]]) \u2013 The set of ops / graphs used in this graph.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each op\u2019s inputs on the outputs of other\nops in the graph. Keys of the top level dict are either the string names of ops in the\ngraph or, in the case of aliased ops, NodeInvocations.\nValues of the top level dict are themselves dicts, which map input names belonging to\nthe op or aliased op to DependencyDefinitions.

  • \n
  • input_mappings (Optional[List[InputMapping]]) \u2013 Defines the inputs to the nested graph, and\nhow they map to the inputs of its constituent ops.

  • \n
  • output_mappings (Optional[List[OutputMapping]]) \u2013 Defines the outputs of the nested graph,\nand how they map from the outputs of its constituent ops.

  • \n
  • config (Optional[ConfigMapping]) \u2013 Defines the config of the graph, and how its schema maps\nto the config of its constituent ops.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the graph.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(num):\n    return num + 1\n\ngraph_def = GraphDefinition(\n    name='basic',\n    node_defs=[return_one, add_one],\n    dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n)\n
\n
\n
\n
\nexecute_in_process(run_config=None, instance=None, resources=None, raise_on_error=True, op_selection=None, run_id=None)[source]\u00b6
\n

Execute this graph in-process, collecting results in-memory.

\n
\n
Parameters
\n
    \n
  • run_config (Optional[Dict[str, Any]]) \u2013 Run config to provide to execution. The configuration for the underlying graph\nshould exist under the \u201cops\u201d key.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
\n
\n
Returns
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nto_job(name=None, description=None, resource_defs=None, config=None, tags=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, op_selection=None, partitions_def=None)[source]\u00b6
\n

Make this graph in to an executable Job by providing remaining components required for execution.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its solids and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagit playground, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the job, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagit playground, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multi_or_in_process_executor,\nwhich can be switched between multi-process and in-process modes of execution. The\ndefault mode of execution is multi-process.

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 Defines how each solid (and optionally, resource) in the job can be versioned. If\nprovided, memoizaton will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition\nkeys that can parameterize the job. If this argument is supplied, the config\nargument can\u2019t also be supplied.

  • \n
\n
\n
Returns
\n

JobDefinition

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.GraphIn(description=None)[source]\u00b6
\n

Represents information about an input that a graph maps.

\n
\n
Parameters
\n

description (Optional[str]) \u2013 Human-readable description of the input.

\n
\n
\n
\n\n
\n
\nclass dagster.GraphOut(description=None)[source]\u00b6
\n

Represents information about the outputs that a graph maps.

\n
\n
Parameters
\n

description (Optional[str]) \u2013 Human-readable description of the output.

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/graphs", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../hooks/", "title": "Hooks"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../execution/", "title": "Execution"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/hooks", "Hooks", "N", "next"], ["sections/api/apidocs/execution", "Execution", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/graphs.rst.txt", "title": "Graphs", "toc": "\n"}, "hooks": {"alabaster_version": "0.7.12", "body": "
\n

Hooks\u00b6

\n
\n
\n@dagster.success_hook(name=None, required_resource_keys=None)[source]\u00b6
\n

Create a hook on step success events with the specified parameters from the decorated function.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of this hook.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n

Examples

\n
@success_hook(required_resource_keys={'slack'})\ndef slack_message_on_success(context):\n    message = 'op {} succeeded'.format(context.op.name)\n    context.resources.slack.send_message(message)\n\n@success_hook\ndef do_something_on_success(context):\n    do_something()\n
\n
\n
\n\n
\n
\n@dagster.failure_hook(name=None, required_resource_keys=None)[source]\u00b6
\n

Create a hook on step failure events with the specified parameters from the decorated function.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of this hook.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n

Examples

\n
@failure_hook(required_resource_keys={'slack'})\ndef slack_message_on_failure(context):\n    message = 'op {} failed'.format(context.op.name)\n    context.resources.slack.send_message(message)\n\n@failure_hook\ndef do_something_on_failure(context):\n    do_something()\n
\n
\n
\n\n
\n
\nclass dagster.HookDefinition(name, hook_fn, required_resource_keys=None, decorated_fn=None)[source]\u00b6
\n

Define a hook which can be triggered during a op execution (e.g. a callback on the step\nexecution failure event during a op execution).

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this hook.

  • \n
  • hook_fn (Callable) \u2013 The callback function that will be triggered.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.HookContext(step_execution_context, hook_def)[source]\u00b6
\n

The context object available to a hook function on an DagsterEvent.

\n
\n
\nlog\u00b6
\n

Centralized log dispatch from user code.

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nhook_def\u00b6
\n

The hook that the context object belongs to.

\n
\n
Type
\n

HookDefinition

\n
\n
\n
\n\n
\n
\nsolid\u00b6
\n

The solid instance associated with the hook.

\n
\n
Type
\n

Solid

\n
\n
\n
\n\n
\n
\nop\u00b6
\n

The op instance associated with the hook.

\n
\n
Type
\n

Op

\n
\n
\n
\n\n
\n
\nstep_key\u00b6
\n

The key for the step where this hook is being triggered.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nrequired_resource_keys\u00b6
\n

Resources required by this hook.

\n
\n
Type
\n

Set[str]

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

Resources available in the hook context.

\n
\n
Type
\n

Resources

\n
\n
\n
\n\n
\n
\nsolid_config\u00b6
\n

The parsed config specific to this solid.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\nop_config\u00b6
\n

The parsed config specific to this op.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\npipeline_name\u00b6
\n

The name of the pipeline where this hook is being triggered.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\njob_name\u00b6
\n

The name of the job where this hook is being triggered.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The id of the run where this hook is being triggered.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nmode_def\u00b6
\n

The mode with which the pipeline is being run.

\n
\n
Type
\n

ModeDefinition

\n
\n
\n
\n\n
\n
\nop_exception\u00b6
\n

The thrown exception in a failed op.

\n
\n
Type
\n

Optional[BaseException]

\n
\n
\n
\n\n
\n
\nop_output_values\u00b6
\n

Computed output values in an op.

\n
\n
Type
\n

Dict

\n
\n
\n
\n\n
\n
\nproperty solid_exception\u00b6
\n

The thrown exception in a failed solid.

\n
\n
Returns
\n

the exception object, None if the solid execution succeeds.

\n
\n
Return type
\n

Optional[BaseException]

\n
\n
\n
\n\n
\n
\nproperty solid_output_values\u00b6
\n

The computed output values.

\n
\n
Returns a dictionary where keys are output names and the values are:
    \n
  • the output values in the normal case

  • \n
  • a dictionary from mapping key to corresponding value in the mapped case

  • \n
\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_hook_context(resources=None, mode_def=None, solid=None, op=None, run_id=None, job_name=None, op_exception=None)[source]\u00b6
\n

Builds hook context from provided parameters.

\n

build_hook_context can be used as either a function or a context manager. If there is a\nprovided resource to build_hook_context that is a context manager, then it must be used as a\ncontext manager. This function can be used to provide the context argument to the invocation of\na hook definition.

\n
\n
Parameters
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can\neither be values or resource definitions.

  • \n
  • mode_def (Optional[ModeDefinition]) \u2013 The mode definition used with the context.

  • \n
  • op (Optional[OpDefinition, PendingNodeInvocation]) \u2013 The op definition which the\nhook may be associated with.

  • \n
  • solid (Optional[SolidDefinition, PendingNodeInvocation]) \u2013 (legacy) The solid definition which the\nhook may be associated with.

  • \n
  • run_id (Optional[str]) \u2013 The id of the run in which the hook is invoked (provided for mocking purposes).

  • \n
  • job_name (Optional[str]) \u2013 The name of the job in which the hook is used (provided for mocking purposes).

  • \n
  • op_exception (Optional[Exception]) \u2013 The exception that caused the hook to be triggered.

  • \n
\n
\n
\n

Examples

\n
context = build_hook_context()\nhook_to_invoke(context)\n\nwith build_hook_context(resources={"foo": context_manager_resource}) as context:\n    hook_to_invoke(context)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/hooks", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../internals/", "title": "Internals"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../graphs/", "title": "Graphs"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/internals", "Internals", "N", "next"], ["sections/api/apidocs/graphs", "Graphs", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/hooks.rst.txt", "title": "Hooks", "toc": "\n"}, "internals": {"alabaster_version": "0.7.12", "body": "
\n

Internals\u00b6

\n

Please note that internal APIs are likely to be in much greater flux pre-1.0 than user-facing APIs,\nparticularly if not exported in the top level dagster module.

\n

If you find yourself consulting these docs because you are writing custom components and plug-ins,\nplease get in touch with the core team on our Slack.\nWe\u2019re curious what you\u2019re up to, happy to help, excited for new community contributions, and eager\nto make the system as easy to work with as possible \u2013 including for teams who are looking to\ncustomize it.

\n
\n

Executors\u00b6

\n
\n
\n@dagster.executor(name=None, config_schema=None, requirements=None)[source]\u00b6
\n

Define an executor.

\n

The decorated function should accept an InitExecutorContext and return an instance\nof Executor.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the executor.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.executor_config. If not set, Dagster will accept any config provided for.

  • \n
  • requirements (Optional[List[ExecutorRequirement]]) \u2013 Any requirements that must\nbe met in order for the executor to be usable for a particular pipeline execution.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ExecutorDefinition(name, config_schema=None, requirements=None, executor_creation_fn=None, description=None)[source]\u00b6
\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the executor.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data\navailable in init_context.executor_config. If not set, Dagster will accept any config\nprovided.

  • \n
  • requirements (Optional[List[ExecutorRequirement]]) \u2013 Any requirements that must\nbe met in order for the executor to be usable for a particular pipeline execution.

  • \n
  • executor_creation_fn (Optional[Callable]) \u2013 Should accept an InitExecutorContext\nand return an instance of Executor

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the\nexecutor.

  • \n
\n
\n
\n
\n
\nconfigured(config_or_config_fn, name=None, config_schema=None, description=None)[source]\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (Optional[str]) \u2013 Name of the new definition. If not provided, the emitted\ndefinition will inherit the name of the ExecutorDefinition upon which this\nfunction is called.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 If config_or_config_fn is a function, the config\nschema that its input must satisfy. If not set, Dagster will accept any config\nprovided.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\nclass dagster.InitExecutorContext(job, executor_def, executor_config, instance)[source]\u00b6
\n

Executor-specific initialization context.

\n
\n
\njob\u00b6
\n

The job to be executed.

\n
\n
Type
\n

IPipeline

\n
\n
\n
\n\n
\n
\nexecutor_def\u00b6
\n

The definition of the executor currently being\nconstructed.

\n
\n
Type
\n

ExecutorDefinition

\n
\n
\n
\n\n
\n
\nexecutor_config\u00b6
\n

The parsed config passed to the executor.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

The current instance.

\n
\n
Type
\n

DagsterInstance

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.Executor[source]\u00b6
\n
\n
\nabstract execute(plan_context, execution_plan)[source]\u00b6
\n

For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.

\n
\n
Parameters
\n
    \n
  • plan_context (PlanOrchestrationContext) \u2013 The plan\u2019s orchestration context.

  • \n
  • execution_plan (ExecutionPlan) \u2013 The plan to execute.

  • \n
\n
\n
Returns
\n

A stream of dagster events.

\n
\n
\n
\n\n
\n
\nabstract property retries\u00b6
\n

Whether retries are enabled or disabled for this instance of the executor.

\n

Executors should allow this to be controlled via configuration if possible.

\n

Returns: RetryMode

\n
\n\n
\n\n
\n
\n
\n

File Manager\u00b6

\n
\n
\nclass dagster.core.storage.file_manager.FileManager[source]\u00b6
\n

Base class for all file managers in dagster.

\n

The file manager is an interface that can be implemented by resources to provide abstract\naccess to a file system such as local disk, S3, or other cloud storage.

\n

For examples of usage, see the documentation of the concrete file manager implementations.

\n
\n
\nabstract copy_handle_to_local_temp(file_handle)[source]\u00b6
\n

Copy a file represented by a file handle to a temp file.

\n

In an implementation built around an object store such as S3, this method would be expected\nto download the file from S3 to local filesystem in a location assigned by the standard\nlibrary\u2019s tempfile module.

\n

Temp files returned by this method are not guaranteed to be reusable across solid\nboundaries. For files that must be available across solid boundaries, use the\nread(),\nread_data(),\nwrite(), and\nwrite_data() methods.

\n
\n
Parameters
\n

file_handle (FileHandle) \u2013 The handle to the file to make available as a local temp file.

\n
\n
Returns
\n

Path to the local temp file.

\n
\n
Return type
\n

str

\n
\n
\n
\n\n
\n
\nabstract delete_local_temp()[source]\u00b6
\n

Delete all local temporary files created by previous calls to\ncopy_handle_to_local_temp().

\n

Should typically only be called by framework implementors.

\n
\n\n
\n
\nabstract read(file_handle, mode='rb')[source]\u00b6
\n

Return a file-like stream for the file handle.

\n

This may incur an expensive network call for file managers backed by object stores\nsuch as S3.

\n
\n
Parameters
\n
    \n
  • file_handle (FileHandle) \u2013 The file handle to make available as a stream.

  • \n
  • mode (str) \u2013 The mode in which to open the file. Default: "rb".

  • \n
\n
\n
Returns
\n

A file-like stream.

\n
\n
Return type
\n

Union[TextIO, BinaryIO]

\n
\n
\n
\n\n
\n
\nabstract read_data(file_handle)[source]\u00b6
\n

Return the bytes for a given file handle. This may incur an expensive network\ncall for file managers backed by object stores such as s3.

\n
\n
Parameters
\n

file_handle (FileHandle) \u2013 The file handle for which to return bytes.

\n
\n
Returns
\n

Bytes for a given file handle.

\n
\n
Return type
\n

bytes

\n
\n
\n
\n\n
\n
\nabstract write(file_obj, mode='wb', ext=None)[source]\u00b6
\n

Write the bytes contained within the given file object into the file manager.

\n
\n
Parameters
\n
    \n
  • file_obj (Union[TextIO, StringIO]) \u2013 A file-like object.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to write the file into the file manager.\nDefault: "wb".

  • \n
  • ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None.

  • \n
\n
\n
Returns
\n

A handle to the newly created file.

\n
\n
Return type
\n

FileHandle

\n
\n
\n
\n\n
\n
\nabstract write_data(data, ext=None)[source]\u00b6
\n

Write raw bytes into the file manager.

\n
\n
Parameters
\n
    \n
  • data (bytes) \u2013 The bytes to write into the file manager.

  • \n
  • ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None.

  • \n
\n
\n
Returns
\n

A handle to the newly created file.

\n
\n
Return type
\n

FileHandle

\n
\n
\n
\n\n
\n\n
\n
\ndagster.local_file_manager ResourceDefinition[source]\u00b6
\n

FileManager that provides abstract access to a local filesystem.

\n

By default, files will be stored in <local_artifact_storage>/storage/file_manager where\n<local_artifact_storage> can be configured the dagster.yaml file in $DAGSTER_HOME.

\n

Implements the FileManager API.

\n

Examples:

\n
import tempfile\n\nfrom dagster import ModeDefinition, local_file_manager, pipeline, solid\n\n\n@solid(required_resource_keys={"file_manager"})\ndef write_files(context):\n    fh_1 = context.resources.file_manager.write_data(b"foo")\n\n    with tempfile.NamedTemporaryFile("w+") as fd:\n        fd.write("bar")\n        fd.seek(0)\n        fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n    return (fh_1, fh_2)\n\n\n@solid(required_resource_keys={"file_manager"})\ndef read_files(context, file_handles):\n    fh_1, fh_2 = file_handles\n    assert context.resources.file_manager.read_data(fh_2) == b"bar"\n    fd = context.resources.file_manager.read(fh_2, mode="r")\n    assert fd.read() == "foo"\n    fd.close()\n\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"file_manager": local_file_manager})])\ndef files_pipeline():\n    read_files(write_files())\n
\n
\n

Or to specify the file directory:

\n
@pipeline(\n    mode_defs=[\n        ModeDefinition(\n            resource_defs={\n                "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n            }\n        )\n    ]\n)\ndef files_pipeline():\n    read_files(write_files())\n
\n
\n
\n\n
\n
\n
\n

Instance\u00b6

\n
\n
\nclass dagster.DagsterInstance(instance_type, local_artifact_storage, run_storage, event_storage, compute_log_manager, run_coordinator, run_launcher, scheduler=None, schedule_storage=None, settings=None, ref=None)[source]\u00b6
\n

Core abstraction for managing Dagster\u2019s access to storage and other resources.

\n

Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\nthe values in the dagster.yaml file in $DAGSTER_HOME.

\n

Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\ntransient in-memory components.

\n

Configuration of this class should be done by setting values in $DAGSTER_HOME/dagster.yaml.\nFor example, to use Postgres for run and event log storage, you can write a dagster.yaml\nsuch as the following:

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n  module: dagster_postgres.run_storage\n  class: PostgresRunStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { database }\n      port: { port }\n\nevent_log_storage:\n  module: dagster_postgres.event_log\n  class: PostgresEventLogStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { database }\n      port: { port }\n\nschedule_storage:\n  module: dagster_postgres.event_log\n  class: PostgresScheduleStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { database }\n      port: { port }\n
\n
\n
\n
\n
Parameters
\n
    \n
  • instance_type (InstanceType) \u2013 Indicates whether the instance is ephemeral or persistent.\nUsers should not attempt to set this value directly or in their dagster.yaml files.

  • \n
  • local_artifact_storage (LocalArtifactStorage) \u2013 The local artifact storage is used to\nconfigure storage for any artifacts that require a local disk, such as schedules, or\nwhen using the filesystem system storage to manage files and intermediates. By default,\nthis will be a dagster.core.storage.root.LocalArtifactStorage. Configurable\nin dagster.yaml using the ConfigurableClass\nmachinery.

  • \n
  • run_storage (RunStorage) \u2013 The run storage is used to store metadata about ongoing and past\npipeline runs. By default, this will be a\ndagster.core.storage.runs.SqliteRunStorage. Configurable in dagster.yaml\nusing the ConfigurableClass machinery.

  • \n
  • event_storage (EventLogStorage) \u2013 Used to store the structured event logs generated by\npipeline runs. By default, this will be a\ndagster.core.storage.event_log.SqliteEventLogStorage. Configurable in\ndagster.yaml using the ConfigurableClass machinery.

  • \n
  • compute_log_manager (ComputeLogManager) \u2013 The compute log manager handles stdout and stderr\nlogging for solid compute functions. By default, this will be a\ndagster.core.storage.local_compute_log_manager.LocalComputeLogManager.\nConfigurable in dagster.yaml using the\nConfigurableClass machinery.

  • \n
  • run_coordinator (RunCoordinator) \u2013 A runs coordinator may be used to manage the execution\nof pipeline runs.

  • \n
  • run_launcher (Optional[RunLauncher]) \u2013 Optionally, a run launcher may be used to enable\na Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\naddition to running them locally.

  • \n
  • settings (Optional[Dict]) \u2013 Specifies certain per-instance settings,\nsuch as feature flags. These are set in the dagster.yaml under a set of whitelisted\nkeys.

  • \n
  • ref (Optional[InstanceRef]) \u2013 Used by internal machinery to pass instances across process\nboundaries.

  • \n
\n
\n
\n
\n
\nadd_daemon_heartbeat(daemon_heartbeat)[source]\u00b6
\n

Called on a regular interval by the daemon

\n
\n\n
\n
\nget_addresses_for_step_output_versions(step_output_versions)[source]\u00b6
\n

For each given step output, finds whether an output exists with the given\nversion, and returns its address if it does.

\n
\n
Parameters
\n

step_output_versions (Dict[(str, StepOutputHandle), str]) \u2013 (pipeline name, step output handle) -> version.

\n
\n
Returns
\n

\n
(pipeline name, step output handle) -> address.

For each step output, an address if there is one and None otherwise.

\n
\n
\n

\n
\n
Return type
\n

Dict[(str, StepOutputHandle), str]

\n
\n
\n
\n\n
\n
\nget_daemon_heartbeats()[source]\u00b6
\n

Latest heartbeats of all daemon types

\n
\n\n
\n
\nlaunch_run(run_id, workspace)[source]\u00b6
\n

Launch a pipeline run.

\n

This method is typically called using instance.submit_run rather than being invoked\ndirectly. This method delegates to the RunLauncher, if any, configured on the instance,\nand will call its implementation of RunLauncher.launch_run() to begin the execution of\nthe specified run. Runs should be created in the instance (e.g., by calling\nDagsterInstance.create_run()) before this method is called, and should be in the\nPipelineRunStatus.NOT_STARTED state.

\n
\n
Parameters
\n

run_id (str) \u2013 The id of the run the launch.

\n
\n
\n
\n\n
\n
\nreport_engine_event(message, pipeline_run=None, engine_event_data=None, cls=None, step_key=None, pipeline_name=None, run_id=None)[source]\u00b6
\n

Report a EngineEvent that occurred outside of a pipeline execution context.

\n
\n\n
\n
\nresume_run(run_id, workspace, attempt_number)[source]\u00b6
\n

Resume a pipeline run.

\n

This method should be called on runs which have already been launched, but whose run workers\nhave died.

\n
\n
Parameters
\n

run_id (str) \u2013 The id of the run the launch.

\n
\n
\n
\n\n
\n
\nproperty should_start_background_run_thread\u00b6
\n

Gate on an experimental feature to start a thread that monitors for if the run should be canceled.

\n
\n\n
\n
\nsubmit_run(run_id, workspace)[source]\u00b6
\n

Submit a pipeline run to the coordinator.

\n

This method delegates to the RunCoordinator, configured on the instance, and will\ncall its implementation of RunCoordinator.submit_run() to send the run to the\ncoordinator for execution. Runs should be created in the instance (e.g., by calling\nDagsterInstance.create_run()) before this method is called, and\nshould be in the PipelineRunStatus.NOT_STARTED state. They also must have a non-null\nExternalPipelineOrigin.

\n
\n
Parameters
\n

run_id (str) \u2013 The id of the run.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.core.instance.InstanceRef(local_artifact_storage_data, run_storage_data, event_storage_data, compute_logs_data, schedule_storage_data, scheduler_data, run_coordinator_data, run_launcher_data, settings, custom_instance_class_data=None)[source]\u00b6
\n

Serializable representation of a DagsterInstance.

\n

Users should not instantiate this class directly.

\n
\n\n
\n
\nclass dagster.serdes.ConfigurableClass[source]\u00b6
\n

Abstract mixin for classes that can be loaded from config.

\n

This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\nof conditional imports / optional extras_requires in dagster core and b) a magic directory or\nfile in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\nrun storage, pluggable with a config chunk like:

\n
run_storage:\n    module: very_cool_package.run_storage\n    class: SplendidRunStorage\n    config:\n        magic_word: "quux"\n
\n
\n

This same pattern should eventually be viable for other system components, e.g. engines.

\n

The ConfigurableClass mixin provides the necessary hooks for classes to be instantiated from\nan instance of ConfigurableClassData.

\n

Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\ntype such as:

\n
{'module': str, 'class': str, 'config': Field(Permissive())}\n
\n
\n
\n
\nabstract classmethod config_type()[source]\u00b6
\n

dagster.ConfigType: The config type against which to validate a config yaml fragment\nserialized in an instance of ConfigurableClassData.

\n
\n\n
\n
\nabstract static from_config_value(inst_data, config_value)[source]\u00b6
\n

New up an instance of the ConfigurableClass from a validated config value.

\n

Called by ConfigurableClassData.rehydrate.

\n
\n
Parameters
\n

config_value (dict) \u2013 The validated config value to use. Typically this should be the\nvalue attribute of a\nEvaluateValueResult.

\n
\n
\n

A common pattern is for the implementation to align the config_value with the signature\nof the ConfigurableClass\u2019s constructor:

\n
@staticmethod\ndef from_config_value(inst_data, config_value):\n    return MyConfigurableClass(inst_data=inst_data, **config_value)\n
\n
\n
\n\n
\n
\nabstract property inst_data\u00b6
\n

Subclass must be able to return the inst_data as a property if it has been constructed\nthrough the from_config_value code path.

\n
\n\n
\n\n
\n
\nclass dagster.serdes.ConfigurableClassData(module_name, class_name, config_yaml)[source]\u00b6
\n

Serializable tuple describing where to find a class and the config fragment that should\nbe used to instantiate it.

\n

Users should not instantiate this class directly.

\n

Classes intended to be serialized in this way should implement the\ndagster.serdes.ConfigurableClass mixin.

\n
\n\n
\n
\nclass dagster.core.storage.root.LocalArtifactStorage(base_dir, inst_data=None)[source]\u00b6
\n
\n
\nclassmethod config_type()[source]\u00b6
\n

dagster.ConfigType: The config type against which to validate a config yaml fragment\nserialized in an instance of ConfigurableClassData.

\n
\n\n
\n
\nstatic from_config_value(inst_data, config_value)[source]\u00b6
\n

New up an instance of the ConfigurableClass from a validated config value.

\n

Called by ConfigurableClassData.rehydrate.

\n
\n
Parameters
\n

config_value (dict) \u2013 The validated config value to use. Typically this should be the\nvalue attribute of a\nEvaluateValueResult.

\n
\n
\n

A common pattern is for the implementation to align the config_value with the signature\nof the ConfigurableClass\u2019s constructor:

\n
@staticmethod\ndef from_config_value(inst_data, config_value):\n    return MyConfigurableClass(inst_data=inst_data, **config_value)\n
\n
\n
\n\n
\n
\nproperty inst_data\u00b6
\n

Subclass must be able to return the inst_data as a property if it has been constructed\nthrough the from_config_value code path.

\n
\n\n
\n\n
\n
\n
\n

Run storage\u00b6

\n
\n
\nclass dagster.PipelineRun(pipeline_name, run_id=None, run_config=None, mode=None, solid_selection=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot_id=None, execution_plan_snapshot_id=None, external_pipeline_origin=None, pipeline_code_origin=None)[source]\u00b6
\n

Serializable internal representation of a pipeline run, as stored in a\nRunStorage.

\n
\n\n
\n
\nclass dagster.DagsterRunStatus(value)[source]\u00b6
\n

The status of pipeline execution.

\n
\n
\nCANCELED = 'CANCELED'\u00b6
\n
\n\n
\n
\nCANCELING = 'CANCELING'\u00b6
\n
\n\n
\n
\nFAILURE = 'FAILURE'\u00b6
\n
\n\n
\n
\nMANAGED = 'MANAGED'\u00b6
\n
\n\n
\n
\nNOT_STARTED = 'NOT_STARTED'\u00b6
\n
\n\n
\n
\nQUEUED = 'QUEUED'\u00b6
\n
\n\n
\n
\nSTARTED = 'STARTED'\u00b6
\n
\n\n
\n
\nSTARTING = 'STARTING'\u00b6
\n
\n\n
\n
\nSUCCESS = 'SUCCESS'\u00b6
\n
\n\n
\n\n
\n
\ndagster.PipelineRunStatus\u00b6
\n

alias of dagster.core.storage.pipeline_run.DagsterRunStatus

\n
\n\n
\n
\nclass dagster.core.storage.runs.RunStorage[source]\u00b6
\n

Abstract base class for storing pipeline run history.

\n

Note that run storages using SQL databases as backing stores should implement\nSqlRunStorage.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagit and dagster-graphql load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\nclass dagster.core.storage.runs.SqlRunStorage[source]\u00b6
\n

Base class for SQL based run storages

\n
\n\n
\n
\nclass dagster.core.storage.runs.SqliteRunStorage(conn_string, inst_data=None)[source]\u00b6
\n

SQLite-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

This is the default run storage when none is specified in the dagster.yaml.

\n

To explicitly specify SQLite for run storage, you can add a block such as the following to your\ndagster.yaml:

\n
run_storage:\n  module: dagster.core.storage.runs\n  class: SqliteRunStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the run storage where on disk to store the database.

\n
\n\n

See also: dagster_postgres.PostgresRunStorage and dagster_mysql.MySQLRunStorage.

\n
\n
\n
\n

Event log storage\u00b6

\n
\n
\nclass dagster.core.storage.event_log.EventLogEntry(error_info, level, user_message, run_id, timestamp, step_key=None, pipeline_name=None, dagster_event=None, job_name=None)[source]\u00b6
\n

Entries in the event log.

\n

These entries may originate from the logging machinery (DagsterLogManager/context.log), from\nframework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n(e.g. Output).

\n
\n
Parameters
\n
    \n
  • error_info (Optional[SerializableErrorInfo]) \u2013 Error info for an associated exception, if\nany, as generated by serializable_error_info_from_exc_info and friends.

  • \n
  • level (Union[str, int]) \u2013 The Python log level at which to log this event. Note that\nframework and user code events are also logged to Python logging. This value may be an\ninteger or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.

  • \n
  • user_message (str) \u2013 For log messages, this is the user-generated message.

  • \n
  • run_id (str) \u2013 The id of the run which generated this event.

  • \n
  • timestamp (float) \u2013 The Unix timestamp of this event.

  • \n
  • step_key (Optional[str]) \u2013 The step key for the step which generated this event. Some events\nare generated outside of a step context.

  • \n
  • job_name (Optional[str]) \u2013 The job which generated this event. Some events are\ngenerated outside of a job context.

  • \n
  • dagster_event (Optional[DagsterEvent]) \u2013 For framework and user events, the associated\nstructured event.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The pipeline which generated this event. Some events are\ngenerated outside of a pipeline context.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.core.storage.event_log.EventLogRecord(storage_id, event_log_entry)[source]\u00b6
\n

Internal representation of an event record, as stored in a\nEventLogStorage.

\n
\n\n
\n
\nclass dagster.core.storage.event_log.EventRecordsFilter(event_type=None, asset_key=None, asset_partitions=None, after_cursor=None, before_cursor=None, after_timestamp=None, before_timestamp=None)[source]\u00b6
\n

Defines a set of filter fields for fetching a set of event log entries or event log records.

\n
\n
Parameters
\n
    \n
  • event_type (Optional[DagsterEventType]) \u2013 Filter argument for dagster event type

  • \n
  • asset_key (Optional[AssetKey]) \u2013 Asset key for which to get asset materialization event\nentries / records.

  • \n
  • asset_partitions (Optional[List[str]]) \u2013 Filter parameter such that only asset\nmaterialization events with a partition value matching one of the provided values. Only\nvalid when the asset_key parameter is provided.

  • \n
  • after_cursor (Optional[Union[int, RunShardedEventsCursor]]) \u2013 Filter parameter such that only\nrecords with storage_id greater than the provided value are returned. Using a\nrun-sharded events cursor will result in a significant performance gain when run against\na SqliteEventLogStorage implementation (which is run-sharded)

  • \n
  • before_cursor (Optional[Union[int, RunShardedEventsCursor]]) \u2013 Filter parameter such that\nrecords with storage_id less than the provided value are returned. Using a run-sharded\nevents cursor will result in a significant performance gain when run against\na SqliteEventLogStorage implementation (which is run-sharded)

  • \n
  • after_timestamp (Optional[float]) \u2013 Filter parameter such that only event records for\nevents with timestamp greater than the provided value are returned.

  • \n
  • before_timestamp (Optional[float]) \u2013 Filter parameter such that only event records for\nevents with timestamp less than the provided value are returned.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.core.storage.event_log.RunShardedEventsCursor(id, run_updated_after)[source]\u00b6
\n

Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\nperformance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\nrun-sharded storages, the id field is ignored, since they may not be unique across shards

\n
\n\n
\n
\nclass dagster.core.storage.event_log.EventLogStorage[source]\u00b6
\n

Abstract base class for storing structured event logs from pipeline runs.

\n

Note that event log storages using SQL databases as backing stores should implement\nSqlEventLogStorage.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagit and dagster-graphql load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\nclass dagster.core.storage.event_log.SqlEventLogStorage[source]\u00b6
\n

Base class for SQL backed event log storages.

\n

Distinguishes between run-based connections and index connections in order to support run-level\nsharding, while maintaining the ability to do cross-run queries

\n
\n\n
\n
\nclass dagster.core.storage.event_log.SqliteEventLogStorage(base_dir, inst_data=None)[source]\u00b6
\n

SQLite-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

This is the default event log storage when none is specified in the dagster.yaml.

\n

To explicitly specify SQLite for event log storage, you can add a block such as the following\nto your dagster.yaml:

\n
event_log_storage:\n  module: dagster.core.storage.event_log\n  class: SqliteEventLogStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the event log storage where on disk to store the databases. To\nimprove concurrent performance, event logs are stored in a separate SQLite database for each\nrun.

\n
\n\n
\n
\nclass dagster.core.storage.event_log.ConsolidatedSqliteEventLogStorage(base_dir, inst_data=None)[source]\u00b6
\n

SQLite-backed consolidated event log storage intended for test cases only.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\nthe following to your dagster.yaml:

\n
run_storage:\n  module: dagster.core.storage.event_log\n  class: ConsolidatedSqliteEventLogStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the event log storage where on disk to store the database.

\n
\n\n

See also: dagster_postgres.PostgresEventLogStorage and dagster_mysql.MySQLEventLogStorage.

\n
\n
\n
\n

Compute log manager\u00b6

\n
\n
\nclass dagster.core.storage.compute_log_manager.ComputeLogManager[source]\u00b6
\n

Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\nsteps of pipeline solids.

\n
\n\n
\n
\nclass dagster.core.storage.local_compute_log_manager.LocalComputeLogManager(base_dir, polling_timeout=None, inst_data=None)[source]\u00b6
\n

Stores copies of stdout & stderr for each compute step locally on disk.

\n
\n\n

See also: dagster_aws.S3ComputeLogManager.

\n
\n
\n
\n

Run launcher\u00b6

\n
\n
\nclass dagster.core.launcher.RunLauncher[source]\u00b6
\n
\n\n
\n
\nclass dagster.core.launcher.DefaultRunLauncher(inst_data=None, wait_for_processes=False)[source]\u00b6
\n

Launches runs against running GRPC servers.

\n
\n\n
\n
\n
\n

Run coordinator\u00b6

\n
\n
\nclass dagster.core.run_coordinator.DefaultRunCoordinator(inst_data=None)[source]\u00b6
\n

Immediately send runs to the run launcher.

\n
\n\n
\n
\ndagster.core.run_coordinator.QueuedRunCoordinator RunCoordinator[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
max_concurrent_runs (dagster.IntSource, optional)
\n

The maximum number of runs that are allowed to be in progress at once

\n
\n
tag_concurrency_limits (Union[List[strict dict], None], optional)
\n

A set of limits that are applied to runs with particular tags. If a value is set, the limit is applied to only that key-value pair. If no value is set, the limit is applied across all values of that key. If the value is set to a dict with applyLimitPerUniqueValue: true, the limit will apply to the number of unique values for that key.

\n
\n
dequeue_interval_seconds (dagster.IntSource, optional)
\n

The interval in seconds at which the Dagster Daemon should periodically check the run queue for new runs to launch.

\n
\n
\n

Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\nthe Dagster Daemon process to be alive in order for runs to be launched.

\n
\n\n
\n
\n
\n

Scheduling\u00b6

\n
\n
\nclass dagster.core.scheduler.Scheduler[source]\u00b6
\n

Abstract base class for a scheduler. This component is responsible for interfacing with\nan external system such as cron to ensure scheduled repeated execution according.

\n
\n\n
\n
\nclass dagster.core.storage.schedules.ScheduleStorage[source]\u00b6
\n

Abstract class for managing persistance of scheduler artifacts

\n
\n\n
\n
\nclass dagster.core.storage.schedules.SqlScheduleStorage[source]\u00b6
\n

Base class for SQL backed schedule storage

\n
\n\n
\n
\nclass dagster.core.storage.schedules.SqliteScheduleStorage(conn_string, inst_data=None)[source]\u00b6
\n

Local SQLite backed schedule storage

\n
\n\n

see also: dagster_postgres.PostgresScheduleStorage and dagster_mysql.MySQLScheduleStorage.

\n
\n
\n
\n

Exception handling\u00b6

\n
\n
\ndagster.core.errors.user_code_error_boundary(error_cls, msg_fn, log_manager=None, **kwargs)[source]\u00b6
\n

Wraps the execution of user-space code in an error boundary. This places a uniform\npolicy around any user code invoked by the framework. This ensures that all user\nerrors are wrapped in an exception derived from DagsterUserCodeExecutionError,\nand that the original stack trace of the user error is preserved, so that it\ncan be reported without confusing framework code in the stack trace, if a\ntool author wishes to do so.

\n

Examples:

\n
with user_code_error_boundary(\n    # Pass a class that inherits from DagsterUserCodeExecutionError\n    DagsterExecutionStepExecutionError,\n    # Pass a function that produces a message\n    "Error occurred during step execution"\n):\n    call_user_provided_function()\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/internals", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../jobs/", "title": "Jobs"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../hooks/", "title": "Hooks"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/jobs", "Jobs", "N", "next"], ["sections/api/apidocs/hooks", "Hooks", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/internals.rst.txt", "title": "Internals", "toc": "\n"}, "io-managers": {"alabaster_version": "0.7.12", "body": "
\n

IO Managers\u00b6

\n

IO managers are user-provided objects that store op outputs and load them as inputs to downstream\nops.

\n
\n
\n@dagster.io_manager(config_schema=None, description=None, output_config_schema=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define an IO manager.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

The decorated function should accept an InitResourceContext and return an\nIOManager.

\n
\n
Parameters
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource config. Configuration\ndata available in init_context.resource_config. If not set, Dagster will accept any\nconfig provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • output_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-output config. If not set,\nno per-output configuration will be allowed.

  • \n
  • input_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-input config. If not set,\nDagster will accept any config provided.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the object\nmanager.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.

  • \n
\n
\n
\n

Examples:

\n
class MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        write_csv("some/path")\n\n    def load_input(self, context):\n        return read_csv("some/path")\n\n@io_manager\ndef my_io_manager(init_context):\n    return MyIOManager()\n\n@op(out=Out(io_manager_key="my_io_manager_key"))\ndef my_op(_):\n    return do_stuff()\n\n@job(resource_defs={"my_io_manager_key": my_io_manager})\ndef my_job():\n    my_op()\n
\n
\n
\n\n
\n
\nclass dagster.IOManager[source]\u00b6
\n

Base class for user-provided IO managers.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

Extend this class to handle how objects are loaded and stored. Users should implement\nhandle_output to store an object and load_input to retrieve an object.

\n
\n
\nget_input_asset_key(context)[source]\u00b6
\n

User-defined method that associates inputs loaded by this IOManager with a particular\nAssetKey.

\n
\n
Parameters
\n

context (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.

\n
\n
\n
\n\n
\n
\nget_input_asset_partitions(context)[source]\u00b6
\n

User-defined method that associates inputs loaded by this IOManager with a set of\npartitions of an AssetKey.

\n
\n
Parameters
\n

context (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.

\n
\n
\n
\n\n
\n
\nget_output_asset_key(_context)[source]\u00b6
\n

User-defined method that associates outputs handled by this IOManager with a particular\nAssetKey.

\n
\n
Parameters
\n

context (OutputContext) \u2013 The context of the step output that produces this object.

\n
\n
\n
\n\n
\n
\nget_output_asset_partitions(_context)[source]\u00b6
\n

User-defined method that associates outputs handled by this IOManager with a set of\npartitions of an AssetKey.

\n
\n
Parameters
\n

context (OutputContext) \u2013 The context of the step output that produces this object.

\n
\n
\n
\n\n
\n
\nabstract handle_output(context, obj)[source]\u00b6
\n

User-defined method that stores an output of an op.

\n
\n
Parameters
\n
    \n
  • context (OutputContext) \u2013 The context of the step output that produces this object.

  • \n
  • obj (Any) \u2013 The object, returned by the op, to be stored.

  • \n
\n
\n
\n
\n\n
\n
\nabstract load_input(context)[source]\u00b6
\n

User-defined method that loads an input to an op.

\n
\n
Parameters
\n

context (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.

\n
\n
Returns
\n

The data object.

\n
\n
Return type
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.IOManagerDefinition(resource_fn=None, config_schema=None, description=None, required_resource_keys=None, version=None, input_config_schema=None, output_config_schema=None)[source]\u00b6
\n

Definition of an IO manager resource.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

An IOManagerDefinition is a ResourceDefinition whose resource_fn returns an\nIOManager.

\n

The easiest way to create an IOManagerDefnition is with the @io_manager\ndecorator.

\n
\n
\nstatic hardcoded_io_manager(value, description=None)[source]\u00b6
\n

A helper function that creates an IOManagerDefinition with a hardcoded IOManager.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 A hardcoded IO Manager which helps mock the definition.

  • \n
  • description ([Optional[str]]) \u2013 The description of the IO Manager. Defaults to None.

  • \n
\n
\n
Returns
\n

A hardcoded resource.

\n
\n
Return type
\n

[IOManagerDefinition]

\n
\n
\n
\n\n
\n
\nproperty input_config_schema\u00b6
\n

The schema for per-input configuration for inputs that are managed by this\ninput manager

\n
\n\n
\n
\nproperty output_config_schema\u00b6
\n

The schema for per-output configuration for outputs that are managed by this\nmanager

\n
\n\n
\n\n
\n

Input and Output Contexts\u00b6

\n
\n
\nclass dagster.InputContext(name=None, pipeline_name=None, solid_def=None, config=None, metadata=None, upstream_output=None, dagster_type=None, log_manager=None, resource_config=None, resources=None, step_context=None, op_def=None)[source]\u00b6
\n

The context object available to the load_input method of RootInputManager.

\n
\n
\nname\u00b6
\n

The name of the input that we\u2019re loading.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npipeline_name\u00b6
\n

The name of the pipeline.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nsolid_def\u00b6
\n

The definition of the solid that\u2019s loading the input.

\n
\n
Type
\n

Optional[SolidDefinition]

\n
\n
\n
\n\n
\n
\nconfig\u00b6
\n

The config attached to the input that we\u2019re loading.

\n
\n
Type
\n

Optional[Any]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of metadata that is assigned to the\nInputDefinition that we\u2019re loading for.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nupstream_output\u00b6
\n

Info about the output that produced the object\nwe\u2019re loading.

\n
\n
Type
\n

Optional[OutputContext]

\n
\n
\n
\n\n
\n
\ndagster_type\u00b6
\n

The type of this input.

\n
\n
Type
\n

Optional[DagsterType]

\n
\n
\n
\n\n
\n
\nlog\u00b6
\n

The log manager to use for this input.

\n
\n
Type
\n

Optional[DagsterLogManager]

\n
\n
\n
\n\n
\n
\nresource_config\u00b6
\n

The config associated with the resource that\ninitializes the RootInputManager.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

The resources required by the resource that initializes the\ninput manager. If using the @root_input_manager() decorator, these resources\ncorrespond to those requested with the required_resource_keys parameter.

\n
\n
Type
\n

Optional[Resources]

\n
\n
\n
\n\n
\n
\nop_def\u00b6
\n

The definition of the op that\u2019s loading the input.

\n
\n
Type
\n

Optional[OpDefinition]

\n
\n
\n
\n\n
\n
\nadd_input_metadata(metadata, description=None)[source]\u00b6
\n

Accepts a dictionary of metadata. Metadata entries will appear on the LOADED_INPUT event.\nIf the input is an asset, metadata will be attached to an asset observation.

\n

The asset observation will be yielded from the run and appear in the event log.\nOnly valid if the context has an asset key.

\n
\n\n
\n
\nproperty asset_partition_key\u00b6
\n

The partition key for input asset.

\n

Raises an error if the input asset has no partitioning, or if the run covers a partition\nrange for the input asset.

\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n

The partition key range for input asset.

\n

Raises an error if the input asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partitions_time_window\u00b6
\n

The time window for the partitions of the input asset.

\n

Raises an error if either of the following are true:\n- The input asset has no partitioning.\n- The input asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nconsume_events()[source]\u00b6
\n

Pops and yields all user-generated events that have been recorded from this context.

\n

If consume_events has not yet been called, this will yield all logged events since the call to handle_input. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nget_observations()[source]\u00b6
\n

Retrieve the list of user-generated asset observations that were observed via the context.

\n

User-generated events that were yielded will not appear in this list.

\n

Examples:

\n
from dagster import IOManager, build_input_context, AssetObservation\n\nclass MyIOManager(IOManager):\n    def load_input(self, context, obj):\n        ...\n\ndef test_load_input():\n    mgr = MyIOManager()\n    context = build_input_context()\n    mgr.load_input(context)\n    observations = context.get_observations()\n    ...\n
\n
\n
\n\n
\n
\nproperty has_input_name\u00b6
\n

If we\u2019re the InputContext is being used to load the result of a run from outside the run,\nthen it won\u2019t have an input name.

\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n\n
\n
\nclass dagster.OutputContext(step_key=None, name=None, pipeline_name=None, run_id=None, metadata=None, mapping_key=None, config=None, solid_def=None, dagster_type=None, log_manager=None, version=None, resource_config=None, resources=None, step_context=None, op_def=None)[source]\u00b6
\n

The context object that is available to the handle_output method of an IOManager.

\n
\n
\nstep_key\u00b6
\n

The step_key for the compute step that produced the output.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nname\u00b6
\n

The name of the output that produced the output.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npipeline_name\u00b6
\n

The name of the pipeline definition.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The id of the run that produced the output.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of the metadata that is assigned to the\nOutputDefinition that produced the output.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nmapping_key\u00b6
\n

The key that identifies a unique mapped output. None for regular outputs.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nconfig\u00b6
\n

The configuration for the output.

\n
\n
Type
\n

Optional[Any]

\n
\n
\n
\n\n
\n
\nsolid_def\u00b6
\n

The definition of the solid that produced the output.

\n
\n
Type
\n

Optional[SolidDefinition]

\n
\n
\n
\n\n
\n
\ndagster_type\u00b6
\n

The type of this output.

\n
\n
Type
\n

Optional[DagsterType]

\n
\n
\n
\n\n
\n
\nlog\u00b6
\n

The log manager to use for this output.

\n
\n
Type
\n

Optional[DagsterLogManager]

\n
\n
\n
\n\n
\n
\nversion\u00b6
\n

(Experimental) The version of the output.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nresource_config\u00b6
\n

The config associated with the resource that\ninitializes the RootInputManager.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

The resources required by the output manager, specified by the\nrequired_resource_keys parameter.

\n
\n
Type
\n

Optional[Resources]

\n
\n
\n
\n\n
\n
\nop_def\u00b6
\n

The definition of the op that produced the output.

\n
\n
Type
\n

Optional[OpDefinition]

\n
\n
\n
\n\n
\n
\nadd_output_metadata(metadata)[source]\u00b6
\n

Add a dictionary of metadata to the handled output.

\n

Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.

\n
\n
Parameters
\n

metadata (Dict[str, Any]) \u2013 A metadata dictionary to log

\n
\n
\n

Examples:

\n
from dagster import IOManager\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        context.add_output_metadata({"foo": "bar"})\n
\n
\n
\n\n
\n
\nproperty asset_partition_key\u00b6
\n

The partition key for output asset.

\n

Raises an error if the output asset has no partitioning, or if the run covers a partition\nrange for the output asset.

\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n

The partition key range for output asset.

\n

Raises an error if the output asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partitions_time_window\u00b6
\n

The time window for the partitions of the output asset.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nconsume_events()[source]\u00b6
\n

Pops and yields all user-generated events that have been recorded from this context.

\n

If consume_events has not yet been called, this will yield all logged events since the call to handle_output. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nconsume_logged_metadata_entries()[source]\u00b6
\n

Pops and yields all user-generated metadata entries that have been recorded from this context.

\n

If consume_logged_metadata_entries has not yet been called, this will yield all logged events since the call to handle_output. If consume_logged_metadata_entries has been called, it will yield all events since the last time consume_logged_metadata_entries was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nget_logged_events()[source]\u00b6
\n

Retrieve the list of user-generated events that were logged via the context.

\n

User-generated events that were yielded will not appear in this list.

\n

Examples:

\n
from dagster import IOManager, build_output_context, AssetMaterialization\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        ...\n\ndef test_handle_output():\n    mgr = MyIOManager()\n    context = build_output_context()\n    mgr.handle_output(context)\n    all_user_events = context.get_logged_events()\n    materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n    ...\n
\n
\n
\n\n
\n
\nget_logged_metadata_entries()[source]\u00b6
\n

Get the list of metadata entries that have been logged for use with this output.

\n
\n\n
\n
\nget_output_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep output.

\n

If not using memoization, the unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the output.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the output is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n

If using memoization, the version corresponding to the step output is used in place of\nthe run_id.

\n
\n
Returns
\n

A list of identifiers, i.e. (run_id or version), step_key, and output_name

\n
\n
Return type
\n

List[str, ..]

\n
\n
\n
\n\n
\n
\nget_run_scoped_output_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep output.

\n

The unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the output.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the output is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n
\n
Returns
\n

A list of identifiers, i.e. run id, step key, and output name

\n
\n
Return type
\n

List[str, ..]

\n
\n
\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run

\n
\n\n
\n
\nlog_event(event)[source]\u00b6
\n

Log an AssetMaterialization or AssetObservation from within the body of an io manager\u2019s handle_output method.

\n

Events logged with this method will appear in the event log.

\n
\n
Parameters
\n

event (Union[AssetMaterialization, Materialization, AssetObservation]) \u2013 The event to log.

\n
\n
\n

Examples:

\n
from dagster import IOManager, AssetMaterialization\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n\n
\n
\ndagster.build_input_context(name=None, config=None, metadata=None, upstream_output=None, dagster_type=None, resource_config=None, resources=None, op_def=None, step_context=None)[source]\u00b6
\n

Builds input context from provided parameters.

\n

build_input_context can be used as either a function, or a context manager. If resources\nthat are also context managers are provided, then build_input_context must be used as a\ncontext manager.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the input that we\u2019re loading.

  • \n
  • config (Optional[Any]) \u2013 The config attached to the input that we\u2019re loading.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata that is assigned to the\nInputDefinition that we\u2019re loading for.

  • \n
  • upstream_output (Optional[OutputContext]) \u2013 Info about the output that produced the object\nwe\u2019re loading.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 The type of this input.

  • \n
  • resource_config (Optional[Dict[str, Any]]) \u2013 The resource config to make available from the\ninput context. This usually corresponds to the config provided to the resource that\nloads the input manager.

  • \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to make available from the context.\nFor a given key, you can provide either an actual instance of an object, or a resource\ndefinition.

  • \n
  • asset_key (Optional[AssetKey]) \u2013 The asset key attached to the InputDefinition.

  • \n
  • op_def (Optional[OpDefinition]) \u2013 The definition of the op that\u2019s loading the input.

  • \n
  • step_context (Optional[StepExecutionContext]) \u2013 For internal use.

  • \n
\n
\n
\n

Examples

\n
build_input_context()\n\nwith build_input_context(resources={"foo": context_manager_resource}) as context:\n    do_something\n
\n
\n
\n\n
\n
\ndagster.build_output_context(step_key=None, name=None, metadata=None, run_id=None, mapping_key=None, config=None, dagster_type=None, version=None, resource_config=None, resources=None, solid_def=None, op_def=None)[source]\u00b6
\n

Builds output context from provided parameters.

\n

build_output_context can be used as either a function, or a context manager. If resources\nthat are also context managers are provided, then build_output_context must be used as a\ncontext manager.

\n
\n
Parameters
\n
    \n
  • step_key (Optional[str]) \u2013 The step_key for the compute step that produced the output.

  • \n
  • name (Optional[str]) \u2013 The name of the output that produced the output.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of the metadata that is assigned to the\nOutputDefinition that produced the output.

  • \n
  • mapping_key (Optional[str]) \u2013 The key that identifies a unique mapped output. None for regular outputs.

  • \n
  • config (Optional[Any]) \u2013 The configuration for the output.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 The type of this output.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the output.

  • \n
  • resource_config (Optional[Dict[str, Any]]) \u2013 The resource config to make available from the\ninput context. This usually corresponds to the config provided to the resource that\nloads the output manager.

  • \n
  • resources (Optional[Resources]) \u2013 The resources to make available from the context.\nFor a given key, you can provide either an actual instance of an object, or a resource\ndefinition.

  • \n
  • solid_def (Optional[SolidDefinition]) \u2013 The definition of the solid that produced the output.

  • \n
  • op_def (Optional[OpDefinition]) \u2013 The definition of the solid that produced the output.

  • \n
\n
\n
\n

Examples

\n
build_output_context()\n\nwith build_output_context(resources={"foo": context_manager_resource}) as context:\n    do_something\n
\n
\n
\n\n
\n
\n

Built-in IO Managers\u00b6

\n
\n
\ndagster.mem_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in IO manager that stores and retrieves values in memory.

\n
\n\n
\n
\ndagster.fs_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in filesystem IO manager that stores and retrieves values using pickling.

\n

Allows users to specify a base directory where all the step outputs will be stored. By\ndefault, step outputs will be stored in the directory specified by local_artifact_storage in\nyour dagster.yaml file (which will be a temporary directory if not explicitly set).

\n

Serializes and deserializes output values using pickling and automatically constructs\nthe filepaths for the assets.

\n

Example usage:

\n

1. Specify a job-level IO manager using the reserved resource key "io_manager",\nwhich will set the given IO manager on all ops in a job.

\n
from dagster import fs_io_manager, job, op\n\n@op\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(\n    resource_defs={\n        "io_manager": fs_io_manager.configured({"base_path": "/my/base/path"})\n    }\n)\ndef job():\n    op_b(op_a())\n
\n
\n

2. Specify IO manager on Out, which allows the user to set different IO managers on\ndifferent step outputs.

\n
from dagster import fs_io_manager, job, op, Out\n\n@op(out=Out(io_manager_key="my_io_manager"))\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(resource_defs={"my_io_manager": fs_io_manager})\ndef job():\n    op_b(op_a())\n
\n
\n
\n\n
\n
\ndagster.custom_path_fs_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in IO manager that allows users to custom output file path per output definition.

\n

It requires users to specify a base directory where all the step output will be stored in. It\nserializes and deserializes output values (assets) using pickling and stores the pickled object\nin the user-provided file paths.

\n

Example usage:

\n
from dagster import custom_path_fs_io_manager, job, op\n\n@op(out=Out(metadata={"path": "path/to/sample_output"}))\ndef sample_data(df):\n    return df[:5]\n\nmy_custom_path_fs_io_manager = custom_path_fs_io_manager.configured(\n    {"base_dir": "path/to/basedir"}\n)\n\n@job(resource_defs={"io_manager": my_custom_path_fs_io_manager})\ndef my_job():\n    sample_data()\n
\n
\n
\n\n
\n
\n

Root Input Managers (Experimental)\u00b6

\n

Root input managers are user-provided objects that specify how to load inputs that aren\u2019t connected\nto upstream outputs.

\n
\n
\n@dagster.root_input_manager(config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define a root input manager.

\n

Root input managers load op inputs that aren\u2019t connected to upstream outputs.

\n

The decorated function should accept a InputContext and resource config, and return\na loaded object that will be passed into one of the inputs of an op.

\n

The decorator produces an RootInputManagerDefinition.

\n
\n
Parameters
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource-level config. If not\nset, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • input_config_schema (Optional[ConfigSchema]) \u2013 A schema for the input-level config. Each\ninput that uses this input manager can be configured separately using this config.\nIf not set, Dagster will accept any config provided.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the input\nmanager.

  • \n
  • version (Optional[str]) \u2013 (Experimental) the version of the input manager definition.

  • \n
\n
\n
\n

Examples:

\n
from dagster import root_input_manager, op, job, In\n\n@root_input_manager\ndef csv_loader(_):\n    return read_csv("some/path")\n\n@op(ins={"input1": In(root_manager_key="csv_loader_key")})\ndef my_op(_, input1):\n    do_stuff(input1)\n\n@job(resource_defs={"csv_loader_key": csv_loader})\ndef my_job():\n    my_op()\n\n@root_input_manager(config_schema={"base_dir": str})\ndef csv_loader(context):\n    return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n@root_input_manager(input_config_schema={"path": str})\ndef csv_loader(context):\n    return read_csv(context.config["path"])\n
\n
\n
\n\n
\n
\nclass dagster.RootInputManager[source]\u00b6
\n

RootInputManagers are used to load inputs to ops at the root of a job.

\n

The easiest way to define an RootInputManager is with the\n@root_input_manager decorator.

\n
\n
\nabstract load_input(context)[source]\u00b6
\n

The user-defined read method that loads data given its metadata.

\n
\n
Parameters
\n

context (InputContext) \u2013 The context of the step output that produces this asset.

\n
\n
Returns
\n

The data object.

\n
\n
Return type
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.RootInputManagerDefinition(resource_fn=None, config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Definition of a root input manager resource.

\n

Root input managers load op inputs that aren\u2019t connected to upstream outputs.

\n

An RootInputManagerDefinition is a ResourceDefinition whose resource_fn returns an\nRootInputManager.

\n

The easiest way to create an RootInputManagerDefinition is with the\n@root_input_manager decorator.

\n
\n
\nproperty input_config_schema\u00b6
\n

The schema for per-input configuration for inputs that are managed by this\ninput manager

\n
\n\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/io-managers", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../partitions/", "title": "Partitions"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../ops/", "title": "Ops"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/partitions", "Partitions", "N", "next"], ["sections/api/apidocs/ops", "Ops", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/io-managers.rst.txt", "title": "IO Managers", "toc": "\n"}, "jobs": {"alabaster_version": "0.7.12", "body": "
\n

Jobs\u00b6

\n

The replacement for pipeline / PipelineDefinition, a Job binds a Graph and the resources it needs to be executable.

\n

Jobs are created by calling GraphDefinition.to_job() on a graph instance, or using the job decorator.

\n
\n
\n@dagster.job(name=None, description=None, resource_defs=None, config=None, tags=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, partitions_def=None)[source]\u00b6
\n

Creates a job with the specified parameters from the decorated graph/op invocation function.

\n

Using this decorator allows you to build an executable job by writing a function that invokes\nops (or graphs).

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagit playground, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the pipeline, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagit playground, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multiprocess_executor .

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 Defines how each op (and optionally, resource) in the job can be versioned. If\nprovided, memoizaton will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition keys\nthat can parameterize the job. If this argument is supplied, the config argument\ncan\u2019t also be supplied.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.JobDefinition(mode_def, graph_def, name=None, description=None, preset_defs=None, tags=None, hook_defs=None, op_retry_policy=None, version_strategy=None, _op_selection_data=None)[source]\u00b6
\n
\n
\nexecute_in_process(run_config=None, instance=None, partition_key=None, raise_on_error=True, op_selection=None, run_id=None)[source]\u00b6
\n

Execute the Job in-process, gathering results in-memory.

\n

The executor_def on the Job will be ignored, and replaced with the in-process executor.\nIf using the default io_manager, it will switch from filesystem to in-memory.

\n
\n
Parameters
\n
    \n
  • (Optional[Dict[str (run_config) \u2013 The configuration for the run

  • \n
  • Any]] \u2013 The configuration for the run

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for jobs with partitioned config.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
\n
\n
Returns
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]\u00b6
\n

Apply a set of hooks to all op instances within the job.

\n
\n\n
\n\n
\n

Reconstructable jobs\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]
\n

Create a ReconstructablePipeline from a\nfunction that returns a PipelineDefinition/JobDefinition,\nor a function decorated with @pipeline/@job.

\n

When your pipeline/job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the pipeline/job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of pipelines\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples:

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\ndagster.build_reconstructable_job(reconstructor_module_name, reconstructor_function_name, reconstructable_args=None, reconstructable_kwargs=None, reconstructor_working_directory=None)[source]\u00b6
\n

Create a dagster.core.definitions.reconstructable.ReconstructablePipeline.

\n

When your job must cross process boundaries, e.g., for execution on multiple nodes or in\ndifferent systems (like dagstermill), Dagster must know how to reconstruct the job\non the other side of the process boundary.

\n

This function allows you to use the strategy of your choice for reconstructing jobs, so\nthat you can reconstruct certain kinds of jobs that are not supported by\nreconstructable(), such as those defined by lambdas, in nested scopes (e.g.,\ndynamically within a method call), or in interactive environments such as the Python REPL or\nJupyter notebooks.

\n

If you need to reconstruct jobs constructed in these ways, use this function instead of\nreconstructable().

\n
\n
Parameters
\n
    \n
  • reconstructor_module_name (str) \u2013 The name of the module containing the function to use to\nreconstruct the job.

  • \n
  • reconstructor_function_name (str) \u2013 The name of the function to use to reconstruct the\njob.

  • \n
  • reconstructable_args (Tuple) \u2013 Args to the function to use to reconstruct the job.\nValues of the tuple must be JSON serializable.

  • \n
  • reconstructable_kwargs (Dict[str, Any]) \u2013 Kwargs to the function to use to reconstruct the\njob. Values of the dict must be JSON serializable.

  • \n
\n
\n
\n

Examples:

\n
# module: mymodule\n\nfrom dagster import JobDefinition, job, build_reconstructable_job\n\nclass JobFactory:\n    def make_job(*args, **kwargs):\n\n        @job\n        def _job(...):\n            ...\n\n        return _job\n\ndef reconstruct_job(*args):\n    factory = JobFactory()\n    return factory.make_job(*args)\n\nfactory = JobFactory()\n\nfoo_job_args = (...,...)\n\nfoo_job_kwargs = {...:...}\n\nfoo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\nreconstructable_foo_job = build_reconstructable_job(\n    'mymodule',\n    'reconstruct_job',\n    foo_job_args,\n    foo_job_kwargs,\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/jobs", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../loggers/", "title": "Loggers"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../internals/", "title": "Internals"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/loggers", "Loggers", "N", "next"], ["sections/api/apidocs/internals", "Internals", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/jobs.rst.txt", "title": "Jobs", "toc": "\n"}, "libraries": {"dagster-airbyte": {"alabaster_version": "0.7.12", "body": "
\n

Airbyte (dagster-airbyte)\u00b6

\n

This library provides a Dagster integration with Airbyte.

\n
\n

Ops\u00b6

\n
\n
\ndagster_airbyte.airbyte_sync_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection_id (String)
\n

The Airbyte Connection ID that this op will sync. You can retrieve this value from the \u201cConnections\u201d tab of a given connector in the Airbyte UI.

\n
\n
poll_interval (Float, optional)
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional)
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n

Default Value: None

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the Airbyte sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018airbyte\u2019]

\n
\n
\n

Executes a Airbyte job sync for a given connection_id, and polls until that sync\ncompletes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\nthe job details for a given connection_id.

\n

It requires the use of the airbyte_resource, which allows it to\ncommunicate with the Airbyte API.

\n

Examples:

\n
from dagster import job\nfrom dagster_airbyte import airbyte_resource, airbyte_sync_op\n\nmy_airbyte_resource = airbyte_resource.configured(\n    {\n        "host": {"env": "AIRBYTE_HOST"},\n        "port": {"env": "AIRBYTE_PORT"},\n    }\n)\n\nsync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"airbyte": my_airbyte_resource})\ndef my_simple_airbyte_job():\n    sync_foobar()\n\n@job(resource_defs={"airbyte": my_airbyte_resource})\ndef my_composed_airbyte_job():\n    final_foobar_state = sync_foobar(start_after=some_op())\n    other_op(final_foobar_state)\n
\n
\n
\n\n
\n
\n

Resources\u00b6

\n
\n
\ndagster_airbyte.airbyte_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

The Airbyte Server Address.

\n
\n
port (dagster.StringSource, optional)
\n

Port for the Airbyte Server.

\n
\n
use_https (Bool, optional)
\n

Use https to connect in Airbyte Server.

\n

Default Value: False

\n
\n
request_max_retries (Int, optional)
\n

The maximum number of times requests to the Airbyte API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional)
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the Airbyte REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

For a complete set of documentation on the Airbyte REST API, including expected response JSON\nschema, see the Airbyte API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_airbyte import airbyte_resource\n\nmy_airbyte_resource = airbyte_resource.configured(\n    {\n        "host": {"env": "AIRBYTE_HOST"},\n        "port": {"env": "AIRBYTE_PORT"},\n    }\n)\n\n@job(resource_defs={"airbyte":my_airbyte_resource})\ndef my_airbyte_job():\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_airbyte.AirbyteResource(host, port, use_https, request_max_retries=3, request_retry_delay=0.25, log=<Logger dagster.builtin (DEBUG)>)[source]\u00b6
\n

This class exposes methods on top of the Airbyte REST API.

\n
\n
\nmake_request(endpoint, data)[source]\u00b6
\n

Creates and sends a request to the desired Airbyte REST API endpoint.

\n
\n
Parameters
\n
    \n
  • endpoint (str) \u2013 The Airbyte API endpoint to send this request to.

  • \n
  • data (Optional[str]) \u2013 JSON-formatted data string to be included in the request.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nsync_and_poll(connection_id, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Initializes a sync operation for the given connector, and polls until it completes.

\n
\n
Parameters
\n
    \n
  • connection_id (str) \u2013 The Airbyte Connector ID. You can retrieve this value from the\n\u201cConnection\u201d tab of a given connection in the Arbyte UI.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (float) \u2013 The maximum time that will waited before this operation is timed\nout. By default, this will never time out.

  • \n
\n
\n
Returns
\n

Details of the sync job.

\n
\n
Return type
\n

AirbyteOutput

\n
\n
\n
\n\n
\n\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_airbyte.build_airbyte_assets(connection_id, destination_tables, asset_key_prefix=None)[source]\u00b6
\n

Builds a set of assets representing the tables created by an Airbyte sync operation.

\n
\n
Parameters
\n
    \n
  • connection_id (str) \u2013 The Airbyte Connection ID that this op will sync. You can retrieve this\nvalue from the \u201cConnections\u201d tab of a given connector in the Airbyte UI.

  • \n
  • destination_tables (List[str]) \u2013 The names of the tables that you want to be represented\nin the Dagster asset graph for this sync. This will generally map to the name of the\nstream in Airbyte, unless a stream prefix has been specified in Airbyte.

  • \n
  • asset_key_prefix (Optional[List[str]]) \u2013 A prefix for the asset keys inside this asset.\nIf left blank, assets will have a key of AssetKey([table_name]).

  • \n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-airbyte", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-airflow/", "title": "Airflow (dagster-airflow)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../../memoization/", "title": "Versioning and Memoization"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-airflow", "Airflow (dagster-airflow)", "N", "next"], ["sections/api/apidocs/memoization", "Versioning and Memoization", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-airbyte.rst.txt", "title": "Airbyte (dagster-airbyte)", "toc": "\n"}, "dagster-airflow": {"alabaster_version": "0.7.12", "body": "
\n

Airflow (dagster-airflow)\u00b6

\n
\n
\ndagster_airflow.make_airflow_dag(module_name, job_name, run_config=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, pipeline_name=None)[source]\u00b6
\n

Construct an Airflow DAG corresponding to a given Dagster job/pipeline.

\n

Tasks in the resulting DAG will execute the Dagster logic they encapsulate as a Python\ncallable, run by an underlying PythonOperator. As a\nconsequence, both dagster, any Python dependencies required by your solid logic, and the module\ncontaining your pipeline definition must be available in the Python environment within which\nyour Airflow tasks execute. If you cannot install requirements into this environment, or you\nare looking for a containerized solution to provide better isolation, see instead\nmake_airflow_dag_containerized().

\n

This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.

\n
\n
Parameters
\n
    \n
  • module_name (str) \u2013 The name of the importable module in which the pipeline/job definition can be\nfound.

  • \n
  • job_name (str) \u2013 The name of the job definition.

  • \n
  • run_config (Optional[dict]) \u2013 The config, if any, with which to compile\nthe pipeline/job to an execution plan, as a Python dict.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to execute the pipeline.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The Dagster instance to use to execute the pipeline/job.

  • \n
  • dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG).

  • \n
  • dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG)

  • \n
  • dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG constructor, including default_args.

  • \n
  • op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator (a subclass of\nPythonOperator).

  • \n
  • pipeline_name (str) \u2013 (legacy) The name of the pipeline definition.

  • \n
\n
\n
Returns
\n

The generated Airflow DAG, and a\nlist of its constituent tasks.

\n
\n
Return type
\n

(airflow.models.DAG, List[airflow.models.BaseOperator])

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_airflow_dag_for_operator(recon_repo, job_name, operator, run_config=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, pipeline_name=None)[source]\u00b6
\n

Construct an Airflow DAG corresponding to a given Dagster job/pipeline and custom operator.

\n

Custom operator template

\n

Tasks in the resulting DAG will execute the Dagster logic they encapsulate run by the given\nOperator BaseOperator. If you\nare looking for a containerized solution to provide better isolation, see instead\nmake_airflow_dag_containerized().

\n

This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.

\n
\n
Parameters
\n
    \n
  • recon_repo (dagster.ReconstructableRepository) \u2013 reference to a Dagster RepositoryDefinition\nthat can be reconstructed in another process

  • \n
  • job_name (str) \u2013 The name of the job definition.

  • \n
  • operator (type) \u2013 The operator to use. Must be a class that inherits from\nBaseOperator

  • \n
  • run_config (Optional[dict]) \u2013 The config, if any, with which to compile\nthe pipeline to an execution plan, as a Python dict.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to execute the pipeline.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The Dagster instance to use to execute the pipeline.

  • \n
  • dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG).

  • \n
  • dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG)

  • \n
  • dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG constructor, including default_args.

  • \n
  • op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator.

  • \n
  • pipeline_name (str) \u2013 (legacy) The name of the pipeline definition.

  • \n
\n
\n
Returns
\n

The generated Airflow DAG, and a\nlist of its constituent tasks.

\n
\n
Return type
\n

(airflow.models.DAG, List[airflow.models.BaseOperator])

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_airflow_dag_containerized(module_name, job_name, image, run_config=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, pipeline_name=None)[source]\u00b6
\n

Construct a containerized Airflow DAG corresponding to a given Dagster job/pipeline.

\n

Tasks in the resulting DAG will execute the Dagster logic they encapsulate using a subclass of\nDockerOperator. As a\nconsequence, both dagster, any Python dependencies required by your solid logic, and the module\ncontaining your pipeline definition must be available in the container spun up by this operator.\nTypically you\u2019ll want to install these requirements onto the image you\u2019re using.

\n

This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.

\n
\n
Parameters
\n
    \n
  • module_name (str) \u2013 The name of the importable module in which the pipeline/job definition can be\nfound.

  • \n
  • job_name (str) \u2013 The name of the job definition.

  • \n
  • image (str) \u2013 The name of the Docker image to use for execution (passed through to\nDockerOperator).

  • \n
  • run_config (Optional[dict]) \u2013 The config, if any, with which to compile\nthe pipeline/job to an execution plan, as a Python dict.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to execute the pipeline.

  • \n
  • dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG).

  • \n
  • dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG)

  • \n
  • dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG constructor, including default_args.

  • \n
  • op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator (a subclass of\nDockerOperator).

  • \n
  • pipeline_name (str) \u2013 (legacy) The name of the pipeline definition.

  • \n
\n
\n
Returns
\n

The generated Airflow DAG, and a\nlist of its constituent tasks.

\n
\n
Return type
\n

(airflow.models.DAG, List[airflow.models.BaseOperator])

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_job_from_airflow_dag(dag, tags=None, use_airflow_template_context=False, unique_id=None)[source]\u00b6
\n

Construct a Dagster job corresponding to a given Airflow DAG.

\n

Tasks in the resulting job will execute the execute() method on the corresponding\nAirflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\ncontaining your DAG definition must be available in the Python environment within which your\nDagster solids execute.

\n

To set Airflow\u2019s execution_date for use with Airflow Operator\u2019s execute() methods,\neither:

\n
    \n
  1. \n
    (Best for ad hoc runs) Execute job directly. This will set execution_date to the

    time (in UTC) of the run.

    \n
    \n
    \n
  2. \n
  3. \n
    Add {'airflow_execution_date': utc_date_string} to the job tags. This will override

    behavior from (1).

    \n
    my_dagster_job = make_dagster_job_from_airflow_dag(\n        dag=dag,\n        tags={'airflow_execution_date': utc_execution_date_str}\n)\nmy_dagster_job.execute_in_process()\n
    \n
    \n
    \n
    \n
  4. \n
  5. \n
    (Recommended) Add {'airflow_execution_date': utc_date_string} to the run tags,

    such as in the Dagit UI. This will override behavior from (1) and (2)

    \n
    \n
    \n
  6. \n
\n

We apply normalized_name() to the dag id and task ids when generating job name and op\nnames to ensure that names conform to Dagster\u2019s naming conventions.

\n
\n
Parameters
\n
    \n
  • dag (DAG) \u2013 The Airflow DAG to compile into a Dagster job

  • \n
  • tags (Dict[str, Field]) \u2013 Job tags. Optionally include\ntags={\u2018airflow_execution_date\u2019: utc_date_string} to specify execution_date used within\nexecution of Airflow Operators.

  • \n
  • use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)

  • \n
  • unique_id (int) \u2013 If not None, this id will be postpended to generated op names. Used by\nframework authors to enforce unique op names within a repo.

  • \n
\n
\n
Returns
\n

The generated Dagster job

\n
\n
Return type
\n

JobDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_repo_from_airflow_dags_path(dag_path, repo_name, safe_mode=True, store_serialized_dags=False, use_airflow_template_context=False)[source]\u00b6
\n

Construct a Dagster repository corresponding to Airflow DAGs in dag_path.

\n

DagBag.get_dag() dependency requires Airflow DB to be initialized.

\n
\n
Usage:

Create make_dagster_repo.py:

\n
from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dags_path\n\ndef make_repo_from_dir():\n    return make_dagster_repo_from_airflow_dags_path(\n        '/path/to/dags/', 'my_repo_name'\n    )\n
\n
\n

Use RepositoryDefinition as usual, for example:\ndagit -f path/to/make_dagster_repo.py -n make_repo_from_dir

\n
\n
\n
\n
Parameters
\n
    \n
  • dag_path (str) \u2013 Path to directory or file that contains Airflow Dags

  • \n
  • repo_name (str) \u2013 Name for generated RepositoryDefinition

  • \n
  • include_examples (bool) \u2013 True to include Airflow\u2019s example DAGs. (default: False)

  • \n
  • safe_mode (bool) \u2013 True to use Airflow\u2019s default heuristic to find files that contain DAGs\n(ie find files that contain both b\u2019DAG\u2019 and b\u2019airflow\u2019) (default: True)

  • \n
  • store_serialized_dags (bool) \u2013 True to read Airflow DAGS from Airflow DB. False to read DAGS\nfrom Python files. (default: False)

  • \n
  • use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)

  • \n
\n
\n
Returns
\n

RepositoryDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_repo_from_airflow_dag_bag(dag_bag, repo_name, refresh_from_airflow_db=False, use_airflow_template_context=False)[source]\u00b6
\n

Construct a Dagster repository corresponding to Airflow DAGs in DagBag.

\n
\n
Usage:
\n
Create make_dagster_repo.py:

from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dag_bag\nfrom airflow_home import my_dag_bag

\n
\n
def make_repo_from_dag_bag():

return make_dagster_repo_from_airflow_dag_bag(my_dag_bag, \u2018my_repo_name\u2019)

\n
\n
\n
\n
Use RepositoryDefinition as usual, for example:

dagit -f path/to/make_dagster_repo.py -n make_repo_from_dag_bag

\n
\n
\n
\n
\n
\n
Parameters
\n
    \n
  • dag_path (str) \u2013 Path to directory or file that contains Airflow Dags

  • \n
  • repo_name (str) \u2013 Name for generated RepositoryDefinition

  • \n
  • refresh_from_airflow_db (bool) \u2013 If True, will refresh DAG if expired via DagBag.get_dag(),\nwhich requires access to initialized Airflow DB. If False (recommended), gets dag from\nDagBag\u2019s dags dict without depending on Airflow DB. (default: False)

  • \n
  • use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)

  • \n
\n
\n
Returns
\n

RepositoryDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_repo_from_airflow_example_dags(repo_name='airflow_example_dags_repo')[source]\u00b6
\n

Construct a Dagster repository for Airflow\u2019s example DAGs.

\n
\n
Execution of the following Airflow example DAGs is not currently supported:

\u2018example_external_task_marker_child\u2019,\n\u2018example_pig_operator\u2019,\n\u2018example_skip_dag\u2019,\n\u2018example_trigger_target_dag\u2019,\n\u2018example_xcom\u2019,\n\u2018test_utils\u2019,

\n
\n
\n

Usage:

\n
\n
\n
Create make_dagster_repo.py:

from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_example_dags

\n
\n
def make_airflow_example_dags():

return make_dagster_repo_from_airflow_example_dags()

\n
\n
\n
\n
Use RepositoryDefinition as usual, for example:

dagit -f path/to/make_dagster_repo.py -n make_airflow_example_dags

\n
\n
\n
\n
\n
Parameters
\n

repo_name (str) \u2013 Name for generated RepositoryDefinition

\n
\n
Returns
\n

RepositoryDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_pipeline_from_airflow_dag(dag, tags=None, use_airflow_template_context=False, unique_id=None)[source]\u00b6
\n

Construct a Dagster pipeline corresponding to a given Airflow DAG.

\n

Tasks in the resulting pipeline will execute the execute() method on the corresponding\nAirflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\ncontaining your DAG definition must be available in the Python environment within which your\nDagster solids execute.

\n

To set Airflow\u2019s execution_date for use with Airflow Operator\u2019s execute() methods,\neither:

\n
    \n
  1. \n
    (Best for ad hoc runs) Run Pipeline with \u2018default\u2019 preset, which sets execution_date to the

    time (in UTC) of pipeline invocation:

    \n
    execute_pipeline(\n    pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),\n    preset='default')\n
    \n
    \n
    \n
    \n
  2. \n
  3. Add {'airflow_execution_date': utc_date_string} to the PipelineDefinition tags. This will\noverride behavior from (1).

    \n
    \n
    execute_pipeline(\n    make_dagster_pipeline_from_airflow_dag(\n        dag=dag,\n        tags={'airflow_execution_date': utc_execution_date_str}\n    )\n)\n
    \n
    \n
    \n
  4. \n
  5. \n
    (Recommended) Add {'airflow_execution_date': utc_date_string} to the PipelineRun tags,

    such as in the Dagit UI. This will override behavior from (1) and (2)

    \n
    \n
    \n
  6. \n
\n

We apply normalized_name() to the dag id and task ids when generating pipeline name and solid\nnames to ensure that names conform to Dagster\u2019s naming conventions.

\n
\n
Parameters
\n
    \n
  • dag (DAG) \u2013 The Airflow DAG to compile into a Dagster pipeline

  • \n
  • tags (Dict[str, Field]) \u2013 Pipeline tags. Optionally include\ntags={\u2018airflow_execution_date\u2019: utc_date_string} to specify execution_date used within\nexecution of Airflow Operators.

  • \n
  • use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)

  • \n
  • unique_id (int) \u2013 If not None, this id will be postpended to generated solid names. Used by\nframework authors to enforce unique solid names within a repo.

  • \n
\n
\n
Returns
\n

The generated Dagster pipeline

\n
\n
Return type
\n

pipeline_def (PipelineDefinition)

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-airflow", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-aws/", "title": "AWS (dagster-aws)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-airbyte/", "title": "Airbyte (dagster-airbyte)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-aws", "AWS (dagster-aws)", "N", "next"], ["sections/api/apidocs/libraries/dagster-airbyte", "Airbyte (dagster-airbyte)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-airflow.rst.txt", "title": "Airflow (dagster-airflow)", "toc": "\n"}, "dagster-aws": {"alabaster_version": "0.7.12", "body": "
\n

AWS (dagster-aws)\u00b6

\n

Utilities for interfacing with AWS with Dagster.

\n
\n

S3\u00b6

\n
\n
\nclass dagster_aws.s3.S3ComputeLogManager(bucket, local_dir=None, inst_data=None, prefix='dagster', use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None, skip_empty_files=False)[source]\u00b6
\n

Logs compute function stdout and stderr to S3.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_aws.s3.compute_log_manager\n  class: S3ComputeLogManager\n  config:\n    bucket: "mycorp-dagster-compute-logs"\n    local_dir: "/tmp/cool"\n    prefix: "dagster-test-"\n    use_ssl: true\n    verify: true\n    verify_cert_path: "/path/to/cert/bundle.pem"\n    endpoint_url: "http://alternate-s3-host.io"\n    skip_empty_files: true\n
\n
\n
\n
Parameters
\n
    \n
  • bucket (str) \u2013 The name of the s3 bucket to which to log.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster.seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • use_ssl (Optional[bool]) \u2013 Whether or not to use SSL. Default True.

  • \n
  • verify (Optional[bool]) \u2013 Whether or not to verify SSL certificates. Default True.

  • \n
  • verify_cert_path (Optional[str]) \u2013 A filename of the CA cert bundle to use. Only used if\nverify set to False.

  • \n
  • endpoint_url (Optional[str]) \u2013 Override for the S3 endpoint url.

  • \n
  • skip_empty_files \u2013 (Optional[bool]): Skip upload of empty log files.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_aws.s3.S3FileCache(s3_bucket, s3_key, s3_session, overwrite=False)[source]\u00b6
\n
\n\n
\n
\nclass dagster_aws.s3.S3FileHandle(s3_bucket, s3_key)[source]\u00b6
\n

A reference to a file on S3.

\n
\n
\nproperty path_desc\u00b6
\n

The file\u2019s S3 URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty s3_bucket\u00b6
\n

The name of the S3 bucket.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty s3_key\u00b6
\n

The S3 key.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty s3_path\u00b6
\n

The file\u2019s S3 URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\ndagster_aws.s3.s3_file_manager ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (Bool, optional)
\n

Specifies whether to use an unsigned S3 session

\n

Default Value: False

\n
\n
region_name (String, optional)
\n

Specifies a custom region for the S3 session

\n
\n
endpoint_url (dagster.StringSource, optional)
\n

Specifies a custom endpoint for the S3 session

\n
\n
max_attempts (Int, optional)
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (String, optional)
\n

Specifies a profile to connect that session

\n
\n
s3_bucket (dagster.StringSource)
\n

\n
s3_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to S3.

\n

Implements the FileManager API.

\n
\n\n
\n
\ndagster_aws.s3.s3_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (Bool, optional)
\n

Specifies whether to use an unsigned S3 session

\n

Default Value: False

\n
\n
region_name (String, optional)
\n

Specifies a custom region for the S3 session

\n
\n
endpoint_url (dagster.StringSource, optional)
\n

Specifies a custom endpoint for the S3 session

\n
\n
max_attempts (Int, optional)
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (String, optional)
\n

Specifies a profile to connect that session

\n
\n
\n

Resource that gives access to S3.

\n

The underlying S3 session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is an S3 client, an instance of botocore.client.S3.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.s3 import s3_resource\n\n@op(required_resource_keys={'s3'})\ndef example_s3_op(context):\n    return context.resources.s3.list_objects_v2(\n        Bucket='my-bucket',\n        Prefix='some-key'\n    )\n\n@job(resource_defs={'s3': s3_resource})\ndef example_job(context):\n    example_s3_op()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            's3': {\n                'config': {\n                    'region_name': 'us-west-1',\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  s3:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n      # through the ordinary boto credential chain.\n      use_unsigned_session: false\n      # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n      endpoint_url: "http://localhost"\n      # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for S3 session. Default is default\n      # profile as specified in ~/.aws/credentials file\n
\n
\n
\n\n
\n
\ndagster_aws.s3.S3Coordinate DagsterType\u00b6
\n

A dagster.DagsterType intended to make it easier to pass information about files on S3\nfrom op to op. Objects of this type should be dicts with 'bucket' and 'key' keys,\nand may be hydrated from config in the intuitive way, e.g., for an input with the name\ns3_file:

\n
inputs:\n  s3_file:\n    value:\n      bucket: my-bucket\n      key: my-key\n
\n
\n
\n\n
\n
\ndagster_aws.s3.s3_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_bucket (dagster.StringSource)
\n

\n
s3_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using S3 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.

\n

Attach this resource definition to your job to make it available to your ops.

\n
@job(resource_defs={'io_manager': s3_pickle_io_manager, "s3": s3_resource, ...})\ndef my_job():\n    ...\n
\n
\n

You may configure this storage as follows:

\n
resources:\n    io_manager:\n        config:\n            s3_bucket: my-cool-bucket\n            s3_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\ndagster_aws.s3.s3_pickle_asset_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_bucket (dagster.StringSource)
\n

\n
s3_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using S3 for storage, meant for use with software-defined assets.

\n

Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\nwill overwrite previous materializations of that asset.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.

\n

Attach this resource definition to your job to make it available to your ops.

\n
asset_group = AssetGroup(\n    assets...,\n    resource_defs={'io_manager': s3_pickle_asset_io_manager, "s3": s3_resource, ...}),\n)\n
\n
\n

You may configure this IO manager as follows:

\n
resources:\n    io_manager:\n        config:\n            s3_bucket: my-cool-bucket\n            s3_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\n

ECS\u00b6

\n
\n
\ndagster_aws.ecs.EcsRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
task_definition (dagster.StringSource, optional)
\n

The task definition to use when launching new tasks. If none is provided, each run will create its own task definition.

\n
\n
container_name (dagster.StringSource, optional)
\n

The container name to use when launching new tasks. Defaults to \u2018run\u2019.

\n

Default Value: \u2018run\u2019

\n
\n
secrets (List[Union[String, strict dict]], optional)
\n

An array of AWS Secrets Manager secrets. These secrets will be mounted as environment variabls in the container. See https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html.

\n
\n
secrets_tag (Union[dagster.StringSource, None], optional)
\n

AWS Secrets Manager secrets with this tag will be mounted as environment variables in the container. Defaults to \u2018dagster\u2019.

\n

Default Value: \u2018dagster\u2019

\n
\n
include_sidecars (Bool, optional)
\n

Whether each run should use the same sidecars as the task that launches it. Defaults to False.

\n

Default Value: False

\n
\n
\n

RunLauncher that starts a task in ECS for each Dagster job run.

\n
\n\n
\n
\n

Redshift\u00b6

\n
\n
\ndagster_aws.redshift.redshift_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

Redshift host

\n
\n
port (dagster.IntSource, optional)
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (dagster.StringSource, optional)
\n

Username for Redshift connection

\n
\n
password (dagster.StringSource, optional)
\n

Password for Redshift connection

\n
\n
database (dagster.StringSource, optional)
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (dagster.StringSource, optional)
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
autocommit (Bool, optional)
\n

None by default, which honors the Redshift parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
connect_timeout (Int, optional)
\n

Connection timeout in seconds. 5 seconds by default

\n

Default Value: 5

\n
\n
sslmode (String, optional)
\n

SSL mode to use. See the Redshift documentation for more information on usage: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n

This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.

\n

Example

\n
from dagster import build_op_context, op\nfrom dagster_aws.redshift import redshift_resource\n\n@op(required_resource_keys={'redshift'})\ndef example_redshift_op(context):\n    return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\nredshift_configured = redshift_resource.configured({\n    'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n    'port': 5439,\n    'user': 'dagster',\n    'password': 'dagster',\n    'database': 'dev',\n})\ncontext = build_op_context(resources={'redshift': redshift_configured})\nassert example_redshift_op(context) == [(1,)]\n
\n
\n
\n\n
\n

Testing\u00b6

\n
\n
\ndagster_aws.redshift.fake_redshift_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

Redshift host

\n
\n
port (dagster.IntSource, optional)
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (dagster.StringSource, optional)
\n

Username for Redshift connection

\n
\n
password (dagster.StringSource, optional)
\n

Password for Redshift connection

\n
\n
database (dagster.StringSource, optional)
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (dagster.StringSource, optional)
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
autocommit (Bool, optional)
\n

None by default, which honors the Redshift parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
connect_timeout (Int, optional)
\n

Connection timeout in seconds. 5 seconds by default

\n

Default Value: 5

\n
\n
sslmode (String, optional)
\n

SSL mode to use. See the Redshift documentation for more information on usage: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n
\n\n
\n
\n
\n

EMR\u00b6

\n
\n
\ndagster_aws.emr.emr_pyspark_step_launcher ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_config (permissive dict, optional)
\n
\nDefault Value:
{\n    "spark": {\n        "app": {},\n        "driver": {\n            "blockManager": {}\n        },\n        "executor": {\n            "pyspark": {},\n            "logs": {\n                "rolling": {\n                    "time": {}\n                }\n            }\n        },\n        "local": {},\n        "submit": {},\n        "log": {},\n        "redaction": {},\n        "python": {\n            "profile": {},\n            "worker": {}\n        },\n        "files": {},\n        "jars": {},\n        "pyspark": {\n            "driver": {}\n        },\n        "reducer": {},\n        "shuffle": {\n            "file": {},\n            "io": {},\n            "service": {\n                "index": {\n                    "cache": {}\n                }\n            },\n            "sort": {},\n            "spill": {},\n            "registration": {}\n        },\n        "eventLog": {\n            "logBlockUpdates": {},\n            "longForm": {},\n            "buffer": {}\n        },\n        "ui": {\n            "dagGraph": {},\n            "liveUpdate": {}\n        },\n        "worker": {\n            "ui": {}\n        },\n        "sql": {\n            "ui": {}\n        },\n        "streaming": {\n            "ui": {},\n            "backpressure": {},\n            "receiver": {\n                "writeAheadLog": {}\n            },\n            "kafka": {},\n            "driver": {\n                "writeAheadLog": {}\n            }\n        },\n        "broadcast": {},\n        "io": {\n            "compression": {\n                "lz4": {},\n                "snappy": {},\n                "zstd": {}\n            }\n        },\n        "kryo": {},\n        "kryoserializer": {\n            "buffer": {}\n        },\n        "rdd": {},\n        "serializer": {},\n        "memory": {\n            "offHeap": {}\n        },\n        "storage": {\n            "replication": {}\n        },\n        "cleaner": {\n            "periodicGC": {},\n            "referenceTracking": {\n                "blocking": {}\n            }\n        },\n        "default": {},\n        "hadoop": {\n            "mapreduce": {\n                "fileoutputcommitter": {\n                    "algorithm": {}\n                }\n            }\n        },\n        "rpc": {\n            "message": {},\n            "retry": {}\n        },\n        "blockManager": {},\n        "network": {},\n        "port": {},\n        "core": {\n            "connection": {\n                "ack": {\n                    "wait": {}\n                }\n            }\n        },\n        "cores": {},\n        "locality": {\n            "wait": {}\n        },\n        "scheduler": {\n            "revive": {},\n            "listenerbus": {\n                "eventqueue": {}\n            }\n        },\n        "blacklist": {\n            "task": {},\n            "stage": {},\n            "application": {\n                "fetchFailure": {}\n            }\n        },\n        "speculation": {},\n        "task": {\n            "reaper": {}\n        },\n        "stage": {},\n        "dynamicAllocation": {},\n        "r": {\n            "driver": {},\n            "shell": {}\n        },\n        "graphx": {\n            "pregel": {}\n        },\n        "deploy": {\n            "zookeeper": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
spark (permissive dict, optional)
\n
\nDefault Value:
{\n    "app": {},\n    "driver": {\n        "blockManager": {}\n    },\n    "executor": {\n        "pyspark": {},\n        "logs": {\n            "rolling": {\n                "time": {}\n            }\n        }\n    },\n    "local": {},\n    "submit": {},\n    "log": {},\n    "redaction": {},\n    "python": {\n        "profile": {},\n        "worker": {}\n    },\n    "files": {},\n    "jars": {},\n    "pyspark": {\n        "driver": {}\n    },\n    "reducer": {},\n    "shuffle": {\n        "file": {},\n        "io": {},\n        "service": {\n            "index": {\n                "cache": {}\n            }\n        },\n        "sort": {},\n        "spill": {},\n        "registration": {}\n    },\n    "eventLog": {\n        "logBlockUpdates": {},\n        "longForm": {},\n        "buffer": {}\n    },\n    "ui": {\n        "dagGraph": {},\n        "liveUpdate": {}\n    },\n    "worker": {\n        "ui": {}\n    },\n    "sql": {\n        "ui": {}\n    },\n    "streaming": {\n        "ui": {},\n        "backpressure": {},\n        "receiver": {\n            "writeAheadLog": {}\n        },\n        "kafka": {},\n        "driver": {\n            "writeAheadLog": {}\n        }\n    },\n    "broadcast": {},\n    "io": {\n        "compression": {\n            "lz4": {},\n            "snappy": {},\n            "zstd": {}\n        }\n    },\n    "kryo": {},\n    "kryoserializer": {\n        "buffer": {}\n    },\n    "rdd": {},\n    "serializer": {},\n    "memory": {\n        "offHeap": {}\n    },\n    "storage": {\n        "replication": {}\n    },\n    "cleaner": {\n        "periodicGC": {},\n        "referenceTracking": {\n            "blocking": {}\n        }\n    },\n    "default": {},\n    "hadoop": {\n        "mapreduce": {\n            "fileoutputcommitter": {\n                "algorithm": {}\n            }\n        }\n    },\n    "rpc": {\n        "message": {},\n        "retry": {}\n    },\n    "blockManager": {},\n    "network": {},\n    "port": {},\n    "core": {\n        "connection": {\n            "ack": {\n                "wait": {}\n            }\n        }\n    },\n    "cores": {},\n    "locality": {\n        "wait": {}\n    },\n    "scheduler": {\n        "revive": {},\n        "listenerbus": {\n            "eventqueue": {}\n        }\n    },\n    "blacklist": {\n        "task": {},\n        "stage": {},\n        "application": {\n            "fetchFailure": {}\n        }\n    },\n    "speculation": {},\n    "task": {\n        "reaper": {}\n    },\n    "stage": {},\n    "dynamicAllocation": {},\n    "r": {\n        "driver": {},\n        "shell": {}\n    },\n    "graphx": {\n        "pregel": {}\n    },\n    "deploy": {\n        "zookeeper": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
app (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
name (dagster.StringSource, optional)
\n

Application Properties: The name of your application. This will appear in the UI and in log data.

\n
\n
\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{\n    "blockManager": {}\n}\n
\n
\n
\nConfig Schema:
\n
cores (dagster.IntSource, optional)
\n

Application Properties: Number of cores to use for the driver process, only in cluster mode.

\n
\n
maxResultSize (dagster.StringSource, optional)
\n

Application Properties: Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size is above this limit. Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory and memory overhead of objects in JVM). Setting a proper limit can protect the driver from out-of-memory errors.

\n
\n
memory (dagster.StringSource, optional)
\n

Application Properties: Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-memory command line option or in your default properties file.

\n
\n
memoryOverhead (dagster.StringSource, optional)
\n

Application Properties: The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
supervise (Bool, optional)
\n

Application Properties: If true, restarts the driver automatically if it fails with a non-zero exit status. Only has effect in Spark standalone mode or Mesos cluster deploy mode.

\n
\n
extraClassPath (dagster.StringSource, optional)
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of the driver. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-class-path command line option or in your default properties file.

\n
\n
extraJavaOptions (dagster.StringSource, optional)
\n

Runtime Environment: A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set with spark.driver.memory in the cluster mode and through the \u2013driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-java-options command line option or in your default properties file.

\n
\n
extraLibraryPath (dagster.StringSource, optional)
\n

Runtime Environment: Set a special library path to use when launching the driver JVM. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-library-path command line option or in your default properties file.

\n
\n
userClassPathFirst (Bool, optional)
\n

Runtime Environment: (Experimental) Whether to give user-added jars precedence over Spark\u2019s own jars when loading classes in the driver. This feature can be used to mitigate conflicts between Spark\u2019s dependencies and user dependencies. It is currently an experimental feature. This is used in cluster mode only.

\n
\n
blockManager (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional)
\n

Networking: Driver-specific port for the block manager to listen on, for cases where it cannot use the same configuration as executors.

\n
\n
\n
\n
bindAddress (dagster.StringSource, optional)
\n

Networking: Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP environment variable (see below). It also allows a different address from the local one to be advertised to executors or external systems. This is useful, for example, when running containers with bridged networking. For this to properly work, the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the container\u2019s host.

\n
\n
host (dagster.StringSource, optional)
\n

Networking: Hostname or IP address for the driver. This is used for communicating with the executors and the standalone Master.

\n
\n
port (dagster.StringSource, optional)
\n

Networking: Port for the driver to listen on. This is used for communicating with the executors and the standalone Master.

\n
\n
\n
\n
executor (permissive dict, optional)
\n
\nDefault Value:
{\n    "pyspark": {},\n    "logs": {\n        "rolling": {\n            "time": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Application Properties: Amount of memory to use per executor process, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g).

\n
\n
pyspark (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Application Properties: The amount of memory to be allocated to PySpark in each executor, in MiB unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python\u2019s memory use and it is up to the application to avoid exceeding the overhead memory space shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests.

\n
\n
\n
\n
memoryOverhead (dagster.StringSource, optional)
\n

Application Properties: The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
extraClassPath (dagster.StringSource, optional)
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of executors. This exists primarily for backwards-compatibility with older versions of Spark. Users typically should not need to set this option.

\n
\n
extraJavaOptions (dagster.StringSource, optional)
\n

Runtime Environment: A string of extra JVM options to pass to executors. For instance, GC settings or other logging. Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. The following symbols, if present will be interpolated: {{APP_ID}} will be replaced by application ID and {{EXECUTOR_ID}} will be replaced by executor ID. For example, to enable verbose gc logging to a file named for the executor ID of the app in /tmp, pass a \u2018value\u2019 of: -verbose:gc -Xloggc:/tmp/{{APP_ID}}-{{EXECUTOR_ID}}.gc

\n
\n
extraLibraryPath (dagster.StringSource, optional)
\n

Runtime Environment: Set a special library path to use when launching executor JVM\u2019s.

\n
\n
logs (permissive dict, optional)
\n
\nDefault Value:
{\n    "rolling": {\n        "time": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
rolling (permissive dict, optional)
\n
\nDefault Value:
{\n    "time": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRetainedFiles (dagster.IntSource, optional)
\n

Runtime Environment: Sets the number of latest rolling log files that are going to be retained by the system. Older log files will be deleted. Disabled by default.

\n
\n
enableCompression (Bool, optional)
\n

Runtime Environment: Enable executor log compression. If it is enabled, the rolled executor logs will be compressed. Disabled by default.

\n
\n
maxSize (dagster.IntSource, optional)
\n

Runtime Environment: Set the max size of the file in bytes by which the executor logs will be rolled over. Rolling is disabled by default. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
strategy (dagster.StringSource, optional)
\n

Runtime Environment: Set the strategy of rolling of executor logs. By default it is disabled. It can be set to \u201ctime\u201d (time-based rolling) or \u201csize\u201d (size-based rolling). For \u201ctime\u201d, use spark.executor.logs.rolling.time.interval to set the rolling interval. For \u201csize\u201d, use spark.executor.logs.rolling.maxSize to set the maximum file size for rolling.

\n
\n
time (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Runtime Environment: Set the time interval by which the executor logs will be rolled over. Rolling is disabled by default. Valid values are daily, hourly, minutely or any interval in seconds. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
\n
\n
\n
\n
\n
\n
userClassPathFirst (Bool, optional)
\n

Runtime Environment: (Experimental) Same functionality as spark.driver.userClassPathFirst, but applied to executor instances.

\n
\n
cores (dagster.IntSource, optional)
\n

Execution Behavior: The number of cores to use on each executor. In standalone and Mesos coarse-grained modes, for more detail, see this description.

\n
\n
heartbeatInterval (dagster.StringSource, optional)
\n

Execution Behavior: Interval between each executor\u2019s heartbeats to the driver. Heartbeats let the driver know that the executor is still alive and update it with metrics for in-progress tasks. spark.executor.heartbeatInterval should be significantly less than spark.network.timeout

\n
\n
\n
\n
extraListeners (dagster.StringSource, optional)
\n

Application Properties: A comma-separated list of classes that implement SparkListener; when initializing SparkContext, instances of these classes will be created and registered with Spark\u2019s listener bus. If a class has a single-argument constructor that accepts a SparkConf, that constructor will be called; otherwise, a zero-argument constructor will be called. If no valid constructor can be found, the SparkContext creation will fail with an exception.

\n
\n
local (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
dir (dagster.StringSource, optional)
\n

Application Properties: Directory to use for \u201cscratch\u201d space in Spark, including map output files and RDDs that get stored on disk. This should be on a fast, local disk in your system. It can also be a comma-separated list of multiple directories on different disks. NOTE: In Spark 1.0 and later this will be overridden by SPARK_LOCAL_DIRS (Standalone), MESOS_SANDBOX (Mesos) or LOCAL_DIRS (YARN) environment variables set by the cluster manager.

\n
\n
\n
\n
logConf (Bool, optional)
\n

Application Properties: Logs the effective SparkConf as INFO when a SparkContext is started.

\n
\n
master (dagster.StringSource, optional)
\n

Application Properties: The cluster manager to connect to. See the list of allowed master URL\u2019s.

\n
\n
submit (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
deployMode (dagster.StringSource, optional)
\n

Application Properties: The deploy mode of Spark driver program, either \u201cclient\u201d or \u201ccluster\u201d, Which means to launch driver program locally (\u201cclient\u201d) or remotely (\u201ccluster\u201d) on one of the nodes inside the cluster.

\n
\n
pyFiles (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.

\n
\n
\n
\n
log (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
callerContext (dagster.StringSource, optional)
\n

Application Properties: Application information that will be written into Yarn RM log/HDFS audit log when running on Yarn/HDFS. Its length depends on the Hadoop configuration hadoop.caller.context.max.size. It should be concise, and typically can have up to 50 characters.

\n
\n
\n
\n
redaction (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
regex (dagster.StringSource, optional)
\n

Runtime Environment: Regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the environment UI and various logs like YARN and event logs.

\n
\n
\n
\n
python (permissive dict, optional)
\n
\nDefault Value:
{\n    "profile": {},\n    "worker": {}\n}\n
\n
\n
\nConfig Schema:
\n
profile (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Runtime Environment: Enable profiling in Python worker, the profile result will show up by sc.show_profiles(), or it will be displayed before the driver exits. It also can be dumped into disk by sc.dump_profiles(path). If some of the profile results had been displayed manually, they will not be displayed automatically before driver exiting. By default the pyspark.profiler.BasicProfiler will be used, but this can be overridden by passing a profiler class in as a parameter to the SparkContext constructor.

\n
\n
dump (dagster.StringSource, optional)
\n

Runtime Environment: The directory which is used to dump the profile result before driver exiting. The results will be dumped as separated file for each RDD. They can be loaded by ptats.Stats(). If this is specified, the profile result will not be displayed automatically.

\n
\n
\n
\n
worker (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Runtime Environment: Amount of memory to use per python worker process during aggregation, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). If the memory used during aggregation goes above this amount, it will spill the data into disks.

\n
\n
reuse (Bool, optional)
\n

Runtime Environment: Reuse Python worker or not. If yes, it will use a fixed number of Python workers, does not need to fork() a Python process for every task. It will be very useful if there is large broadcast, then the broadcast will not be needed to transferred from JVM to Python worker for every task.

\n
\n
\n
\n
\n
\n
files (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.

\n
\n
fetchTimeout (dagster.StringSource, optional)
\n

Execution Behavior: Communication timeout to use when fetching files added through SparkContext.addFile() from the driver.

\n
\n
useFetchCache (Bool, optional)
\n

Execution Behavior: If set to true (default), file fetching will use a local cache that is shared by executors that belong to the same application, which can improve task launching performance when running many executors on the same host. If set to false, these caching optimizations will be disabled and all executors will fetch their own copies of files. This optimization may be disabled in order to use Spark local directories that reside on NFS filesystems (see SPARK-6313 for more details).

\n
\n
overwrite (Bool, optional)
\n

Execution Behavior: Whether to overwrite files added through SparkContext.addFile() when the target file exists and its contents do not match those of the source.

\n
\n
maxPartitionBytes (dagster.IntSource, optional)
\n

Execution Behavior: The maximum number of bytes to pack into a single partition when reading files.

\n
\n
openCostInBytes (dagster.IntSource, optional)
\n

Execution Behavior: The estimated cost to open a file, measured by the number of bytes could be scanned at the same time. This is used when putting multiple files into a partition. It is better to overestimate, then the partitions with small files will be faster than partitions with bigger files.

\n
\n
\n
\n
jars (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.

\n
\n
packages (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option \u2013repositories. For more details, see Advanced Dependency Management.

\n
\n
excludes (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in spark.jars.packages to avoid dependency conflicts.

\n
\n
ivy (dagster.StringSource, optional)
\n

Runtime Environment: Path to specify the Ivy user directory, used for the local Ivy cache and package files from spark.jars.packages. This will override the Ivy property ivy.default.ivy.user.dir which defaults to ~/.ivy2.

\n
\n
ivySettings (dagster.StringSource, optional)
\n

Runtime Environment: Path to an Ivy settings file to customize resolution of jars specified using spark.jars.packages instead of the built-in defaults, such as maven central. Additional repositories given by the command-line option \u2013repositories or spark.jars.repositories will also be included. Useful for allowing Spark to resolve artifacts from behind a firewall e.g. via an in-house artifact server like Artifactory. Details on the settings file format can be found at http://ant.apache.org/ivy/history/latest-milestone/settings.html

\n
\n
repositories (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of additional remote repositories to search for the maven coordinates given with \u2013packages or spark.jars.packages.

\n
\n
\n
\n
pyspark (permissive dict, optional)
\n
\nDefault Value:
{\n    "driver": {}\n}\n
\n
\n
\nConfig Schema:
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
python (dagster.StringSource, optional)
\n

Runtime Environment: Python binary executable to use for PySpark in driver. (default is spark.pyspark.python)

\n
\n
\n
\n
python (dagster.StringSource, optional)
\n

Runtime Environment: Python binary executable to use for PySpark in both driver and executors.

\n
\n
\n
\n
reducer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSizeInFlight (dagster.StringSource, optional)
\n

Shuffle Behavior: Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless otherwise specified. Since each output requires us to create a buffer to receive it, this represents a fixed memory overhead per reduce task, so keep it small unless you have a large amount of memory.

\n
\n
maxReqsInFlight (dagster.IntSource, optional)
\n

Shuffle Behavior: This configuration limits the number of remote requests to fetch blocks at any given point. When the number of hosts in the cluster increase, it might lead to very large number of inbound connections to one or more nodes, causing the workers to fail under load. By allowing it to limit the number of fetch requests, this scenario can be mitigated.

\n
\n
maxBlocksInFlightPerAddress (dagster.IntSource, optional)
\n

Shuffle Behavior: This configuration limits the number of remote blocks being fetched per reduce task from a given host port. When a large number of blocks are being requested from a given address in a single fetch or simultaneously, this could crash the serving executor or Node Manager. This is especially useful to reduce the load on the Node Manager when external shuffle is enabled. You can mitigate this issue by setting it to a lower value.

\n
\n
\n
\n
maxRemoteBlockSizeFetchToMem (dagster.IntSource, optional)
\n

Shuffle Behavior: The remote block will be fetched to disk when size of the block is above this threshold in bytes. This is to avoid a giant request that takes too much memory. By default, this is only enabled for blocks > 2GB, as those cannot be fetched directly into memory, no matter what resources are available. But it can be turned down to a much lower value (eg. 200m) to avoid using too much memory on smaller blocks as well. Note this configuration will affect both shuffle fetch and block manager remote block fetch. For users who enabled external shuffle service, this feature can only be used when external shuffle service is newer than Spark 2.2.

\n
\n
shuffle (permissive dict, optional)
\n
\nDefault Value:
{\n    "file": {},\n    "io": {},\n    "service": {\n        "index": {\n            "cache": {}\n        }\n    },\n    "sort": {},\n    "spill": {},\n    "registration": {}\n}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional)
\n

Shuffle Behavior: Whether to compress map output files. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
file (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
buffer (dagster.StringSource, optional)
\n

Shuffle Behavior: Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise specified. These buffers reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

\n
\n
\n
\n
io (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.IntSource, optional)
\n

Shuffle Behavior: (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.

\n
\n
numConnectionsPerPeer (dagster.IntSource, optional)
\n

Shuffle Behavior: (Netty only) Connections between hosts are reused in order to reduce connection buildup for large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value.

\n
\n
preferDirectBufs (Bool, optional)
\n

Shuffle Behavior: (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache block transfer. For environments where off-heap memory is tightly limited, users may wish to turn this off to force all allocations from Netty to be on-heap.

\n
\n
retryWait (dagster.StringSource, optional)
\n

Shuffle Behavior: (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds by default, calculated as maxRetries * retryWait.

\n
\n
\n
\n
service (permissive dict, optional)
\n
\nDefault Value:
{\n    "index": {\n        "cache": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional)
\n

Shuffle Behavior: Enables the external shuffle service. This service preserves the shuffle files written by executors so the executors can be safely removed. This must be enabled if spark.dynamicAllocation.enabled is \u201ctrue\u201d. The external shuffle service must be set up in order to enable it. See dynamic allocation configuration and setup documentation for more information.

\n
\n
port (dagster.IntSource, optional)
\n

Shuffle Behavior: Port on which the external shuffle service will run.

\n
\n
index (permissive dict, optional)
\n
\nDefault Value:
{\n    "cache": {}\n}\n
\n
\n
\nConfig Schema:
\n
cache (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
size (dagster.StringSource, optional)
\n

Shuffle Behavior: Cache entries limited to the specified memory footprint in bytes.

\n
\n
\n
\n
\n
\n
\n
\n
maxChunksBeingTransferred (dagster.IntSource, optional)
\n

Shuffle Behavior: The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see spark.shuffle.io.maxRetries and spark.shuffle.io.retryWait), if those limits are reached the task will fail with fetch failure.

\n
\n
sort (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
bypassMergeThreshold (dagster.IntSource, optional)
\n

Shuffle Behavior: (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no map-side aggregation and there are at most this many reduce partitions.

\n
\n
\n
\n
spill (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional)
\n

Shuffle Behavior: Whether to compress data spilled during shuffles. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
accurateBlockThreshold (dagster.IntSource, optional)
\n

Shuffle Behavior: Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle block size when fetch shuffle blocks.

\n
\n
registration (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.IntSource, optional)
\n

Shuffle Behavior: Timeout in milliseconds for registration to the external shuffle service.

\n
\n
maxAttempts (dagster.IntSource, optional)
\n

Shuffle Behavior: When we fail to register to the external shuffle service, we will retry for maxAttempts times.

\n
\n
\n
\n
memoryFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for aggregation and cogroups during shuffles. At any given time, the collective size of all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will begin to spill to disk. If spills are often, consider increasing this value at the expense of spark.storage.memoryFraction.

\n
\n
\n
\n
eventLog (permissive dict, optional)
\n
\nDefault Value:
{\n    "logBlockUpdates": {},\n    "longForm": {},\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
logBlockUpdates (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to log events for every block update, if spark.eventLog.enabled is true. *Warning*: This will increase the size of the event log considerably.

\n
\n
\n
\n
longForm (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: If true, use the long form of call sites in the event log. Otherwise use the short form.

\n
\n
\n
\n
compress (dagster.StringSource, optional)
\n

Spark UI: Whether to compress logged events, if spark.eventLog.enabled is true. Compression will use spark.io.compression.codec.

\n
\n
dir (dagster.StringSource, optional)
\n

Spark UI: Base directory in which Spark events are logged, if spark.eventLog.enabled is true. Within this base directory, Spark creates a sub-directory for each application, and logs the events specific to the application in this directory. Users may want to set this to a unified location like an HDFS directory so history files can be read by the history server.

\n
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to log Spark events, useful for reconstructing the Web UI after the application has finished.

\n
\n
overwrite (dagster.StringSource, optional)
\n

Spark UI: Whether to overwrite any existing files.

\n
\n
buffer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
kb (dagster.StringSource, optional)
\n

Spark UI: Buffer size to use when writing to output streams, in KiB unless otherwise specified.

\n
\n
\n
\n
\n
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{\n    "dagGraph": {},\n    "liveUpdate": {}\n}\n
\n
\n
\nConfig Schema:
\n
dagGraph (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedRootRDDs (dagster.StringSource, optional)
\n

Spark UI: How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to run the web UI for the Spark application.

\n
\n
killEnabled (dagster.StringSource, optional)
\n

Spark UI: Allows jobs and stages to be killed from the web UI.

\n
\n
liveUpdate (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
period (dagster.StringSource, optional)
\n

Spark UI: How often to update live entities. -1 means \u201cnever update\u201d when replaying applications, meaning only the last write will happen. For live applications, this avoids a few operations that we can live without when rapidly processing incoming task events.

\n
\n
\n
\n
port (dagster.StringSource, optional)
\n

Spark UI: Port for your application\u2019s dashboard, which shows memory and workload data.

\n
\n
retainedJobs (dagster.StringSource, optional)
\n

Spark UI: How many jobs the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedStages (dagster.StringSource, optional)
\n

Spark UI: How many stages the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedTasks (dagster.StringSource, optional)
\n

Spark UI: How many tasks the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
reverseProxy (dagster.StringSource, optional)
\n

Spark UI: Enable running Spark Master as reverse proxy for worker and application UIs. In this mode, Spark master will reverse proxy the worker and application UIs to enable access without requiring direct access to their hosts. Use it with caution, as worker and application UI will not be accessible directly, you will only be able to access them through spark master/proxy public URL. This setting affects all the workers and application UIs running in the cluster and must be set on all the workers, drivers and masters.

\n
\n
reverseProxyUrl (dagster.StringSource, optional)
\n

Spark UI: This is the URL where your proxy is running. This URL is for proxy which is running in front of Spark Master. This is useful when running proxy for authentication e.g. OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to reach your proxy.

\n
\n
showConsoleProgress (dagster.StringSource, optional)
\n

Spark UI: Show the progress bar in the console. The progress bar shows the progress of stages that run for longer than 500ms. If multiple stages run at the same time, multiple progress bars will be displayed on the same line.

\n
\n
retainedDeadExecutors (dagster.StringSource, optional)
\n

Spark UI: How many dead executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
filters (dagster.StringSource, optional)
\n

Spark UI: Comma separated list of filter class names to apply to the Spark Web UI. The filter should be a standard javax servlet Filter. Filter parameters can also be specified in the configuration, by setting config entries of the form spark.<class name of filter>.param.<param name>=<value> For example: spark.ui.filters=com.test.filter1 spark.com.test.filter1.param.name1=foo spark.com.test.filter1.param.name2=bar

\n
\n
\n
\n
worker (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutors (dagster.StringSource, optional)
\n

Spark UI: How many finished executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
retainedDrivers (dagster.StringSource, optional)
\n

Spark UI: How many finished drivers the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
sql (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutions (dagster.StringSource, optional)
\n

Spark UI: How many finished executions the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
streaming (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {},\n    "backpressure": {},\n    "receiver": {\n        "writeAheadLog": {}\n    },\n    "kafka": {},\n    "driver": {\n        "writeAheadLog": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedBatches (dagster.StringSource, optional)
\n

Spark Streaming: How many batches the Spark Streaming UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
backpressure (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark Streaming: Enables or disables Spark Streaming\u2019s internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values spark.streaming.receiver.maxRate and spark.streaming.kafka.maxRatePerPartition if they are set (see below).

\n
\n
initialRate (dagster.StringSource, optional)
\n

Spark Streaming: This is the initial maximum receiving rate at which each receiver will receive data for the first batch when the backpressure mechanism is enabled.

\n
\n
\n
\n
blockInterval (dagster.StringSource, optional)
\n

Spark Streaming: Interval at which data received by Spark Streaming receivers is chunked into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the performance tuning section in the Spark Streaming programing guide for more details.

\n
\n
receiver (permissive dict, optional)
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRate (dagster.StringSource, optional)
\n

Spark Streaming: Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details.

\n
\n
writeAheadLog (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enable (dagster.StringSource, optional)
\n

Spark Streaming: Enable write-ahead logs for receivers. All the input data received through receivers will be saved to write-ahead logs that will allow it to be recovered after driver failures. See the deployment guide in the Spark Streaming programing guide for more details.

\n
\n
closeFileAfterWrite (dagster.StringSource, optional)
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the receivers. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the data WAL on the receivers.

\n
\n
\n
\n
\n
\n
unpersist (dagster.StringSource, optional)
\n

Spark Streaming: Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from Spark\u2019s memory. The raw input data received by Spark Streaming is also automatically cleared. Setting this to false will allow the raw data and persisted RDDs to be accessible outside the streaming application as they will not be cleared automatically. But it comes at the cost of higher memory usage in Spark.

\n
\n
stopGracefullyOnShutdown (dagster.StringSource, optional)
\n

Spark Streaming: If true, Spark shuts down the StreamingContext gracefully on JVM shutdown rather than immediately.

\n
\n
kafka (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRatePerPartition (dagster.StringSource, optional)
\n

Spark Streaming: Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API. See the Kafka Integration guide for more details.

\n
\n
minRatePerPartition (dagster.StringSource, optional)
\n

Spark Streaming: Minimum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API.

\n
\n
maxRetries (dagster.StringSource, optional)
\n

Spark Streaming: Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the new Kafka direct stream API.

\n
\n
\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
writeAheadLog (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
closeFileAfterWrite (dagster.StringSource, optional)
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the driver. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the metadata WAL on the driver.

\n
\n
\n
\n
\n
\n
\n
\n
broadcast (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to compress broadcast variables before sending them. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
blockSize (dagster.StringSource, optional)
\n

Execution Behavior: Size of each piece of a block for TorrentBroadcastFactory, in KiB unless otherwise specified. Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit.

\n
\n
checksum (dagster.StringSource, optional)
\n

Execution Behavior: Whether to enable checksum for broadcast. If enabled, broadcasts will include a checksum, which can help detect corrupted blocks, at the cost of computing and sending a little more data. It\u2019s possible to disable it if the network has other mechanisms to guarantee data won\u2019t be corrupted during broadcast.

\n
\n
\n
\n
io (permissive dict, optional)
\n
\nDefault Value:
{\n    "compression": {\n        "lz4": {},\n        "snappy": {},\n        "zstd": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
compression (permissive dict, optional)
\n
\nDefault Value:
{\n    "lz4": {},\n    "snappy": {},\n    "zstd": {}\n}\n
\n
\n
\nConfig Schema:
\n
codec (dagster.StringSource, optional)
\n

Compression and Serialization: The codec used to compress internal data such as RDD partitions, event log, broadcast variables and shuffle outputs. By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, and org.apache.spark.io.ZStdCompressionCodec.

\n
\n
lz4 (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional)
\n

Compression and Serialization: Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.

\n
\n
\n
\n
snappy (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional)
\n

Compression and Serialization: Block size in bytes used in Snappy compression, in the case when Snappy compression codec is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.

\n
\n
\n
\n
zstd (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
level (dagster.StringSource, optional)
\n

Compression and Serialization: Compression level for Zstd compression codec. Increasing the compression level will result in better compression at the expense of more CPU and memory.

\n
\n
bufferSize (dagster.StringSource, optional)
\n

Compression and Serialization: Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it might increase the compression cost because of excessive JNI call overhead.

\n
\n
\n
\n
\n
\n
\n
\n
kryo (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
classesToRegister (dagster.StringSource, optional)
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of custom class names to register with Kryo. See the tuning guide for more details.

\n
\n
referenceTracking (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple copies of the same object. Can be disabled to improve performance if you know this is not the case.

\n
\n
registrationRequired (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to require registration with Kryo. If set to \u2018true\u2019, Kryo will throw an exception if an unregistered class is serialized. If set to false (the default), Kryo will write unregistered class names along with each object. Writing class names can cause significant performance overhead, so enabling this option can enforce strictly that a user has not omitted classes from registration.

\n
\n
registrator (dagster.StringSource, optional)
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of classes that register your custom classes with Kryo. This property is useful if you need to register your classes in a custom way, e.g. to specify a custom field serializer. Otherwise spark.kryo.classesToRegister is simpler. It should be set to classes that extend KryoRegistrator. See the tuning guide for more details.

\n
\n
unsafe (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to use unsafe based Kryo serializer. Can be substantially faster by using Unsafe Based IO.

\n
\n
\n
\n
kryoserializer (permissive dict, optional)
\n
\nDefault Value:
{\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
buffer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Compression and Serialization: Initial size of Kryo\u2019s serialization buffer, in KiB unless otherwise specified. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.

\n
\n
max (dagster.StringSource, optional)
\n

Compression and Serialization: Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger than any object you attempt to serialize and must be less than 2048m. Increase this if you get a \u201cbuffer limit exceeded\u201d exception inside Kryo.

\n
\n
\n
\n
\n
\n
rdd (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
serializer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Compression and Serialization: Class to use for serializing objects that will be sent over the network or need to be cached in serialized form. The default of Java serialization works with any Serializable Java object but is quite slow, so we recommend using org.apache.spark.serializer.KryoSerializer and configuring Kryo serialization when speed is necessary. Can be any subclass of org.apache.spark.Serializer.

\n
\n
objectStreamReset (dagster.StringSource, optional)
\n

Compression and Serialization: When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches objects to prevent writing redundant data, however that stops garbage collection of those objects. By calling \u2018reset\u2019 you flush that info from the serializer, and allow old objects to be collected. To turn off this periodic reset set it to -1. By default it will reset the serializer every 100 objects.

\n
\n
\n
\n
memory (permissive dict, optional)
\n
\nDefault Value:
{\n    "offHeap": {}\n}\n
\n
\n
\nConfig Schema:
\n
fraction (Float, optional)
\n

Memory Management: Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at the default value is recommended. For more detail, including important information about correctly tuning JVM garbage collection when increasing this value, see this description.

\n
\n
storageFraction (Float, optional)
\n

Memory Management: Amount of storage memory immune to eviction, expressed as a fraction of the size of the region set aside by spark.memory.fraction. The higher this is, the less working memory may be available to execution and tasks may spill to disk more often. Leaving this at the default value is recommended. For more detail, see this description.

\n
\n
offHeap (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional)
\n

Memory Management: If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.

\n
\n
size (dagster.IntSource, optional)
\n

Memory Management: The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors\u2019 total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.

\n
\n
\n
\n
useLegacyMode (Bool, optional)
\n

Memory Management: Whether to enable the legacy memory management mode used in Spark 1.5 and before. The legacy mode rigidly partitions the heap space into fixed-size regions, potentially leading to excessive spilling if the application was not tuned. The following deprecated memory fraction configurations are not read unless this is enabled: spark.shuffle.memoryFraction spark.storage.memoryFraction spark.storage.unrollFraction

\n
\n
\n
\n
storage (permissive dict, optional)
\n
\nDefault Value:
{\n    "replication": {}\n}\n
\n
\n
\nConfig Schema:
\n
memoryFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for Spark\u2019s memory cache. This should not be larger than the \u201cold\u201d generation of objects in the JVM, which by default is given 0.6 of the heap, but you can increase it if you configure your own old generation size.

\n
\n
unrollFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of spark.storage.memoryFraction to use for unrolling blocks in memory. This is dynamically allocated by dropping existing blocks when there is not enough free storage space to unroll the new block in its entirety.

\n
\n
replication (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
proactive (Bool, optional)
\n

Memory Management: Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries to get the replication level of the block to the initial number.

\n
\n
\n
\n
memoryMapThreshold (dagster.StringSource, optional)
\n

Execution Behavior: Size in bytes of a block above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory mapping has high overhead for blocks close to or below the page size of the operating system.

\n
\n
\n
\n
cleaner (permissive dict, optional)
\n
\nDefault Value:
{\n    "periodicGC": {},\n    "referenceTracking": {\n        "blocking": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
periodicGC (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Memory Management: Controls how often to trigger a garbage collection. This context cleaner triggers cleanups only when weak references are garbage collected. In long-running applications with large driver JVMs, where there is little memory pressure on the driver, this may happen very occasionally or not at all. Not cleaning at all may lead to executors running out of disk space after a while.

\n
\n
\n
\n
referenceTracking (permissive dict, optional)
\n
\nDefault Value:
{\n    "blocking": {}\n}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Memory Management: Enables or disables context cleaning.

\n
\n
blocking (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Memory Management: Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by spark.cleaner.referenceTracking.blocking.shuffle Spark property).

\n
\n
shuffle (Bool, optional)
\n

Memory Management: Controls whether the cleaning thread should block on shuffle cleanup tasks.

\n
\n
\n
\n
cleanCheckpoints (Bool, optional)
\n

Memory Management: Controls whether to clean checkpoint files if the reference is out of scope.

\n
\n
\n
\n
\n
\n
default (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
parallelism (dagster.IntSource, optional)
\n

Execution Behavior: Default number of partitions in RDDs returned by transformations like join, reduceByKey, and parallelize when not set by user.

\n
\n
\n
\n
hadoop (permissive dict, optional)
\n
\nDefault Value:
{\n    "mapreduce": {\n        "fileoutputcommitter": {\n            "algorithm": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
cloneConf (Bool, optional)
\n

Execution Behavior: If set to true, clones a new Hadoop Configuration object for each task. This option should be enabled to work around Configuration thread-safety issues (see SPARK-2546 for more details). This is disabled by default in order to avoid unexpected performance regressions for jobs that are not affected by these issues.

\n
\n
validateOutputSpecs (Bool, optional)
\n

Execution Behavior: If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with previous versions of Spark. Simply use Hadoop\u2019s FileSystem API to delete output directories by hand. This setting is ignored for jobs generated through Spark Streaming\u2019s StreamingContext, since data may need to be rewritten to pre-existing output directories during checkpoint recovery.

\n
\n
mapreduce (permissive dict, optional)
\n
\nDefault Value:
{\n    "fileoutputcommitter": {\n        "algorithm": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
fileoutputcommitter (permissive dict, optional)
\n
\nDefault Value:
{\n    "algorithm": {}\n}\n
\n
\n
\nConfig Schema:
\n
algorithm (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
version (dagster.IntSource, optional)
\n

Execution Behavior: The file output committer algorithm version, valid algorithm version number: 1 or 2. Version 2 may have better performance, but version 1 may handle failures better in certain situations, as per MAPREDUCE-4815.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
rpc (permissive dict, optional)
\n
\nDefault Value:
{\n    "message": {},\n    "retry": {}\n}\n
\n
\n
\nConfig Schema:
\n
message (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSize (dagster.StringSource, optional)
\n

Networking: Maximum message size (in MB) to allow in \u201ccontrol plane\u201d communication; generally only applies to map output size information sent between executors and the driver. Increase this if you are running jobs with many thousands of map and reduce tasks and see messages about the RPC message size.

\n
\n
\n
\n
numRetries (dagster.StringSource, optional)
\n

Networking: Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number.

\n
\n
retry (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
wait (dagster.StringSource, optional)
\n

Networking: Duration for an RPC ask operation to wait before retrying.

\n
\n
\n
\n
askTimeout (dagster.StringSource, optional)
\n

Networking: Duration for an RPC ask operation to wait before timing out.

\n
\n
lookupTimeout (dagster.StringSource, optional)
\n

Networking: Duration for an RPC remote endpoint lookup operation to wait before timing out.

\n
\n
\n
\n
blockManager (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional)
\n

Networking: Port for all block managers to listen on. These exist on both the driver and the executors.

\n
\n
\n
\n
network (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional)
\n

Networking: Default timeout for all network interactions. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.

\n
\n
\n
\n
port (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.StringSource, optional)
\n

Networking: Maximum number of retries when binding to a port before giving up. When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. This essentially allows it to try a range of ports from the start port specified to port + maxRetries.

\n
\n
\n
\n
core (permissive dict, optional)
\n
\nDefault Value:
{\n    "connection": {\n        "ack": {\n            "wait": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
connection (permissive dict, optional)
\n
\nDefault Value:
{\n    "ack": {\n        "wait": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ack (permissive dict, optional)
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional)
\n

Networking: How long for the connection to wait for ack to occur before timing out and giving up. To avoid unwilling timeout caused by long pause like GC, you can set larger value.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cores (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
max (dagster.StringSource, optional)
\n

Scheduling: When running on a standalone deploy cluster or a Mesos cluster in \u201ccoarse-grained\u201d sharing mode, the maximum amount of CPU cores to request for the application from across the cluster (not from each machine). If not set, the default will be spark.deploy.defaultCores on Spark\u2019s standalone cluster manager, or infinite (all available cores) on Mesos.

\n
\n
\n
\n
locality (permissive dict, optional)
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Scheduling: How long to wait to launch a data-local task before giving up and launching it on a less-local node. The same wait will be used to step through multiple locality levels (process-local, node-local, rack-local and then any). It is also possible to customize the waiting time for each level by setting spark.locality.wait.node, etc. You should increase this setting if your tasks are long and see poor locality, but the default usually works well.

\n
\n
node (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for node locality. For example, you can set this to 0 to skip node locality and search immediately for rack locality (if your cluster has rack information).

\n
\n
process (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for process locality. This affects tasks that attempt to access cached data in a particular executor process.

\n
\n
rack (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for rack locality.

\n
\n
\n
\n
\n
\n
scheduler (permissive dict, optional)
\n
\nDefault Value:
{\n    "revive": {},\n    "listenerbus": {\n        "eventqueue": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
maxRegisteredResourcesWaitingTime (dagster.StringSource, optional)
\n

Scheduling: Maximum amount of time to wait for resources to register before scheduling begins.

\n
\n
minRegisteredResourcesRatio (dagster.StringSource, optional)
\n

Scheduling: The minimum ratio of registered resources (registered resources / total expected resources) (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarse-grained mode [\u2018spark.cores.max\u2019 value is total expected resources for Mesos coarse-grained mode] ) to wait for before scheduling begins. Specified as a double between 0.0 and 1.0. Regardless of whether the minimum ratio of resources has been reached, the maximum amount of time it will wait before scheduling begins is controlled by config spark.scheduler.maxRegisteredResourcesWaitingTime.

\n
\n
mode (dagster.StringSource, optional)
\n

Scheduling: The scheduling mode between jobs submitted to the same SparkContext. Can be set to FAIR to use fair sharing instead of queueing jobs one after another. Useful for multi-user services.

\n
\n
revive (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Scheduling: The interval length for the scheduler to revive the worker resource offers to run tasks.

\n
\n
\n
\n
listenerbus (permissive dict, optional)
\n
\nDefault Value:
{\n    "eventqueue": {}\n}\n
\n
\n
\nConfig Schema:
\n
eventqueue (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
capacity (dagster.StringSource, optional)
\n

Scheduling: Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing value (e.g. 20000) if listener events are dropped. Increasing this value may result in the driver using more memory.

\n
\n
\n
\n
\n
\n
\n
\n
blacklist (permissive dict, optional)
\n
\nDefault Value:
{\n    "task": {},\n    "stage": {},\n    "application": {\n        "fetchFailure": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: If set to \u201ctrue\u201d, prevent Spark from scheduling tasks on executors that have been blacklisted due to too many task failures. The blacklisting algorithm can be further controlled by the other \u201cspark.blacklist\u201d configuration options.

\n
\n
timeout (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How long a node or executor is blacklisted for the entire application, before it is unconditionally removed from the blacklist to attempt running new tasks.

\n
\n
task (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxTaskAttemptsPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one executor before the executor is blacklisted for that task.

\n
\n
maxTaskAttemptsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one node, before the entire node is blacklisted for that task.

\n
\n
\n
\n
stage (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, within one stage, before the executor is blacklisted for that stage.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different executors are marked as blacklisted for a given stage, before the entire node is marked as failed for the stage.

\n
\n
\n
\n
application (permissive dict, optional)
\n
\nDefault Value:
{\n    "fetchFailure": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, in successful task sets, before the executor is blacklisted for the entire application. Blacklisted executors will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors may get marked as idle and be reclaimed by the cluster manager.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different executors must be blacklisted for the entire application, before the node is blacklisted for the entire application. Blacklisted nodes will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors on the node may get marked as idle and be reclaimed by the cluster manager.

\n
\n
fetchFailure (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, Spark will blacklist the executor immediately when a fetch failure happens. If external shuffle service is enabled, then the whole node will be blacklisted.

\n
\n
\n
\n
\n
\n
killBlacklistedExecutors (dagster.StringSource, optional)
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, allow Spark to automatically kill the executors when they are blacklisted on fetch failure or blacklisted for the entire application, as controlled by spark.blacklist.application.*. Note that, when an entire node is added to the blacklist, all of the executors on that node will be killed.

\n
\n
\n
\n
speculation (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Scheduling: If set to \u201ctrue\u201d, performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.

\n
\n
interval (dagster.StringSource, optional)
\n

Scheduling: How often Spark will check for tasks to speculate.

\n
\n
multiplier (dagster.StringSource, optional)
\n

Scheduling: How many times slower a task is than the median to be considered for speculation.

\n
\n
quantile (dagster.StringSource, optional)
\n

Scheduling: Fraction of tasks which must be complete before speculation is enabled for a particular stage.

\n
\n
\n
\n
task (permissive dict, optional)
\n
\nDefault Value:
{\n    "reaper": {}\n}\n
\n
\n
\nConfig Schema:
\n
cpus (dagster.StringSource, optional)
\n

Scheduling: Number of cores to allocate for each task.

\n
\n
maxFailures (dagster.StringSource, optional)
\n

Scheduling: Number of failures of any particular task before giving up on the job. The total number of failures spread across different tasks will not cause the job to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1.

\n
\n
reaper (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed will be monitored by the executor until that task actually finishes executing. See the other spark.task.reaper.* configurations for details on how to control the exact behavior of this monitoring. When set to false (the default), task killing will use an older code path which lacks such monitoring.

\n
\n
pollingInterval (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls the frequency at which executors will poll the status of killed tasks. If a killed task is still running when polled then a warning will be logged and, by default, a thread-dump of the task will be logged (this thread dump can be disabled via the spark.task.reaper.threadDump setting, which is documented below).

\n
\n
threadDump (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls whether task thread dumps are logged during periodic polling of killed tasks. Set this to false to disable collection of thread dumps.

\n
\n
killTimeout (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting specifies a timeout after which the executor JVM will kill itself if a killed task has not stopped running. The default value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose of this setting is to act as a safety-net to prevent runaway noncancellable tasks from rendering an executor unusable.

\n
\n
\n
\n
\n
\n
stage (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxConsecutiveAttempts (dagster.StringSource, optional)
\n

Scheduling: Number of consecutive stage attempts allowed before a stage is aborted.

\n
\n
\n
\n
dynamicAllocation (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Dynamic Allocation: Whether to use dynamic resource allocation, which scales the number of executors registered with this application up and down based on the workload. For more detail, see the description here. This requires spark.shuffle.service.enabled to be set. The following configurations are also relevant: spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors, and spark.dynamicAllocation.initialExecutors spark.dynamicAllocation.executorAllocationRatio

\n
\n
executorIdleTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor has been idle for more than this duration, the executor will be removed. For more detail, see this description.

\n
\n
cachedExecutorIdleTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration, the executor will be removed. For more details, see this description.

\n
\n
initialExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Initial number of executors to run if dynamic allocation is enabled. If \u2013num-executors (or spark.executor.instances) is set and larger than this value, it will be used as the initial number of executors.

\n
\n
maxExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Upper bound for the number of executors if dynamic allocation is enabled.

\n
\n
minExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Lower bound for the number of executors if dynamic allocation is enabled.

\n
\n
executorAllocationRatio (dagster.StringSource, optional)
\n

Dynamic Allocation: By default, the dynamic allocation will request enough executors to maximize the parallelism according to the number of tasks to process. While this minimizes the latency of the job, with small tasks this setting can waste a lot of resources due to executor allocation overhead, as some executor might not even do any work. This setting allows to set a ratio that will be used to reduce the number of executors w.r.t. full parallelism. Defaults to 1.0 to give maximum parallelism. 0.5 will divide the target number of executors by 2 The target number of executors computed by the dynamicAllocation can still be overridden by the spark.dynamicAllocation.minExecutors and spark.dynamicAllocation.maxExecutors settings

\n
\n
schedulerBacklogTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and there have been pending tasks backlogged for more than this duration, new executors will be requested. For more detail, see this description.

\n
\n
sustainedSchedulerBacklogTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: Same as spark.dynamicAllocation.schedulerBacklogTimeout, but used only for subsequent executor requests. For more detail, see this description.

\n
\n
\n
\n
r (permissive dict, optional)
\n
\nDefault Value:
{\n    "driver": {},\n    "shell": {}\n}\n
\n
\n
\nConfig Schema:
\n
numRBackendThreads (dagster.StringSource, optional)
\n

SparkR: Number of threads used by RBackend to handle RPC calls from SparkR package.

\n
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing R scripts in cluster modes for both driver and workers.

\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing R scripts in client modes for driver. Ignored in cluster modes.

\n
\n
\n
\n
shell (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script.

\n
\n
\n
\n
backendConnectionTimeout (dagster.StringSource, optional)
\n

SparkR: Connection timeout set by R process on its connection to RBackend in seconds.

\n
\n
heartBeatInterval (dagster.StringSource, optional)
\n

SparkR: Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.

\n
\n
\n
\n
graphx (permissive dict, optional)
\n
\nDefault Value:
{\n    "pregel": {}\n}\n
\n
\n
\nConfig Schema:
\n
pregel (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
checkpointInterval (dagster.StringSource, optional)
\n

GraphX: Checkpoint interval for graph and message in Pregel. It used to avoid stackOverflowError due to long lineage chains after lots of iterations. The checkpoint is disabled by default.

\n
\n
\n
\n
\n
\n
deploy (permissive dict, optional)
\n
\nDefault Value:
{\n    "zookeeper": {}\n}\n
\n
\n
\nConfig Schema:
\n
recoveryMode (dagster.StringSource, optional)
\n

Deploy: The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches. This is only applicable for cluster mode when running with Standalone or Mesos.

\n
\n
zookeeper (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
url (dagster.StringSource, optional)
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.

\n
\n
dir (dagster.StringSource, optional)
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cluster_id (dagster.StringSource)
\n

Name of the job flow (cluster) on which to execute.

\n
\n
region_name (dagster.StringSource)
\n

The AWS region that the cluster is in.

\n
\n
action_on_failure (String, optional)
\n

The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html

\n

Default Value: \u2018CANCEL_AND_WAIT\u2019

\n
\n
staging_bucket (dagster.StringSource)
\n

S3 bucket to use for passing files between the plan process and EMR process.

\n
\n
staging_prefix (dagster.StringSource, optional)
\n

S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process

\n

Default Value: \u2018emr_staging\u2019

\n
\n
wait_for_logs (Bool, optional)
\n

If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.

\n

Default Value: False

\n
\n
local_job_package_path (dagster.StringSource, optional)
\n

Absolute path to the package that contains the job definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the job. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_job_package option, referenced on s3 via the s3_job_package_path option, or installed on the cluster via bootstrap actions.

\n
\n
local_pipeline_package_path (dagster.StringSource, optional)
\n

(legacy) Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.

\n
\n
deploy_local_job_package (Bool, optional)
\n

If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

\n

Default Value: False

\n
\n
deploy_local_pipeline_package (Bool, optional)
\n

(legacy) If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

\n

Default Value: False

\n
\n
s3_job_package_path (dagster.StringSource, optional)
\n

If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_job_package should not be set to True.

\n
\n
s3_pipeline_package_path (dagster.StringSource, optional)
\n

If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.

\n
\n
\n
    \n
  • spark_config:

  • \n
  • cluster_id: Name of the job flow (cluster) on which to execute.

  • \n
  • region_name: The AWS region that the cluster is in.

  • \n
  • action_on_failure: The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html

  • \n
  • staging_bucket: S3 bucket to use for passing files between the plan process and EMR process.

  • \n
  • staging_prefix: S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process

  • \n
  • wait_for_logs: If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.

  • \n
  • local_job_package_path: Absolute path to the package that contains the job definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the job. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_job_package option, referenced on s3 via the s3_job_package_path option, or installed on the cluster via bootstrap actions.

  • \n
  • local_pipeline_package_path: (legacy) Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.

  • \n
  • deploy_local_job_package: If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

  • \n
  • deploy_local_pipeline_package: (legacy) If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

  • \n
  • s3_job_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_job_package should not be set to True.

  • \n
  • s3_pipeline_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.

  • \n
\n
\n\n
\n
\nclass dagster_aws.emr.EmrJobRunner(region, check_cluster_every=30, aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster_aws.emr.EmrError[source]\u00b6
\n
\n\n
\n
\ndagster_aws.emr.EmrClusterState = <enum 'EmrClusterState'>[source]\u00b6
\n

An enumeration.

\n
\n\n
\n
\ndagster_aws.emr.EmrStepState = <enum 'EmrStepState'>[source]\u00b6
\n

An enumeration.

\n
\n\n
\n
\n

CloudWatch\u00b6

\n
\n
\ndagster_aws.cloudwatch.cloudwatch_logger LoggerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
log_level (String, optional)
\n

Default Value: \u2018INFO\u2019

\n
\n
name (String, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
log_group_name (String)
\n

The name of the log group

\n
\n
log_stream_name (String)
\n

The name of the log stream

\n
\n
aws_region (dagster.StringSource, optional)
\n

Specifies a custom region for the S3 session. Default is chosen through the ordinary boto3 credential chain.

\n
\n
aws_secret_access_key (dagster.StringSource, optional)
\n

\n
aws_access_key_id (dagster.StringSource, optional)
\n

\n
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\n

SecretsManager\u00b6

\n

Resources which surface SecretsManager secrets for use in Dagster resources and jobs.

\n
\n
\ndagster_aws.secretsmanager.secretsmanager_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (String, optional)
\n

Specifies a custom region for the SecretsManager session

\n
\n
max_attempts (Int, optional)
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (String, optional)
\n

Specifies a profile to connect that session

\n
\n
\n

Resource that gives access to AWS SecretsManager.

\n

The underlying SecretsManager session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is a SecretsManager client, an instance of botocore.client.SecretsManager.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import secretsmanager_resource\n\n@op(required_resource_keys={'secretsmanager'})\ndef example_secretsmanager_op(context):\n    return context.resources.secretsmanager.get_secret_value(\n        SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n    )\n\n@job(resource_defs={'secretsmanager': secretsmanager_resource})\ndef example_job(context):\n    example_secretsmanager_op()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            'secretsmanager': {\n                'config': {\n                    'region_name': 'us-west-1',\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  secretsmanager:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n      # through the ordinary boto credential chain.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n      # profile as specified in ~/.aws/credentials file\n
\n
\n
\n\n
\n
\ndagster_aws.secretsmanager.secretsmanager_secrets_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (String, optional)
\n

Specifies a custom region for the SecretsManager session

\n
\n
max_attempts (Int, optional)
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (String, optional)
\n

Specifies a profile to connect that session

\n
\n
secrets (List[String], optional)
\n

An array of AWS Secrets Manager secrets arns to fetch.

\n

Default Value: []

\n
\n
secrets_tag (Union[String, None], optional)
\n

AWS Secrets Manager secrets with this tag will be fetched and made available.

\n

Default Value: None

\n
\n
add_to_environment (Bool, optional)
\n

Whether to mount the secrets as environment variables.

\n

Default Value: False

\n
\n
\n

Resource that provides a dict which maps selected SecretsManager secrets to\ntheir string values. Also optionally sets chosen secrets as environment variables.

\n

Example

\n
import os\nfrom dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n@op(required_resource_keys={'secrets'})\ndef example_secretsmanager_secrets_op(context):\n    return context.resources.secrets.get("my-secret-name")\n\n@op(required_resource_keys={'secrets'})\ndef example_secretsmanager_secrets_op_2(context):\n    return os.getenv("my-other-secret-name")\n\n@job(resource_defs={'secrets': secretsmanager_secrets_resource})\ndef example_job(context):\n    example_secretsmanager_secrets_op()\n    example_secretsmanager_secrets_op_2()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            'secrets': {\n                'config': {\n                    'region_name': 'us-west-1',\n                    'secrets_tag': 'dagster',\n                    'add_to_environment': True,\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  secretsmanager:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n      # through the ordinary boto credential chain.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n      # profile as specified in ~/.aws/credentials file\n      secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n      # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n      secrets_tag: "dagster"\n      # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n      # from SecretsManager.\n      add_to_environment: true\n      # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n      # to false.\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-aws", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-azure/", "title": "Azure (dagster-azure)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-airflow/", "title": "Airflow (dagster-airflow)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-azure", "Azure (dagster-azure)", "N", "next"], ["sections/api/apidocs/libraries/dagster-airflow", "Airflow (dagster-airflow)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-aws.rst.txt", "title": "AWS (dagster-aws)", "toc": "\n"}, "dagster-azure": {"alabaster_version": "0.7.12", "body": "
\n

Azure (dagster-azure)\u00b6

\n

Utilities for using Azure Storage Accounts with Dagster. This is mostly aimed at Azure Data Lake\nStorage Gen 2 (ADLS2) but also contains some utilities for Azure Blob Storage.

\n
\n

\n
\n

NOTE: This package is incompatible with dagster-snowflake! This is due to a version mismatch\nbetween the underlying azure-storage-blob package; dagster-snowflake has a transitive\ndependency on an old version, via snowflake-connector-python.

\n
\n
\ndagster_azure.adls2.adls2_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource)
\n

The storage account name.

\n
\n
credential (selector)
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource)
\n

SAS token for the account.

\n
\n
key (dagster.StringSource)
\n

Shared Access Key for the account

\n
\n
\n
\n
\n

Resource that gives ops access to Azure Data Lake Storage Gen2.

\n

The underlying client is a DataLakeServiceClient.

\n

Attach this resource definition to a JobDefinition in order to make it\navailable to your ops.

\n

Example

\n
from dagster import job, op\nfrom dagster_azure.adls2 import adls2_resource\n\n@op(required_resource_keys={'adls2'})\ndef example_adls2_op(context):\n    return list(context.resources.adls2.adls2_client.list_file_systems())\n\n@job(resource_defs={"adls2": adls2_resource})\ndef my_job():\n    example_adls2_op()\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may pass credentials to this resource using either a SAS token or a key, using\nenvironment variables if desired:

\n
resources:\n  adls2:\n    config:\n      storage_account: my_storage_account\n      # str: The storage account name.\n      credential:\n        sas: my_sas_token\n        # str: the SAS token for the account.\n        key:\n          env: AZURE_DATA_LAKE_STORAGE_KEY\n        # str: The shared access key for the account.\n
\n
\n
\n\n
\n
\nclass dagster_azure.adls2.FakeADLS2Resource(account_name, credential='fake-creds')[source]\u00b6
\n

Stateful mock of an ADLS2Resource for testing.

\n

Wraps a mock.MagicMock. Containers are implemented using an in-memory dict.

\n
\n\n
\n
\ndagster_azure.adls2.adls2_file_cache ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource)
\n

The storage account name.

\n
\n
credential (selector)
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource)
\n

SAS token for the account.

\n
\n
key (dagster.StringSource)
\n

Shared Access Key for the account

\n
\n
\n
\n
prefix (dagster.StringSource)
\n

The base path prefix to use in ADLS2

\n
\n
file_system (dagster.StringSource)
\n

The storage account filesystem (aka container)

\n
\n
overwrite (Bool, optional)
\n

Default Value: False

\n
\n
\n
\n\n
\n
\nclass dagster_azure.blob.AzureBlobComputeLogManager(storage_account, container, secret_key, local_dir=None, inst_data=None, prefix='dagster')[source]\u00b6
\n

Logs op compute function stdout and stderr to Azure Blob Storage.

\n

This is also compatible with Azure Data Lake Storage.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_azure.blob.compute_log_manager\n  class: AzureBlobComputeLogManager\n  config:\n    storage_account: my-storage-account\n    container: my-container\n    credential: sas-token-or-secret-key\n    prefix: "dagster-test-"\n    local_dir: "/tmp/cool"\n
\n
\n
\n
Parameters
\n
    \n
  • storage_account (str) \u2013 The storage account name to which to log.

  • \n
  • container (str) \u2013 The container (or ADLS2 filesystem) to which to log.

  • \n
  • secret_key (str) \u2013 Secret key for the storage account. SAS tokens are not\nsupported because we need a secret key to generate a SAS token for a download URL.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster.seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_azure.adls2.adls2_file_manager ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource)
\n

The storage account name.

\n
\n
credential (selector)
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource)
\n

SAS token for the account.

\n
\n
key (dagster.StringSource)
\n

Shared Access Key for the account

\n
\n
\n
\n
adls2_file_system (dagster.StringSource)
\n

ADLS Gen2 file system name

\n
\n
adls2_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to ADLS2.

\n

Implements the FileManager API.

\n
\n\n
\n
\nclass dagster_azure.adls2.ADLS2FileHandle(account, file_system, key)[source]\u00b6
\n

A reference to a file on ADLS2.

\n
\n
\nproperty account\u00b6
\n

The name of the ADLS2 account.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty adls2_path\u00b6
\n

The file\u2019s ADLS2 URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty file_system\u00b6
\n

The name of the ADLS2 file system.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty key\u00b6
\n

The ADLS2 key.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty path_desc\u00b6
\n

The file\u2019s ADLS2 URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\ndagster_azure.adls2.adls2_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2_file_system (dagster.StringSource)
\n

ADLS Gen2 file system name

\n
\n
adls2_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using Azure Data Lake Storage Gen2 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for ADLS and the backing\ncontainer.

\n

Attach this resource definition to your job in order to make it available all your ops:

\n
@job(resource_defs={\n    'io_manager': adls2_pickle_io_manager,\n    'adls2': adls2_resource,\n    ...,\n})\ndef my_job():\n    ...\n
\n
\n

You may configure this storage as follows:

\n
resources:\n    io_manager:\n        config:\n            adls2_file_system: my-cool-file-system\n            adls2_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\ndagster_azure.adls2.adls2_pickle_asset_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2_file_system (dagster.StringSource)
\n

ADLS Gen2 file system name

\n
\n
adls2_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using Azure Data Lake Storage Gen2 for storage, meant for use with\nsoftware-defined assets.

\n

Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\nwill overwrite previous materializations of that asset.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for ADLS and the backing\ncontainer.

\n

Attach this resource definition to your job in order to make it available all your ops:

\n
asset_group = AssetGroup(\n    assets...,\n    resource_defs={'io_manager': adls2_pickle_io_manager, "adls2": adls2_resource, ...}),\n)\n
\n
\n

You may configure this storage as follows:

\n
resources:\n    io_manager:\n        config:\n            adls2_file_system: my-cool-file-system\n            adls2_prefix: good/prefix-for-files\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-azure", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-celery/", "title": "Celery (dagster-celery)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-aws/", "title": "AWS (dagster-aws)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery", "Celery (dagster-celery)", "N", "next"], ["sections/api/apidocs/libraries/dagster-aws", "AWS (dagster-aws)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-azure.rst.txt", "title": "Azure (dagster-azure)", "toc": "\n"}, "dagster-celery": {"alabaster_version": "0.7.12", "body": "
\n

Celery (dagster-celery)\u00b6

\n
\n

Quickstart\u00b6

\n

To get a local rabbitmq broker started and available via the default\npyamqp://guest@localhost:5672, in the dagster/python_modules/libraries/dagster-celery/\ndirectory run:

\n
docker-compose up\n
\n
\n

To run a celery worker:

\n
celery -A dagster_celery.app worker -l info\n
\n
\n

To start multiple workers in the background, run:

\n
celery multi start w2 -A dagster_celery.app -l info\n
\n
\n

To execute a job using the celery-backed executor, you\u2019ll need to set the job\u2019s executor_def to\nthe celery_executor.

\n
from dagster import job\nfrom dagster_celery import celery_executor\n\n@job(executor_def=celery_executor)\ndef my_job():\n    pass\n
\n
\n
\n

Monitoring your Celery tasks\u00b6

\n

We advise using [Flower](https://celery.readthedocs.io/en/latest/userguide/monitoring.html#flower-real-time-celery-web-monitor):

\n
celery -A dagster_celery.app flower\n
\n
\n
\n
\n

Customizing the Celery broker, backend, and other app configuration\u00b6

\n

By default this will use amqp://guest:**@localhost:5672// as the Celery broker URL and\nrpc:// as the results backend. In production, you will want to change these values. Pending the\nintroduction of a dagster_celery CLI, that would entail writing a Python module my_module as\nfollows:

\n
from celery import Celery\n\nfrom dagster_celery.tasks import create_task\n\napp = Celery('dagster', broker_url='some://custom@value', ...)\n\nexecute_plan = create_task(app)\n\nif __name__ == '__main__':\n    app.worker_main()\n
\n
\n

You can then run the celery worker using:

\n
celery -A my_module worker --loglevel=info\n
\n
\n

This customization mechanism is used to implement dagster_celery_k8s and dagster_celery_k8s which delegate the execution of steps to ephemeral kubernetes pods and docker containers, respectively.

\n
\n
\n

Celery best practices\u00b6

\n

Celery is a rich and full-featured system. We\u2019ve found the following resources helpful:

\n\n
\n
\n
\n

API\u00b6

\n
\n
\ndagster_celery.celery_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional)
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional)
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Celery-based executor.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when solid executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute jobs\nwith variations on these settings.

\n

To use the celery_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery import celery_executor\n\n@job(executor_def=celery_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n
\n\n
\n
\n

CLI\u00b6

\n

The dagster-celery CLI lets you start, monitor, and terminate workers.

\n
\n

dagster-celery worker start\u00b6

\n

Start a dagster celery worker.

\n
dagster-celery worker start [OPTIONS] [ADDITIONAL_ARGS]...\n
\n
\n

Options

\n
\n
\n-n, --name <name>\u00b6
\n

The name of the worker. Defaults to a unique name prefixed with \u201cdagster-\u201d and ending with the hostname.

\n
\n\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the worker. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use.

\n
\n\n
\n
\n-q, --queue <queue>\u00b6
\n

Names of the queues on which this worker should listen for tasks. Provide multiple -q arguments to specify multiple queues. Note that each celery worker may listen on no more than four queues.

\n
\n\n
\n
\n-d, --background\u00b6
\n

Set this flag to run the worker in the background.

\n
\n\n
\n
\n-i, --includes <includes>\u00b6
\n

Python modules the worker should import. Provide multiple -i arguments to specify multiple modules.

\n
\n\n
\n
\n-l, --loglevel <loglevel>\u00b6
\n

Log level for the worker.

\n
\n\n
\n
\n-A, --app <app>\u00b6
\n
\n\n

Arguments

\n
\n
\nADDITIONAL_ARGS\u00b6
\n

Optional argument(s)

\n
\n\n
\n
\n

dagster-celery worker list\u00b6

\n

List running dagster-celery workers. Note that we use the broker to contact the workers.

\n
dagster-celery worker list [OPTIONS]\n
\n
\n

Options

\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to find your workers (since the CLI won\u2019t know how to reach the broker).

\n
\n\n
\n
\n

dagster-celery worker terminate\u00b6

\n

Shut down dagster-celery workers. Note that we use the broker to send signals to the workers to terminate \u2013 if the broker is not running, this command is a no-op. Provide the argument NAME to terminate a specific worker by name.

\n
dagster-celery worker terminate [OPTIONS] [NAME]\n
\n
\n

Options

\n
\n
\n-a, --all\u00b6
\n

Set this flag to terminate all running workers.

\n
\n\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to terminate your workers (since the CLI won\u2019t know how to reach the broker).

\n
\n\n

Arguments

\n
\n
\nNAME\u00b6
\n

Optional argument

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-celery-docker/", "title": "Orchestration on Celery + Docker"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-azure/", "title": "Azure (dagster-azure)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery-docker", "Orchestration on Celery + Docker", "N", "next"], ["sections/api/apidocs/libraries/dagster-azure", "Azure (dagster-azure)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery.rst.txt", "title": "Celery (dagster-celery)", "toc": "\n"}, "dagster-celery-docker": {"alabaster_version": "0.7.12", "body": "
\n

Orchestration on Celery + Docker\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_celery_docker.celery_docker_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional)
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional)
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
docker (strict dict)
\n

The configuration for interacting with docker in the celery worker.

\n
\nConfig Schema:
\n
image (dagster.StringSource, optional)
\n

The docker image to be used for step execution.

\n
\n
registry (strict dict, optional)
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource)
\n

\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
\n
\n
env_vars (List[String], optional)
\n

The list of environment variables names to forward from the celery worker in to the docker container

\n
\n
network (String, optional)
\n

Name of the network this container will be connected to at creation time

\n
\n
\n
\n
\n

Celery-based executor which launches tasks in docker containers.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when op executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute jobs\nwith variations on these settings.

\n

To use the celery_docker_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery_docker.executor import celery_executor\n\n@job(executor_def=celery_docker_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    docker:\n      image: 'my_repo.com/image_name:latest'\n      registry:\n        url: 'my_repo.com'\n        username: 'my_user'\n        password: {env: 'DOCKER_PASSWORD'}\n      env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n

In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_docker.app argument.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery-docker", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-celery-k8s/", "title": "Orchestration on Celery + Kubernetes"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery/", "title": "Celery (dagster-celery)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery-k8s", "Orchestration on Celery + Kubernetes", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery", "Celery (dagster-celery)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery-docker.rst.txt", "title": "Orchestration on Celery + Docker", "toc": "\n"}, "dagster-celery-k8s": {"alabaster_version": "0.7.12", "body": "
\n

Orchestration on Celery + Kubernetes\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_celery_k8s.CeleryK8sRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
instance_config_map (dagster.StringSource)
\n

The name of an existing Volume to mount into the pod in order to provide a ConfigMap for the Dagster instance. This Volume should contain a dagster.yaml with appropriate values for run storage, event log storage, etc.

\n
\n
postgres_password_secret (dagster.StringSource, optional)
\n

The name of the Kubernetes Secret where the postgres password can be retrieved. Will be mounted and supplied as an environment variable to the Job Pod.Secret must contain the key "postgresql-password" which will be exposed in the Job environment as the environment variable DAGSTER_PG_PASSWORD.

\n
\n
dagster_home (dagster.StringSource, optional)
\n

The location of DAGSTER_HOME in the Job container; this is where the dagster.yaml file will be mounted from the instance ConfigMap specified here. Defaults to /opt/dagster/dagster_home.

\n

Default Value: \u2018/opt/dagster/dagster_home\u2019

\n
\n
load_incluster_config (Bool, optional)
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional)
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
fail_pod_on_run_failure (Bool, optional)
\n

Whether the launched Kubernetes Jobs and Pods should fail if the Dagster run fails

\n
\n
job_image (Union[dagster.StringSource, None], optional)
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used.(Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional)
\n

Image pull policy to set on the launched task Job Pods. Defaults to \u201cIfNotPresent\u201d.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional)
\n

(Advanced) Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional)
\n

(Advanced) Override the name of the Kubernetes service account under which to run the Job.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional)
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional)
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional)
\n

A list of environment variables to inject into the Job. Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[strict dict], optional)
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional)
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional)
\n

Additional labels that should be included in the Job\u2019s Pod. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
broker (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional)
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional)
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

In contrast to the K8sRunLauncher, which launches dagster runs as single K8s\nJobs, this run launcher is intended for use in concert with\ndagster_celery_k8s.celery_k8s_job_executor().

\n

With this run launcher, execution is delegated to:

\n
\n
    \n
  1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\nsubmits steps to Celery queues for execution;

  2. \n
  3. The step executions which are submitted to Celery queues are picked up by Celery workers,\nand each step execution spawns a step execution Kubernetes Job. See the implementation\ndefined in dagster_celery_k8.executor.create_k8s_job_task().

  4. \n
\n
\n

You can configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml like the following:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: CeleryK8sRunLauncher\n  config:\n    instance_config_map: "dagster-k8s-instance-config-map"\n    dagster_home: "/some/path"\n    postgres_password_secret: "dagster-k8s-pg-password"\n    broker: "some_celery_broker_url"\n    backend: "some_celery_backend_url"\n
\n
\n
\n\n
\n
\ndagster_celery_k8s.celery_k8s_job_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional)
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional)
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
job_image (Union[dagster.StringSource, None], optional)
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used.(Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional)
\n

Image pull policy to set on the launched task Job Pods. Defaults to \u201cIfNotPresent\u201d.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional)
\n

(Advanced) Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional)
\n

(Advanced) Override the name of the Kubernetes service account under which to run the Job.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional)
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional)
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional)
\n

A list of environment variables to inject into the Job. Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[strict dict], optional)
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional)
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional)
\n

Additional labels that should be included in the Job\u2019s Pod. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
load_incluster_config (Bool, optional)
\n

Set this value if you are running the launcher within a k8s cluster. If\nTrue, we assume the launcher is running within the target cluster and load config\nusing kubernetes.config.load_incluster_config. Otherwise, we will use the k8s config\nspecified in kubeconfig_file (using kubernetes.config.load_kube_config) or fall\nback to the default kubeconfig. Default: True.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional)
\n

Path to a kubeconfig file to use, if not using default kubeconfig.

\n
\n
job_namespace (dagster.StringSource, optional)
\n

The namespace into which to launch new jobs. Note that any other Kubernetes resources the Job requires (such as the service account) must be present in this namespace. Default: "default"

\n

Default Value: \u2018default\u2019

\n
\n
repo_location_name (dagster.StringSource, optional)
\n

The repository location name to use for execution.

\n

Default Value: \u2018<<in_process>>\u2019

\n
\n
job_wait_timeout (Float, optional)
\n

Wait this many seconds for a job to complete before marking the run as failed. Defaults to 86400.0 seconds.

\n

Default Value: 86400.0

\n
\n
\n

Celery-based executor which launches tasks as Kubernetes Jobs.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when op executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute dagster jobs\nwith variations on these settings.

\n

To use the celery_k8s_job_executor, set it as the executor_def when defining a job:

\n
from dagster_celery_k8s.executor import celery_k8s_job_executor\n\nfrom dagster import job\n\n\n@job(executor_def=celery_k8s_job_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    job_image: 'my_repo.com/image_name:latest'\n    job_namespace: 'some-namespace'\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n

In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_k8s.app argument.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery-k8s", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-dask/", "title": "Dask (dagster-dask)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery-docker/", "title": "Orchestration on Celery + Docker"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-dask", "Dask (dagster-dask)", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery-docker", "Orchestration on Celery + Docker", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery-k8s.rst.txt", "title": "Orchestration on Celery + Kubernetes", "toc": "\n"}, "dagster-dask": {"alabaster_version": "0.7.12", "body": "
\n

Dask (dagster-dask)\u00b6

\n

See also the Dask deployment guide.

\n
\n
\ndagster_dask.dask_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
cluster (selector)
\n
\nConfig Schema:
\n
existing (strict dict)
\n

Connect to an existing scheduler.

\n
\nConfig Schema:
\n
address (dagster.StringSource)
\n

\n
\n
\n
local (permissive dict, optional)
\n

Local cluster configuration.

\n
\n
yarn (permissive dict, optional)
\n

YARN cluster configuration.

\n
\n
ssh (permissive dict, optional)
\n

SSH cluster configuration.

\n
\n
pbs (permissive dict, optional)
\n

PBS cluster configuration.

\n
\n
moab (permissive dict, optional)
\n

Moab cluster configuration.

\n
\n
sge (permissive dict, optional)
\n

SGE cluster configuration.

\n
\n
lsf (permissive dict, optional)
\n

LSF cluster configuration.

\n
\n
slurm (permissive dict, optional)
\n

SLURM cluster configuration.

\n
\n
oar (permissive dict, optional)
\n

OAR cluster configuration.

\n
\n
kube (permissive dict, optional)
\n

Kubernetes cluster configuration.

\n
\n
\n
\n
\n

Dask-based executor.

\n

The \u2018cluster\u2019 can be one of the following:\n(\u2018existing\u2019, \u2018local\u2019, \u2018yarn\u2019, \u2018ssh\u2019, \u2018pbs\u2019, \u2018moab\u2019, \u2018sge\u2019, \u2018lsf\u2019, \u2018slurm\u2019, \u2018oar\u2019, \u2018kube\u2019).

\n

If the Dask executor is used without providing executor-specific config, a local Dask cluster\nwill be created (as when calling dask.distributed.Client()\nwith dask.distributed.LocalCluster()).

\n

The Dask executor optionally takes the following config:

\n
cluster:\n    {\n        local?: # takes distributed.LocalCluster parameters\n            {\n                timeout?: 5,  # Timeout duration for initial connection to the scheduler\n                n_workers?: 4  # Number of workers to start\n                threads_per_worker?: 1 # Number of threads per each worker\n            }\n    }\n
\n
\n

To use the dask_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_dask import dask_executor\n\n@job(executor_def=dask_executor)\ndef dask_enabled_job():\n    pass\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-dask", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-databricks/", "title": "Databricks (dagster-databricks)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery-k8s/", "title": "Orchestration on Celery + Kubernetes"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-databricks", "Databricks (dagster-databricks)", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery-k8s", "Orchestration on Celery + Kubernetes", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-dask.rst.txt", "title": "Dask (dagster-dask)", "toc": "\n"}, "dagster-databricks": {"alabaster_version": "0.7.12", "body": "
\n

Databricks (dagster-databricks)\u00b6

\n

The dagster_databricks package provides two main pieces of functionality:

\n\n

Note that, for the databricks_pyspark_step_launcher, either S3 or Azure Data Lake Storage config\nmust be specified for ops to succeed, and the credentials for this storage must also be\nstored as a Databricks Secret and stored in the resource config so that the Databricks cluster can\naccess storage.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_databricks.create_databricks_job_op(name='databricks_job', num_inputs=1, description=None, required_resource_keys=frozenset({'databricks_client'}))[source]\u00b6
\n

Creates an op that launches a databricks job (not to be confused with Dagster\u2019s job API).

\n

As config, the op accepts a blob of the form described in Databricks\u2019 job API:\nhttps://docs.databricks.com/dev-tools/api/latest/jobs.html.

\n
\n
Returns
\n

An op definition.

\n
\n
Return type
\n

OpDefinition

\n
\n
\n

Example

\n
from dagster import graph\nfrom dagster_databricks import create_databricks_job_op, databricks_client\n\nsparkpi = create_databricks_job_op().configured(\n    {\n        "job": {\n            "name": "SparkPi Python job",\n            "new_cluster": {\n                "spark_version": "7.3.x-scala2.12",\n                "node_type_id": "i3.xlarge",\n                "num_workers": 2,\n            },\n            "spark_python_task": {"python_file": "dbfs:/docs/pi.py", "parameters": ["10"]},\n        }\n    },\n    name="sparkpi",\n)\n\n@graph\ndef my_spark():\n    sparkpi()\n\nmy_spark.to_job(\n    resource_defs={\n        "databricks_client": databricks_client.configured(\n            {"host": "my.workspace.url", "token": "my.access.token"}\n        )\n    }\n)\n
\n
\n
\n\n
\n
\ndagster_databricks.databricks_pyspark_step_launcher ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
run_config (strict dict)
\n

Databricks job run configuration

\n
\nConfig Schema:
\n
cluster (selector)
\n
\nConfig Schema:
\n
new (strict dict)
\n
\nConfig Schema:
\n
size (selector)
\n
\nConfig Schema:
\n
autoscale (strict dict)
\n
\nConfig Schema:
\n
min_workers (Int)
\n

The minimum number of workers to which the cluster can scale down when underutilized. It is also the initial number of workers the cluster will have after creation.

\n
\n
max_workers (Int)
\n

The maximum number of workers to which the cluster can scale up when overloaded. max_workers must be strictly greater than min_workers.

\n
\n
\n
\n
num_workers (Int)
\n

If num_workers, number of worker nodes that this cluster should have. A cluster has one Spark Driver and num_workers Executors for a total of num_workers + 1 Spark nodes.

\n
\n
\n
\n
spark_version (String)
\n

The Spark version of the cluster. A list of available Spark versions can be retrieved by using the Runtime versions API call. This field is required.

\n
\n
spark_conf (permissive dict, optional)
\n

An object containing a set of optional, user-specified Spark configuration key-value pairs. You can also pass in a string of extra JVM options to the driver and the executors via spark.driver.extraJavaOptions and spark.executor.extraJavaOptions respectively. Example Spark confs: {\u201cspark.speculation\u201d: true, \u201cspark.streaming.ui.retainedBatches\u201d: 5} or {\u201cspark.driver.extraJavaOptions\u201d: \u201c-verbose:gc -XX:+PrintGCDetails\u201d}

\n
\n
nodes (selector)
\n

The nodes used in the cluster. Either the node types or an instance pool can be specified.

\n
\nConfig Schema:
\n
node_types (strict dict)
\n
\nConfig Schema:
\n
node_type_id (String)
\n

This field encodes, through a single value, the resources available to each of the Spark nodes in this cluster. For example, the Spark nodes can be provisioned and optimized for memory or compute intensive workloads. A list of available node types can be retrieved by using the List node types API call. This field is required.

\n
\n
driver_node_type_id (String, optional)
\n

The node type of the Spark driver. This field is optional; if unset, the driver node type is set as the same value as node_type_id defined above.

\n
\n
\n
\n
instance_pool_id (String, optional)
\n

The optional ID of the instance pool to which the cluster belongs. Refer to the Instance Pools API for details.

\n
\n
\n
\n
ssh_public_keys (List[String], optional)
\n

SSH public key contents that will be added to each Spark node in this cluster. The corresponding private keys can be used to login with the user name ubuntu on port 2200. Up to 10 keys can be specified.

\n
\n
custom_tags (List[strict dict], optional)
\n

Additional tags for cluster resources. Databricks tags all cluster resources (e.g., AWS instances and EBS volumes) with these tags in addition to default_tags. Note: - Tags are not supported on legacy node types such as compute-optimized and memory-optimized - Databricks allows at most 45 custom tagsMore restrictions may apply if using Azure Databricks; refer to the official docs for further details.

\n
\n
cluster_log_conf (selector, optional)
\n

Recommended! The configuration for delivering Spark logs to a long-term storage destination. Only one destination can be specified for one cluster. If the conf is given, the logs will be delivered to the destination every 5 mins. The destination of driver logs is <destination>/<cluster-id>/driver, while the destination of executor logs is <destination>/<cluster-id>/executor.

\n
\nConfig Schema:
\n
dbfs (strict dict)
\n

DBFS storage information

\n
\nConfig Schema:
\n
destination (String)
\n

DBFS destination, e.g. dbfs:/my/path

\n
\n
\n
\n
s3 (strict dict)
\n

S3 storage information

\n
\nConfig Schema:
\n
destination (String)
\n

S3 destination, e.g. s3://my-bucket/some-prefix. You must configure the cluster with an instance profile and the instance profile must have write access to the destination. You cannot use AWS keys.

\n
\n
region (String)
\n

S3 region, e.g. us-west-2. Either region or endpoint must be set. If both are set, endpoint is used.

\n
\n
endpoint (String)
\n

S3 endpoint, e.g. https://s3-us-west-2.amazonaws.com. Either region or endpoint must be set. If both are set, endpoint is used.

\n
\n
enable_encryption (Bool, optional)
\n

(Optional) Enable server side encryption, false by default.

\n
\n
encryption_type (String, optional)
\n

(Optional) The encryption type, it could be sse-s3 or sse-kms. It is used only when encryption is enabled and the default type is sse-s3.

\n
\n
kms_key (String, optional)
\n

(Optional) KMS key used if encryption is enabled and encryption type is set to sse-kms.

\n
\n
canned_acl (String, optional)
\n

(Optional) Set canned access control list, e.g. bucket-owner-full-control.If canned_acl is set, the cluster instance profile must have s3:PutObjectAcl permission on the destination bucket and prefix. The full list of possible canned ACLs can be found at https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl. By default only the object owner gets full control. If you are using cross account role for writing data, you may want to set bucket-owner-full-control to make bucket owner able to read the logs.

\n
\n
\n
\n
\n
\n
init_scripts (List[selector], optional)
\n

The configuration for storing init scripts. Any number of scripts can be specified. The scripts are executed sequentially in the order provided. If cluster_log_conf is specified, init script logs are sent to <destination>/<cluster-id>/init_scripts.

\n
\n
spark_env_vars (permissive dict, optional)
\n

An object containing a set of optional, user-specified environment variable key-value pairs. Key-value pair of the form (X,Y) are exported as is (i.e., export X=\u201dY\u201d) while launching the driver and workers. To specify an additional set of SPARK_DAEMON_JAVA_OPTS, we recommend appending them to $SPARK_DAEMON_JAVA_OPTS as shown in the example below. This ensures that all default Databricks managed environmental variables are included as well. Example Spark environment variables: {\u201cSPARK_WORKER_MEMORY\u201d: \u201c28000m\u201d, \u201cSPARK_LOCAL_DIRS\u201d: \u201c/local_disk0\u201d} or {\u201cSPARK_DAEMON_JAVA_OPTS\u201d: \u201c$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\u201d}

\n
\n
enable_elastic_disk (Bool, optional)
\n

Autoscaling Local Storage: when enabled, this cluster dynamically acquires attitional disk space when its Spark workers are running low on disk space. This feature requires specific AWS permissions to function correctly - refer to https://docs.databricks.com/clusters/configure.html#autoscaling-local-storage for details.

\n
\n
\n
\n
existing (String)
\n

The ID of an existing cluster that will be used for all runs of this job. When running jobs on an existing cluster, you may need to manually restart the cluster if it stops responding. Databricks suggests running jobs on new clusters for greater reliability.

\n
\n
\n
\n
run_name (String, optional)
\n

An optional name for the run. The default value is Untitled

\n
\n
libraries (List[selector], optional)
\n

An optional list of libraries to be installed on the cluster that will execute the job. By default dagster, dagster-databricks and dagster-pyspark libraries will be included.

\n
\n
timeout_seconds (Int, optional)
\n

An optional timeout applied to each run of this job. The default behavior is to have no timeout.

\n
\n
idempotency_token (String, optional)
\n

An optional token that can be used to guarantee the idempotency of job run requests.If an active run with the provided token already exists, the request will not create a new run, but will return the ID of the existing run instead. If you specify the idempotency token, upon failure you can retry until the request succeeds. Databricks guarantees that exactly one run will be launched with that idempotency token. This token should have at most 64 characters.

\n
\n
\n
\n
databricks_host (dagster.StringSource)
\n

Databricks host, e.g. uksouth.azuredatabricks.com

\n
\n
databricks_token (dagster.StringSource)
\n

Databricks access token

\n
\n
secrets_to_env_variables (List[strict dict], optional)
\n

Databricks secrets to be exported as environment variables. Since runs will execute in the Databricks runtime environment, environment variables (such as those required for a StringSource config variable) will not be accessible to Dagster. These variables must be stored as Databricks secrets and specified here, which will ensure they are re-exported as environment variables accessible to Dagster upon execution.

\n
\n
storage (selector, optional)
\n

Databricks storage configuration for either S3 or ADLS2. If access credentials for your Databricks storage are stored in Databricks secrets, this config indicates the secret scope and the secret keys used to access either S3 or ADLS2.

\n
\nConfig Schema:
\n
s3 (strict dict)
\n

S3 storage secret configuration

\n
\nConfig Schema:
\n
secret_scope (String)
\n

The Databricks secret scope containing the storage secrets.

\n
\n
access_key_key (String)
\n

The key of a Databricks secret containing the S3 access key ID.

\n
\n
secret_key_key (String)
\n

The key of a Databricks secret containing the S3 secret access key.

\n
\n
\n
\n
adls2 (strict dict)
\n

ADLS2 storage secret configuration

\n
\nConfig Schema:
\n
secret_scope (String)
\n

The Databricks secret scope containing the storage secrets.

\n
\n
storage_account_name (String)
\n

The name of the storage account used to access data.

\n
\n
storage_account_key_key (String)
\n

The key of a Databricks secret containing the storage account secret key.

\n
\n
\n
\n
\n
\n
local_pipeline_package_path (dagster.StringSource, optional)
\n

Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on Databricks. This is a path on the local fileystem of the process executing the pipeline. Before every step run, the launcher will zip up the code in this path, upload it to DBFS, and unzip it into the Python path of the remote Spark process. This gives the remote process access to up-to-date user code.

\n
\n
local_dagster_job_package_path (dagster.StringSource, optional)
\n

Absolute path to the package that contains the dagster job definition(s) whose steps will execute remotely on Databricks. This is a path on the local fileystem of the process executing the dagster job. Before every step run, the launcher will zip up the code in this path, upload it to DBFS, and unzip it into the Python path of the remote Spark process. This gives the remote process access to up-to-date user code.

\n
\n
staging_prefix (dagster.StringSource, optional)
\n

Directory in DBFS to use for uploaded job code. Must be absolute.

\n

Default Value: \u2018/dagster_staging\u2019

\n
\n
wait_for_logs (Bool, optional)
\n

If set, and if the specified cluster is configured to export logs, the system will wait after job completion for the logs to appear in the configured location. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime. NOTE: this integration will export stdout/stderrfrom the remote Databricks process automatically, so this option is not generally necessary.

\n

Default Value: False

\n
\n
max_completion_wait_time_seconds (dagster.IntSource, optional)
\n

If the Databricks job run takes more than this many seconds, then consider it failed and terminate the step.

\n

Default Value: 86400

\n
\n
poll_interval_sec (Float, optional)
\n

How frequently Dagster will poll Databricks to determine the state of the job.

\n

Default Value: 5.0

\n
\n
\n

Resource for running ops as a Databricks Job.

\n

When this resource is used, the op will be executed in Databricks using the \u2018Run Submit\u2019\nAPI. Pipeline code will be zipped up and copied to a directory in DBFS along with the op\u2019s\nexecution context.

\n

Use the \u2018run_config\u2019 configuration to specify the details of the Databricks cluster used, and\nthe \u2018storage\u2019 key to configure persistent storage on that cluster. Storage is accessed by\nsetting the credentials in the Spark context, as documented here for S3 and here for ADLS.

\n
\n\n
\n
\nclass dagster_databricks.DatabricksError[source]\u00b6
\n
\n\n
\n
\n

Legacy APIs\u00b6

\n
\n
\ndagster_databricks.create_databricks_job_solid(name='databricks_job', num_inputs=1, description=None, required_resource_keys=frozenset({'databricks_client'}))[source]\u00b6
\n

Creates a solid that launches a databricks job.

\n

As config, the solid accepts a blob of the form described in Databricks\u2019 job API:\nhttps://docs.databricks.com/dev-tools/api/latest/jobs.html.

\n
\n
Returns
\n

A solid definition.

\n
\n
Return type
\n

SolidDefinition

\n
\n
\n

Example

\n
from dagster import ModeDefinition, pipeline\nfrom dagster_databricks import create_databricks_job_solid, databricks_client\n\nsparkpi = create_databricks_job_solid().configured(\n    {\n        "job": {\n            "name": "SparkPi Python job",\n            "new_cluster": {\n                "spark_version": "7.3.x-scala2.12",\n                "node_type_id": "i3.xlarge",\n                "num_workers": 2,\n            },\n            "spark_python_task": {"python_file": "dbfs:/docs/pi.py", "parameters": ["10"]},\n        }\n    },\n    name="sparkspi",\n)\n\n\n@pipeline(\n    mode_defs=[\n        ModeDefinition(\n            resource_defs={\n                "databricks_client": databricks_client.configured(\n                    {"host": "my.workspace.url", "token": "my.access.token"}\n                )\n            }\n        )\n    ]\n)\ndef my_pipeline():\n    sparkpi()\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-databricks", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-datadog/", "title": "Datadog (dagster-datadog)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-dask/", "title": "Dask (dagster-dask)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-datadog", "Datadog (dagster-datadog)", "N", "next"], ["sections/api/apidocs/libraries/dagster-dask", "Dask (dagster-dask)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-databricks.rst.txt", "title": "Databricks (dagster-databricks)", "toc": "\n"}, "dagster-datadog": {"alabaster_version": "0.7.12", "body": "
\n

Datadog (dagster-datadog)\u00b6

\n

This library provides an integration with Datadog, to support publishing metrics to Datadog from\nwithin Dagster ops.

\n
\n

\n
\n

We use the Python datadogpy library. To use it, you\u2019ll\nfirst need to create a DataDog account and get both API and Application keys.

\n
\n

\n
\n

The integration uses DogStatsD, so you\u2019ll need\nto ensure the datadog agent is running on the host you\u2019re sending metrics from.

\n
\n
\ndagster_datadog.datadog_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource)
\n

Datadog API key

\n
\n
app_key (dagster.StringSource)
\n

Datadog application key

\n
\n
\n

This resource is a thin wrapper over the\ndogstatsd library.

\n

As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\nDataDog documentation for how to use this\nresource.

\n

Examples

\n
@op(required_resource_keys={'datadog'})\ndef datadog_op(context):\n    dd = context.resources.datadog\n\n    dd.event('Man down!', 'This server needs assistance.')\n    dd.gauge('users.online', 1001, tags=["protocol:http"])\n    dd.increment('page.views')\n    dd.decrement('page.views')\n    dd.histogram('album.photo.count', 26, tags=["gender:female"])\n    dd.distribution('album.photo.count', 26, tags=["color:blue"])\n    dd.set('visitors.uniques', 999, tags=["browser:ie"])\n    dd.service_check('svc.check_name', dd.WARNING)\n    dd.timing("query.response.time", 1234)\n\n    # Use timed decorator\n    @dd.timed('run_fn')\n    def run_fn():\n        pass\n\n    run_fn()\n\n@job(resource_defs={'datadog': datadog_resource})\ndef dd_job():\n    datadog_op()\n\nresult = dd_job.execute_in_process(\n    run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-datadog", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-dbt/", "title": "dbt (dagster-dbt)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-databricks/", "title": "Databricks (dagster-databricks)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-dbt", "dbt (dagster-dbt)", "N", "next"], ["sections/api/apidocs/libraries/dagster-databricks", "Databricks (dagster-databricks)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-datadog.rst.txt", "title": "Datadog (dagster-datadog)", "toc": "\n"}, "dagster-dbt": {"alabaster_version": "0.7.12", "body": "
\n

dbt (dagster-dbt)\u00b6

\n

This library provides a Dagster integration with dbt (data build tool), created by dbt Labs.

\n
\n

Ops\u00b6

\n
\n

dbt Core Ops\u00b6

\n

dagster_dbt provides a set of pre-built ops that work with either the CLI or RPC interfaces. For\nmore advanced use cases, we suggest building your own ops which directly interact with these resources.

\n
\n
\ndagster_dbt.dbt_run_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the op executes. Default: True

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018dbt\u2019]

\n
\n
\n

This op executes a dbt run command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_run_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_run_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_run_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_compile_op(context)[source]\u00b6
\n

This op executes a dbt compile command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_compile_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_compile_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_compile_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_ls_op(context)[source]\u00b6
\n

This op executes a dbt ls command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_ls_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_ls_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_ls_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_test_op(context)[source]\u00b6
\n

This op executes a dbt test command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_test_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_test_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_test_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_snapshot_op(context)[source]\u00b6
\n

This op executes a dbt snapshot command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_snapshot_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_snapshot_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_snapshot_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_seed_op(context)[source]\u00b6
\n

This op executes a dbt seed command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_seed_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_seed_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_seed_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_docs_generate_op(context)[source]\u00b6
\n

This op executes a dbt docs generate command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_docs_generate_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_docs_generate_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_docs_generate_op()\n
\n
\n
\n\n
\n
\n

dbt Cloud Ops\u00b6

\n
\n
\ndagster_dbt.dbt_cloud_run_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_id (Int)
\n

The integer ID of the relevant dbt Cloud job. You can find this value by going to the details page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
poll_interval (Float, optional)
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional)
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n

Default Value: None

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018dbt\u2019]

\n
\n
\n

Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\nfails or is otherwised stopped before succeeding, a dagster.Failure exception will be raised,\nand this op will fail.

\n

It requires the use of a \u2018dbt_cloud\u2019 resource, which is used to connect to the dbt Cloud API.

\n

Config Options:

\n
\n
job_id (int)

The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\npage of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\nhttps://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
poll_interval (float)

The time (in seconds) that will be waited between successive polls. Defaults to 10.

\n
\n
poll_timeout (float)

The maximum time (in seconds) that will waited before this operation is timed out. By\ndefault, this will never time out.

\n
\n
yield_materializations (bool)

If True, materializations corresponding to the results of the dbt operation will be\nyielded when the solid executes. Defaults to True.

\n
\n
rasset_key_prefix (float)

If provided and yield_materializations is True, these components will be used to \u201d\nprefix the generated asset keys. Defaults to [\u201cdbt\u201d].

\n
\n
\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_cloud_resource, run_dbt_cloud_op\n\nmy_dbt_cloud_resource = dbt_cloud_resource.configured(\n    {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n)\nrun_dbt_nightly_sync = run_dbt_cloud_op.configured(\n    {"job_id": 54321}, name="run_dbt_nightly_sync"\n)\n\n@job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\ndef dbt_cloud():\n    run_dbt_nightly_sync()\n
\n
\n
\n\n
\n
\n
\n

Resources\u00b6

\n
\n

CLI Resources\u00b6

\n
\n
\nclass dagster_dbt.DbtCliResource(executable, default_flags, warn_error, ignore_handled_error, target_path, logger=None)[source]\u00b6
\n

A resource that allows you to execute dbt cli commands. For the most up-to-date documentation on\nthe specific parameters available to you for each command, check out the dbt docs:

\n

https://docs.getdbt.com/reference/commands/run

\n

To use this as a dagster resource, we recommend using\ndbt_cli_resource.

\n
\n
\ncli(command, **kwargs)[source]\u00b6
\n
\n
Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the

default flags that were configured on resource initialization (if any) overriding the\ndefault values if necessary.

\n
\n
\n
\n
Parameters
\n

command (str) \u2013 The command you wish to run (e.g. \u2018run\u2019, \u2018test\u2019, \u2018docs generate\u2019, etc.)

\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\ncompile(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the compile command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in compilation.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from compilation.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nproperty default_flags\u00b6
\n

A set of params populated from resource config that are passed as flags to each dbt CLI command.

\n
\n\n
\n
\nfreshness(select=None, **kwargs)[source]\u00b6
\n

Run the source snapshot-freshness command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

select (List[str], optional) \u2013 the sources to include in the run.

\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\ngenerate_docs(compile_project=False, **kwargs)[source]\u00b6
\n

Run the docs generate command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

compile_project (bool, optional) \u2013 If true, compile the project before generating a catalog.

\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nls(select=None, models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the ls command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the resources to include in the output.

  • \n
  • models (List[str], optional) \u2013 the models to include in the output.

  • \n
  • exclude (List[str], optional) \u2013 the resources to exclude from the output.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nrun(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the run command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in compilation.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from compilation.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nrun_operation(macro, args=None, **kwargs)[source]\u00b6
\n

Run the run-operation command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • macro (str) \u2013 the dbt macro to invoke.

  • \n
  • args (Dict[str, Any], optional) \u2013 the keyword arguments to be supplied to the macro.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nseed(show=False, select=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the seed command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • show (bool, optional) \u2013 If True, then show a sample of the seeded data in the\nresponse. Defaults to False.

  • \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nsnapshot(select=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the snapshot command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nproperty strict_flags\u00b6
\n

A set of flags that should not be auto-populated from the default flags unless they are\narguments to the associated function.

\n
\n\n
\n
\ntest(models=None, exclude=None, data=True, schema=True, **kwargs)[source]\u00b6
\n

Run the test command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in testing.

  • \n
  • exclude (List[str], optional) \u2013 the models to exclude from testing.

  • \n
  • data (bool, optional) \u2013 If True (default), then run data tests.

  • \n
  • schema (bool, optional) \u2013 If True (default), then run schema tests.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtCliOutput(command, return_code, raw_output, logs, result)[source]\u00b6
\n

The results of executing a dbt command, along with additional metadata about the dbt CLI\nprocess that was run.

\n

Note that users should not construct instances of this class directly. This class is intended\nto be constructed from the JSON output of dbt commands.

\n
\n
\ncommand\u00b6
\n

The full shell command that was executed.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nreturn_code\u00b6
\n

The return code of the dbt CLI process.

\n
\n
Type
\n

int

\n
\n
\n
\n\n
\n
\nraw_output\u00b6
\n

The raw output (stdout) of the dbt CLI process.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nlogs\u00b6
\n

List of parsed JSON logs produced by the dbt command.

\n
\n
Type
\n

List[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nresult\u00b6
\n

Dictionary containing dbt-reported result information\ncontained in run_results.json. Some dbt commands do not produce results, and will\ntherefore have result = None.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n\n
\n
\ndagster_dbt.dbt_cli_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project_dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles_dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass_cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn_error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target_path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
\n

This resource defines a dbt CLI interface.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
custom_dbt_cli_resource = dbt_cli_resource.configured({"project-dir": "path/to/my/dbt_project"})\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"dbt": custom_dbt_cli_resource})])\ndef dbt_cli_pipeline():\n    # Run solids with `required_resource_keys={"dbt", ...}`.\n
\n
\n

You may configure this resource as follows:

\n
resources:\n  dbt_cli_resource:\n    config:\n      project_dir: "."\n      # Optional[str]: Which directory to look in for the dbt_project.yml file. Default is\n      # the current working directory and its parents.\n      profiles_dir: $DBT_PROFILES_DIR or $HOME/.dbt\n      # Optional[str]: Which directory to look in for the profiles.yml file.\n      profile: ""\n      # Optional[str]: Which profile to load. Overrides setting in dbt_project.yml.\n      target: ""\n      # Optional[str]: Which target to load for the given profile.\n      vars: {}\n      # Optional[Permissive]: Supply variables to the project. This argument overrides\n      # variables defined in your dbt_project.yml file. This argument should be a\n      # dictionary, eg. "{'my_variable': 'my_value'}"\n      bypass_cache: False\n      # Optional[bool]: If set, bypass the adapter-level cache of database state.\n
\n
\n
\n\n
\n
\n

RPC Resources\u00b6

\n
\n
\nclass dagster_dbt.DbtRpcResource(host='0.0.0.0', port=8580, jsonrpc_version='2.0', logger=None, **_)[source]\u00b6
\n

A client for a dbt RPC server.

\n

To use this as a dagster resource, we recommend using\ndbt_rpc_resource.

\n
\n
\ncli(command, **kwargs)[source]\u00b6
\n

Sends a request with CLI syntax to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for running CLI commands via RPC.

\n
\n
Parameters
\n

cli (str) \u2013 a dbt command in CLI syntax.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\ncompile(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method compile to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for compiling projects via RPC.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in compilation.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from compilation.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\ncompile_sql(sql, name)[source]\u00b6
\n

Sends a request with the method compile_sql to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for compiling SQL via RPC.

\n
\n
Parameters
\n
    \n
  • sql (str) \u2013 the SQL to compile in base-64 encoding.

  • \n
  • name (str) \u2013 a name for the compiled SQL.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\ngenerate_docs(compile_project=False, **kwargs)[source]\u00b6
\n

Sends a request with the method docs.generate to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the RPC method docs.generate.

\n
\n
Parameters
\n

compile_project (bool, optional) \u2013 If true, compile the project before generating a catalog.

\n
\n
\n
\n\n
\n
\nproperty host\u00b6
\n

The IP address of the host of the dbt RPC server.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty jsonrpc_version\u00b6
\n

The JSON-RPC version to send in RPC requests.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nkill(task_id)[source]\u00b6
\n

Sends a request with the method kill to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method kill.

\n
\n
Parameters
\n

task_id (str) \u2013 the ID of the task to terminate.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nproperty logger\u00b6
\n

A property for injecting a logger dependency.

\n
\n
Type
\n

Optional[Any]

\n
\n
\n
\n\n
\n
\nls(select=None, models=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method list to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for list.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the resources to include in the output.

  • \n
  • models (List[str], optional) \u2013 the models to include in the output.

  • \n
  • exclude (List[str]), optional) \u2013 the resources to exclude from compilation.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\npoll(request_token, logs=False, logs_start=0)[source]\u00b6
\n

Sends a request with the method poll to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method poll.

\n
\n
Parameters
\n
    \n
  • request_token (str) \u2013 the token to poll responses for.

  • \n
  • logs (bool) \u2013 Whether logs should be returned in the response. Defaults to False.

  • \n
  • logs_start (int) \u2013 The zero-indexed log line to fetch logs from. Defaults to 0.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nproperty port\u00b6
\n

The port of the dbt RPC server.

\n
\n
Type
\n

int

\n
\n
\n
\n\n
\n
\nps(completed=False)[source]\u00b6
\n

Sends a request with the method ps to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method ps.

\n
\n
Parameters
\n

compelted (bool) \u2013 If True, then also return completed tasks. Defaults to False.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nrun(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method run to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method run.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in the run.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from the run.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nrun_operation(macro, args=None, **kwargs)[source]\u00b6
\n

Sends a request with the method run-operation to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the command run-operation.

\n
\n
Parameters
\n
    \n
  • macro (str) \u2013 the dbt macro to invoke.

  • \n
  • args (Dict[str, Any], optional) \u2013 the keyword arguments to be supplied to the macro.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nrun_sql(sql, name)[source]\u00b6
\n

Sends a request with the method run_sql to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for running SQL via RPC.

\n
\n
Parameters
\n
    \n
  • sql (str) \u2013 the SQL to run in base-64 encoding.

  • \n
  • name (str) \u2013 a name for the compiled SQL.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nseed(show=False, select=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method seed to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method seed.

\n
\n
Parameters
\n
    \n
  • show (bool, optional) \u2013 If True, then show a sample of the seeded data in the\nresponse. Defaults to False.

  • \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nsnapshot(select=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method snapshot to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the command snapshot.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nsnapshot_freshness(select=None, **kwargs)[source]\u00b6
\n

Sends a request with the method snapshot-freshness to the dbt RPC server, and returns\nthe response. For more details, see the dbt docs for the command source snapshot-freshness.

\n
\n
Parameters
\n

select (List[str], optional) \u2013 the models to include in calculating snapshot freshness.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nstatus()[source]\u00b6
\n

Sends a request with the method status to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the RPC method status.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\ntest(models=None, exclude=None, data=True, schema=True, **kwargs)[source]\u00b6
\n

Sends a request with the method test to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method test.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in testing.

  • \n
  • exclude (List[str], optional) \u2013 the models to exclude from testing.

  • \n
  • data (bool, optional) \u2013 If True (default), then run data tests.

  • \n
  • schema (bool, optional) \u2013 If True (default), then run schema tests.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nproperty url\u00b6
\n

The URL for sending dbt RPC requests.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtRpcSyncResource(host='0.0.0.0', port=8580, jsonrpc_version='2.0', logger=None, poll_interval=1, **_)[source]\u00b6
\n
\n\n
\n
\nclass dagster_dbt.DbtRpcOutput(response)[source]\u00b6
\n

The output from executing a dbt command via the dbt RPC server.

\n
\n
\nresult\u00b6
\n

The parsed contents of the \u201cresult\u201d field of the JSON response from\nthe rpc server (if any).

\n
\n
Type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nresponse_dict\u00b6
\n

The entire contents of the JSON response from the rpc server.

\n
\n
Type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nresponse\u00b6
\n

The original Response from which this output was generated.

\n
\n
Type
\n

requests.Response

\n
\n
\n
\n\n
\n\n
\n
\ndagster_dbt.local_dbt_rpc_resource ResourceDefinition\u00b6
\n

This resource defines a dbt RPC client for an RPC server running\non 0.0.0.0:8580.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 8580

\n
\n
\n

This resource defines a dbt RPC client.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n

Examples:

\n
from dagster_dbt import dbt_rpc_resource\n\ncustom_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n@job(resource_defs={"dbt_rpc": custom_dbt_rpc_sync_resource})\ndef dbt_rpc_job():\n    # Run ops with `required_resource_keys={"dbt_rpc", ...}`.\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_rpc_sync_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 8580

\n
\n
poll_interval (dagster.IntSource, optional)
\n

Default Value: 1

\n
\n
\n

This resource defines a synchronous dbt RPC client, which sends requests to a dbt RPC server,\nand waits for the request to complete before returning.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster_dbt import dbt_rpc_sync_resource\n\ncustom_sync_dbt_rpc_resource = dbt_rpc_sync_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n@job(resource_defs={"dbt_rpc": custom_dbt_rpc_sync_resource})\ndef dbt_rpc_sync_job():\n    # Run ops with `required_resource_keys={"dbt_rpc", ...}`.\n
\n
\n
\n\n
\n
\n

dbt Cloud Resources\u00b6

\n
\n
\nclass dagster_dbt.DbtCloudResourceV2(auth_token, account_id, disable_schedule_on_trigger=True, request_max_retries=3, request_retry_delay=0.25, dbt_cloud_host='https://cloud.getdbt.com/', log=<Logger dagster.builtin (DEBUG)>, log_requests=False)[source]\u00b6
\n

This class exposes methods on top of the dbt Cloud REST API v2.

\n

For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\nresponse JSON schemae, see the dbt Cloud API Docs.

\n
\n
\ncancel_run(run_id)[source]\u00b6
\n

Cancels a dbt Cloud run.

\n
\n
Parameters
\n

run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

\n
\n
Returns
\n

\n
A dictionary containing the parsed contents of the dbt Cloud run details.

See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_job(job_id)[source]\u00b6
\n

Gets details about a given dbt job from the dbt Cloud API.

\n
\n
Parameters
\n

job_id (int) \u2013 The ID of the relevant dbt Cloud job. You can find this value by going to\nthe details page of your job in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_manifest(run_id, step=None)[source]\u00b6
\n

The parsed contents of a manifest.json file created by a completed run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • step (int) \u2013 The index of the step in the run to query for artifacts. The first step in\nthe run has the index 1. If the step parameter is omitted, then this endpoint will\nreturn the artifacts compiled for the last step in the run.

  • \n
\n
\n
Returns
\n

Parsed contents of the manifest.json file

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_run(run_id, include_related=None)[source]\u00b6
\n

Gets details about a specific job run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • include_related (List[str]) \u2013 List of related fields to pull with the run. Valid values\nare \u201ctrigger\u201d, \u201cjob\u201d, and \u201cdebug_logs\u201d.

  • \n
\n
\n
Returns
\n

\n
A dictionary containing the parsed contents of the dbt Cloud run details.

See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_run_artifact(run_id, path, step=None)[source]\u00b6
\n

The string contents of a run artifact from a dbt Cloud run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • path (str) \u2013 The path to this run artifact (e.g. \u2018run/my_new_project/models/example/my_first_dbt_model.sql\u2019)

  • \n
  • step (int) \u2013 The index of the step in the run to query for artifacts. The first step in\nthe run has the index 1. If the step parameter is omitted, then this endpoint will\nreturn the artifacts compiled for the last step in the run.

  • \n
\n
\n
Returns
\n

List of the names of the available run artifacts

\n
\n
Return type
\n

List[str]

\n
\n
\n
\n\n
\n
\nget_run_results(run_id, step=None)[source]\u00b6
\n

The parsed contents of a run_results.json file created by a completed run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • step (int) \u2013 The index of the step in the run to query for artifacts. The first step in\nthe run has the index 1. If the step parameter is omitted, then this endpoint will\nreturn the artifacts compiled for the last step in the run.

  • \n
\n
\n
Returns
\n

Parsed contents of the run_results.json file

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_run_steps(run_id)[source]\u00b6
\n

Gets the steps of an initialized dbt Cloud run.

\n
\n
Parameters
\n

run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

\n
\n
Returns
\n

List of commands for each step of the run.

\n
\n
Return type
\n

List[str, Any]

\n
\n
\n
\n\n
\n
\nlist_run_artifacts(run_id, step=None)[source]\u00b6
\n

Lists the paths of the available run artifacts from a completed dbt Cloud run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • step (int) \u2013 The index of the step in the run to query for artifacts. The first step in\nthe run has the index 1. If the step parameter is omitted, then this endpoint will\nreturn the artifacts compiled for the last step in the run

  • \n
\n
\n
Returns
\n

List of the paths of the available run artifacts

\n
\n
Return type
\n

List[str]

\n
\n
\n
\n\n
\n
\nmake_request(method, endpoint, data=None, return_text=False)[source]\u00b6
\n

Creates and sends a request to the desired dbt Cloud API endpoint.

\n
\n
Parameters
\n
    \n
  • method (str) \u2013 The http method to use for this request (e.g. \u201cPOST\u201d, \u201cGET\u201d, \u201cPATCH\u201d).

  • \n
  • endpoint (str) \u2013 The dbt Cloud API endpoint to send this request to.

  • \n
  • data (Optional[str]) \u2013 JSON-formatted data string to be included in the request.

  • \n
  • return_text (bool) \u2013 Override default behavior and return unparsed {\u201ctext\u201d: response.text}\nblob instead of json.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\npoll_run(run_id, poll_interval=10, poll_timeout=None, href=None)[source]\u00b6
\n

Polls a dbt Cloud job run until it completes. Will raise a dagster.Failure exception if the\nrun does not complete successfully.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that should be waited between successive\npolls of the dbt Cloud API.

  • \n
  • poll_timeout (float) \u2013 The maximum time (in seconds) that should be waited for this run\nto complete. If this threshold is exceeded, the run will be cancelled and an\nexception will be thrown. By default, this will poll forver.

  • \n
  • href (str) \u2013 For internal use, generally should not be set manually.

  • \n
\n
\n
Returns
\n

\n
A dictionary containing the parsed contents of the dbt Cloud run details.

See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nrun_job(job_id, **kwargs)[source]\u00b6
\n

Initializes a run for a job. Overrides for specific properties can be set by passing in\nvalues to the kwargs. A full list of overridable properties can be found here:\nhttps://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun

\n
\n
Parameters
\n
    \n
  • job_id (int) \u2013 The ID of the relevant dbt Cloud job. You can find this value by going to\nthe details page of your job in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

  • \n
  • kwargs \u2013 Passed in as the properties to be overridden.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nrun_job_and_poll(job_id, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Runs a dbt Cloud job and polls until it completes. Will raise a dagster.Failure exception\nif the run does not complete successfully.

\n
\n
Parameters
\n
    \n
  • job_id (int) \u2013 The ID of the relevant dbt Cloud job. You can find this value by going to\nthe details page of your job in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that should be waited between successive\npolls of the dbt Cloud API.

  • \n
  • poll_timeout (float) \u2013 The maximum time (in seconds) that should be waited for this run\nto complete. If this threshold is exceeded, the run will be cancelled and an\nexception will be thrown. By default, this will poll forver.

  • \n
\n
\n
Returns
\n

\n
Class containing details about the specific job run and the

parsed run results.

\n
\n
\n

\n
\n
Return type
\n

DbtCloudOutput

\n
\n
\n
\n\n
\n
\nupdate_job(job_id, **kwargs)[source]\u00b6
\n

Updates specific properties of a dbt job. Documentation on the full set of potential\nparameters can be found here: https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById

\n
\n
Parameters
\n
    \n
  • job_id (int) \u2013 The ID of the relevant dbt Cloud job. You can find this value by going to\nthe details page of your job in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

  • \n
  • kwargs \u2013 Passed in as the properties to be changed.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n

Examples:

\n
# disable schedule for job with id=12345\nmy_dbt_cloud_resource.update_job(12345, triggers={"schedule": False})\n
\n
\n
\n\n
\n\n
\n
\ndagster_dbt.dbt_cloud_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
auth_token (dagster.StringSource)
\n

dbt Cloud API Token. User tokens can be found in the [dbt Cloud UI](https://cloud.getdbt.com/#/profile/api/), or see the [dbt Cloud Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) for instructions on creating a Service Account token.

\n
\n
account_id (Int)
\n

dbt Cloud Account ID. This value can be found in the url of a variety of views in the dbt Cloud UI, e.g. https://cloud.getdbt.com/#/accounts/{account_id}/settings/.

\n
\n
disable_schedule_on_trigger (Bool, optional)
\n

Specifies if you would like any job that is triggered using this resource to automatically disable its schedule.

\n

Default Value: True

\n
\n
request_max_retries (Int, optional)
\n

The maximum number of times requests to the dbt Cloud API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional)
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the dbt Cloud Administrative REST\nAPI (v2) to launch jobs and monitor their progress. This currently implements only a subset of\nthe functionality exposed by the API.

\n

For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\nresponse JSON schemae, see the dbt Cloud API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_cloud_resource\n\nmy_dbt_cloud_resource = dbt_cloud_resource.configured(\n    {\n        "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n        "account_id": 30000,\n    }\n)\n\n@job(resource_defs={"dbt_cloud":my_dbt_cloud_resource})\ndef my_dbt_cloud_job():\n    ...\n
\n
\n
\n\n
\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_dbt.load_assets_from_dbt_project(project_dir, profiles_dir=None, target_dir=None, select=None, runtime_metadata_fn=None, io_manager_key=None, node_info_to_asset_key=<function _get_node_asset_key>)[source]\u00b6
\n

Loads a set of DBT models from a DBT project into Dagster assets.

\n

Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\ndbt run command.

\n
\n
Parameters
\n
    \n
  • project_dir (Optional[str]) \u2013 The directory containing the DBT project to load.

  • \n
  • profiles_dir (Optional[str]) \u2013 The profiles directory to use for loading the DBT project.\nDefaults to a directory called \u201cconfig\u201d inside the project_dir.

  • \n
  • target_dir (Optional[str]) \u2013 The target directory where DBT will place compiled artifacts.\nDefaults to \u201ctarget\u201d underneath the project_dir.

  • \n
  • select (str) \u2013 A DBT selection string for the models in a project that you want to include.\nDefaults to \u201c*\u201d.

  • \n
  • runtime_metadata_fn \u2013 (Optional[Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]):\nA function that will be run after any of the assets are materialized and returns\nmetadata entries for the asset, to be displayed in the asset catalog for that run.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key that will be set on each of the returned\nassets. When other ops are downstream of the loaded assets, the IOManager specified\nhere determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • node_info_to_asset_key \u2013 (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\nof dbt node info and returns the AssetKey that you want to represent that node. By\ndefault, the asset key will simply be the name of the dbt model.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_dbt.load_assets_from_dbt_manifest(manifest_json, runtime_metadata_fn=None, io_manager_key=None, selected_unique_ids=None, node_info_to_asset_key=<function _get_node_asset_key>)[source]\u00b6
\n

Loads a set of dbt models, described in a manifest.json, into Dagster assets.

\n

Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\ndbt run command.

\n
\n
Parameters
\n
    \n
  • manifest_json (Optional[Mapping[str, Any]]) \u2013 The contents of a DBT manifest.json, which contains\na set of models to load into assets.

  • \n
  • runtime_metadata_fn \u2013 (Optional[Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]):\nA function that will be run after any of the assets are materialized and returns\nmetadata entries for the asset, to be displayed in the asset catalog for that run.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key that will be set on each of the returned\nassets. When other ops are downstream of the loaded assets, the IOManager specified\nhere determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • selected_unique_ids (Optional[Set[str]]) \u2013 The set of dbt unique_ids that you want to load\nas assets.

  • \n
  • node_info_to_asset_key \u2013 (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\nof dbt node info and returns the AssetKey that you want to represent that node. By\ndefault, the asset key will simply be the name of the dbt model.

  • \n
\n
\n
\n
\n\n
\n
\n

Types\u00b6

\n
\n
\nclass dagster_dbt.DbtOutput(result)[source]\u00b6
\n

Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, result, which\nrepresents the dbt-formatted result of the command that was run (if any).

\n

Used internally, should not be instantiated directly by the user.

\n
\n\n
\n
\nclass dagster_dbt.DbtResource(logger=None)[source]\u00b6
\n

Base class for a resource allowing users to interface with dbt

\n
\n
\nabstract compile(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the compile command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in compilation.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from compilation.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract generate_docs(compile_project=False, **kwargs)[source]\u00b6
\n

Run the docs generate command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

compile_project (bool, optional) \u2013 If true, compile the project before generating a catalog.

\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nproperty logger\u00b6
\n

A property for injecting a logger dependency.

\n
\n
Type
\n

logging.Logger

\n
\n
\n
\n\n
\n
\nabstract ls(select=None, models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the ls command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the resources to include in the output.

  • \n
  • models (List[str], optional) \u2013 the models to include in the output.

  • \n
  • exclude (List[str], optional) \u2013 the resources to exclude from the output.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract run(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the run command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in the run.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from the run.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract run_operation(macro, args=None, **kwargs)[source]\u00b6
\n

Run the run-operation command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • macro (str) \u2013 the dbt macro to invoke.

  • \n
  • args (Dict[str, Any], optional) \u2013 the keyword arguments to be supplied to the macro.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract seed(show=False, select=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the seed command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • show (bool, optional) \u2013 If True, then show a sample of the seeded data in the\nresponse. Defaults to False.

  • \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract snapshot(select=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the snapshot command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract test(models=None, exclude=None, data=True, schema=True, **kwargs)[source]\u00b6
\n

Run the test command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in testing.

  • \n
  • exclude (List[str], optional) \u2013 the models to exclude from testing.

  • \n
  • data (bool, optional) \u2013 If True (default), then run data tests.

  • \n
  • schema (bool, optional) \u2013 If True (default), then run schema tests.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n\n
\n
\n

Errors\u00b6

\n
\n
\nexception dagster_dbt.DagsterDbtError(description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

The base exception of the dagster-dbt library.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliRuntimeError(description, logs, raw_output)[source]\u00b6
\n

Represents an error while executing a dbt CLI command.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliFatalRuntimeError(logs, raw_output)[source]\u00b6
\n

Represents a fatal error in the dbt CLI (return code 2).

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliHandledRuntimeError(logs, raw_output)[source]\u00b6
\n

Represents a model error reported by the dbt CLI at runtime (return code 1).

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliOutputsNotFoundError(path)[source]\u00b6
\n

Represents a problem in finding the target/run_results.json artifact when executing a dbt\nCLI command.

\n

For more details on target/run_results.json, see\nhttps://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliUnexpectedOutputError(invalid_line_nos)[source]\u00b6
\n

Represents an error when parsing the output of a dbt CLI command.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtRpcUnexpectedPollOutputError(description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Represents an unexpected response when polling the dbt RPC server.

\n
\n\n
\n
\n

Utils\u00b6

\n
\n
\ndagster_dbt.utils.generate_materializations(dbt_output, asset_key_prefix=None)[source]\u00b6
\n

This function yields dagster.AssetMaterialization events for each model created by\na dbt run command (with information parsed from a DbtOutput object).

\n

Note that this will not work with output from the dbt_rpc_resource, because this resource does\nnot wait for a response from the RPC server before returning. Instead, use the\ndbt_rpc_sync_resource, which will wait for execution to complete.

\n

Examples:

\n
from dagster import op, Output\nfrom dagster_dbt.utils import generate_materializations\nfrom dagster_dbt import dbt_cli_resource, dbt_rpc_sync_resource\n\n@op(required_resource_keys={"dbt"})\ndef my_custom_dbt_run(context):\n    dbt_output = context.resources.dbt.run()\n    for materialization in generate_materializations(dbt_output):\n        # you can modify the materialization object to add extra metadata, if desired\n        yield materialization\n    yield Output(my_dbt_output)\n\n@job(resource_defs={{"dbt":dbt_cli_resource}})\ndef my_dbt_cli_job():\n    my_custom_dbt_run()\n\n@job(resource_defs={{"dbt":dbt_rpc_sync_resource}})\ndef my_dbt_rpc_job():\n    my_custom_dbt_run()\n
\n
\n
\n\n
\n
\n

Solids [Legacy]\u00b6

\n

dagster_dbt provides a set of solids that may be used in legacy pipelines.

\n
\n

CLI Solids\u00b6

\n
\n
\ndagster_dbt.dbt_cli_compile = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
parse-only (Bool, optional)
\n

Default Value: False

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
no-version-check (Bool, optional)
\n

Skip the check that dbt\u2019s version matches the one specified in the dbt_project.yml file (\u2018require-dbt-version\u2019)

\n

Default Value: False

\n
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
selector (Union[List[String], None], optional)
\n

The selector name to use, as defined in your selectors.yml

\n

Default Value: None

\n
\n
state (Union[List[String], None], optional)
\n

If set, use the given directory as the source for json files to compare with this project.

\n

Default Value: None

\n
\n
full-refresh (Bool, optional)
\n

If specified, DBT will drop incremental models and fully-recalculate the incremental table from the model definition. (\u2013full-refresh)

\n

Default Value: False

\n
\n
\n

This solid executes dbt compile via the dbt CLI.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_run = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
full-refresh (Bool, optional)
\n

If specified, DBT will drop incremental models and fully-recalculate the incremental table from the model definition. (\u2013full-refresh)

\n

Default Value: False

\n
\n
fail-fast (Bool, optional)
\n

Stop execution upon a first failure. (\u2013fail-fast)

\n

Default Value: False

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: []

\n
\n
\n

This solid executes dbt run via the dbt CLI. See the solid definition for available\nparameters.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_run_operation = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
macro (dagster.StringSource)
\n

Specify the macro to invoke. dbt will call this macro with the supplied arguments and then exit.

\n
\n
args (permissive dict, optional)
\n

Supply arguments to the macro. This dictionary will be mapped to the keyword arguments defined in the selected macro. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
\n

This solid executes dbt run-operation via the dbt CLI.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_snapshot = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
select (Union[List[String], None], optional)
\n

The dbt models to include.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
\n

This solid executes dbt snapshot via the dbt CLI.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_snapshot_freshness = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
select (Union[List[String], None], optional)
\n

Specify the sources to snapshot freshness.

\n

Default Value: None

\n
\n
output (dagster.StringSource, optional)
\n

Specify the output path for the json report. By default, outputs to target/sources.json

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
\n

This solid executes dbt source snapshot-freshness via the dbt CLI.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_test = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
data (Bool, optional)
\n

Run data tests defined in \u201ctests\u201d directory.

\n

Default Value: False

\n
\n
schema (Bool, optional)
\n

Run constraint validations from schema.yml files.

\n

Default Value: False

\n
\n
fail-fast (Bool, optional)
\n

Stop execution upon a first test failure.

\n

Default Value: False

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
\n

This solid executes dbt test via the dbt CLI. See the solid definition for available\nparameters.

\n
\n\n
\n
\n

RPC Solids\u00b6

\n
\n
\ndagster_dbt.create_dbt_rpc_run_sql_solid(name, output_def=None, **kwargs)[source]\u00b6
\n

This function is a factory which constructs a solid that will copy the results of a SQL query\nrun within the context of a dbt project to a pandas DataFrame.

\n

Any kwargs passed to this function will be passed along to the underlying @solid decorator. However, note that overriding config_schema, input_defs, and\nrequired_resource_keys is not allowed and will throw a DagsterInvalidDefinitionError.

\n

If you would like to configure this solid with different config fields, you could consider using\n@composite_solid to wrap this solid.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this solid.

  • \n
  • output_def (OutputDefinition, optional) \u2013 The OutputDefinition for the solid. This value should always be a representation\nof a pandas DataFrame. If not specified, the solid will default to an\nOutputDefinition named \u201cdf\u201d with a DataFrame\ndagster type.

  • \n
\n
\n
Returns
\n

Returns the constructed solid definition.

\n
\n
Return type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_rpc_compile_sql = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
name (String)
\n

\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt compile command to a dbt RPC server and returns the request\ntoken.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_run = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
\n

This solid sends the dbt run command to a dbt RPC server and returns the request token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_run_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
full_refresh (Bool, optional)
\n

Whether or not to perform a \u2013full-refresh.

\n

Default Value: False

\n
\n
fail_fast (Bool, optional)
\n

Whether or not to \u2013fail-fast.

\n

Default Value: False

\n
\n
warn_error (Bool, optional)
\n

Whether or not to \u2013warn-error.

\n

Default Value: False

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
task_tags (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
max_retries (Int, optional)
\n

Default Value: 5

\n
\n
retry_interval (Int, optional)
\n

Default Value: 120

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt run command to a dbt RPC server and returns the result of the\nexecuted dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_run_operation = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
macro (String)
\n

The dbt macro to invoke as a run operation

\n
\n
args (Union[permissive dict, None], optional)
\n

Arguments to supply to the invoked macro.

\n

Default Value: None

\n
\n
\n

This solid sends the dbt run-operation command to a dbt RPC server and returns the\nrequest token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_run_operation_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
macro (String)
\n

The dbt macro to invoke as a run operation

\n
\n
args (Union[permissive dict, None], optional)
\n

Arguments to supply to the invoked macro.

\n

Default Value: None

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt run-operation command to a dbt RPC server and returns the\nresult of the executed dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_snapshot = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
select (Union[List[String], None], optional)
\n

The dbt snapshot files to snapshot.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt snapshot files to exclude from the snapshot.

\n

Default Value: None

\n
\n
\n

This solid sends the dbt snapshot command to a dbt RPC server and returns the\nrequest token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_snapshot_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
select (Union[List[String], None], optional)
\n

The dbt snapshot files to snapshot.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt snapshot files to exclude from the snapshot.

\n

Default Value: None

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
task_tags (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
max_retries (Int, optional)
\n

Default Value: 5

\n
\n
retry_interval (Int, optional)
\n

Default Value: 120

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt snapshot command to a dbt RPC server and returns the result of\nthe executed dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_snapshot_freshness = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
select (Union[List[String], None], optional)
\n

The dbt sources to snapshot-freshness for.

\n

Default Value: None

\n
\n
warn_error (Bool, optional)
\n

Whether or not to \u2013warn-error.

\n

Default Value: False

\n
\n
\n

This solid sends the dbt source snapshot-freshness command to a dbt RPC server and\nreturns the request token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_snapshot_freshness_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
select (Union[List[String], None], optional)
\n

The dbt sources to snapshot-freshness for.

\n

Default Value: None

\n
\n
warn_error (Bool, optional)
\n

Whether or not to \u2013warn-error.

\n

Default Value: False

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt source snapshot command to a dbt RPC server and returns the\nresult of the executed dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_test = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
models (Union[List[String], None], optional)
\n

The dbt models to test.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
data (Bool, optional)
\n

Whether or not to run custom data tests.

\n

Default Value: True

\n
\n
schema (Bool, optional)
\n

Whether or not to run schema tests.

\n

Default Value: True

\n
\n
\n

This solid sends the dbt test command to a dbt RPC server and returns the request token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_test_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
models (Union[List[String], None], optional)
\n

The dbt models to test.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
data (Bool, optional)
\n

Whether or not to run custom data tests.

\n

Default Value: True

\n
\n
schema (Bool, optional)
\n

Whether or not to run schema tests.

\n

Default Value: True

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt test command to a dbt RPC server and returns the result of the\nexecuted dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-dbt", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-fivetran/", "title": "Fivetran (dagster-fivetran)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-datadog/", "title": "Datadog (dagster-datadog)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-fivetran", "Fivetran (dagster-fivetran)", "N", "next"], ["sections/api/apidocs/libraries/dagster-datadog", "Datadog (dagster-datadog)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-dbt.rst.txt", "title": "dbt (dagster-dbt)", "toc": "\n"}, "dagster-docker": {"alabaster_version": "0.7.12", "body": "
\n

Orchestration on Docker\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_docker.DockerRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource, optional)
\n

The docker image to be used if the repository does not specify one.

\n
\n
registry (strict dict, optional)
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource)
\n

\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
\n
\n
env_vars (List[String], optional)
\n

The list of environment variables names to forward to the docker container

\n
\n
network (dagster.StringSource, optional)
\n

Name of the network to which to connect the launched container at creation time

\n
\n
networks (List[dagster.StringSource], optional)
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
container_kwargs (permissive dict, optional)
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
\n

Launches runs in a Docker container.

\n
\n\n
\n
\ndagster_docker.docker_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource, optional)
\n

The docker image to be used if the repository does not specify one.

\n
\n
registry (strict dict, optional)
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource)
\n

\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
\n
\n
env_vars (List[String], optional)
\n

The list of environment variables names to forward to the docker container

\n
\n
network (dagster.StringSource, optional)
\n

Name of the network to which to connect the launched container at creation time

\n
\n
networks (List[dagster.StringSource], optional)
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
container_kwargs (permissive dict, optional)
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Executor which launches steps as Docker containers.

\n

To use the docker_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_docker import docker_executor\n\n@job(executor_def=docker_executor)\ndef docker_job():\n    pass\n
\n
\n

Then you can configure the executor with run config as follows:

\n
execution:\n  config:\n    registry: ...\n    network: ...\n    networks: ...\n    container_kwargs: ...\n
\n
\n

If you\u2019re using the DockerRunLauncher, configuration set on the containers created by the run\nlauncher will also be set on the containers that are created for each step.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-docker", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-gcp/", "title": "GCP (dagster-gcp)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-fivetran/", "title": "Fivetran (dagster-fivetran)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-gcp", "GCP (dagster-gcp)", "N", "next"], ["sections/api/apidocs/libraries/dagster-fivetran", "Fivetran (dagster-fivetran)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-docker.rst.txt", "title": "Orchestration on Docker", "toc": "\n"}, "dagster-fivetran": {"alabaster_version": "0.7.12", "body": "
\n

Fivetran (dagster-fivetran)\u00b6

\n

This library provides a Dagster integration with Fivetran.

\n
\n

Ops\u00b6

\n
\n
\ndagster_fivetran.fivetran_sync_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connector_id (String)
\n

The Fivetran Connector ID that this op will sync. You can retrieve this value from the \u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
poll_interval (Float, optional)
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional)
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n

Default Value: None

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the Fivetran sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018fivetran\u2019]

\n
\n
\n

Executes a Fivetran sync for a given connector_id, and polls until that sync\ncompletes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\nthe details of the Fivetran connector after the sync successfully completes, as well as details\nabout which tables the sync updates.

\n

It requires the use of the fivetran_resource, which allows it to\ncommunicate with the Fivetran API.

\n

Examples:

\n
from dagster import job\nfrom dagster_fivetran import fivetran_resource, fivetran_sync_op\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\nsync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"fivetran": my_fivetran_resource})\ndef my_simple_fivetran_job():\n    sync_foobar()\n\n@job(resource_defs={"fivetran": my_fivetran_resource})\ndef my_composed_fivetran_job():\n    final_foobar_state = sync_foobar(start_after=some_op())\n    other_op(final_foobar_state)\n
\n
\n
\n\n
\n
\n

Resources\u00b6

\n
\n
\ndagster_fivetran.fivetran_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource)
\n

Fivetran API Key. You can find this value on the Fivetran settings page: https://fivetran.com/account/settings

\n
\n
api_secret (dagster.StringSource)
\n

Fivetran API Secret. You can find this value on the Fivetran settings page: https://fivetran.com/account/settings

\n
\n
disable_schedule_on_trigger (Bool, optional)
\n

Specifies if you would like any connector that is sync\u2019d using this resource to be automatically taken off its Fivetran schedule.

\n

Default Value: True

\n
\n
request_max_retries (Int, optional)
\n

The maximum number of times requests to the Fivetran API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional)
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the Fivetran REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

For a complete set of documentation on the Fivetran REST API, including expected response JSON\nschemae, see the Fivetran API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_fivetran import fivetran_resource\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\n@job(resource_defs={"fivetran":my_fivetran_resource})\ndef my_fivetran_job():\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_fivetran.FivetranResource(api_key, api_secret, disable_schedule_on_trigger=True, request_max_retries=3, request_retry_delay=0.25, log=<Logger dagster.builtin (DEBUG)>)[source]\u00b6
\n

This class exposes methods on top of the Fivetran REST API.

\n
\n
\nget_connector_details(connector_id)[source]\u00b6
\n

Gets details about a given connector from the Fivetran Connector API.

\n
\n
Parameters
\n

connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_connector_sync_status(connector_id)[source]\u00b6
\n

Gets details about the status of the most recent Fivetran sync operation for a given\nconnector.

\n
\n
Parameters
\n

connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
Returns
\n

Tuple representing the timestamp of the last completeded sync, if it succeeded, and\nthe currently reported sync status.

\n
\n
Return type
\n

Tuple[datetime.datetime, bool, str]

\n
\n
\n
\n\n
\n
\nmake_request(method, endpoint, data=None)[source]\u00b6
\n

Creates and sends a request to the desired Fivetran Connector API endpoint.

\n
\n
Parameters
\n
    \n
  • method (str) \u2013 The http method to use for this request (e.g. \u201cPOST\u201d, \u201cGET\u201d, \u201cPATCH\u201d).

  • \n
  • endpoint (str) \u2013 The Fivetran API endpoint to send this request to.

  • \n
  • data (Optional[str]) \u2013 JSON-formatted data string to be included in the request.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\npoll_sync(connector_id, initial_last_sync_completion, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Given a Fivetran connector and the timestamp at which the previous sync completed, poll\nuntil the next sync completes.

\n

The previous sync completion time is necessary because the only way to tell when a sync\ncompletes is when this value changes.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • initial_last_sync_completion (datetime.datetime) \u2013 The timestamp of the last completed sync\n(successful or otherwise) for this connector, prior to running this method.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (float) \u2013 The maximum time that will waited before this operation is timed\nout. By default, this will never time out.

  • \n
\n
\n
Returns
\n

Parsed json data representing the API response.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nresync_and_poll(connector_id, resync_parameters, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Initializes a historical resync operation for the given connector, and polls until it completes.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • resync_parameters (Dict[str, List[str]]) \u2013 The payload to send to the Fivetran API.\nThis should be a dictionary with schema names as the keys and a list of tables\nto resync as the values.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (float) \u2013 The maximum time that will waited before this operation is timed\nout. By default, this will never time out.

  • \n
\n
\n
Returns
\n

Object containing details about the connector and the tables it updates

\n
\n
Return type
\n

FivetranOutput

\n
\n
\n
\n\n
\n
\nstart_resync(connector_id, resync_parameters)[source]\u00b6
\n

Initiates a historical sync of all data for multiple schema tables within a Fivetran connector.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • resync_parameters (Dict[str, List[str]]) \u2013 The resync parameters to send to the Fivetran API.\nAn example payload can be found here: https://fivetran.com/docs/rest-api/connectors#request_6

  • \n
\n
\n
Returns
\n

\n
Parsed json data representing the connector details API response after

the resync is started.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nstart_sync(connector_id)[source]\u00b6
\n

Initiates a sync of a Fivetran connector.

\n
\n
Parameters
\n

connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
Returns
\n

\n
Parsed json data representing the connector details API response after

the sync is started.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nsync_and_poll(connector_id, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Initializes a sync operation for the given connector, and polls until it completes.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (float) \u2013 The maximum time that will waited before this operation is timed\nout. By default, this will never time out.

  • \n
\n
\n
Returns
\n

Object containing details about the connector and the tables it updates

\n
\n
Return type
\n

FivetranOutput

\n
\n
\n
\n\n
\n
\nupdate_connector(connector_id, properties=None)[source]\u00b6
\n

Updates properties of a Fivetran Connector.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • properties (Dict[str, Any]) \u2013 The properties to be updated. For a comprehensive list of\nproperties, see the [Fivetran docs](https://fivetran.com/docs/rest-api/connectors#modifyaconnector).

  • \n
\n
\n
Returns
\n

Parsed json data representing the API response.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nupdate_schedule_type(connector_id, schedule_type=None)[source]\u00b6
\n

Updates the schedule type property of the connector to either \u201cauto\u201d or \u201cmanual\u201d.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • schedule_type (Optional[str]) \u2013 Either \u201cauto\u201d (to turn the schedule on) or \u201cmanual\u201d (to\nturn it off).

  • \n
\n
\n
Returns
\n

Parsed json data representing the API response.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_fivetran.build_fivetran_assets(connector_id, destination_tables, poll_interval=10, poll_timeout=None, io_manager_key=None, asset_key_prefix=None)[source]\u00b6
\n

Build a set of assets for a given Fivetran connector.

\n

Returns an AssetsDefintion which connects the specified asset_keys to the computation that\nwill update them. Internally, executes a Fivetran sync for a given connector_id, and\npolls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\nfivetran_resource, which allows it to communicate with the\nFivetran API.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID that this op will sync. You can retrieve this\nvalue from the \u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • destination_tables (List[str]) \u2013 schema_name.table_name for each table that you want to be\nrepresented in the Dagster asset graph for this connection.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (Optional[float]) \u2013 The maximum time that will waited before this operation is\ntimed out. By default, this will never time out.

  • \n
  • io_manager_key (Optional[str]) \u2013 The io_manager to be used to handle each of these assets.

  • \n
  • asset_key_prefix (Optional[List[str]]) \u2013 A prefix for the asset keys inside this asset.\nIf left blank, assets will have a key of AssetKey([schema_name, table_name]).

  • \n
\n
\n
\n

Examples:

\n
from dagster import AssetKey, build_assets_job\n\nfrom dagster_fivetran import fivetran_resource\nfrom dagster_fivetran.assets import build_fivetran_assets\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\nfivetran_assets = build_fivetran_assets(\n    connector_id="foobar",\n    table_names=["schema1.table1", "schema2.table2"],\n])\n\nmy_fivetran_job = build_assets_job(\n    "my_fivetran_job",\n    assets=[fivetran_assets],\n    resource_defs={"fivetran": my_fivetran_resource}\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-fivetran", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-docker/", "title": "Orchestration on Docker"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-dbt/", "title": "dbt (dagster-dbt)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-docker", "Orchestration on Docker", "N", "next"], ["sections/api/apidocs/libraries/dagster-dbt", "dbt (dagster-dbt)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-fivetran.rst.txt", "title": "Fivetran (dagster-fivetran)", "toc": "\n"}, "dagster-gcp": {"alabaster_version": "0.7.12", "body": "
\n

GCP (dagster-gcp)\u00b6

\n
\n

BigQuery\u00b6

\n
\n
\nclass dagster_gcp.BigQueryError[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.bigquery_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource, optional)
\n

Project ID for the project which the client acts on behalf of. Will be passed\nwhen creating a dataset / job. If not passed, falls back to the default inferred from the\nenvironment.

\n
\n
location (dagster.StringSource, optional)
\n

(Optional) Default location for jobs / datasets / tables.

\n
\n
\n
\n\n
\n
\ndagster_gcp.bq_create_dataset(context)[source]\u00b6
\n

BigQuery Create Dataset.

\n

This op encapsulates creating a BigQuery dataset.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.bq_delete_dataset(context)[source]\u00b6
\n

BigQuery Delete Dataset.

\n

This op encapsulates deleting a BigQuery dataset.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.bq_op_for_queries(sql_queries)[source]\u00b6
\n

Executes BigQuery SQL queries.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.import_df_to_bq(context, df)[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.import_file_to_bq(context, path)[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.import_gcs_paths_to_bq(context, paths)[source]\u00b6
\n
\n\n
\n
\n

Dataproc\u00b6

\n
\n
\ndagster_gcp.dataproc_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_config (strict dict)
\n
\nConfig Schema:
\n
job (strict dict, optional)
\n

A Cloud Dataproc job resource.

\n
\nConfig Schema:
\n
status (strict dict, optional)
\n

Cloud Dataproc job status.

\n
\n
placement (strict dict, optional)
\n

Cloud Dataproc job config.

\n
\nConfig Schema:
\n
clusterName (String, optional)
\n

Required. The name of the cluster where the job will\nbe submitted.

\n
\n
\n
\n
scheduling (strict dict, optional)
\n

Job scheduling options.

\n
\nConfig Schema:
\n
maxFailuresPerHour (Int, optional)
\n

Optional. Maximum number of times per hour a driver\nmay be restarted as a result of driver terminating with non-zero\ncode before job is reported failed.A job may be reported as\nthrashing if driver exits with non-zero code 4 times within 10\nminute window.Maximum value is 10.

\n
\n
\n
\n
pigJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Pig\n(https://pig.apache.org/) queries on YARN.

\n
\nConfig Schema:
\n
queryFileUri (String, optional)
\n

The HCFS URI of the script that contains the Pig\nqueries.

\n
\n
queryList (strict dict, optional)
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional)
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATH of the Pig Client and Hadoop MapReduce (MR) tasks. Can\ncontain Pig UDFs.

\n
\n
scriptVariables (permissive dict, optional)
\n

Optional. Mapping of query variable names to values\n(equivalent to the Pig command: name=[value]).

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure Pig. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/hadoop/conf/*-site.xml, /etc/pig/conf/pig.properties, and\nclasses in user code.

\n
\n
continueOnFailure (Bool, optional)
\n

Optional. Whether to continue executing queries if a\nquery fails. The default value is false. Setting to true can be\nuseful when executing independent parallel queries.

\n
\n
\n
\n
hiveJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Hive\n(https://hive.apache.org/) queries on YARN.

\n
\nConfig Schema:
\n
continueOnFailure (Bool, optional)
\n

Optional. Whether to continue executing queries if a\nquery fails. The default value is false. Setting to true can be\nuseful when executing independent parallel queries.

\n
\n
queryFileUri (String, optional)
\n

The HCFS URI of the script that contains Hive\nqueries.

\n
\n
queryList (strict dict, optional)
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional)
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATH of the Hive server and Hadoop MapReduce (MR) tasks. Can\ncontain Hive SerDes and UDFs.

\n
\n
scriptVariables (permissive dict, optional)
\n

Optional. Mapping of query variable names to values\n(equivalent to the Hive command: SET name=\u201dvalue\u201d;).

\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names and values,\nused to configure Hive. Properties that conflict with values set by\nthe Cloud Dataproc API may be overwritten. Can include properties\nset in /etc/hadoop/conf/*-site.xml, /etc/hive/conf/hive-site.xml,\nand classes in user code.

\n
\n
\n
\n
labels (permissive dict, optional)
\n

Optional. The labels to associate with this job. Label keys must\ncontain 1 to 63 characters, and must conform to RFC 1035\n(https://www.ietf.org/rfc/rfc1035.txt). Label values may be empty, but, if\npresent, must contain 1 to 63 characters, and must conform to RFC 1035\n(https://www.ietf.org/rfc/rfc1035.txt). No more than 32 labels can be associated\nwith a job.

\n
\n
sparkJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Spark\n(http://spark.apache.org/) applications on YARN.

\n
\nConfig Schema:
\n
archiveUris (List[String], optional)
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of Spark drivers and tasks. Supported file\ntypes: .jar, .tar, .tar.gz, .tgz, and .zip.

\n
\n
mainJarFileUri (String, optional)
\n

The HCFS URI of the jar file that contains the main\nclass.

\n
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATHs of the Spark driver and tasks.

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure Spark. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/spark/conf/spark-defaults.conf and classes in user code.

\n
\n
args (List[String], optional)
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as \u2013conf, that can be set as job\nproperties, since a collision may occur that causes an incorrect job\nsubmission.

\n
\n
fileUris (List[String], optional)
\n

Optional. HCFS URIs of files to be copied to the\nworking directory of Spark drivers and distributed tasks. Useful for\nnaively parallel tasks.

\n
\n
mainClass (String, optional)
\n

The name of the driver\u2019s main class. The jar file\nthat contains the class must be in the default CLASSPATH or\nspecified in jar_file_uris.

\n
\n
\n
\n
sparkSqlJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Spark SQL\n(http://spark.apache.org/sql/) queries.

\n
\nConfig Schema:
\n
queryList (strict dict, optional)
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional)
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
queryFileUri (String, optional)
\n

The HCFS URI of the script that contains SQL\nqueries.

\n
\n
scriptVariables (permissive dict, optional)
\n

Optional. Mapping of query variable names to values\n(equivalent to the Spark SQL command: SET name=\u201dvalue\u201d;).

\n
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to be added to the\nSpark CLASSPATH.

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure Spark SQL\u2019s SparkConf. Properties that conflict with\nvalues set by the Cloud Dataproc API may be overwritten.

\n
\n
\n
\n
pysparkJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache PySpark\n(https://spark.apache.org/docs/0.9.0/python-programming-guide.html) applications\non YARN.

\n
\nConfig Schema:
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATHs of the Python driver and tasks.

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure PySpark. Properties that conflict with values set by\nthe Cloud Dataproc API may be overwritten. Can include properties\nset in /etc/spark/conf/spark-defaults.conf and classes in user\ncode.

\n
\n
args (List[String], optional)
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as \u2013conf, that can be set as job\nproperties, since a collision may occur that causes an incorrect job\nsubmission.

\n
\n
fileUris (List[String], optional)
\n

Optional. HCFS URIs of files to be copied to the\nworking directory of Python drivers and distributed tasks. Useful\nfor naively parallel tasks.

\n
\n
pythonFileUris (List[String], optional)
\n

Optional. HCFS file URIs of Python files to pass to\nthe PySpark framework. Supported file types: .py, .egg, and\n.zip.

\n
\n
mainPythonFileUri (String, optional)
\n

Required. The HCFS URI of the main Python file to use\nas the driver. Must be a .py file.

\n
\n
archiveUris (List[String], optional)
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of .jar, .tar, .tar.gz, .tgz, and .zip.

\n
\n
\n
\n
reference (strict dict, optional)
\n

Encapsulates the full scoping used to reference a job.

\n
\nConfig Schema:
\n
projectId (String, optional)
\n

Required. The ID of the Google Cloud Platform project\nthat the job belongs to.

\n
\n
jobId (String, optional)
\n

Optional. The job ID, which must be unique within the\nproject.The ID must contain only letters (a-z, A-Z), numbers (0-9),\nunderscores (_), or hyphens (-). The maximum length is 100\ncharacters.If not specified by the caller, the job ID will be\nprovided by the server.

\n
\n
\n
\n
hadoopJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Hadoop MapReduce\n(https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html)\njobs on Apache Hadoop YARN\n(https://hadoop.apache.org/docs/r2.7.1/hadoop-yarn/hadoop-yarn-site/YARN.html).

\n
\nConfig Schema:
\n
jarFileUris (List[String], optional)
\n

Optional. Jar file URIs to add to the CLASSPATHs of\nthe Hadoop driver and tasks.

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure Hadoop. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/hadoop/conf/*-site and classes in user code.

\n
\n
args (List[String], optional)
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as -libjars or -Dfoo=bar, that can be set as\njob properties, since a collision may occur that causes an incorrect\njob submission.

\n
\n
fileUris (List[String], optional)
\n

Optional. HCFS (Hadoop Compatible Filesystem) URIs of\nfiles to be copied to the working directory of Hadoop drivers and\ndistributed tasks. Useful for naively parallel tasks.

\n
\n
mainClass (String, optional)
\n

The name of the driver\u2019s main class. The jar file\ncontaining the class must be in the default CLASSPATH or specified\nin jar_file_uris.

\n
\n
archiveUris (List[String], optional)
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of Hadoop drivers and tasks. Supported file\ntypes: .jar, .tar, .tar.gz, .tgz, or .zip.

\n
\n
mainJarFileUri (String, optional)
\n

The HCFS URI of the jar file containing the main\nclass. Examples:\n\u2018gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar\u2019\n\u2018hdfs:/tmp/test-samples/custom-wordcount.jar\u2019\n\u2018file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar\u2019

\n
\n
\n
\n
\n
\n
projectId (dagster.StringSource)
\n

Required. Project ID for the project which the client acts on behalf of. Will\nbe passed when creating a dataset / job. If not passed, falls back to the default inferred\nfrom the environment.

\n
\n
region (dagster.StringSource)
\n

\n
\n
\n
job_scoped_cluster (Bool, optional)
\n

whether to create a cluster or use an existing cluster

\n

Default Value: True

\n
\n
\n
\n\n
\n
\ndagster_gcp.dataproc_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
projectId (dagster.StringSource)
\n

Required. Project ID for the project which the client acts on behalf of. Will\nbe passed when creating a dataset / job. If not passed, falls back to the default inferred\nfrom the environment.

\n
\n
region (dagster.StringSource)
\n

\n
clusterName (dagster.StringSource)
\n

Required. The cluster name. Cluster names within a project must be unique.\nNames of deleted clusters can be reused.

\n
\n
cluster_config (strict dict, optional)
\n

The cluster config.

\n
\nConfig Schema:
\n
masterConfig (strict dict, optional)
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional)
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional)
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional)
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional)
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional)
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional)
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional)
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional)
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional)
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional)
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
secondaryWorkerConfig (strict dict, optional)
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional)
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional)
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional)
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional)
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional)
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional)
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional)
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional)
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional)
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional)
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
encryptionConfig (strict dict, optional)
\n

Encryption settings for the cluster.

\n
\nConfig Schema:
\n
gcePdKmsKeyName (String, optional)
\n

Optional. The Cloud KMS key name to use for PD disk\nencryption for all instances in the cluster.

\n
\n
\n
\n
securityConfig (strict dict, optional)
\n

Security related configuration, including Kerberos.

\n
\nConfig Schema:
\n
kerberosConfig (strict dict, optional)
\n

Specifies Kerberos related configuration.

\n
\nConfig Schema:
\n
truststorePasswordUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided truststore. For the self-signed certificate,\nthis password is generated by Dataproc.

\n
\n
enableKerberos (Bool, optional)
\n

Optional. Flag to indicate whether to\nKerberize the cluster.

\n
\n
truststoreUri (String, optional)
\n

Optional. The Cloud Storage URI of the\ntruststore file used for SSL encryption. If not\nprovided, Dataproc will provide a self-signed\ncertificate.

\n
\n
crossRealmTrustRealm (String, optional)
\n

Optional. The remote realm the Dataproc\non-cluster KDC will trust, should the user enable cross\nrealm trust.

\n
\n
rootPrincipalPasswordUri (String, optional)
\n

Required. The Cloud Storage URI of a KMS\nencrypted file containing the root principal\npassword.

\n
\n
kmsKeyUri (String, optional)
\n

Required. The uri of the KMS key used to\nencrypt various sensitive files.

\n
\n
crossRealmTrustKdc (String, optional)
\n

Optional. The KDC (IP or hostname) for\nthe remote trusted realm in a cross realm trust\nrelationship.

\n
\n
crossRealmTrustSharedPasswordUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the shared password between\nthe on-cluster Kerberos realm and the remote trusted\nrealm, in a cross realm trust relationship.

\n
\n
tgtLifetimeHours (Int, optional)
\n

Optional. The lifetime of the ticket\ngranting ticket, in hours. If not specified, or user\nspecifies 0, then default value 10 will be used.

\n
\n
keystoreUri (String, optional)
\n

Optional. The Cloud Storage URI of the\nkeystore file used for SSL encryption. If not provided,\nDataproc will provide a self-signed certificate.

\n
\n
keyPasswordUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided key. For the self-signed certificate, this\npassword is generated by Dataproc.

\n
\n
keystorePasswordUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided keystore. For the self-signed certificate, this\npassword is generated by Dataproc.

\n
\n
crossRealmTrustAdminServer (String, optional)
\n

Optional. The admin server (IP or\nhostname) for the remote trusted realm in a cross realm\ntrust relationship.

\n
\n
kdcDbKeyUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the master key of the KDC\ndatabase.

\n
\n
\n
\n
\n
\n
initializationActions (List[strict dict], optional)
\n

Optional. Commands to execute on each node after config is\ncompleted. By default, executables are run on master and all worker nodes. You\ncan test a node\u2019s role metadata to run an executable on a master or worker\nnode, as shown below using curl (you can also use wget): ROLE=$(curl -H\nMetadata-Flavor:Google\nhttp://metadata/computeMetadata/v1/instance/attributes/dataproc-role) if [[\n\u201c${ROLE}\u201d == \u2018Master\u2019 ]]; then \u2026 master specific actions \u2026 else \u2026\nworker specific actions \u2026 fi

\n
\n
configBucket (String, optional)
\n

Optional. A Google Cloud Storage bucket used to stage job\ndependencies, config files, and job driver console output. If you do not specify\na staging bucket, Cloud Dataproc will determine a Cloud Storage location (US,\nASIA, or EU) for your cluster\u2019s staging bucket according to the Google Compute\nEngine zone where your cluster is deployed, and then create and manage this\nproject-level, per-location bucket (see Cloud Dataproc staging bucket).

\n
\n
workerConfig (strict dict, optional)
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional)
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional)
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional)
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional)
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional)
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional)
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional)
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional)
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional)
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional)
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
gceClusterConfig (strict dict, optional)
\n

Common config settings for resources of Compute Engine cluster\ninstances, applicable to all instances in the cluster.

\n
\nConfig Schema:
\n
networkUri (String, optional)
\n

Optional. The Compute Engine network to be used for\nmachine communications. Cannot be specified with subnetwork_uri. If\nneither network_uri nor subnetwork_uri is specified, the \u201cdefault\u201d\nnetwork of the project is used, if it exists. Cannot be a \u201cCustom\nSubnet Network\u201d (see Using Subnetworks for more information).A full\nURL, partial URI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/regions/global/default\nprojects/[project_id]/regions/global/default default

\n
\n
zoneUri (String, optional)
\n

Optional. The zone where the Compute Engine cluster\nwill be located. On a create request, it is required in the \u201cglobal\u201d\nregion. If omitted in a non-global Cloud Dataproc region, the\nservice will pick a zone in the corresponding Compute Engine region.\nOn a get request, zone will always be present.A full URL, partial\nURI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/[zone]\nprojects/[project_id]/zones/[zone] us-central1-f

\n
\n
metadata (permissive dict, optional)
\n

The Compute Engine metadata entries to add to all\ninstances (see Project and instance metadata\n(https://cloud.google.com/compute/docs/storing-retrieving-metadata#project_and_instance_metadata)).

\n
\n
internalIpOnly (Bool, optional)
\n

Optional. If true, all instances in the cluster will\nonly have internal IP addresses. By default, clusters are not\nrestricted to internal IP addresses, and will have ephemeral\nexternal IP addresses assigned to each instance. This\ninternal_ip_only restriction can only be enabled for subnetwork\nenabled networks, and all off-cluster dependencies must be\nconfigured to be accessible without external IP addresses.

\n
\n
serviceAccountScopes (List[String], optional)
\n

Optional. The URIs of service account scopes to be\nincluded in Compute Engine instances. The following base set of\nscopes is always included:\nhttps://www.googleapis.com/auth/cloud.useraccounts.readonly\nhttps://www.googleapis.com/auth/devstorage.read_write\nhttps://www.googleapis.com/auth/logging.writeIf no scopes are\nspecified, the following defaults are also provided:\nhttps://www.googleapis.com/auth/bigquery\nhttps://www.googleapis.com/auth/bigtable.admin.table\nhttps://www.googleapis.com/auth/bigtable.data\nhttps://www.googleapis.com/auth/devstorage.full_control

\n
\n
tags (List[String], optional)
\n

The Compute Engine tags to add to all instances (see\nTagging instances).

\n
\n
serviceAccount (String, optional)
\n

Optional. The service account of the instances.\nDefaults to the default Compute Engine service account. Custom\nservice accounts need permissions equivalent to the following IAM\nroles: roles/logging.logWriter roles/storage.objectAdmin(see\nhttps://cloud.google.com/compute/docs/access/service-accounts#custom_service_accounts\nfor more information). Example:\n[account_id]@[project_id].iam.gserviceaccount.com

\n
\n
subnetworkUri (String, optional)
\n

Optional. The Compute Engine subnetwork to be used\nfor machine communications. Cannot be specified with network_uri.A\nfull URL, partial URI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/regions/us-east1/subnetworks/sub0\nprojects/[project_id]/regions/us-east1/subnetworks/sub0 sub0

\n
\n
\n
\n
softwareConfig (strict dict, optional)
\n

Specifies the selection and config of software inside the\ncluster.

\n
\nConfig Schema:
\n
properties (permissive dict, optional)
\n

Optional. The properties to set on daemon config\nfiles.Property keys are specified in prefix:property format, for\nexample core:hadoop.tmp.dir. The following are supported prefixes\nand their mappings: capacity-scheduler: capacity-scheduler.xml core:\ncore-site.xml distcp: distcp-default.xml hdfs: hdfs-site.xml hive:\nhive-site.xml mapred: mapred-site.xml pig: pig.properties spark:\nspark-defaults.conf yarn: yarn-site.xmlFor more information, see\nCluster properties.

\n
\n
optionalComponents (List[Component], optional)
\n

The set of optional components to activate on the\ncluster.

\n
\n
imageVersion (String, optional)
\n

Optional. The version of software inside the cluster.\nIt must be one of the supported Cloud Dataproc Versions, such as\n\u201c1.2\u201d (including a subminor version, such as \u201c1.2.29\u201d), or the\n\u201cpreview\u201d version. If unspecified, it defaults to the latest Debian\nversion.

\n
\n
\n
\n
\n
\n
\n
\n\n
\n
\n

GCS\u00b6

\n
\n
\ndagster_gcp.gcs_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional)
\n

Project name

\n
\n
\n
\n\n
\n
\nclass dagster_gcp.GCSFileHandle(gcs_bucket, gcs_key)[source]\u00b6
\n

A reference to a file on GCS.

\n
\n
\nproperty gcs_bucket\u00b6
\n

The name of the GCS bucket.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty gcs_key\u00b6
\n

The GCS key.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty gcs_path\u00b6
\n

The file\u2019s GCS URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty path_desc\u00b6
\n

The file\u2019s GCS URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\ndagster_gcp.gcs_file_manager ResourceDefinition[source]\u00b6
\n

FileManager that provides abstract access to GCS.

\n

Implements the FileManager API.

\n
\n\n
\n
\ndagster_gcp.gcs.gcs_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs_bucket (dagster.StringSource)
\n

\n
gcs_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using GCS for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.

\n

Attach this resource definition to your job to make it available to your ops.

\n
@job(resource_defs={'io_manager': gcs_pickle_io_manager, 'gcs': gcs_resource, ...})\ndef my_job():\n    my_op()\n
\n
\n

You may configure this storage as follows:

\n
resources:\n    io_manager:\n        config:\n            gcs_bucket: my-cool-bucket\n            gcs_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\ndagster_gcp.gcs.gcs_pickle_asset_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs_bucket (dagster.StringSource)
\n

\n
gcs_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using GCS for storage, meant for use with software-defined assets.

\n

Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\nwill overwrite previous materializations of that asset.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.

\n

Attach this resource definition to your job to make it available to your ops.

\n
asset_group = AssetGroup(\n    assets...,\n    resource_defs={'io_manager': gcs_pickle_asset_io_manager, "gcs": gcs_resource, ...}),\n)\n
\n
\n

You may configure this IO manager as follows:

\n
resources:\n    io_manager:\n        config:\n            gcs_bucket: my-cool-bucket\n            gcs_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\n

Legacy APIs\u00b6

\n
\n
\ndagster_gcp.bq_solid_for_queries(sql_queries)[source]\u00b6
\n

Executes BigQuery SQL queries.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.dataproc_solid(context)[source]\u00b6
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-gcp", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-ge/", "title": "Great Expectations (dagster-ge)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-docker/", "title": "Orchestration on Docker"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-ge", "Great Expectations (dagster-ge)", "N", "next"], ["sections/api/apidocs/libraries/dagster-docker", "Orchestration on Docker", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-gcp.rst.txt", "title": "GCP (dagster-gcp)", "toc": "\n"}, "dagster-ge": {"alabaster_version": "0.7.12", "body": "
\n

Great Expectations (dagster-ge)\u00b6

\n
\n
\ndagster_ge.ge_validation_solid_factory(name, datasource_name, suite_name, validation_operator_name=None, input_dagster_type=<dagster.core.types.dagster_type.DagsterType object>, batch_kwargs=None)[source]\u00b6
\n

Generates solids for interacting with GE.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 the name of the solid

  • \n
  • datasource_name (str) \u2013 the name of your DataSource, see your great_expectations.yml

  • \n
  • suite_name (str) \u2013 the name of your expectation suite, see your great_expectations.yml

  • \n
  • validation_operator_name (Optional[str]) \u2013 what validation operator to run \u2013 defaults to None,\nwhich generates an ephemeral validator.\nIf you want to save data docs, use \u2018action_list_operator\u2019.\nSee https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html

  • \n
  • input_dagster_type (DagsterType) \u2013 the Dagster type used to type check the input to the\nsolid. Defaults to dagster_pandas.DataFrame.

  • \n
  • batch_kwargs (Optional[dict]) \u2013 overrides the batch_kwargs parameter when calling the\nge_data_context\u2019s get_batch method. Defaults to {\u201cdataset\u201d: dataset},\nwhere dataset is the input to the generated solid.

  • \n
\n
\n
Returns
\n

A solid that takes in a set of data and yields both an expectation with relevant metadata\nand an output with all the metadata (for user processing)

\n
\n
\n
\n\n
\n
\ndagster_ge.ge_validation_op_factory(name, datasource_name, suite_name, validation_operator_name=None, input_dagster_type=<dagster.core.types.dagster_type.DagsterType object>, batch_kwargs=None)[source]\u00b6
\n

Generates ops for interacting with GE.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 the name of the op

  • \n
  • datasource_name (str) \u2013 the name of your DataSource, see your great_expectations.yml

  • \n
  • suite_name (str) \u2013 the name of your expectation suite, see your great_expectations.yml

  • \n
  • validation_operator_name (Optional[str]) \u2013 what validation operator to run \u2013 defaults to\nNone, which generates an ephemeral validator. If you want to save data docs, use\n\u2018action_list_operator\u2019.\nSee https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html

  • \n
  • input_dagster_type (DagsterType) \u2013 the Dagster type used to type check the input to the op.\nDefaults to dagster_pandas.DataFrame.

  • \n
  • batch_kwargs (Optional[dict]) \u2013 overrides the batch_kwargs parameter when calling the\nge_data_context\u2019s get_batch method. Defaults to {\u201cdataset\u201d: dataset}, where\ndataset is the input to the generated op.

  • \n
\n
\n
Returns
\n

A solid that takes in a set of data and yields both an expectation with relevant metadata\nand an output with all the metadata (for user processing)

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-ge", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-github/", "title": "GitHub (dagster-github)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-gcp/", "title": "GCP (dagster-gcp)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-github", "GitHub (dagster-github)", "N", "next"], ["sections/api/apidocs/libraries/dagster-gcp", "GCP (dagster-gcp)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-ge.rst.txt", "title": "Great Expectations (dagster-ge)", "toc": "\n"}, "dagster-github": {"alabaster_version": "0.7.12", "body": "
\n

GitHub (dagster-github)\u00b6

\n

This library provides an integration with GitHub Apps, to support performing various automation\noperations within your github repositories and with the tighter permissions scopes that github apps\nallow for vs using a personal token.

\n

Presently, it provides a thin wrapper on the github v4 graphql API.

\n

To use this integration, you\u2019ll first need to create a GitHub App for it.

\n
    \n
  1. Create App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/, You will end up with a private key and App ID, which will be used when configuring the\ndagster-github resource. Note you will need to grant your app the relevent permissions\nfor the API requests you want to make, for example to post issues it will need read/write access\nfor the issues repository permission, more info on GitHub application permissions can be found\nhere

  2. \n
  3. Install App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/#step-7-install-the-app-on-your-account

  4. \n
  5. Find your installation_id: You can pull this from the GitHub app administration page,\nhttps://github.com/apps/<app-name>/installations/<installation_id>. Note if your app is\ninstalled more than once you can also programatically retrieve these IDs.

  6. \n
\n

Sharing your App ID and Installation ID is fine, but make sure that the Private Key for your app is\nstored securily.

\n
\n
\n

Posting Issues\u00b6

\n

Now, you can create issues in GitHub from Dagster with the GitHub resource:

\n
import os\n\nfrom dagster import job, op\nfrom dagster_github import github_resource\n\n\n@op(resource_defs={'github'})\ndef github_op(context):\n    context.resources.github.create_issue(\n        repo_name='dagster',\n        repo_owner='dagster-io',\n        title='Dagster\\'s first github issue',\n        body='this open source thing seems like a pretty good idea',\n    )\n\n@job(resource_defs={'github': github_resource})\ndef github_job():\n    github_op()\n\ngithub_job.execute_in_process(\n    run_config={'resources': {'github': {'config': {\n        "github_app_id": os.getenv('GITHUB_APP_ID'),\n        "github_app_private_rsa_key": os.getenv('GITHUB_PRIVATE_KEY'),\n        "github_installation_id": os.getenv('GITHUB_INSTALLATION_ID'),\n    }}}}\n)\n
\n
\n

Run the above code, and you\u2019ll see the issue appear in GitHub:\n

\n

GitHub enterprise users can provide their hostname in the run config. Provide github_hostname\nas part of your github config like below.

\n
github_job.execute_in_process(\n    run_config={'resources': {'github': {'config': {\n        "github_app_id": os.getenv('GITHUB_APP_ID'),\n        "github_app_private_rsa_key": os.getenv('GITHUB_PRIVATE_KEY'),\n        "github_installation_id": os.getenv('GITHUB_INSTALLATION_ID'),\n        "github_hostname": os.getenv('GITHUB_HOSTNAME'),\n    }}}}\n
\n
\n

By provisioning github_resource as a Dagster job resource, you can post to GitHub from\nwithin any op execution.

\n
\n
\n

Executing GraphQL queries\u00b6

\n
import os\n\nfrom dagster import job, op\nfrom dagster_github import github_resource\n\n\n@op(resource_defs={'github'})\ndef github_op(context):\n    context.resources.github.execute(\n        query="""\n        query get_repo_id($repo_name: String!, $repo_owner: String!) {\n            repository(name: $repo_name, owner: $repo_owner) {\n                id\n            }\n        }\n        """,\n        variables={"repo_name": repo_name, "repo_owner": repo_owner},\n    )\n\n@job(resource_defs={'github': github_resource})\ndef github_job():\n    github_op()\n\ngithub_job.execute_in_process(\n    run_config={'resources': {'github': {'config': {\n        "github_app_id": os.getenv('GITHUB_APP_ID'),\n        "github_app_private_rsa_key": os.getenv('GITHUB_PRIVATE_KEY'),\n        "github_installation_id": os.getenv('GITHUB_INSTALLATION_ID'),\n    }}}}\n)\n
\n
\n
\n
\ndagster_github.github_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
github_app_id (dagster.IntSource)
\n

Github Application ID, for more info see https://developer.github.com/apps/

\n
\n
github_app_private_rsa_key (dagster.StringSource)
\n

Github Application Private RSA key text, for more info see https://developer.github.com/apps/

\n
\n
github_installation_id (dagster.IntSource, optional)
\n

Github Application Installation ID, for more info see https://developer.github.com/apps/

\n
\n
github_hostname (dagster.StringSource, optional)
\n

Github hostname. Defaults to api.github.com, for more info see https://developer.github.com/apps/

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-github", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-k8s/", "title": "Kubernetes (dagster-k8s)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-ge/", "title": "Great Expectations (dagster-ge)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-k8s", "Kubernetes (dagster-k8s)", "N", "next"], ["sections/api/apidocs/libraries/dagster-ge", "Great Expectations (dagster-ge)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-github.rst.txt", "title": "GitHub (dagster-github)", "toc": "\n"}, "dagster-graphql": {"alabaster_version": "0.7.12", "body": "
\n

GraphQL (dagster-graphql)\u00b6

\n
\n

Python Client\u00b6

\n
\n
\nclass dagster_graphql.DagsterGraphQLClient(hostname, port_number=None, transport=None, use_https=False)[source]\u00b6
\n

Official Dagster Python Client for GraphQL

\n

Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server

\n

As of now, all operations on this client are synchronous.

\n

Intended usage:

\n
client = DagsterGraphQLClient("localhost", port_number=3000)\nstatus = client.get_run_status(**SOME_RUN_ID**)\n
\n
\n
\n
Parameters
\n
    \n
  • hostname (str) \u2013 Hostname for the Dagster GraphQL API, like localhost or\ndagit.dagster.YOUR_ORG_HERE.

  • \n
  • port_number (Optional[int], optional) \u2013 Optional port number to connect to on the host.\nDefaults to None.

  • \n
  • transport (Optional[Transport], optional) \u2013 A custom transport to use to connect to the\nGraphQL API with (e.g. for custom auth). Defaults to None.

  • \n
  • use_https (bool, optional) \u2013 Whether to use https in the URL connection string for the\nGraphQL API. Defaults to False.

  • \n
\n
\n
Raises
\n

ConnectionError \u2013 if the client cannot connect to the host.

\n
\n
\n
\n
\nget_run_status(run_id)[source]\u00b6
\n

Get the status of a given Pipeline Run

\n
\n
Parameters
\n

run_id (str) \u2013 run id of the requested pipeline run.

\n
\n
Raises
\n
\n
\n
Returns
\n

returns a status Enum describing the state of the requested pipeline run

\n
\n
Return type
\n

PipelineRunStatus

\n
\n
\n
\n\n
\n
\nreload_repository_location(repository_location_name)[source]\u00b6
\n

Reloads a Dagster Repository Location, which reloads all repositories in that repository location.

\n

This is useful in a variety of contexts, including refreshing Dagit without restarting\nthe server.

\n
\n
Parameters
\n

repository_location_name (str) \u2013 The name of the repository location

\n
\n
Returns
\n

Object with information about the result of the reload request

\n
\n
Return type
\n

ReloadRepositoryLocationInfo

\n
\n
\n
\n\n
\n
\nshutdown_repository_location(repository_location_name)[source]\u00b6
\n

Shuts down the server that is serving metadata for the provided repository location.

\n

This is primarily useful when you want the server to be restarted by the compute environment\nin which it is running (for example, in Kubernetes, the pod in which the server is running\nwill automatically restart when the server is shut down, and the repository metadata will\nbe reloaded)

\n
\n
Parameters
\n

repository_location_name (str) \u2013 The name of the repository location

\n
\n
Returns
\n

Object with information about the result of the reload request

\n
\n
Return type
\n

ShutdownRepositoryLocationInfo

\n
\n
\n
\n\n
\n
\nsubmit_job_execution(job_name, repository_location_name=None, repository_name=None, run_config=None, tags=None, op_selection=None)[source]\u00b6
\n

Submits a job with attached configuration for execution.

\n
\n
Parameters
\n
    \n
  • job_name (str) \u2013 The job\u2019s name

  • \n
  • repository_location_name (Optional[str]) \u2013 The name of the repository location where\nthe job is located. If omitted, the client will try to infer the repository location\nfrom the available options on the Dagster deployment. Defaults to None.

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository where the job is located.\nIf omitted, the client will try to infer the repository from the available options\non the Dagster deployment. Defaults to None.

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 This is the run config to execute the job with.\nNote that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\nan arbitrary object for run config. However, it must conform to the constraints of the config\nschema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\nJobConfigValidationInvalid. Defaults to None.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 A set of tags to add to the job execution.

  • \n
\n
\n
Raises
\n
    \n
  • DagsterGraphQLClientError("InvalidStepError", invalid_step_key) \u2013 the job has an invalid step

  • \n
  • DagsterGraphQLClientError("InvalidOutputError", body=error_object) \u2013 some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.

  • \n
  • DagsterGraphQLClientError("RunConflict", message) \u2013 a DagsterRunConflict occured during execution.\n This indicates that a conflicting job run already exists in run storage.

  • \n
  • DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key) \u2013 the run_config is not in the expected format\n for the job

  • \n
  • DagsterGraphQLClientError("JobNotFoundError", message) \u2013 the requested job does not exist

  • \n
  • DagsterGraphQLClientError("PythonError", message) \u2013 an internal framework error occurred

  • \n
\n
\n
Returns
\n

run id of the submitted pipeline run

\n
\n
Return type
\n

str

\n
\n
\n
\n\n
\n
\nsubmit_pipeline_execution(pipeline_name, repository_location_name=None, repository_name=None, run_config=None, mode=None, preset=None, tags=None, solid_selection=None)[source]\u00b6
\n

Submits a Pipeline with attached configuration for execution.

\n
\n
Parameters
\n
    \n
  • pipeline_name (str) \u2013 The pipeline\u2019s name

  • \n
  • repository_location_name (Optional[str], optional) \u2013 The name of the repository location where\nthe pipeline is located. If omitted, the client will try to infer the repository location\nfrom the available options on the Dagster deployment. Defaults to None.

  • \n
  • repository_name (Optional[str], optional) \u2013 The name of the repository where the pipeline is located.\nIf omitted, the client will try to infer the repository from the available options\non the Dagster deployment. Defaults to None.

  • \n
  • run_config (Optional[Any], optional) \u2013 This is the run config to execute the pipeline with.\nNote that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\nan arbitrary object for run config. However, it must conform to the constraints of the config\nschema for this pipeline. If it does not, the client will throw a DagsterGraphQLClientError with a message of\nRunConfigValidationInvalid. Defaults to None.

  • \n
  • mode (Optional[str], optional) \u2013 The mode to run the pipeline with. If you have not\ndefined any custom modes for your pipeline, the default mode is \u201cdefault\u201d. Defaults to None.

  • \n
  • preset (Optional[str], optional) \u2013 The name of a pre-defined preset to use instead of a\nrun config. Defaults to None.

  • \n
  • tags (Optional[Dict[str, Any]], optional) \u2013 A set of tags to add to the pipeline execution.

  • \n
\n
\n
Raises
\n
    \n
  • DagsterGraphQLClientError("InvalidStepError", invalid_step_key) \u2013 the pipeline has an invalid step

  • \n
  • DagsterGraphQLClientError("InvalidOutputError", body=error_object) \u2013 some solid has an invalid output within the pipeline.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.

  • \n
  • DagsterGraphQLClientError("ConflictingExecutionParamsError", invalid_step_key) \u2013 a preset and a run_config & mode are present\n that conflict with one another

  • \n
  • DagsterGraphQLClientError("PresetNotFoundError", message) \u2013 if the provided preset name is not found

  • \n
  • DagsterGraphQLClientError("RunConflict", message) \u2013 a DagsterRunConflict occured during execution.\n This indicates that a conflicting pipeline run already exists in run storage.

  • \n
  • DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key) \u2013 the run_config is not in the expected format\n for the pipeline

  • \n
  • DagsterGraphQLClientError("PipelineNotFoundError", message) \u2013 the requested pipeline does not exist

  • \n
  • DagsterGraphQLClientError("PythonError", message) \u2013 an internal framework error occurred

  • \n
\n
\n
Returns
\n

run id of the submitted pipeline run

\n
\n
Return type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nexception dagster_graphql.DagsterGraphQLClientError(*args, body=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster_graphql.InvalidOutputErrorInfo(step_key, invalid_output_name)[source]\u00b6
\n

This class gives information about an InvalidOutputError from submitting a pipeline for execution\nfrom GraphQL.

\n
\n
Parameters
\n
    \n
  • step_key (str) \u2013 key of the step that failed

  • \n
  • invalid_output_name (str) \u2013 the name of the invalid output from the given step

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_graphql.ReloadRepositoryLocationInfo(status, failure_type=None, message=None)[source]\u00b6
\n

This class gives information about the result of reloading\na Dagster repository location with a GraphQL mutation.

\n
\n
Parameters
\n
    \n
  • status (ReloadRepositoryLocationStatus) \u2013 The status of the reload repository location mutation

  • \n
  • failure_type \u2013 (Optional[str], optional): the failure type if status == ReloadRepositoryLocationStatus.FAILURE.\nCan be one of ReloadNotSupported, RepositoryLocationNotFound, or RepositoryLocationLoadFailure. Defaults to None.

  • \n
  • message (Optional[str], optional) \u2013 the failure message/reason if\nstatus == ReloadRepositoryLocationStatus.FAILURE. Defaults to None.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_graphql.ReloadRepositoryLocationStatus(value)[source]\u00b6
\n

This enum describes the status of a GraphQL mutation to reload a Dagster repository location

\n
\n
Parameters
\n

Enum (str) \u2013 can be either ReloadRepositoryLocationStatus.SUCCESS\nor ReloadRepositoryLocationStatus.FAILURE.

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-graphql", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": null, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagstermill/", "title": "Dagstermill"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagstermill", "Dagstermill", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-graphql.rst.txt", "title": "GraphQL (dagster-graphql)", "toc": "\n"}, "dagster-k8s": {"alabaster_version": "0.7.12", "body": "
\n

Kubernetes (dagster-k8s)\u00b6

\n

See also the Kubernetes deployment guide.

\n

This library contains utilities for running Dagster with Kubernetes. This includes a Python API\nallowing Dagit to launch runs as Kubernetes Jobs, as well as a Helm chart you can use as the basis\nfor a Dagster deployment on a Kubernetes cluster.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_k8s.K8sRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
instance_config_map (dagster.StringSource)
\n

The name of an existing Volume to mount into the pod in order to provide a ConfigMap for the Dagster instance. This Volume should contain a dagster.yaml with appropriate values for run storage, event log storage, etc.

\n
\n
postgres_password_secret (dagster.StringSource, optional)
\n

The name of the Kubernetes Secret where the postgres password can be retrieved. Will be mounted and supplied as an environment variable to the Job Pod.Secret must contain the key "postgresql-password" which will be exposed in the Job environment as the environment variable DAGSTER_PG_PASSWORD.

\n
\n
dagster_home (dagster.StringSource, optional)
\n

The location of DAGSTER_HOME in the Job container; this is where the dagster.yaml file will be mounted from the instance ConfigMap specified here. Defaults to /opt/dagster/dagster_home.

\n

Default Value: \u2018/opt/dagster/dagster_home\u2019

\n
\n
load_incluster_config (Bool, optional)
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional)
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
fail_pod_on_run_failure (Bool, optional)
\n

Whether the launched Kubernetes Jobs and Pods should fail if the Dagster run fails

\n
\n
job_image (Union[dagster.StringSource, None], optional)
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used.(Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional)
\n

Image pull policy to set on the launched task Job Pods. Defaults to \u201cIfNotPresent\u201d.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional)
\n

(Advanced) Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional)
\n

(Advanced) Override the name of the Kubernetes service account under which to run the Job.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional)
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional)
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional)
\n

A list of environment variables to inject into the Job. Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[strict dict], optional)
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional)
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional)
\n

Additional labels that should be included in the Job\u2019s Pod. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
job_namespace (dagster.StringSource, optional)
\n

Default Value: \u2018default\u2019

\n
\n
\n

RunLauncher that starts a Kubernetes Job for each Dagster job run.

\n

Encapsulates each run in a separate, isolated invocation of dagster-graphql.

\n

You can configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml like the following:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: K8sRunLauncher\n  config:\n    service_account_name: your_service_account\n    job_image: my_project/dagster_image:latest\n    instance_config_map: dagster-instance\n    postgres_password_secret: dagster-postgresql-secret\n
\n
\n
\n\n
\n
\ndagster_k8s.k8s_job_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional)
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used.(Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional)
\n

Image pull policy to set on the launched task Job Pods. Defaults to \u201cIfNotPresent\u201d.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional)
\n

(Advanced) Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional)
\n

(Advanced) Override the name of the Kubernetes service account under which to run the Job.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional)
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional)
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional)
\n

A list of environment variables to inject into the Job. Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[strict dict], optional)
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional)
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional)
\n

Additional labels that should be included in the Job\u2019s Pod. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
job_namespace (dagster.StringSource, optional)
\n

\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Executor which launches steps as Kubernetes Jobs.

\n

To use the k8s_job_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_k8s import k8s_job_executor\n\n@job(executor_def=k8s_job_executor)\ndef k8s_job():\n    pass\n
\n
\n

Then you can configure the executor with run config as follows:

\n
execution:\n  config:\n    job_namespace: 'some-namespace'\n    image_pull_policy: ...\n    image_pull_secrets: ...\n    service_account_name: ...\n    env_config_maps: ...\n    env_secrets: ...\n    env_vars: ...\n    job_image: ... # leave out if using userDeployments\n
\n
\n

Configuration set on the Kubernetes Jobs and Pods created by the K8sRunLauncher will also be\nset on Kubernetes Jobs and Pods created by the k8s_job_executor.

\n
\n\n
\n

Python API\u00b6

\n

The K8sRunLauncher allows Dagit instances to be configured to launch new runs by starting\nper-run Kubernetes Jobs. To configure the K8sRunLauncher, your dagster.yaml should\ninclude a section like:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: K8sRunLauncher\n  config:\n    image_pull_secrets:\n    service_account_name: dagster\n    job_image: "my-company.com/image:latest"\n    dagster_home: "/opt/dagster/dagster_home"\n    postgres_password_secret: "dagster-postgresql-secret"\n    image_pull_policy: "IfNotPresent"\n    job_namespace: "dagster"\n    instance_config_map: "dagster-instance"\n    env_config_maps:\n      - "dagster-k8s-job-runner-env"\n    env_secrets:\n      - "dagster-k8s-some-secret"\n
\n
\n
\n
\n

Helm chart\u00b6

\n

For local dev (e.g., on kind or minikube):

\n
helm install \\\n    --set dagit.image.repository="dagster.io/buildkite-test-image" \\\n    --set dagit.image.tag="py37-latest" \\\n    --set job_runner.image.repository="dagster.io/buildkite-test-image" \\\n    --set job_runner.image.tag="py37-latest" \\\n    --set imagePullPolicy="IfNotPresent" \\\n    dagster \\\n    helm/dagster/\n
\n
\n

Upon installation, the Helm chart will provide instructions for port forwarding Dagit and Flower (if\nconfigured).

\n
\n
\n

Running tests\u00b6

\n

To run the unit tests:

\n
pytest -m "not integration"\n
\n
\n

To run the integration tests, you must have Docker,\nkind,\nand helm installed.

\n

On macOS:

\n
brew install kind\nbrew install helm\n
\n
\n

Docker must be running.

\n

You may experience slow first test runs thanks to image pulls (run pytest -svv --fulltrace for\nvisibility). Building images and loading them to the kind cluster is slow, and there is\nno visibility into the progress of the load.

\n

NOTE: This process is quite slow, as it requires bootstrapping a local kind cluster with\nDocker images and the dagster-k8s Helm chart. For faster development, you can either:

\n
    \n
  1. Keep a warm kind cluster

  2. \n
  3. Use a remote K8s cluster, e.g. via AWS EKS or GCP GKE

  4. \n
\n

Instructions are below.

\n
\n

Faster local development (with kind)\u00b6

\n

You may find that the kind cluster creation, image loading, and kind cluster creation loop\nis too slow for effective local dev.

\n

You may bypass cluster creation and image loading in the following way. First add the --no-cleanup\nflag to your pytest invocation:

\n
pytest --no-cleanup -s -vvv -m "not integration"\n
\n
\n

The tests will run as before, but the kind cluster will be left running after the tests are completed.

\n

For subsequent test runs, you can run:

\n
pytest --kind-cluster="cluster-d9971c84d44d47f382a2928c8c161faa" --existing-helm-namespace="dagster-test-95590a" -s -vvv -m "not integration"\n
\n
\n

This will bypass cluster creation, image loading, and Helm chart installation, for much faster tests.

\n

The kind cluster name and Helm namespace for this command can be found in the logs, or retrieved\nvia the respective CLIs, using kind get clusters and kubectl get namespaces. Note that\nfor kubectl and helm to work correctly with a kind cluster, you should override your\nkubeconfig file location with:

\n
kind get kubeconfig --name kind-test > /tmp/kubeconfig\nexport KUBECONFIG=/tmp/kubeconfig\n
\n
\n
\n
\n

Manual kind cluster setup\u00b6

\n

The test fixtures provided by dagster-k8s automate the process described below, but sometimes\nit\u2019s useful to manually configure a kind cluster and load images onto it.

\n

First, ensure you have a Docker image appropriate for your Python version. Run, from the root of\nthe repo:

\n
./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6 \\\n    dagster.io.priv/buildkite-test-image:py37-latest\n
\n
\n

In the above invocation, the Python majmin version should be appropriate for your desired tests.

\n

Then run the following commands to create the cluster and load the image. Note that there is no\nfeedback from the loading process.

\n
kind create cluster --name kind-test\nkind load docker-image --name kind-test dagster.io/dagster-docker-buildkite:py37-latest\n
\n
\n

If you are deploying the Helm chart with an in-cluster Postgres (rather than an external database),\nand/or with dagster-celery workers (and a RabbitMQ), you\u2019ll also want to have images present for\nrabbitmq and postgresql:

\n
docker pull docker.io/bitnami/rabbitmq\ndocker pull docker.io/bitnami/postgresql\n\nkind load docker-image --name kind-test docker.io/bitnami/rabbitmq:latest\nkind load docker-image --name kind-test docker.io/bitnami/postgresql:latest\n
\n
\n

Then you can run pytest as follows:

\n
pytest --kind-cluster=kind-test\n
\n
\n
\n
\n
\n

Faster local development (with an existing K8s cluster)\u00b6

\n

If you already have a development K8s cluster available, you can run tests on that cluster vs.\nrunning locally in kind.

\n

For this to work, first build and deploy the test image to a registry available to your cluster.\nFor example, with a private ECR repository:

\n
./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6\ndocker tag dagster-docker-buildkite:latest $AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n\naws ecr get-login --no-include-email --region us-west-1 | sh\ndocker push $AWS_ACCOUNT_ID.dkr.ecr.us-west-1.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n
\n
\n

Then, you can run tests on EKS with:

\n
export DAGSTER_DOCKER_IMAGE_TAG="2020-04-21T21-04-06"\nexport DAGSTER_DOCKER_REPOSITORY="$AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com"\nexport DAGSTER_DOCKER_IMAGE="dagster-k8s-tests"\n\n# First run with --no-cleanup to leave Helm chart in place\npytest --cluster-provider="kubeconfig" --no-cleanup -s -vvv\n\n# Subsequent runs against existing Helm chart\npytest --cluster-provider="kubeconfig" --existing-helm-namespace="dagster-test-<some id>" -s -vvv\n
\n
\n
\n
\n

Validating Helm charts\u00b6

\n

To test / validate Helm charts, you can run:

\n
helm install dagster --dry-run --debug helm/dagster\nhelm lint\n
\n
\n
\n
\n

Enabling GCR access from Minikube\u00b6

\n

To enable GCR access from Minikube:

\n
kubectl create secret docker-registry element-dev-key \\\n    --docker-server=https://gcr.io \\\n    --docker-username=oauth2accesstoken \\\n    --docker-password="$(gcloud auth print-access-token)" \\\n    --docker-email=my@email.com\n
\n
\n
\n
\n

A note about PVCs\u00b6

\n

Both the Postgres and the RabbitMQ Helm charts will store credentials using Persistent Volume\nClaims, which will outlive test invocations and calls to helm uninstall. These must be deleted if\nyou want to change credentials. To view your pvcs, run:

\n
kubectl get pvc\n
\n
\n
\n
\n

Testing Redis\u00b6

\n

The Redis Helm chart installs w/ a randomly-generated password by default; turn this off:

\n
helm install dagredis stable/redis --set usePassword=false\n
\n
\n

Then, to connect to your database from outside the cluster execute the following commands:

\n
kubectl port-forward --namespace default svc/dagredis-master 6379:6379\nredis-cli -h 127.0.0.1 -p 6379\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-k8s", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-mlflow/", "title": "MLflow (dagster-mlflow)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-github/", "title": "GitHub (dagster-github)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-mlflow", "MLflow (dagster-mlflow)", "N", "next"], ["sections/api/apidocs/libraries/dagster-github", "GitHub (dagster-github)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-k8s.rst.txt", "title": "Kubernetes (dagster-k8s)", "toc": "\n"}, "dagster-mlflow": {"alabaster_version": "0.7.12", "body": "
\n

MLflow (dagster-mlflow)\u00b6

\n
\n
\ndagster_mlflow.mlflow_tracking ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
experiment_name (String)
\n

MlFlow experiment name.

\n
\n
mlflow_tracking_uri (Union[String, None], optional)
\n

MlFlow tracking server uri.

\n

Default Value: None

\n
\n
parent_run_id (Union[String, None], optional)
\n

Mlflow run ID of parent run if this is a nested run.

\n

Default Value: None

\n
\n
env (permissive dict, optional)
\n

Environment variables for mlflow setup.

\n
\nDefault Value:
{}\n
\n
\n
\n
env_to_tag (Union[List[Any], None], optional)
\n

List of environment variables to log as tags in mlflow.

\n

Default Value: None

\n
\n
extra_tags (permissive dict, optional)
\n

Any extra key-value tags to log to mlflow.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n

This resource initializes an MLflow run that\u2019s used for all steps within a Dagster run.

\n

This resource provides access to all of mlflow\u2019s methods as well as the mlflow tracking client\u2019s\nmethods.

\n

Usage:

\n
    \n
  1. Add the mlflow resource to any solids in which you want to invoke mlflow tracking APIs.

  2. \n
  3. Add the end_mlflow_on_run_finished hook to your pipeline to end the MLflow run\nwhen the Dagster run is finished.

  4. \n
\n

Examples

\n
from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n@op(required_resource_keys={"mlflow"})\ndef mlflow_solid(context):\n    mlflow.log_params(some_params)\n    mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n@end_mlflow_on_run_finished\n@job(resource_defs={"mlflow": mlflow_tracking})\ndef mlf_example():\n    mlflow_op()\n\n# example using an mlflow instance with s3 storage\nmlf_example.execute_in_process(run_config={\n    "resources": {\n        "mlflow": {\n            "config": {\n                "experiment_name": my_experiment,\n                "mlflow_tracking_uri": "http://localhost:5000",\n\n                # if want to run a nested run, provide parent_run_id\n                "parent_run_id": an_existing_mlflow_run_id,\n\n                # env variables to pass to mlflow\n                "env": {\n                    "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n                    "AWS_ACCESS_KEY_ID": my_aws_key_id,\n                    "AWS_SECRET_ACCESS_KEY": my_secret,\n                },\n\n                # env variables you want to log as mlflow tags\n                "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n                # key-value tags to add to your experiment\n                "extra_tags": {"super": "experiment"},\n            }\n        }\n    }\n})\n
\n
\n
\n\n
\n
\ndagster_mlflow.end_mlflow_on_run_finished HookDefinition\u00b6
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_mlflow.end_mlflow_run_on_pipeline_finished HookDefinition\u00b6
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-mlflow", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-msteams/", "title": "Microsoft Teams (dagster-msteams)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-k8s/", "title": "Kubernetes (dagster-k8s)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-msteams", "Microsoft Teams (dagster-msteams)", "N", "next"], ["sections/api/apidocs/libraries/dagster-k8s", "Kubernetes (dagster-k8s)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-mlflow.rst.txt", "title": "MLflow (dagster-mlflow)", "toc": "\n"}, "dagster-msteams": {"alabaster_version": "0.7.12", "body": "
\n

Microsoft Teams (dagster-msteams)\u00b6

\n
\n
\ndagster_msteams.msteams_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
hook_url (dagster.StringSource)
\n

To send messages to MS Teams channel, an incoming webhook has to\nbe created. The incoming webhook url must be given as a part of the\nresource config to the msteams_resource in dagster.

\n
\n
http_proxy (dagster.StringSource, optional)
\n

\n
https_proxy (dagster.StringSource, optional)
\n

\n
timeout (Float, optional)
\n

Default Value: 60

\n
\n
Verify (Bool, optional)
\n

\n
\n

This resource is for connecting to Microsoft Teams.

\n

The resource object is a dagster_msteams.TeamsClient.

\n

By configuring this resource, you can post messages to MS Teams from any Dagster solid:

\n

Examples:

\n
import os\n\nfrom dagster import ModeDefinition, execute_pipeline, pipeline, solid\nfrom dagster_msteams import Card, msteams_resource\n\n\n@solid(required_resource_keys={"msteams"})\ndef teams_solid(context):\n    card = Card()\n    card.add_attachment(text_message="Hello There !!")\n    context.resources.msteams.post_message(payload=card.payload)\n\n\n@pipeline(\n    mode_defs=[ModeDefinition(resource_defs={"msteams": msteams_resource})],\n)\ndef teams_pipeline():\n    teams_solid()\n\n\nexecute_pipeline(\n    teams_pipeline,\n    {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}},\n)\n
\n
\n
\n\n
\n
\ndagster_msteams.teams_on_failure HookDefinition[source]\u00b6
\n

Create a hook on step failure events that will message the given MS Teams webhook URL.

\n
\n
Parameters
\n
    \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the\nHookContext outputs the message you want to send.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this\nto allow messages to include deeplinks to the specific pipeline run that triggered\nthe hook.

  • \n
\n
\n
\n

Examples

\n
@teams_on_failure(dagit_base_url="http://localhost:3000")\n@pipeline(...)\ndef my_pipeline():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return "Solid {solid_name} failed!".format(\n        solid_name=context.solid\n    )\n\n@solid\ndef a_solid(context):\n    pass\n\n@pipeline(...)\ndef my_pipeline():\n    a_solid.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_msteams.teams_on_success HookDefinition[source]\u00b6
\n

Create a hook on step success events that will message the given MS Teams webhook URL.

\n
\n
Parameters
\n
    \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the\nHookContext outputs the message you want to send.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this\nto allow messages to include deeplinks to the specific pipeline run that triggered\nthe hook.

  • \n
\n
\n
\n

Examples

\n
@teams_on_success(dagit_base_url="http://localhost:3000")\n@pipeline(...)\ndef my_pipeline():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return "Solid {solid_name} failed!".format(\n        solid_name=context.solid\n    )\n\n@solid\ndef a_solid(context):\n    pass\n\n@pipeline(...)\ndef my_pipeline():\n    a_solid.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_msteams.make_teams_on_pipeline_failure_sensor(hook_url, message_fn=<function _default_failure_message>, http_proxy=None, https_proxy=None, timeout=60, verify=None, name=None, dagit_base_url=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Create a sensor on pipeline failures that will message the given MS Teams webhook URL.

\n
\n
Parameters
\n
    \n
  • hook_url (str) \u2013 MS Teams incoming webhook URL.

  • \n
  • message_fn (Optional(Callable[[PipelineFailureSensorContext], str])) \u2013 Function which\ntakes in the PipelineFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, pipeline name, and run ID.

  • \n
  • http_proxy \u2013 (Optional[str]): Proxy for requests using http protocol.

  • \n
  • https_proxy \u2013 (Optional[str]): Proxy for requests using https protocol.

  • \n
  • timeout \u2013 (Optional[float]): Connection timeout in seconds. Defaults to 60.

  • \n
  • verify \u2013 (Optional[bool]): Whether to verify the servers TLS certificate.

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cteams_on_pipeline_failure\u201d.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the failed pipeline run.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n

Examples

\n
teams_on_pipeline_failure = make_teams_on_pipeline_failure_sensor(\n    hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n)\n\n@repository\ndef my_repo():\n    return [my_pipeline + teams_on_pipeline_failure]\n
\n
\n
def my_message_fn(context: PipelineFailureSensorContext) -> str:\n    return "Pipeline {pipeline_name} failed! Error: {error}".format(\n        pipeline_name=context.pipeline_run.pipeline_name,\n        error=context.failure_event.message,\n    )\n\nteams_on_pipeline_failure = make_teams_on_pipeline_failure_sensor(\n    hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n    message_fn=my_message_fn,\n    dagit_base_url="http://localhost:3000",\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-msteams", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-mysql/", "title": "MySQL (dagster-mysql)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-mlflow/", "title": "MLflow (dagster-mlflow)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-mysql", "MySQL (dagster-mysql)", "N", "next"], ["sections/api/apidocs/libraries/dagster-mlflow", "MLflow (dagster-mlflow)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-msteams.rst.txt", "title": "Microsoft Teams (dagster-msteams)", "toc": "\n"}, "dagster-mysql": {"alabaster_version": "0.7.12", "body": "
\n

MySQL (dagster-mysql)\u00b6

\n
\n
\nclass dagster_mysql.MySQLEventLogStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
event_log_storage:\n module: dagster_mysql.event_log\n class: MySQLEventLogStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { db_name }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\nclass dagster_mysql.MySQLRunStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n module: dagster_mysql.run_storage\n class: MySQLRunStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { database }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\nclass dagster_mysql.MySQLScheduleStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
schedule_storage:\n module: dagster_mysql.schedule_storage\n class: MySQLScheduleStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { db_name }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-mysql", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-pagerduty/", "title": "PagerDuty (dagster-pagerduty)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-msteams/", "title": "Microsoft Teams (dagster-msteams)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pagerduty", "PagerDuty (dagster-pagerduty)", "N", "next"], ["sections/api/apidocs/libraries/dagster-msteams", "Microsoft Teams (dagster-msteams)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-mysql.rst.txt", "title": "MySQL (dagster-mysql)", "toc": "\n"}, "dagster-pagerduty": {"alabaster_version": "0.7.12", "body": "
\n

PagerDuty (dagster-pagerduty)\u00b6

\n

This library provides an integration with PagerDuty, to support creating alerts from your Dagster\ncode.

\n

Presently, it provides a thin wrapper on the Events V2 API.

\n
\n
\n

Getting Started\u00b6

\n

You can install this library with:

\n
pip install dagster_pagerduty\n
\n
\n

To use this integration, you\u2019ll first need to create a PagerDuty integration. There are instructions\nhere for\ncreating a new PagerDuty service & integration.

\n

As noted in the PagerDuty documentation, you\u2019ll find an integration key (also referred to as a\n\u201crouting key\u201d) on the Integrations tab for your new service. This key is used to authorize events\ncreated from the PagerDuty events API.

\n

Once your service/integration is created, you can provision a PagerDuty resource and issue PagerDuty\nalerts from within your ops.

\n
\n
\ndagster_pagerduty.pagerduty_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
routing_key (String)
\n

The routing key provisions access to your PagerDuty service. You\nwill need to include the integration key for your new integration, as a\nrouting_key in the event payload.

\n
\n
\n

A resource for posting events (alerts) to PagerDuty.

\n

Example:

\n
@op(required_resource_keys={'pagerduty'})\ndef pagerduty_op(context):\n    context.resources.pagerduty.EventV2_create(\n        summary='alert from dagster'\n        source='localhost',\n        severity='error',\n        event_action='trigger',\n    )\n\n@job(resource_defs={ 'pagerduty': pagerduty_resource })\ndef pagerduty_test():\n    pagerduty_op()\n\npagerduty_test.execute_in_process(\n    run_config={\n        "resources": {\n            'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n        }\n    }\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pagerduty", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-pandas/", "title": "Pandas (dagster-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-mysql/", "title": "MySQL (dagster-mysql)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pandas", "Pandas (dagster-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-mysql", "MySQL (dagster-mysql)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pagerduty.rst.txt", "title": "PagerDuty (dagster-pagerduty)", "toc": "\n"}, "dagster-pandas": {"alabaster_version": "0.7.12", "body": "
\n

Pandas (dagster-pandas)\u00b6

\n

The dagster_pandas library provides utilities for using pandas with Dagster and for implementing\nvalidation on pandas DataFrames. A good place to start with dagster_pandas is the validation\nguide.

\n
\n
\ndagster_pandas.create_dagster_pandas_dataframe_type(name, description=None, columns=None, event_metadata_fn=None, dataframe_constraints=None, loader=None, materializer=None)[source]\u00b6
\n

Constructs a custom pandas dataframe dagster type.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the dagster pandas type.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • columns (Optional[List[PandasColumn]]) \u2013 A list of PandasColumn objects\nwhich express dataframe column schemas and constraints.

  • \n
  • event_metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]], List[MetadataEntry]]]]) \u2013 A callable which takes your dataframe and returns a dict with string label keys and\nMetadataValue values. Can optionally return a List[MetadataEntry].

  • \n
  • dataframe_constraints (Optional[List[DataFrameConstraint]]) \u2013 A list of objects that inherit from\nDataFrameConstraint. This allows you to express dataframe-level constraints.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader. If None, we will default\nto using dataframe_loader.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer. If None, we will\ndefault to using dataframe_materializer.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.RowCountConstraint(num_allowed_rows, error_tolerance=0)[source]\u00b6
\n

A dataframe constraint that validates the expected count of rows.

\n
\n
Parameters
\n
    \n
  • num_allowed_rows (int) \u2013 The number of allowed rows in your dataframe.

  • \n
  • error_tolerance (Optional[int]) \u2013 The acceptable threshold if you are not completely certain. Defaults to 0.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.StrictColumnsConstraint(strict_column_list, enforce_ordering=False)[source]\u00b6
\n

A dataframe constraint that validates column existence and ordering.

\n
\n
Parameters
\n
    \n
  • strict_column_list (List[str]) \u2013 The exact list of columns that your dataframe must have.

  • \n
  • enforce_ordering (Optional[bool]) \u2013 If true, will enforce that the ordering of column names must match.\nDefault is False.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.PandasColumn(name, constraints=None, is_required=None)[source]\u00b6
\n

The main API for expressing column level schemas and constraints for your custom dataframe\ntypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf th column exists, the validate function will validate the column. Defaults to True.

  • \n
  • constraints (Optional[List[Constraint]]) \u2013 List of constraint objects that indicate the\nvalidation rules for the pandas column.

  • \n
\n
\n
\n
\n
\nstatic boolean_column(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic categorical_column(name, categories, of_types=frozenset({'category', 'object'}), non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • categories (List[Any]) \u2013 The valid set of buckets that all values in the column must match.

  • \n
  • of_types (Optional[Union[str, Set[str]]]) \u2013 The expected dtype[s] that your categories and values must\nabide by.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in\nthe column ought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the\nconstraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic datetime_column(name, min_datetime=Timestamp('1677-09-21 00:12:43.145224193'), max_datetime=Timestamp('2262-04-11 23:47:16.854775807'), non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, tz=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses datetime constraints on \u2018datetime64[ns]\u2019 dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • min_datetime (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column.\nDefaults to pandas.Timestamp.min.

  • \n
  • max_datetime (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column.\nDefaults to pandas.Timestamp.max.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
  • tz (Optional[str]) \u2013 Required timezone for values eg: tz=\u2019UTC\u2019, tz=\u2019Europe/Dublin\u2019, tz=\u2019US/Eastern\u2019.\nDefaults to None, meaning naive datetime values.

  • \n
\n
\n
\n
\n\n
\n
\nstatic exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses existence constraints.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic float_column(name, min_value=- inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)

  • \n
  • max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic integer_column(name, min_value=- inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)

  • \n
  • max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic numeric_column(name, min_value=- inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)

  • \n
  • max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic string_column(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses constraints on string dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\ndagster_pandas.DataFrame = <dagster.core.types.dagster_type.DagsterType object>\u00b6
\n

Define a type in dagster. These can be used in the inputs and outputs of ops.

\n
\n
Parameters
\n
    \n
  • type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the op. If it passes, return either\nTrue or a TypeCheck with success set to True. If it fails,\nreturn either False or a TypeCheck with success set to False.\nThe first argument must be named context (or, if unused, _, _context, or context_).\nUse required_resource_keys for access to resources.

  • \n
  • key (Optional[str]) \u2013

    The unique key to identify types programmatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name. If\nneither key nor name is provided, a CheckError is thrown.

    \n

    In the case of a generic type such as List or Optional, this is\ngenerated programmatically based on the type parameters.

    \n

    For most use cases, name should be set and the key argument should\nnot be specified.

    \n

  • \n
  • name (Optional[str]) \u2013 A unique name given by a user. If key is None, key\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer\ndecorator to construct these arguments.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn.

  • \n
  • is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String, Int) to visually distinguish\nthem from user-defined types. Meant for internal use.

  • \n
  • kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.

  • \n
  • typing_type \u2013 Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\nvalue contained within the DagsterType. Meant for internal use.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pandas", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-papertrail/", "title": "Papertrail (dagster-papertrail)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pagerduty/", "title": "PagerDuty (dagster-pagerduty)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-papertrail", "Papertrail (dagster-papertrail)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pagerduty", "PagerDuty (dagster-pagerduty)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pandas.rst.txt", "title": "Pandas (dagster-pandas)", "toc": "\n"}, "dagster-papertrail": {"alabaster_version": "0.7.12", "body": "
\n

Papertrail (dagster-papertrail)\u00b6

\n

This library provides an integration with Papertrail for logging.

\n

You can easily set up your Dagster job to log to Papertrail. You\u2019ll need an active Papertrail\naccount, and have your papertrail URL and port handy.

\n
\n
\ndagster_papertrail.papertrail_logger LoggerDefinition\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-papertrail", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-postgres/", "title": "PostgreSQL (dagster-postgres)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pandas/", "title": "Pandas (dagster-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-postgres", "PostgreSQL (dagster-postgres)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pandas", "Pandas (dagster-pandas)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-papertrail.rst.txt", "title": "Papertrail (dagster-papertrail)", "toc": "\n"}, "dagster-postgres": {"alabaster_version": "0.7.12", "body": "
\n

PostgreSQL (dagster-postgres)\u00b6

\n
\n
\ndagster_postgres.PostgresEventLogStorage = <class 'dagster_postgres.event_log.event_log.PostgresEventLogStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional)
\n

\n
postgres_db (strict dict, optional)
\n
\nConfig Schema:
\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
hostname (dagster.StringSource)
\n

\n
db_name (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 5432

\n
\n
params (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
should_autocreate_tables (Bool, optional)
\n

Default Value: True

\n
\n
\n

Postgres-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for event log storage, you can add a block such as the following to your\ndagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
event_log_storage:\n  module: dagster_postgres.event_log\n  class: PostgresEventLogStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\ndagster_postgres.PostgresRunStorage = <class 'dagster_postgres.run_storage.run_storage.PostgresRunStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional)
\n

\n
postgres_db (strict dict, optional)
\n
\nConfig Schema:
\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
hostname (dagster.StringSource)
\n

\n
db_name (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 5432

\n
\n
params (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
should_autocreate_tables (Bool, optional)
\n

Default Value: True

\n
\n
\n

Postgres-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for run storage, you can add a block such as the following to your\ndagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n  module: dagster_postgres.run_storage\n  class: PostgresRunStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { database }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\ndagster_postgres.PostgresScheduleStorage = <class 'dagster_postgres.schedule_storage.schedule_storage.PostgresScheduleStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional)
\n

\n
postgres_db (strict dict, optional)
\n
\nConfig Schema:
\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
hostname (dagster.StringSource)
\n

\n
db_name (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 5432

\n
\n
params (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
should_autocreate_tables (Bool, optional)
\n

Default Value: True

\n
\n
\n

Postgres-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for schedule storage, you can add a block such as the following to your\ndagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
schedule_storage:\n  module: dagster_postgres.schedule_storage\n  class: PostgresScheduleStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-postgres", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-prometheus/", "title": "Prometheus (dagster-prometheus)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-papertrail/", "title": "Papertrail (dagster-papertrail)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-prometheus", "Prometheus (dagster-prometheus)", "N", "next"], ["sections/api/apidocs/libraries/dagster-papertrail", "Papertrail (dagster-papertrail)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-postgres.rst.txt", "title": "PostgreSQL (dagster-postgres)", "toc": "\n"}, "dagster-prometheus": {"alabaster_version": "0.7.12", "body": "
\n

Prometheus (dagster-prometheus)\u00b6

\n
\n
\nclass dagster_prometheus.resources.PrometheusResource(gateway, timeout)[source]\u00b6
\n

Integrates with Prometheus via the prometheus_client library.

\n
\n\n
\n
\ndagster_prometheus.prometheus_resource ResourceDefinition[source]\u00b6
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-prometheus", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-pyspark/", "title": "Pyspark (dagster-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-postgres/", "title": "PostgreSQL (dagster-postgres)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pyspark", "Pyspark (dagster-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-postgres", "PostgreSQL (dagster-postgres)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-prometheus.rst.txt", "title": "Prometheus (dagster-prometheus)", "toc": "\n"}, "dagster-pyspark": {"alabaster_version": "0.7.12", "body": "
\n

Pyspark (dagster-pyspark)\u00b6

\n
\n
\ndagster_pyspark.pyspark_resource = <dagster.core.definitions.resource_definition.ResourceDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_conf (permissive dict, optional)
\n
\nDefault Value:
{\n    "spark": {\n        "app": {},\n        "driver": {\n            "blockManager": {}\n        },\n        "executor": {\n            "pyspark": {},\n            "logs": {\n                "rolling": {\n                    "time": {}\n                }\n            }\n        },\n        "local": {},\n        "submit": {},\n        "log": {},\n        "redaction": {},\n        "python": {\n            "profile": {},\n            "worker": {}\n        },\n        "files": {},\n        "jars": {},\n        "pyspark": {\n            "driver": {}\n        },\n        "reducer": {},\n        "shuffle": {\n            "file": {},\n            "io": {},\n            "service": {\n                "index": {\n                    "cache": {}\n                }\n            },\n            "sort": {},\n            "spill": {},\n            "registration": {}\n        },\n        "eventLog": {\n            "logBlockUpdates": {},\n            "longForm": {},\n            "buffer": {}\n        },\n        "ui": {\n            "dagGraph": {},\n            "liveUpdate": {}\n        },\n        "worker": {\n            "ui": {}\n        },\n        "sql": {\n            "ui": {}\n        },\n        "streaming": {\n            "ui": {},\n            "backpressure": {},\n            "receiver": {\n                "writeAheadLog": {}\n            },\n            "kafka": {},\n            "driver": {\n                "writeAheadLog": {}\n            }\n        },\n        "broadcast": {},\n        "io": {\n            "compression": {\n                "lz4": {},\n                "snappy": {},\n                "zstd": {}\n            }\n        },\n        "kryo": {},\n        "kryoserializer": {\n            "buffer": {}\n        },\n        "rdd": {},\n        "serializer": {},\n        "memory": {\n            "offHeap": {}\n        },\n        "storage": {\n            "replication": {}\n        },\n        "cleaner": {\n            "periodicGC": {},\n            "referenceTracking": {\n                "blocking": {}\n            }\n        },\n        "default": {},\n        "hadoop": {\n            "mapreduce": {\n                "fileoutputcommitter": {\n                    "algorithm": {}\n                }\n            }\n        },\n        "rpc": {\n            "message": {},\n            "retry": {}\n        },\n        "blockManager": {},\n        "network": {},\n        "port": {},\n        "core": {\n            "connection": {\n                "ack": {\n                    "wait": {}\n                }\n            }\n        },\n        "cores": {},\n        "locality": {\n            "wait": {}\n        },\n        "scheduler": {\n            "revive": {},\n            "listenerbus": {\n                "eventqueue": {}\n            }\n        },\n        "blacklist": {\n            "task": {},\n            "stage": {},\n            "application": {\n                "fetchFailure": {}\n            }\n        },\n        "speculation": {},\n        "task": {\n            "reaper": {}\n        },\n        "stage": {},\n        "dynamicAllocation": {},\n        "r": {\n            "driver": {},\n            "shell": {}\n        },\n        "graphx": {\n            "pregel": {}\n        },\n        "deploy": {\n            "zookeeper": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
spark (permissive dict, optional)
\n
\nDefault Value:
{\n    "app": {},\n    "driver": {\n        "blockManager": {}\n    },\n    "executor": {\n        "pyspark": {},\n        "logs": {\n            "rolling": {\n                "time": {}\n            }\n        }\n    },\n    "local": {},\n    "submit": {},\n    "log": {},\n    "redaction": {},\n    "python": {\n        "profile": {},\n        "worker": {}\n    },\n    "files": {},\n    "jars": {},\n    "pyspark": {\n        "driver": {}\n    },\n    "reducer": {},\n    "shuffle": {\n        "file": {},\n        "io": {},\n        "service": {\n            "index": {\n                "cache": {}\n            }\n        },\n        "sort": {},\n        "spill": {},\n        "registration": {}\n    },\n    "eventLog": {\n        "logBlockUpdates": {},\n        "longForm": {},\n        "buffer": {}\n    },\n    "ui": {\n        "dagGraph": {},\n        "liveUpdate": {}\n    },\n    "worker": {\n        "ui": {}\n    },\n    "sql": {\n        "ui": {}\n    },\n    "streaming": {\n        "ui": {},\n        "backpressure": {},\n        "receiver": {\n            "writeAheadLog": {}\n        },\n        "kafka": {},\n        "driver": {\n            "writeAheadLog": {}\n        }\n    },\n    "broadcast": {},\n    "io": {\n        "compression": {\n            "lz4": {},\n            "snappy": {},\n            "zstd": {}\n        }\n    },\n    "kryo": {},\n    "kryoserializer": {\n        "buffer": {}\n    },\n    "rdd": {},\n    "serializer": {},\n    "memory": {\n        "offHeap": {}\n    },\n    "storage": {\n        "replication": {}\n    },\n    "cleaner": {\n        "periodicGC": {},\n        "referenceTracking": {\n            "blocking": {}\n        }\n    },\n    "default": {},\n    "hadoop": {\n        "mapreduce": {\n            "fileoutputcommitter": {\n                "algorithm": {}\n            }\n        }\n    },\n    "rpc": {\n        "message": {},\n        "retry": {}\n    },\n    "blockManager": {},\n    "network": {},\n    "port": {},\n    "core": {\n        "connection": {\n            "ack": {\n                "wait": {}\n            }\n        }\n    },\n    "cores": {},\n    "locality": {\n        "wait": {}\n    },\n    "scheduler": {\n        "revive": {},\n        "listenerbus": {\n            "eventqueue": {}\n        }\n    },\n    "blacklist": {\n        "task": {},\n        "stage": {},\n        "application": {\n            "fetchFailure": {}\n        }\n    },\n    "speculation": {},\n    "task": {\n        "reaper": {}\n    },\n    "stage": {},\n    "dynamicAllocation": {},\n    "r": {\n        "driver": {},\n        "shell": {}\n    },\n    "graphx": {\n        "pregel": {}\n    },\n    "deploy": {\n        "zookeeper": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
app (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
name (dagster.StringSource, optional)
\n

Application Properties: The name of your application. This will appear in the UI and in log data.

\n
\n
\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{\n    "blockManager": {}\n}\n
\n
\n
\nConfig Schema:
\n
cores (dagster.IntSource, optional)
\n

Application Properties: Number of cores to use for the driver process, only in cluster mode.

\n
\n
maxResultSize (dagster.StringSource, optional)
\n

Application Properties: Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size is above this limit. Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory and memory overhead of objects in JVM). Setting a proper limit can protect the driver from out-of-memory errors.

\n
\n
memory (dagster.StringSource, optional)
\n

Application Properties: Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-memory command line option or in your default properties file.

\n
\n
memoryOverhead (dagster.StringSource, optional)
\n

Application Properties: The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
supervise (Bool, optional)
\n

Application Properties: If true, restarts the driver automatically if it fails with a non-zero exit status. Only has effect in Spark standalone mode or Mesos cluster deploy mode.

\n
\n
extraClassPath (dagster.StringSource, optional)
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of the driver. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-class-path command line option or in your default properties file.

\n
\n
extraJavaOptions (dagster.StringSource, optional)
\n

Runtime Environment: A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set with spark.driver.memory in the cluster mode and through the \u2013driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-java-options command line option or in your default properties file.

\n
\n
extraLibraryPath (dagster.StringSource, optional)
\n

Runtime Environment: Set a special library path to use when launching the driver JVM. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-library-path command line option or in your default properties file.

\n
\n
userClassPathFirst (Bool, optional)
\n

Runtime Environment: (Experimental) Whether to give user-added jars precedence over Spark\u2019s own jars when loading classes in the driver. This feature can be used to mitigate conflicts between Spark\u2019s dependencies and user dependencies. It is currently an experimental feature. This is used in cluster mode only.

\n
\n
blockManager (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional)
\n

Networking: Driver-specific port for the block manager to listen on, for cases where it cannot use the same configuration as executors.

\n
\n
\n
\n
bindAddress (dagster.StringSource, optional)
\n

Networking: Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP environment variable (see below). It also allows a different address from the local one to be advertised to executors or external systems. This is useful, for example, when running containers with bridged networking. For this to properly work, the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the container\u2019s host.

\n
\n
host (dagster.StringSource, optional)
\n

Networking: Hostname or IP address for the driver. This is used for communicating with the executors and the standalone Master.

\n
\n
port (dagster.StringSource, optional)
\n

Networking: Port for the driver to listen on. This is used for communicating with the executors and the standalone Master.

\n
\n
\n
\n
executor (permissive dict, optional)
\n
\nDefault Value:
{\n    "pyspark": {},\n    "logs": {\n        "rolling": {\n            "time": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Application Properties: Amount of memory to use per executor process, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g).

\n
\n
pyspark (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Application Properties: The amount of memory to be allocated to PySpark in each executor, in MiB unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python\u2019s memory use and it is up to the application to avoid exceeding the overhead memory space shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests.

\n
\n
\n
\n
memoryOverhead (dagster.StringSource, optional)
\n

Application Properties: The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
extraClassPath (dagster.StringSource, optional)
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of executors. This exists primarily for backwards-compatibility with older versions of Spark. Users typically should not need to set this option.

\n
\n
extraJavaOptions (dagster.StringSource, optional)
\n

Runtime Environment: A string of extra JVM options to pass to executors. For instance, GC settings or other logging. Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. The following symbols, if present will be interpolated: {{APP_ID}} will be replaced by application ID and {{EXECUTOR_ID}} will be replaced by executor ID. For example, to enable verbose gc logging to a file named for the executor ID of the app in /tmp, pass a \u2018value\u2019 of: -verbose:gc -Xloggc:/tmp/{{APP_ID}}-{{EXECUTOR_ID}}.gc

\n
\n
extraLibraryPath (dagster.StringSource, optional)
\n

Runtime Environment: Set a special library path to use when launching executor JVM\u2019s.

\n
\n
logs (permissive dict, optional)
\n
\nDefault Value:
{\n    "rolling": {\n        "time": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
rolling (permissive dict, optional)
\n
\nDefault Value:
{\n    "time": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRetainedFiles (dagster.IntSource, optional)
\n

Runtime Environment: Sets the number of latest rolling log files that are going to be retained by the system. Older log files will be deleted. Disabled by default.

\n
\n
enableCompression (Bool, optional)
\n

Runtime Environment: Enable executor log compression. If it is enabled, the rolled executor logs will be compressed. Disabled by default.

\n
\n
maxSize (dagster.IntSource, optional)
\n

Runtime Environment: Set the max size of the file in bytes by which the executor logs will be rolled over. Rolling is disabled by default. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
strategy (dagster.StringSource, optional)
\n

Runtime Environment: Set the strategy of rolling of executor logs. By default it is disabled. It can be set to \u201ctime\u201d (time-based rolling) or \u201csize\u201d (size-based rolling). For \u201ctime\u201d, use spark.executor.logs.rolling.time.interval to set the rolling interval. For \u201csize\u201d, use spark.executor.logs.rolling.maxSize to set the maximum file size for rolling.

\n
\n
time (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Runtime Environment: Set the time interval by which the executor logs will be rolled over. Rolling is disabled by default. Valid values are daily, hourly, minutely or any interval in seconds. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
\n
\n
\n
\n
\n
\n
userClassPathFirst (Bool, optional)
\n

Runtime Environment: (Experimental) Same functionality as spark.driver.userClassPathFirst, but applied to executor instances.

\n
\n
cores (dagster.IntSource, optional)
\n

Execution Behavior: The number of cores to use on each executor. In standalone and Mesos coarse-grained modes, for more detail, see this description.

\n
\n
heartbeatInterval (dagster.StringSource, optional)
\n

Execution Behavior: Interval between each executor\u2019s heartbeats to the driver. Heartbeats let the driver know that the executor is still alive and update it with metrics for in-progress tasks. spark.executor.heartbeatInterval should be significantly less than spark.network.timeout

\n
\n
\n
\n
extraListeners (dagster.StringSource, optional)
\n

Application Properties: A comma-separated list of classes that implement SparkListener; when initializing SparkContext, instances of these classes will be created and registered with Spark\u2019s listener bus. If a class has a single-argument constructor that accepts a SparkConf, that constructor will be called; otherwise, a zero-argument constructor will be called. If no valid constructor can be found, the SparkContext creation will fail with an exception.

\n
\n
local (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
dir (dagster.StringSource, optional)
\n

Application Properties: Directory to use for \u201cscratch\u201d space in Spark, including map output files and RDDs that get stored on disk. This should be on a fast, local disk in your system. It can also be a comma-separated list of multiple directories on different disks. NOTE: In Spark 1.0 and later this will be overridden by SPARK_LOCAL_DIRS (Standalone), MESOS_SANDBOX (Mesos) or LOCAL_DIRS (YARN) environment variables set by the cluster manager.

\n
\n
\n
\n
logConf (Bool, optional)
\n

Application Properties: Logs the effective SparkConf as INFO when a SparkContext is started.

\n
\n
master (dagster.StringSource, optional)
\n

Application Properties: The cluster manager to connect to. See the list of allowed master URL\u2019s.

\n
\n
submit (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
deployMode (dagster.StringSource, optional)
\n

Application Properties: The deploy mode of Spark driver program, either \u201cclient\u201d or \u201ccluster\u201d, Which means to launch driver program locally (\u201cclient\u201d) or remotely (\u201ccluster\u201d) on one of the nodes inside the cluster.

\n
\n
pyFiles (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.

\n
\n
\n
\n
log (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
callerContext (dagster.StringSource, optional)
\n

Application Properties: Application information that will be written into Yarn RM log/HDFS audit log when running on Yarn/HDFS. Its length depends on the Hadoop configuration hadoop.caller.context.max.size. It should be concise, and typically can have up to 50 characters.

\n
\n
\n
\n
redaction (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
regex (dagster.StringSource, optional)
\n

Runtime Environment: Regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the environment UI and various logs like YARN and event logs.

\n
\n
\n
\n
python (permissive dict, optional)
\n
\nDefault Value:
{\n    "profile": {},\n    "worker": {}\n}\n
\n
\n
\nConfig Schema:
\n
profile (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Runtime Environment: Enable profiling in Python worker, the profile result will show up by sc.show_profiles(), or it will be displayed before the driver exits. It also can be dumped into disk by sc.dump_profiles(path). If some of the profile results had been displayed manually, they will not be displayed automatically before driver exiting. By default the pyspark.profiler.BasicProfiler will be used, but this can be overridden by passing a profiler class in as a parameter to the SparkContext constructor.

\n
\n
dump (dagster.StringSource, optional)
\n

Runtime Environment: The directory which is used to dump the profile result before driver exiting. The results will be dumped as separated file for each RDD. They can be loaded by ptats.Stats(). If this is specified, the profile result will not be displayed automatically.

\n
\n
\n
\n
worker (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Runtime Environment: Amount of memory to use per python worker process during aggregation, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). If the memory used during aggregation goes above this amount, it will spill the data into disks.

\n
\n
reuse (Bool, optional)
\n

Runtime Environment: Reuse Python worker or not. If yes, it will use a fixed number of Python workers, does not need to fork() a Python process for every task. It will be very useful if there is large broadcast, then the broadcast will not be needed to transferred from JVM to Python worker for every task.

\n
\n
\n
\n
\n
\n
files (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.

\n
\n
fetchTimeout (dagster.StringSource, optional)
\n

Execution Behavior: Communication timeout to use when fetching files added through SparkContext.addFile() from the driver.

\n
\n
useFetchCache (Bool, optional)
\n

Execution Behavior: If set to true (default), file fetching will use a local cache that is shared by executors that belong to the same application, which can improve task launching performance when running many executors on the same host. If set to false, these caching optimizations will be disabled and all executors will fetch their own copies of files. This optimization may be disabled in order to use Spark local directories that reside on NFS filesystems (see SPARK-6313 for more details).

\n
\n
overwrite (Bool, optional)
\n

Execution Behavior: Whether to overwrite files added through SparkContext.addFile() when the target file exists and its contents do not match those of the source.

\n
\n
maxPartitionBytes (dagster.IntSource, optional)
\n

Execution Behavior: The maximum number of bytes to pack into a single partition when reading files.

\n
\n
openCostInBytes (dagster.IntSource, optional)
\n

Execution Behavior: The estimated cost to open a file, measured by the number of bytes could be scanned at the same time. This is used when putting multiple files into a partition. It is better to overestimate, then the partitions with small files will be faster than partitions with bigger files.

\n
\n
\n
\n
jars (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.

\n
\n
packages (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option \u2013repositories. For more details, see Advanced Dependency Management.

\n
\n
excludes (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in spark.jars.packages to avoid dependency conflicts.

\n
\n
ivy (dagster.StringSource, optional)
\n

Runtime Environment: Path to specify the Ivy user directory, used for the local Ivy cache and package files from spark.jars.packages. This will override the Ivy property ivy.default.ivy.user.dir which defaults to ~/.ivy2.

\n
\n
ivySettings (dagster.StringSource, optional)
\n

Runtime Environment: Path to an Ivy settings file to customize resolution of jars specified using spark.jars.packages instead of the built-in defaults, such as maven central. Additional repositories given by the command-line option \u2013repositories or spark.jars.repositories will also be included. Useful for allowing Spark to resolve artifacts from behind a firewall e.g. via an in-house artifact server like Artifactory. Details on the settings file format can be found at http://ant.apache.org/ivy/history/latest-milestone/settings.html

\n
\n
repositories (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of additional remote repositories to search for the maven coordinates given with \u2013packages or spark.jars.packages.

\n
\n
\n
\n
pyspark (permissive dict, optional)
\n
\nDefault Value:
{\n    "driver": {}\n}\n
\n
\n
\nConfig Schema:
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
python (dagster.StringSource, optional)
\n

Runtime Environment: Python binary executable to use for PySpark in driver. (default is spark.pyspark.python)

\n
\n
\n
\n
python (dagster.StringSource, optional)
\n

Runtime Environment: Python binary executable to use for PySpark in both driver and executors.

\n
\n
\n
\n
reducer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSizeInFlight (dagster.StringSource, optional)
\n

Shuffle Behavior: Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless otherwise specified. Since each output requires us to create a buffer to receive it, this represents a fixed memory overhead per reduce task, so keep it small unless you have a large amount of memory.

\n
\n
maxReqsInFlight (dagster.IntSource, optional)
\n

Shuffle Behavior: This configuration limits the number of remote requests to fetch blocks at any given point. When the number of hosts in the cluster increase, it might lead to very large number of inbound connections to one or more nodes, causing the workers to fail under load. By allowing it to limit the number of fetch requests, this scenario can be mitigated.

\n
\n
maxBlocksInFlightPerAddress (dagster.IntSource, optional)
\n

Shuffle Behavior: This configuration limits the number of remote blocks being fetched per reduce task from a given host port. When a large number of blocks are being requested from a given address in a single fetch or simultaneously, this could crash the serving executor or Node Manager. This is especially useful to reduce the load on the Node Manager when external shuffle is enabled. You can mitigate this issue by setting it to a lower value.

\n
\n
\n
\n
maxRemoteBlockSizeFetchToMem (dagster.IntSource, optional)
\n

Shuffle Behavior: The remote block will be fetched to disk when size of the block is above this threshold in bytes. This is to avoid a giant request that takes too much memory. By default, this is only enabled for blocks > 2GB, as those cannot be fetched directly into memory, no matter what resources are available. But it can be turned down to a much lower value (eg. 200m) to avoid using too much memory on smaller blocks as well. Note this configuration will affect both shuffle fetch and block manager remote block fetch. For users who enabled external shuffle service, this feature can only be used when external shuffle service is newer than Spark 2.2.

\n
\n
shuffle (permissive dict, optional)
\n
\nDefault Value:
{\n    "file": {},\n    "io": {},\n    "service": {\n        "index": {\n            "cache": {}\n        }\n    },\n    "sort": {},\n    "spill": {},\n    "registration": {}\n}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional)
\n

Shuffle Behavior: Whether to compress map output files. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
file (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
buffer (dagster.StringSource, optional)
\n

Shuffle Behavior: Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise specified. These buffers reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

\n
\n
\n
\n
io (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.IntSource, optional)
\n

Shuffle Behavior: (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.

\n
\n
numConnectionsPerPeer (dagster.IntSource, optional)
\n

Shuffle Behavior: (Netty only) Connections between hosts are reused in order to reduce connection buildup for large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value.

\n
\n
preferDirectBufs (Bool, optional)
\n

Shuffle Behavior: (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache block transfer. For environments where off-heap memory is tightly limited, users may wish to turn this off to force all allocations from Netty to be on-heap.

\n
\n
retryWait (dagster.StringSource, optional)
\n

Shuffle Behavior: (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds by default, calculated as maxRetries * retryWait.

\n
\n
\n
\n
service (permissive dict, optional)
\n
\nDefault Value:
{\n    "index": {\n        "cache": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional)
\n

Shuffle Behavior: Enables the external shuffle service. This service preserves the shuffle files written by executors so the executors can be safely removed. This must be enabled if spark.dynamicAllocation.enabled is \u201ctrue\u201d. The external shuffle service must be set up in order to enable it. See dynamic allocation configuration and setup documentation for more information.

\n
\n
port (dagster.IntSource, optional)
\n

Shuffle Behavior: Port on which the external shuffle service will run.

\n
\n
index (permissive dict, optional)
\n
\nDefault Value:
{\n    "cache": {}\n}\n
\n
\n
\nConfig Schema:
\n
cache (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
size (dagster.StringSource, optional)
\n

Shuffle Behavior: Cache entries limited to the specified memory footprint in bytes.

\n
\n
\n
\n
\n
\n
\n
\n
maxChunksBeingTransferred (dagster.IntSource, optional)
\n

Shuffle Behavior: The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see spark.shuffle.io.maxRetries and spark.shuffle.io.retryWait), if those limits are reached the task will fail with fetch failure.

\n
\n
sort (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
bypassMergeThreshold (dagster.IntSource, optional)
\n

Shuffle Behavior: (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no map-side aggregation and there are at most this many reduce partitions.

\n
\n
\n
\n
spill (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional)
\n

Shuffle Behavior: Whether to compress data spilled during shuffles. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
accurateBlockThreshold (dagster.IntSource, optional)
\n

Shuffle Behavior: Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle block size when fetch shuffle blocks.

\n
\n
registration (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.IntSource, optional)
\n

Shuffle Behavior: Timeout in milliseconds for registration to the external shuffle service.

\n
\n
maxAttempts (dagster.IntSource, optional)
\n

Shuffle Behavior: When we fail to register to the external shuffle service, we will retry for maxAttempts times.

\n
\n
\n
\n
memoryFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for aggregation and cogroups during shuffles. At any given time, the collective size of all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will begin to spill to disk. If spills are often, consider increasing this value at the expense of spark.storage.memoryFraction.

\n
\n
\n
\n
eventLog (permissive dict, optional)
\n
\nDefault Value:
{\n    "logBlockUpdates": {},\n    "longForm": {},\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
logBlockUpdates (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to log events for every block update, if spark.eventLog.enabled is true. *Warning*: This will increase the size of the event log considerably.

\n
\n
\n
\n
longForm (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: If true, use the long form of call sites in the event log. Otherwise use the short form.

\n
\n
\n
\n
compress (dagster.StringSource, optional)
\n

Spark UI: Whether to compress logged events, if spark.eventLog.enabled is true. Compression will use spark.io.compression.codec.

\n
\n
dir (dagster.StringSource, optional)
\n

Spark UI: Base directory in which Spark events are logged, if spark.eventLog.enabled is true. Within this base directory, Spark creates a sub-directory for each application, and logs the events specific to the application in this directory. Users may want to set this to a unified location like an HDFS directory so history files can be read by the history server.

\n
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to log Spark events, useful for reconstructing the Web UI after the application has finished.

\n
\n
overwrite (dagster.StringSource, optional)
\n

Spark UI: Whether to overwrite any existing files.

\n
\n
buffer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
kb (dagster.StringSource, optional)
\n

Spark UI: Buffer size to use when writing to output streams, in KiB unless otherwise specified.

\n
\n
\n
\n
\n
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{\n    "dagGraph": {},\n    "liveUpdate": {}\n}\n
\n
\n
\nConfig Schema:
\n
dagGraph (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedRootRDDs (dagster.StringSource, optional)
\n

Spark UI: How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to run the web UI for the Spark application.

\n
\n
killEnabled (dagster.StringSource, optional)
\n

Spark UI: Allows jobs and stages to be killed from the web UI.

\n
\n
liveUpdate (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
period (dagster.StringSource, optional)
\n

Spark UI: How often to update live entities. -1 means \u201cnever update\u201d when replaying applications, meaning only the last write will happen. For live applications, this avoids a few operations that we can live without when rapidly processing incoming task events.

\n
\n
\n
\n
port (dagster.StringSource, optional)
\n

Spark UI: Port for your application\u2019s dashboard, which shows memory and workload data.

\n
\n
retainedJobs (dagster.StringSource, optional)
\n

Spark UI: How many jobs the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedStages (dagster.StringSource, optional)
\n

Spark UI: How many stages the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedTasks (dagster.StringSource, optional)
\n

Spark UI: How many tasks the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
reverseProxy (dagster.StringSource, optional)
\n

Spark UI: Enable running Spark Master as reverse proxy for worker and application UIs. In this mode, Spark master will reverse proxy the worker and application UIs to enable access without requiring direct access to their hosts. Use it with caution, as worker and application UI will not be accessible directly, you will only be able to access them through spark master/proxy public URL. This setting affects all the workers and application UIs running in the cluster and must be set on all the workers, drivers and masters.

\n
\n
reverseProxyUrl (dagster.StringSource, optional)
\n

Spark UI: This is the URL where your proxy is running. This URL is for proxy which is running in front of Spark Master. This is useful when running proxy for authentication e.g. OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to reach your proxy.

\n
\n
showConsoleProgress (dagster.StringSource, optional)
\n

Spark UI: Show the progress bar in the console. The progress bar shows the progress of stages that run for longer than 500ms. If multiple stages run at the same time, multiple progress bars will be displayed on the same line.

\n
\n
retainedDeadExecutors (dagster.StringSource, optional)
\n

Spark UI: How many dead executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
filters (dagster.StringSource, optional)
\n

Spark UI: Comma separated list of filter class names to apply to the Spark Web UI. The filter should be a standard javax servlet Filter. Filter parameters can also be specified in the configuration, by setting config entries of the form spark.<class name of filter>.param.<param name>=<value> For example: spark.ui.filters=com.test.filter1 spark.com.test.filter1.param.name1=foo spark.com.test.filter1.param.name2=bar

\n
\n
\n
\n
worker (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutors (dagster.StringSource, optional)
\n

Spark UI: How many finished executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
retainedDrivers (dagster.StringSource, optional)
\n

Spark UI: How many finished drivers the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
sql (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutions (dagster.StringSource, optional)
\n

Spark UI: How many finished executions the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
streaming (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {},\n    "backpressure": {},\n    "receiver": {\n        "writeAheadLog": {}\n    },\n    "kafka": {},\n    "driver": {\n        "writeAheadLog": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedBatches (dagster.StringSource, optional)
\n

Spark Streaming: How many batches the Spark Streaming UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
backpressure (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark Streaming: Enables or disables Spark Streaming\u2019s internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values spark.streaming.receiver.maxRate and spark.streaming.kafka.maxRatePerPartition if they are set (see below).

\n
\n
initialRate (dagster.StringSource, optional)
\n

Spark Streaming: This is the initial maximum receiving rate at which each receiver will receive data for the first batch when the backpressure mechanism is enabled.

\n
\n
\n
\n
blockInterval (dagster.StringSource, optional)
\n

Spark Streaming: Interval at which data received by Spark Streaming receivers is chunked into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the performance tuning section in the Spark Streaming programing guide for more details.

\n
\n
receiver (permissive dict, optional)
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRate (dagster.StringSource, optional)
\n

Spark Streaming: Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details.

\n
\n
writeAheadLog (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enable (dagster.StringSource, optional)
\n

Spark Streaming: Enable write-ahead logs for receivers. All the input data received through receivers will be saved to write-ahead logs that will allow it to be recovered after driver failures. See the deployment guide in the Spark Streaming programing guide for more details.

\n
\n
closeFileAfterWrite (dagster.StringSource, optional)
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the receivers. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the data WAL on the receivers.

\n
\n
\n
\n
\n
\n
unpersist (dagster.StringSource, optional)
\n

Spark Streaming: Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from Spark\u2019s memory. The raw input data received by Spark Streaming is also automatically cleared. Setting this to false will allow the raw data and persisted RDDs to be accessible outside the streaming application as they will not be cleared automatically. But it comes at the cost of higher memory usage in Spark.

\n
\n
stopGracefullyOnShutdown (dagster.StringSource, optional)
\n

Spark Streaming: If true, Spark shuts down the StreamingContext gracefully on JVM shutdown rather than immediately.

\n
\n
kafka (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRatePerPartition (dagster.StringSource, optional)
\n

Spark Streaming: Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API. See the Kafka Integration guide for more details.

\n
\n
minRatePerPartition (dagster.StringSource, optional)
\n

Spark Streaming: Minimum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API.

\n
\n
maxRetries (dagster.StringSource, optional)
\n

Spark Streaming: Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the new Kafka direct stream API.

\n
\n
\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
writeAheadLog (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
closeFileAfterWrite (dagster.StringSource, optional)
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the driver. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the metadata WAL on the driver.

\n
\n
\n
\n
\n
\n
\n
\n
broadcast (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to compress broadcast variables before sending them. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
blockSize (dagster.StringSource, optional)
\n

Execution Behavior: Size of each piece of a block for TorrentBroadcastFactory, in KiB unless otherwise specified. Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit.

\n
\n
checksum (dagster.StringSource, optional)
\n

Execution Behavior: Whether to enable checksum for broadcast. If enabled, broadcasts will include a checksum, which can help detect corrupted blocks, at the cost of computing and sending a little more data. It\u2019s possible to disable it if the network has other mechanisms to guarantee data won\u2019t be corrupted during broadcast.

\n
\n
\n
\n
io (permissive dict, optional)
\n
\nDefault Value:
{\n    "compression": {\n        "lz4": {},\n        "snappy": {},\n        "zstd": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
compression (permissive dict, optional)
\n
\nDefault Value:
{\n    "lz4": {},\n    "snappy": {},\n    "zstd": {}\n}\n
\n
\n
\nConfig Schema:
\n
codec (dagster.StringSource, optional)
\n

Compression and Serialization: The codec used to compress internal data such as RDD partitions, event log, broadcast variables and shuffle outputs. By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, and org.apache.spark.io.ZStdCompressionCodec.

\n
\n
lz4 (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional)
\n

Compression and Serialization: Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.

\n
\n
\n
\n
snappy (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional)
\n

Compression and Serialization: Block size in bytes used in Snappy compression, in the case when Snappy compression codec is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.

\n
\n
\n
\n
zstd (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
level (dagster.StringSource, optional)
\n

Compression and Serialization: Compression level for Zstd compression codec. Increasing the compression level will result in better compression at the expense of more CPU and memory.

\n
\n
bufferSize (dagster.StringSource, optional)
\n

Compression and Serialization: Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it might increase the compression cost because of excessive JNI call overhead.

\n
\n
\n
\n
\n
\n
\n
\n
kryo (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
classesToRegister (dagster.StringSource, optional)
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of custom class names to register with Kryo. See the tuning guide for more details.

\n
\n
referenceTracking (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple copies of the same object. Can be disabled to improve performance if you know this is not the case.

\n
\n
registrationRequired (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to require registration with Kryo. If set to \u2018true\u2019, Kryo will throw an exception if an unregistered class is serialized. If set to false (the default), Kryo will write unregistered class names along with each object. Writing class names can cause significant performance overhead, so enabling this option can enforce strictly that a user has not omitted classes from registration.

\n
\n
registrator (dagster.StringSource, optional)
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of classes that register your custom classes with Kryo. This property is useful if you need to register your classes in a custom way, e.g. to specify a custom field serializer. Otherwise spark.kryo.classesToRegister is simpler. It should be set to classes that extend KryoRegistrator. See the tuning guide for more details.

\n
\n
unsafe (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to use unsafe based Kryo serializer. Can be substantially faster by using Unsafe Based IO.

\n
\n
\n
\n
kryoserializer (permissive dict, optional)
\n
\nDefault Value:
{\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
buffer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Compression and Serialization: Initial size of Kryo\u2019s serialization buffer, in KiB unless otherwise specified. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.

\n
\n
max (dagster.StringSource, optional)
\n

Compression and Serialization: Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger than any object you attempt to serialize and must be less than 2048m. Increase this if you get a \u201cbuffer limit exceeded\u201d exception inside Kryo.

\n
\n
\n
\n
\n
\n
rdd (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
serializer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Compression and Serialization: Class to use for serializing objects that will be sent over the network or need to be cached in serialized form. The default of Java serialization works with any Serializable Java object but is quite slow, so we recommend using org.apache.spark.serializer.KryoSerializer and configuring Kryo serialization when speed is necessary. Can be any subclass of org.apache.spark.Serializer.

\n
\n
objectStreamReset (dagster.StringSource, optional)
\n

Compression and Serialization: When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches objects to prevent writing redundant data, however that stops garbage collection of those objects. By calling \u2018reset\u2019 you flush that info from the serializer, and allow old objects to be collected. To turn off this periodic reset set it to -1. By default it will reset the serializer every 100 objects.

\n
\n
\n
\n
memory (permissive dict, optional)
\n
\nDefault Value:
{\n    "offHeap": {}\n}\n
\n
\n
\nConfig Schema:
\n
fraction (Float, optional)
\n

Memory Management: Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at the default value is recommended. For more detail, including important information about correctly tuning JVM garbage collection when increasing this value, see this description.

\n
\n
storageFraction (Float, optional)
\n

Memory Management: Amount of storage memory immune to eviction, expressed as a fraction of the size of the region set aside by spark.memory.fraction. The higher this is, the less working memory may be available to execution and tasks may spill to disk more often. Leaving this at the default value is recommended. For more detail, see this description.

\n
\n
offHeap (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional)
\n

Memory Management: If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.

\n
\n
size (dagster.IntSource, optional)
\n

Memory Management: The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors\u2019 total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.

\n
\n
\n
\n
useLegacyMode (Bool, optional)
\n

Memory Management: Whether to enable the legacy memory management mode used in Spark 1.5 and before. The legacy mode rigidly partitions the heap space into fixed-size regions, potentially leading to excessive spilling if the application was not tuned. The following deprecated memory fraction configurations are not read unless this is enabled: spark.shuffle.memoryFraction spark.storage.memoryFraction spark.storage.unrollFraction

\n
\n
\n
\n
storage (permissive dict, optional)
\n
\nDefault Value:
{\n    "replication": {}\n}\n
\n
\n
\nConfig Schema:
\n
memoryFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for Spark\u2019s memory cache. This should not be larger than the \u201cold\u201d generation of objects in the JVM, which by default is given 0.6 of the heap, but you can increase it if you configure your own old generation size.

\n
\n
unrollFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of spark.storage.memoryFraction to use for unrolling blocks in memory. This is dynamically allocated by dropping existing blocks when there is not enough free storage space to unroll the new block in its entirety.

\n
\n
replication (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
proactive (Bool, optional)
\n

Memory Management: Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries to get the replication level of the block to the initial number.

\n
\n
\n
\n
memoryMapThreshold (dagster.StringSource, optional)
\n

Execution Behavior: Size in bytes of a block above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory mapping has high overhead for blocks close to or below the page size of the operating system.

\n
\n
\n
\n
cleaner (permissive dict, optional)
\n
\nDefault Value:
{\n    "periodicGC": {},\n    "referenceTracking": {\n        "blocking": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
periodicGC (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Memory Management: Controls how often to trigger a garbage collection. This context cleaner triggers cleanups only when weak references are garbage collected. In long-running applications with large driver JVMs, where there is little memory pressure on the driver, this may happen very occasionally or not at all. Not cleaning at all may lead to executors running out of disk space after a while.

\n
\n
\n
\n
referenceTracking (permissive dict, optional)
\n
\nDefault Value:
{\n    "blocking": {}\n}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Memory Management: Enables or disables context cleaning.

\n
\n
blocking (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Memory Management: Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by spark.cleaner.referenceTracking.blocking.shuffle Spark property).

\n
\n
shuffle (Bool, optional)
\n

Memory Management: Controls whether the cleaning thread should block on shuffle cleanup tasks.

\n
\n
\n
\n
cleanCheckpoints (Bool, optional)
\n

Memory Management: Controls whether to clean checkpoint files if the reference is out of scope.

\n
\n
\n
\n
\n
\n
default (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
parallelism (dagster.IntSource, optional)
\n

Execution Behavior: Default number of partitions in RDDs returned by transformations like join, reduceByKey, and parallelize when not set by user.

\n
\n
\n
\n
hadoop (permissive dict, optional)
\n
\nDefault Value:
{\n    "mapreduce": {\n        "fileoutputcommitter": {\n            "algorithm": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
cloneConf (Bool, optional)
\n

Execution Behavior: If set to true, clones a new Hadoop Configuration object for each task. This option should be enabled to work around Configuration thread-safety issues (see SPARK-2546 for more details). This is disabled by default in order to avoid unexpected performance regressions for jobs that are not affected by these issues.

\n
\n
validateOutputSpecs (Bool, optional)
\n

Execution Behavior: If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with previous versions of Spark. Simply use Hadoop\u2019s FileSystem API to delete output directories by hand. This setting is ignored for jobs generated through Spark Streaming\u2019s StreamingContext, since data may need to be rewritten to pre-existing output directories during checkpoint recovery.

\n
\n
mapreduce (permissive dict, optional)
\n
\nDefault Value:
{\n    "fileoutputcommitter": {\n        "algorithm": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
fileoutputcommitter (permissive dict, optional)
\n
\nDefault Value:
{\n    "algorithm": {}\n}\n
\n
\n
\nConfig Schema:
\n
algorithm (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
version (dagster.IntSource, optional)
\n

Execution Behavior: The file output committer algorithm version, valid algorithm version number: 1 or 2. Version 2 may have better performance, but version 1 may handle failures better in certain situations, as per MAPREDUCE-4815.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
rpc (permissive dict, optional)
\n
\nDefault Value:
{\n    "message": {},\n    "retry": {}\n}\n
\n
\n
\nConfig Schema:
\n
message (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSize (dagster.StringSource, optional)
\n

Networking: Maximum message size (in MB) to allow in \u201ccontrol plane\u201d communication; generally only applies to map output size information sent between executors and the driver. Increase this if you are running jobs with many thousands of map and reduce tasks and see messages about the RPC message size.

\n
\n
\n
\n
numRetries (dagster.StringSource, optional)
\n

Networking: Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number.

\n
\n
retry (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
wait (dagster.StringSource, optional)
\n

Networking: Duration for an RPC ask operation to wait before retrying.

\n
\n
\n
\n
askTimeout (dagster.StringSource, optional)
\n

Networking: Duration for an RPC ask operation to wait before timing out.

\n
\n
lookupTimeout (dagster.StringSource, optional)
\n

Networking: Duration for an RPC remote endpoint lookup operation to wait before timing out.

\n
\n
\n
\n
blockManager (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional)
\n

Networking: Port for all block managers to listen on. These exist on both the driver and the executors.

\n
\n
\n
\n
network (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional)
\n

Networking: Default timeout for all network interactions. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.

\n
\n
\n
\n
port (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.StringSource, optional)
\n

Networking: Maximum number of retries when binding to a port before giving up. When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. This essentially allows it to try a range of ports from the start port specified to port + maxRetries.

\n
\n
\n
\n
core (permissive dict, optional)
\n
\nDefault Value:
{\n    "connection": {\n        "ack": {\n            "wait": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
connection (permissive dict, optional)
\n
\nDefault Value:
{\n    "ack": {\n        "wait": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ack (permissive dict, optional)
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional)
\n

Networking: How long for the connection to wait for ack to occur before timing out and giving up. To avoid unwilling timeout caused by long pause like GC, you can set larger value.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cores (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
max (dagster.StringSource, optional)
\n

Scheduling: When running on a standalone deploy cluster or a Mesos cluster in \u201ccoarse-grained\u201d sharing mode, the maximum amount of CPU cores to request for the application from across the cluster (not from each machine). If not set, the default will be spark.deploy.defaultCores on Spark\u2019s standalone cluster manager, or infinite (all available cores) on Mesos.

\n
\n
\n
\n
locality (permissive dict, optional)
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Scheduling: How long to wait to launch a data-local task before giving up and launching it on a less-local node. The same wait will be used to step through multiple locality levels (process-local, node-local, rack-local and then any). It is also possible to customize the waiting time for each level by setting spark.locality.wait.node, etc. You should increase this setting if your tasks are long and see poor locality, but the default usually works well.

\n
\n
node (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for node locality. For example, you can set this to 0 to skip node locality and search immediately for rack locality (if your cluster has rack information).

\n
\n
process (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for process locality. This affects tasks that attempt to access cached data in a particular executor process.

\n
\n
rack (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for rack locality.

\n
\n
\n
\n
\n
\n
scheduler (permissive dict, optional)
\n
\nDefault Value:
{\n    "revive": {},\n    "listenerbus": {\n        "eventqueue": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
maxRegisteredResourcesWaitingTime (dagster.StringSource, optional)
\n

Scheduling: Maximum amount of time to wait for resources to register before scheduling begins.

\n
\n
minRegisteredResourcesRatio (dagster.StringSource, optional)
\n

Scheduling: The minimum ratio of registered resources (registered resources / total expected resources) (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarse-grained mode [\u2018spark.cores.max\u2019 value is total expected resources for Mesos coarse-grained mode] ) to wait for before scheduling begins. Specified as a double between 0.0 and 1.0. Regardless of whether the minimum ratio of resources has been reached, the maximum amount of time it will wait before scheduling begins is controlled by config spark.scheduler.maxRegisteredResourcesWaitingTime.

\n
\n
mode (dagster.StringSource, optional)
\n

Scheduling: The scheduling mode between jobs submitted to the same SparkContext. Can be set to FAIR to use fair sharing instead of queueing jobs one after another. Useful for multi-user services.

\n
\n
revive (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Scheduling: The interval length for the scheduler to revive the worker resource offers to run tasks.

\n
\n
\n
\n
listenerbus (permissive dict, optional)
\n
\nDefault Value:
{\n    "eventqueue": {}\n}\n
\n
\n
\nConfig Schema:
\n
eventqueue (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
capacity (dagster.StringSource, optional)
\n

Scheduling: Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing value (e.g. 20000) if listener events are dropped. Increasing this value may result in the driver using more memory.

\n
\n
\n
\n
\n
\n
\n
\n
blacklist (permissive dict, optional)
\n
\nDefault Value:
{\n    "task": {},\n    "stage": {},\n    "application": {\n        "fetchFailure": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: If set to \u201ctrue\u201d, prevent Spark from scheduling tasks on executors that have been blacklisted due to too many task failures. The blacklisting algorithm can be further controlled by the other \u201cspark.blacklist\u201d configuration options.

\n
\n
timeout (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How long a node or executor is blacklisted for the entire application, before it is unconditionally removed from the blacklist to attempt running new tasks.

\n
\n
task (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxTaskAttemptsPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one executor before the executor is blacklisted for that task.

\n
\n
maxTaskAttemptsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one node, before the entire node is blacklisted for that task.

\n
\n
\n
\n
stage (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, within one stage, before the executor is blacklisted for that stage.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different executors are marked as blacklisted for a given stage, before the entire node is marked as failed for the stage.

\n
\n
\n
\n
application (permissive dict, optional)
\n
\nDefault Value:
{\n    "fetchFailure": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, in successful task sets, before the executor is blacklisted for the entire application. Blacklisted executors will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors may get marked as idle and be reclaimed by the cluster manager.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different executors must be blacklisted for the entire application, before the node is blacklisted for the entire application. Blacklisted nodes will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors on the node may get marked as idle and be reclaimed by the cluster manager.

\n
\n
fetchFailure (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, Spark will blacklist the executor immediately when a fetch failure happens. If external shuffle service is enabled, then the whole node will be blacklisted.

\n
\n
\n
\n
\n
\n
killBlacklistedExecutors (dagster.StringSource, optional)
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, allow Spark to automatically kill the executors when they are blacklisted on fetch failure or blacklisted for the entire application, as controlled by spark.blacklist.application.*. Note that, when an entire node is added to the blacklist, all of the executors on that node will be killed.

\n
\n
\n
\n
speculation (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Scheduling: If set to \u201ctrue\u201d, performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.

\n
\n
interval (dagster.StringSource, optional)
\n

Scheduling: How often Spark will check for tasks to speculate.

\n
\n
multiplier (dagster.StringSource, optional)
\n

Scheduling: How many times slower a task is than the median to be considered for speculation.

\n
\n
quantile (dagster.StringSource, optional)
\n

Scheduling: Fraction of tasks which must be complete before speculation is enabled for a particular stage.

\n
\n
\n
\n
task (permissive dict, optional)
\n
\nDefault Value:
{\n    "reaper": {}\n}\n
\n
\n
\nConfig Schema:
\n
cpus (dagster.StringSource, optional)
\n

Scheduling: Number of cores to allocate for each task.

\n
\n
maxFailures (dagster.StringSource, optional)
\n

Scheduling: Number of failures of any particular task before giving up on the job. The total number of failures spread across different tasks will not cause the job to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1.

\n
\n
reaper (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed will be monitored by the executor until that task actually finishes executing. See the other spark.task.reaper.* configurations for details on how to control the exact behavior of this monitoring. When set to false (the default), task killing will use an older code path which lacks such monitoring.

\n
\n
pollingInterval (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls the frequency at which executors will poll the status of killed tasks. If a killed task is still running when polled then a warning will be logged and, by default, a thread-dump of the task will be logged (this thread dump can be disabled via the spark.task.reaper.threadDump setting, which is documented below).

\n
\n
threadDump (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls whether task thread dumps are logged during periodic polling of killed tasks. Set this to false to disable collection of thread dumps.

\n
\n
killTimeout (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting specifies a timeout after which the executor JVM will kill itself if a killed task has not stopped running. The default value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose of this setting is to act as a safety-net to prevent runaway noncancellable tasks from rendering an executor unusable.

\n
\n
\n
\n
\n
\n
stage (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxConsecutiveAttempts (dagster.StringSource, optional)
\n

Scheduling: Number of consecutive stage attempts allowed before a stage is aborted.

\n
\n
\n
\n
dynamicAllocation (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Dynamic Allocation: Whether to use dynamic resource allocation, which scales the number of executors registered with this application up and down based on the workload. For more detail, see the description here. This requires spark.shuffle.service.enabled to be set. The following configurations are also relevant: spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors, and spark.dynamicAllocation.initialExecutors spark.dynamicAllocation.executorAllocationRatio

\n
\n
executorIdleTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor has been idle for more than this duration, the executor will be removed. For more detail, see this description.

\n
\n
cachedExecutorIdleTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration, the executor will be removed. For more details, see this description.

\n
\n
initialExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Initial number of executors to run if dynamic allocation is enabled. If \u2013num-executors (or spark.executor.instances) is set and larger than this value, it will be used as the initial number of executors.

\n
\n
maxExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Upper bound for the number of executors if dynamic allocation is enabled.

\n
\n
minExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Lower bound for the number of executors if dynamic allocation is enabled.

\n
\n
executorAllocationRatio (dagster.StringSource, optional)
\n

Dynamic Allocation: By default, the dynamic allocation will request enough executors to maximize the parallelism according to the number of tasks to process. While this minimizes the latency of the job, with small tasks this setting can waste a lot of resources due to executor allocation overhead, as some executor might not even do any work. This setting allows to set a ratio that will be used to reduce the number of executors w.r.t. full parallelism. Defaults to 1.0 to give maximum parallelism. 0.5 will divide the target number of executors by 2 The target number of executors computed by the dynamicAllocation can still be overridden by the spark.dynamicAllocation.minExecutors and spark.dynamicAllocation.maxExecutors settings

\n
\n
schedulerBacklogTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and there have been pending tasks backlogged for more than this duration, new executors will be requested. For more detail, see this description.

\n
\n
sustainedSchedulerBacklogTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: Same as spark.dynamicAllocation.schedulerBacklogTimeout, but used only for subsequent executor requests. For more detail, see this description.

\n
\n
\n
\n
r (permissive dict, optional)
\n
\nDefault Value:
{\n    "driver": {},\n    "shell": {}\n}\n
\n
\n
\nConfig Schema:
\n
numRBackendThreads (dagster.StringSource, optional)
\n

SparkR: Number of threads used by RBackend to handle RPC calls from SparkR package.

\n
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing R scripts in cluster modes for both driver and workers.

\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing R scripts in client modes for driver. Ignored in cluster modes.

\n
\n
\n
\n
shell (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script.

\n
\n
\n
\n
backendConnectionTimeout (dagster.StringSource, optional)
\n

SparkR: Connection timeout set by R process on its connection to RBackend in seconds.

\n
\n
heartBeatInterval (dagster.StringSource, optional)
\n

SparkR: Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.

\n
\n
\n
\n
graphx (permissive dict, optional)
\n
\nDefault Value:
{\n    "pregel": {}\n}\n
\n
\n
\nConfig Schema:
\n
pregel (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
checkpointInterval (dagster.StringSource, optional)
\n

GraphX: Checkpoint interval for graph and message in Pregel. It used to avoid stackOverflowError due to long lineage chains after lots of iterations. The checkpoint is disabled by default.

\n
\n
\n
\n
\n
\n
deploy (permissive dict, optional)
\n
\nDefault Value:
{\n    "zookeeper": {}\n}\n
\n
\n
\nConfig Schema:
\n
recoveryMode (dagster.StringSource, optional)
\n

Deploy: The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches. This is only applicable for cluster mode when running with Standalone or Mesos.

\n
\n
zookeeper (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
url (dagster.StringSource, optional)
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.

\n
\n
dir (dagster.StringSource, optional)
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n

This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.

\n

Example:

\n
@op(required_resource_keys={"pyspark"})\ndef my_op(context):\n    spark_session = context.resources.pyspark.spark_session\n    dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\nmy_pyspark_resource = pyspark_resource.configured(\n    {"spark_conf": {"spark.executor.memory": "2g"}}\n)\n\n@job(resource_defs={"pyspark": my_pyspark_resource})\ndef my_spark_job():\n    my_op()\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pyspark", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-shell/", "title": "Shell (dagster-shell)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-prometheus/", "title": "Prometheus (dagster-prometheus)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-shell", "Shell (dagster-shell)", "N", "next"], ["sections/api/apidocs/libraries/dagster-prometheus", "Prometheus (dagster-prometheus)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pyspark.rst.txt", "title": "Pyspark (dagster-pyspark)", "toc": "\n"}, "dagster-shell": {"alabaster_version": "0.7.12", "body": "
\n

Shell (dagster-shell)\u00b6

\n

The Dagster shell library provides op factories for executing inline shell scripts or script files.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_shell.create_shell_command_op(shell_command, name, description=None, required_resource_keys=None, tags=None)[source]\u00b6
\n

This function is a factory that constructs ops to execute a shell command.

\n

Note that you can only use shell_command_op if you know the command you\u2019d like to execute\nat pipeline construction time. If you\u2019d like to construct shell commands dynamically during\npipeline execution and pass them between ops, you should use shell_op instead.

\n

Examples:

\n
# pylint: disable=no-value-for-parameter\nfrom dagster_shell import create_shell_command_op\n\nfrom dagster import graph\n\n\n@graph\ndef my_graph():\n    a = create_shell_command_op('echo "hello, world!"', name="a")\n    a()\n
\n
\n
\n
Parameters
\n
    \n
  • shell_command (str) \u2013 The shell command that the constructed op will execute.

  • \n
  • name (str) \u2013 The name of the constructed op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this op.\nSetting this ensures that resource spin up for the required resources will occur before\nthe shell command is executed.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
\n
\n
Raises
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns
\n

Returns the constructed op definition.

\n
\n
Return type
\n

OpDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.create_shell_script_op(shell_script_path, name='create_shell_script_op', input_defs=None, **kwargs)[source]\u00b6
\n

This function is a factory which constructs an op that will execute a shell command read\nfrom a script file.

\n

Any kwargs passed to this function will be passed along to the underlying @op decorator. However, note that overriding config or output_defs is not\nsupported.

\n

You might consider using @graph to wrap this op\nin the cases where you\u2019d like to configure the shell op with different config fields.

\n

Examples:

\n
# pylint: disable=no-value-for-parameter\nfrom dagster_shell import create_shell_script_op\n\nfrom dagster import file_relative_path, graph\n\n\n@graph\ndef my_graph():\n    a = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="a")\n    a()\n
\n
\n
\n
Parameters
\n
    \n
  • shell_script_path (str) \u2013 The script file to execute.

  • \n
  • name (str, optional) \u2013 The name of this op. Defaults to \u201ccreate_shell_script_op\u201d.

  • \n
  • input_defs (List[InputDefinition], optional) \u2013 input definitions for the op. Defaults to\na single Nothing input.

  • \n
\n
\n
Raises
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns
\n

Returns the constructed op definition.

\n
\n
Return type
\n

OpDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.shell_op(context, shell_command)\u00b6
\n
\n\n
\n
\n

Legacy APIs\u00b6

\n
\n
\ndagster_shell.create_shell_command_solid(shell_command, name, description=None, required_resource_keys=None, tags=None)[source]\u00b6
\n

This function is a factory that constructs solids to execute a shell command.

\n

Note that you can only use shell_command_solid if you know the command you\u2019d like to execute\nat pipeline construction time. If you\u2019d like to construct shell commands dynamically during\npipeline execution and pass them between solids, you should use shell_solid instead.

\n

Examples:

\n
# pylint: disable=no-value-for-parameter\nfrom dagster_shell import create_shell_command_solid\n\nfrom dagster import pipeline\n\n\n@pipeline\ndef pipe():\n    a = create_shell_command_solid('echo "hello, world!"', name="a")\n    a()\n
\n
\n
\n
Parameters
\n
    \n
  • shell_command (str) \u2013 The shell command that the constructed solid will execute.

  • \n
  • name (str) \u2013 The name of the constructed solid.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this solid.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this solid.\nSetting this ensures that resource spin up for the required resources will occur before\nthe shell command is executed.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
\n
\n
Raises
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns
\n

Returns the constructed solid definition.

\n
\n
Return type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.create_shell_script_solid(shell_script_path, name='create_shell_script_solid', input_defs=None, **kwargs)[source]\u00b6
\n

This function is a factory which constructs a solid that will execute a shell command read\nfrom a script file.

\n

Any kwargs passed to this function will be passed along to the underlying @solid decorator. However, note that overriding config or output_defs is not\nsupported.

\n

You might consider using @composite_solid to wrap this solid\nin the cases where you\u2019d like to configure the shell solid with different config fields.

\n

Examples:

\n
# pylint: disable=no-value-for-parameter\nfrom dagster_shell import create_shell_script_solid\n\nfrom dagster import file_relative_path, pipeline\n\n\n@pipeline\ndef pipe():\n    a = create_shell_script_solid(file_relative_path(__file__, "hello_world.sh"), name="a")\n    a()\n
\n
\n
\n
Parameters
\n
    \n
  • shell_script_path (str) \u2013 The script file to execute.

  • \n
  • name (str, optional) \u2013 The name of this solid. Defaults to \u201ccreate_shell_script_solid\u201d.

  • \n
  • input_defs (List[InputDefinition], optional) \u2013 input definitions for the solid. Defaults to\na single Nothing input.

  • \n
\n
\n
Raises
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns
\n

Returns the constructed solid definition.

\n
\n
Return type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.shell_solid(context, shell_command)\u00b6
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-shell", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dagster-slack/", "title": "Slack (dagster-slack)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pyspark/", "title": "Pyspark (dagster-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-slack", "Slack (dagster-slack)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pyspark", "Pyspark (dagster-pyspark)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-shell.rst.txt", "title": "Shell (dagster-shell)", "toc": "\n"}, "dagster-slack": {"alabaster_version": "0.7.12", "body": "
\n

Slack (dagster-slack)\u00b6

\n

\n
\n

\n
\n

This library provides an integration with Slack, to support posting messages in your company\u2019s Slack workspace.

\n
\n

\n
\n

Presently, it provides a thin wrapper on the Slack client API chat.postMessage.

\n
\n

\n
\n

To use this integration, you\u2019ll first need to create a Slack App for it.

\n
    \n
  1. Create App: Go to https://api.slack.com/apps and click \u201cCreate New App\u201d:

    \n

    \n
  2. \n
  3. Install App: After creating an app, on the left-hand side of the app configuration, click \u201cBot Users\u201d, and then create a bot user. Then, click \u201cInstall App\u201d on the left hand side, and finally \u201cInstall App to Workspace\u201d.

  4. \n
  5. Bot Token: Once finished, this will create a new bot token for your bot/workspace:

    \n

    \n
  6. \n
\n

Copy this bot token and put it somewhere safe; see Safely Storing Credentials for more on this topic.

\n
\n
\ndagster_slack.slack_resource ResourceDefinition[source]\u00b6
\n

This resource is for connecting to Slack.

\n

The resource object is a slack_sdk.WebClient.

\n

By configuring this Slack resource, you can post messages to Slack from any Dagster solid:

\n

Examples:

\n
import os\n\nfrom dagster import job, op\nfrom dagster_slack import slack_resource\n\n\n@op(required_resource_keys={'slack'})\ndef slack_op(context):\n    context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n@job(resource_defs={'slack': slack_resource})\ndef slack_job():\n    slack_op()\n\nslack_job.execute_in_process(\n    run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n)\n
\n
\n
\n\n
\n
\ndagster_slack.slack_on_failure HookDefinition[source]\u00b6
\n

Create a hook on step failure events that will message the given Slack channel.

\n
\n
Parameters
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the HookContext\noutputs the message you want to send.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the specific pipeline run that triggered the hook.

  • \n
\n
\n
\n

Examples

\n
@slack_on_failure("#foo", dagit_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op} failed!"\n\n@op\ndef an_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_slack.slack_on_success HookDefinition[source]\u00b6
\n

Create a hook on step success events that will message the given Slack channel.

\n
\n
Parameters
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the HookContext\noutputs the message you want to send.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the specific pipeline run that triggered the hook.

  • \n
\n
\n
\n

Examples

\n
@slack_on_success("#foo", dagit_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.solid} worked!"\n\n@op\ndef an_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_slack.make_slack_on_run_failure_sensor(channel, slack_token, text_fn=<function _default_failure_message_text_fn>, blocks_fn=None, name=None, dagit_base_url=None, job_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Create a sensor on job failures that will message the given Slack channel.

\n
\n
Parameters
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • slack_token (str) \u2013 The slack token.\nTokens are typically either user tokens or bot tokens. More in the Slack API\ndocumentation here: https://api.slack.com/docs/token-types

  • \n
  • text_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, job name, and run ID.\nThe usage of the text_fn changes depending on whether you\u2019re using blocks_fn. If you\nare using blocks_fn, this is used as a fallback string to display in notifications. If\nyou aren\u2019t, this is the main body text of the message. It can be formatted as plain text,\nor with mrkdwn.\nSee more details in https://api.slack.com/methods/chat.postMessage#text_usage

  • \n
  • blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]) \u2013 Function which takes in\nthe RunFailureSensorContext and outputs the message blocks you want to send.\nSee information about Blocks in https://api.slack.com/reference/block-kit/blocks

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cslack_on_run_failure\u201d.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the failed job run.

  • \n
  • job_selection (Optional[List[Union[PipelineDefinition, GraphDefinition]]]) \u2013 The jobs that\nwill be monitored by this failure sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n

Examples

\n
slack_on_run_failure = make_slack_on_run_failure_sensor(\n    "#my_channel",\n    os.getenv("MY_SLACK_TOKEN")\n)\n\n@repository\ndef my_repo():\n    return [my_job + slack_on_run_failure]\n
\n
\n
def my_message_fn(context: RunFailureSensorContext) -> str:\n    return (\n        f"Job {context.pipeline_run.pipeline_name} failed!"\n        f"Error: {context.failure_event.message}"\n    )\n\nslack_on_run_failure = make_slack_on_run_failure_sensor(\n    channel="#my_channel",\n    slack_token=os.getenv("MY_SLACK_TOKEN"),\n    message_fn=my_message_fn,\n    dagit_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n
\ndagster_slack.make_slack_on_pipeline_failure_sensor(channel, slack_token, text_fn=<function _default_failure_message_text_fn>, blocks_fn=None, pipeline_selection=None, name=None, dagit_base_url=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Create a sensor on pipeline failures that will message the given Slack channel.

\n
\n
Parameters
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • slack_token (str) \u2013 The slack token.\nTokens are typically either user tokens or bot tokens. More in the Slack API\ndocumentation here: https://api.slack.com/docs/token-types

  • \n
  • text_fn (Optional(Callable[[PipelineFailureSensorContext], str])) \u2013 Function which\ntakes in the PipelineFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, pipeline name, and run ID.\nThe usage of the text_fn changes depending on whether you\u2019re using blocks_fn. If you\nare using blocks_fn, this is used as a fallback string to display in notifications. If\nyou aren\u2019t, this is the main body text of the message. It can be formatted as plain text,\nor with mrkdwn.\nSee more details in https://api.slack.com/methods/chat.postMessage#text_usage

  • \n
  • blocks_fn (Callable[[PipelineFailureSensorContext], List[Dict]]) \u2013 Function which takes in\nthe PipelineFailureSensorContext and outputs the message blocks you want to send.\nSee information about Blocks in https://api.slack.com/reference/block-kit/blocks

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 Names of the pipelines that will be monitored by\nthis failure sensor. Defaults to None, which means the alert will be sent when any\npipeline in the repository fails.

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cslack_on_pipeline_failure\u201d.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the failed pipeline run.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n

Examples

\n
slack_on_pipeline_failure = make_slack_on_pipeline_failure_sensor(\n    "#my_channel",\n    os.getenv("MY_SLACK_TOKEN")\n)\n\n@repository\ndef my_repo():\n    return [my_pipeline + slack_on_pipeline_failure]\n
\n
\n
def my_message_fn(context: PipelineFailureSensorContext) -> str:\n    return "Pipeline {pipeline_name} failed! Error: {error}".format(\n        pipeline_name=context.pipeline_run.pipeline_name,\n        error=context.failure_event.message,\n    )\n\nslack_on_pipeline_failure = make_slack_on_pipeline_failure_sensor(\n    channel="#my_channel",\n    slack_token=os.getenv("MY_SLACK_TOKEN"),\n    message_fn=my_message_fn,\n    dagit_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-slack", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-snowflake/", "title": "Snowflake (dagster-snowflake)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-shell/", "title": "Shell (dagster-shell)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake", "Snowflake (dagster-snowflake)", "N", "next"], ["sections/api/apidocs/libraries/dagster-shell", "Shell (dagster-shell)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-slack.rst.txt", "title": "Slack (dagster-slack)", "toc": "\n"}, "dagster-snowflake": {"alabaster_version": "0.7.12", "body": "
\n

Snowflake (dagster-snowflake)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse.

\n

It provides a snowflake_resource, which is a Dagster resource for configuring\nSnowflake connections and issuing queries, as well as a snowflake_op_for_query function for\nconstructing ops that execute Snowflake queries.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n
\n
\ndagster_snowflake.snowflake_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account (dagster.StringSource, optional)
\n

Your Snowflake account name. For more details, see https://bit.ly/2FBL320.

\n
\n
user (dagster.StringSource)
\n

User login name.

\n
\n
password (dagster.StringSource)
\n

User password.

\n
\n
database (dagster.StringSource, optional)
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (dagster.StringSource, optional)
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
role (dagster.StringSource, optional)
\n

Name of the default role to use. After login, you can use USE ROLE to change the role.

\n
\n
warehouse (dagster.StringSource, optional)
\n

Name of the default warehouse to use. After login, you can use USE WAREHOUSE to change the role.

\n
\n
autocommit (Bool, optional)
\n

None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
client_prefetch_threads (dagster.IntSource, optional)
\n

Number of threads used to download the results sets (4 by default). Increasing the value improves fetch performance but requires more memory.

\n
\n
client_session_keep_alive (dagster.StringSource, optional)
\n

False by default. Set this to True to keep the session active indefinitely, even if there is no activity from the user. Make certain to call the close method to terminate the thread properly or the process may hang.

\n
\n
login_timeout (dagster.IntSource, optional)
\n

Timeout in seconds for login. By default, 60 seconds. The login request gives up after the timeout length if the HTTP response is \u201csuccess\u201d.

\n
\n
network_timeout (dagster.IntSource, optional)
\n

Timeout in seconds for all other operations. By default, none/infinite. A general request gives up after the timeout length if the HTTP response is not \u2018success\u2019.

\n
\n
ocsp_response_cache_filename (dagster.StringSource, optional)
\n

URI for the OCSP response cache file. By default, the OCSP response cache file is created in the cache directory.

\n
\n
validate_default_parameters (Bool, optional)
\n

False by default. Raise an exception if either one of specified database, schema or warehouse doesn\u2019t exists if True.

\n
\n
paramstyle (dagster.StringSource, optional)
\n

pyformat by default for client side binding. Specify qmark or numeric to change bind variable formats for server side binding.

\n
\n
timezone (dagster.StringSource, optional)
\n

None by default, which honors the Snowflake parameter TIMEZONE. Set to a valid time zone (e.g. America/Los_Angeles) to set the session time zone.

\n
\n
connector (dagster.StringSource, optional)
\n

Indicate alternative database connection engine. Permissible option is \u2018sqlalchemy\u2019 otherwise defaults to use the Snowflake Connector for Python.

\n
\n
cache_column_metadata (dagster.StringSource, optional)
\n

Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a flag cache_column_metadata=True such that all of column metadata for all tables are \u201ccached\u201d

\n
\n
numpy (dagster.StringSource, optional)
\n

Optional parameter when connector is set to sqlalchemy. To enable fetching NumPy data types, add numpy=True to the connection parameters.

\n
\n
\n

A resource for connecting to the Snowflake data warehouse.

\n

A simple example of loading data into Snowflake and subsequently querying that data is shown below:

\n

Examples:

\n
from dagster import job, op\nfrom dagster_snowflake import snowflake_resource\n\n@op(required_resource_keys={'snowflake'})\ndef get_one(context):\n    context.resources.snowflake.execute_query('SELECT 1')\n\n@job(resource_defs={'snowflake': snowflake_resource})\ndef my_snowflake_job():\n    get_one()\n\nmy_snowflake_job.execute_in_process(\n    run_config={\n        'resources': {\n            'snowflake': {\n                'config': {\n                    'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n                    'user': {'env': 'SNOWFLAKE_USER'},\n                    'password': {'env': 'SNOWFLAKE_PASSWORD'},\n                    'database': {'env': 'SNOWFLAKE_DATABASE'},\n                    'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n                    'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n                }\n            }\n        }\n    }\n)\n
\n
\n
\n\n
\n
\ndagster_snowflake.snowflake_op_for_query(sql, parameters=None)[source]\u00b6
\n

This function is an op factory that constructs an op to execute a snowflake query.

\n

Note that you can only use snowflake_op_for_query if you know the query you\u2019d like to\nexecute at graph construction time. If you\u2019d like to execute queries dynamically during\njob execution, you should manually execute those queries in your custom op using the\nsnowflake resource.

\n
\n
Parameters
\n
    \n
  • sql (str) \u2013 The sql query that will execute against the provided snowflake resource.

  • \n
  • parameters (dict) \u2013 The parameters for the sql query.

  • \n
\n
\n
Returns
\n

Returns the constructed op definition.

\n
\n
Return type
\n

OpDefinition

\n
\n
\n
\n\n
\n
\nclass dagster_snowflake.SnowflakeConnection(context)[source]\u00b6
\n
\n
\nexecute_queries(sql_queries, parameters=None, fetch_results=False)[source]\u00b6
\n
\n\n
\n
\nexecute_query(sql, parameters=None, fetch_results=False)[source]\u00b6
\n
\n\n
\n
\nget_connection(raw_conn=True)[source]\u00b6
\n
\n\n
\n
\nload_table_from_local_parquet(src, table)[source]\u00b6
\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-spark/", "title": "Spark (dagster-spark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-slack/", "title": "Slack (dagster-slack)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-spark", "Spark (dagster-spark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-slack", "Slack (dagster-slack)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake.rst.txt", "title": "Snowflake (dagster-snowflake)", "toc": "\n"}, "dagster-spark": {"alabaster_version": "0.7.12", "body": "
\n

Spark (dagster-spark)\u00b6

\n
\n
\nclass dagster_spark.SparkOpError[source]\u00b6
\n
\n\n
\n
\ndagster_spark.define_spark_config()[source]\u00b6
\n

Spark configuration.

\n
\n
See the Spark documentation for reference:

https://spark.apache.org/docs/latest/submitting-applications.html

\n
\n
\n
\n\n
\n
\ndagster_spark.create_spark_op(name, main_class, description=None, required_resource_keys=frozenset({'spark'}))[source]\u00b6
\n
\n\n
\n
\ndagster_spark.construct_spark_shell_command(application_jar, main_class, master_url=None, spark_conf=None, deploy_mode=None, application_arguments=None, spark_home=None)[source]\u00b6
\n

Constructs the spark-submit command for a Spark job.

\n
\n\n
\n
\ndagster_spark.spark_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-spark", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-ssh/", "title": "SSH / SFTP (dagster-ssh)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake/", "title": "Snowflake (dagster-snowflake)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-ssh", "SSH / SFTP (dagster-ssh)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake", "Snowflake (dagster-snowflake)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-spark.rst.txt", "title": "Spark (dagster-spark)", "toc": "\n"}, "dagster-ssh": {"alabaster_version": "0.7.12", "body": "
\n

SSH / SFTP (dagster-ssh)\u00b6

\n

This library provides an integration with SSH and SFTP.

\n
\n
\nclass dagster_ssh.SSHResource(remote_host, remote_port, username=None, password=None, key_file=None, key_string=None, timeout=10, keepalive_interval=30, compress=True, no_host_key_check=True, allow_host_key_change=False, logger=None)[source]\u00b6
\n

Resource for ssh remote execution using Paramiko.\nref: https://github.com/paramiko/paramiko

\n
\n\n
\n
\ndagster_ssh.ssh_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
remote_host (dagster.StringSource)
\n

remote host to connect to

\n
\n
remote_port (Int, optional)
\n

port of remote host to connect (Default is paramiko SSH_PORT)

\n

Default Value: 22

\n
\n
username (dagster.StringSource, optional)
\n

username to connect to the remote_host

\n
\n
password (dagster.StringSource, optional)
\n

password of the username to connect to the remote_host

\n
\n
key_file (dagster.StringSource, optional)
\n

key file to use to connect to the remote_host.

\n
\n
key_string (dagster.StringSource, optional)
\n

key string to use to connect to remote_host

\n
\n
timeout (Int, optional)
\n

timeout for the attempt to connect to the remote_host.

\n

Default Value: 10

\n
\n
keepalive_interval (Int, optional)
\n

send a keepalive packet to remote host every keepalive_interval seconds

\n

Default Value: 30

\n
\n
compress (Bool, optional)
\n

Default Value: True

\n
\n
no_host_key_check (Bool, optional)
\n

Default Value: True

\n
\n
allow_host_key_change (Bool, optional)
\n

Default Value: False

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-ssh", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-twilio/", "title": "Twilio (dagster-twilio)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-spark/", "title": "Spark (dagster-spark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-twilio", "Twilio (dagster-twilio)", "N", "next"], ["sections/api/apidocs/libraries/dagster-spark", "Spark (dagster-spark)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-ssh.rst.txt", "title": "SSH / SFTP (dagster-ssh)", "toc": "\n"}, "dagster-twilio": {"alabaster_version": "0.7.12", "body": "
\n

Twilio (dagster-twilio)\u00b6

\n

This library provides an integration with Twilio.

\n
\n
\ndagster_twilio.twilio_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account_sid (dagster.StringSource)
\n

Twilio Account SID

\n
\n
auth_token (dagster.StringSource)
\n

Twilio Auth Token

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-twilio", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagstermill/", "title": "Dagstermill"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-ssh/", "title": "SSH / SFTP (dagster-ssh)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagstermill", "Dagstermill", "N", "next"], ["sections/api/apidocs/libraries/dagster-ssh", "SSH / SFTP (dagster-ssh)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-twilio.rst.txt", "title": "Twilio (dagster-twilio)", "toc": "\n"}, "dagstermill": {"alabaster_version": "0.7.12", "body": "
\n

Dagstermill\u00b6

\n
\n
\ndagstermill.define_dagstermill_solid(name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook=None, output_notebook_name=None, asset_key_prefix=None, description=None, tags=None)[source]\u00b6
\n

Wrap a Jupyter notebook in a solid.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the solid.

  • \n
  • notebook_path (str) \u2013 Path to the backing notebook.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013 The solid\u2019s inputs.

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013 The solid\u2019s outputs. Your notebook should\ncall yield_result() to yield each of these outputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The string names of any required resources.

  • \n
  • output_notebook (Optional[str]) \u2013 If set, will be used as the name of an injected output of\ntype FileHandle that will point to the executed notebook (in\naddition to the AssetMaterialization that is always created). This\nrespects the FileManager configured on\nthe pipeline resources via the \u201cfile_manager\u201d resource key, so, e.g.,\nif s3_file_manager is configured, the output will be a :\npy:class:~dagster_aws.s3.S3FileHandle.

  • \n
  • output_notebook_name \u2013 (Optional[str]): If set, will be used as the name of an injected output\nof type of BufferedIOBase that is the file object of the executed\nnotebook (in addition to the AssetMaterialization that is always\ncreated). It allows the downstream solids to access the executed notebook via a file\nobject.

  • \n
  • asset_key_prefix (Optional[Union[List[str], str]]) \u2013 If set, will be used to prefix the\nasset keys for materialized notebooks.

  • \n
  • description (Optional[str]) \u2013 If set, description used for solid.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 If set, additional tags used to annotate solid.\nDagster uses the tag keys notebook_path and kind, which cannot be\noverwritten by the user.

  • \n
\n
\n
Returns
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagstermill.define_dagstermill_op(name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook_name=None, asset_key_prefix=None, description=None, tags=None)[source]\u00b6
\n

Wrap a Jupyter notebook in a solid.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the solid.

  • \n
  • notebook_path (str) \u2013 Path to the backing notebook.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013 The solid\u2019s inputs.

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013 The solid\u2019s outputs. Your notebook should\ncall yield_result() to yield each of these outputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The string names of any required resources.

  • \n
  • output_notebook_name \u2013 (Optional[str]): If set, will be used as the name of an injected output\nof type of BufferedIOBase that is the file object of the executed\nnotebook (in addition to the AssetMaterialization that is always\ncreated). It allows the downstream solids to access the executed notebook via a file\nobject.

  • \n
  • asset_key_prefix (Optional[Union[List[str], str]]) \u2013 If set, will be used to prefix the\nasset keys for materialized notebooks.

  • \n
  • description (Optional[str]) \u2013 If set, description used for solid.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 If set, additional tags used to annotate solid.\nDagster uses the tag keys notebook_path and kind, which cannot be\noverwritten by the user.

  • \n
\n
\n
Returns
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagstermill.local_output_notebook_io_manager(init_context)[source]\u00b6
\n

Built-in IO Manager that handles output notebooks.

\n
\n\n
\n
\ndagstermill.get_context(solid_config=None, mode_def=None, run_config=None)\u00b6
\n

Get a dagstermill execution context for interactive exploration and development.

\n
\n
Parameters
\n
    \n
  • solid_config (Optional[Any]) \u2013 If specified, this value will be made available on the\ncontext as its solid_config property.

  • \n
  • mode_def (Optional[dagster.ModeDefinition]) \u2013 If specified, defines the mode to\nuse to construct the context. Specify this if you would like a context constructed\nwith specific resource_defs or logger_defs. By default, an ephemeral mode\nwith a console logger will be constructed.

  • \n
  • run_config (Optional[dict]) \u2013 The config dict with which to construct\nthe context.

  • \n
\n
\n
Returns
\n

DagstermillExecutionContext

\n
\n
\n
\n\n
\n
\ndagstermill.yield_event(dagster_event)\u00b6
\n

Yield a dagster event directly from notebook code.

\n

When called interactively or in development, returns its input.

\n
\n
Parameters
\n

dagster_event (Union[dagster.AssetMaterialization, dagster.ExpectationResult, dagster.TypeCheck, dagster.Failure, dagster.RetryRequested]) \u2013 An event to yield back to Dagster.

\n
\n
\n
\n\n
\n
\ndagstermill.yield_result(value, output_name='result')\u00b6
\n

Yield a result directly from notebook code.

\n

When called interactively or in development, returns its input.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value to yield.

  • \n
  • output_name (Optional[str]) \u2013 The name of the result to yield (default: 'result').

  • \n
\n
\n
\n
\n\n
\n
\nclass dagstermill.DagstermillExecutionContext(pipeline_context, pipeline_def, resource_keys_to_init, solid_name, solid_handle, solid_config=None)[source]\u00b6
\n

Dagstermill-specific execution context.

\n

Do not initialize directly: use dagstermill.get_context().

\n
\n
\nget_tag(key)[source]\u00b6
\n

Get a logging tag defined on the context.

\n
\n
Parameters
\n

key (str) \u2013 The key to get.

\n
\n
Returns
\n

str

\n
\n
\n
\n\n
\n
\nhas_tag(key)[source]\u00b6
\n

Check if a logging tag is defined on the context.

\n
\n
Parameters
\n

key (str) \u2013 The key to check.

\n
\n
Returns
\n

bool

\n
\n
\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager for the context.

\n

Call, e.g., log.info() to log messages through the Dagster machinery.

\n
\n
Type
\n

dagster.DagsterLogManager

\n
\n
\n
\n\n
\n
\nproperty logging_tags\u00b6
\n

The logging tags for the context.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\nproperty pipeline_def\u00b6
\n

The pipeline definition for the context.

\n

This will be a dagstermill-specific shim.

\n
\n
Type
\n

dagster.PipelineDefinition

\n
\n
\n
\n\n
\n
\nproperty pipeline_run\u00b6
\n

The pipeline run for the context.

\n
\n
Type
\n

dagster.PipelineRun

\n
\n
\n
\n\n
\n
\nproperty resolved_run_config\u00b6
\n

The resolved_run_config for the context

\n
\n
Type
\n

dagster.ResolvedRunConfig

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

A dynamically-created type whose properties allow access to\nresources.

\n
\n
Type
\n

collections.namedtuple

\n
\n
\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run_config for the context.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The run_id for the context.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty solid\u00b6
\n

The solid for the context.

\n

In interactive contexts, this may be a dagstermill-specific shim, depending whether a\nsolid definition was passed to dagstermill.get_context.

\n
\n
Type
\n

dagster.Node

\n
\n
\n
\n\n
\n
\nproperty solid_config\u00b6
\n

A dynamically-created type whose properties allow access to\nsolid-specific config.

\n
\n
Type
\n

collections.namedtuple

\n
\n
\n
\n\n
\n
\nproperty solid_def\u00b6
\n

The solid definition for the context.

\n

In interactive contexts, this may be a dagstermill-specific shim, depending whether a\nsolid definition was passed to dagstermill.get_context.

\n
\n
Type
\n

dagster.SolidDefinition

\n
\n
\n
\n\n
\n\n
\n
\nclass dagstermill.DagstermillError[source]\u00b6
\n

Base class for errors raised by dagstermill.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagstermill", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../dagster-graphql/", "title": "GraphQL (dagster-graphql)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-twilio/", "title": "Twilio (dagster-twilio)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-graphql", "GraphQL (dagster-graphql)", "N", "next"], ["sections/api/apidocs/libraries/dagster-twilio", "Twilio (dagster-twilio)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagstermill.rst.txt", "title": "Dagstermill", "toc": "\n"}}, "loggers": {"alabaster_version": "0.7.12", "body": "
\n

Loggers\u00b6

\n
\n

Built-in loggers\u00b6

\n
\n
\ndagster.loggers.colored_console_logger(*args, **kwargs)\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.loggers.json_console_logger(*args, **kwargs)\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\n

Logging from an @op\u00b6

\n
\n
\nclass dagster.DagsterLogManager(dagster_handler, level=0, managed_loggers=None)[source]\u00b6
\n

Centralized dispatch for logging from user code.

\n

Handles the construction of uniform structured log messages and passes them through to the\nunderlying loggers/handlers.

\n

An instance of the log manager is made available to ops as context.log. Users should not\ninitialize instances of the log manager directly. To configure custom loggers, set the\nlogger_defs argument in an @job decorator or when calling the to_job() method on a\nGraphDefinition.

\n

The log manager inherits standard convenience methods like those exposed by the Python standard\nlibrary logging module (i.e., within the body of an op,\ncontext.log.{debug, info, warning, warn, error, critical, fatal}).

\n

The underlying integer API can also be called directly using, e.g.\ncontext.log.log(5, msg), and the log manager will delegate to the log method\ndefined on each of the loggers it manages.

\n

User-defined custom log levels are not supported, and calls to, e.g.,\ncontext.log.trace or context.log.notice will result in hard exceptions at runtime.

\n
\n\n
\n
\n

Defining custom loggers\u00b6

\n
\n
\n@dagster.logger(config_schema=None, description=None)[source]\u00b6
\n

Define a logger.

\n

The decorated function should accept an InitLoggerContext and return an instance of\nlogging.Logger. This function will become the logger_fn of an underlying\nLoggerDefinition.

\n
\n
Parameters
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the logger.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.LoggerDefinition(logger_fn, config_schema=None, description=None)[source]\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n
\nconfigured(config_or_config_fn, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\nclass dagster.InitLoggerContext(logger_config, logger_def=None, pipeline_def=None, run_id=None)[source]\u00b6
\n

Logger-specific initialization context.

\n

An instance of this class is made available as the first argument to the logger_fn decorated\nby @logger or set on a LoggerDefinition.

\n

Users should not instantiate this class.

\n
\n
\nlogger_config\u00b6
\n

The configuration data provided by the run config. The\nschema for this data is defined by config_schema on the LoggerDefinition

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\npipeline_def\u00b6
\n

The pipeline/job definition currently being executed.

\n
\n
Type
\n

Optional[PipelineDefinition]

\n
\n
\n
\n\n
\n
\nlogger_def\u00b6
\n

The logger definition for the logger being constructed.

\n
\n
Type
\n

Optional[LoggerDefinition]

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The ID for this run of the pipeline.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_init_logger_context(logger_config=None, pipeline_def=None, job_def=None)[source]\u00b6
\n

Builds logger initialization context from provided parameters.

\n

This function can be used to provide the context argument to the invocation of a logger\ndefinition.

\n

Note that you may only specify one of pipeline_def and job_def.

\n
\n
Parameters
\n
    \n
  • logger_config (Any) \u2013 The config to provide during initialization of logger.

  • \n
  • pipeline_def (Optional[PipelineDefinition]) \u2013 The pipeline definition that the logger will be\nused with.

  • \n
  • job_def (Optional[JobDefinition]) \u2013 The job definition that the logger will be used with.

  • \n
\n
\n
\n

Examples

\n
context = build_init_logger_context()\nlogger_to_init(context)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/loggers", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../modes/", "title": "[Legacy] Modes"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../jobs/", "title": "Jobs"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/modes", "[Legacy] Modes", "N", "next"], ["sections/api/apidocs/jobs", "Jobs", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/loggers.rst.txt", "title": "Loggers", "toc": "\n"}, "memoization": {"alabaster_version": "0.7.12", "body": "
\n

Versioning and Memoization\u00b6

\n

Dagster allows for code versioning and memoization of previous outputs based upon that versioning.\nListed here are APIs related to versioning and memoization.

\n
\n

Versioning\u00b6

\n
\n
\nclass dagster.VersionStrategy[source]\u00b6
\n

Abstract class for defining a strategy to version solids and resources.

\n

When subclassing, get_solid_version must be implemented, and get_resource_version can be\noptionally implemented.

\n

get_solid_version should ingest a SolidVersionContext, and get_resource_version should ingest a\nResourceVersionContext. From that, each synthesize a unique string called a version, which will\nbe tagged to outputs of that solid in the pipeline. Providing a VersionStrategy instance to a\njob will enable memoization on that job, such that only steps whose outputs do not have an\nup-to-date version will run.

\n
\n\n
\n
\nclass dagster.SourceHashVersionStrategy[source]\u00b6
\n
\n\n
\n
\n

Memoization\u00b6

\n
\n
\nclass dagster.MemoizableIOManager[source]\u00b6
\n

Base class for IO manager enabled to work with memoized execution. Users should implement\nthe load_input and handle_output methods described in the IOManager API, and the\nhas_output method, which returns a boolean representing whether a data object can be found.

\n
\n
\nabstract has_output(context)[source]\u00b6
\n

The user-defined method that returns whether data exists given the metadata.

\n
\n
Parameters
\n

context (OutputContext) \u2013 The context of the step performing this check.

\n
\n
Returns
\n

True if there is data present that matches the provided context. False otherwise.

\n
\n
Return type
\n

bool

\n
\n
\n
\n\n
\n\n

See also: dagster.IOManager.

\n
\n
\ndagster.MEMOIZED_RUN_TAG\u00b6
\n

Provide this tag to a run to toggle memoization on or off. {MEMOIZED_RUN_TAG: "true"} toggles memoization on, while {MEMOIZED_RUN_TAG: "false"} toggles memoization off.

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/memoization", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../libraries/dagster-airbyte/", "title": "Airbyte (dagster-airbyte)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../utilities/", "title": "Utilities"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-airbyte", "Airbyte (dagster-airbyte)", "N", "next"], ["sections/api/apidocs/utilities", "Utilities", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/memoization.rst.txt", "title": "Versioning and Memoization", "toc": "\n"}, "modes": {"alabaster_version": "0.7.12", "body": "
\n

[Legacy] Modes\u00b6

\n

Modes are only used in the creation of PipelineDefinition objects, which are now\ndeprecated in favor of JobDefinition.

\n
\n
\nclass dagster.ModeDefinition(name=None, resource_defs=None, logger_defs=None, executor_defs=None, description=None, _config_mapping=None, _partitioned_config=None)[source]\u00b6
\n

Define a mode in which a pipeline can operate.

\n

A mode provides pipelines with a set of resource implementations, loggers, system storages,\nand executors.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the mode. Must be unique within the\nPipelineDefinition to which the mode is attached. (default: \u201cdefault\u201d).

  • \n
  • resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 A dictionary of string resource\nkeys to their implementations. Individual solids may require resources to be present by\nthese keys.

  • \n
  • logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger\nidentifiers to their implementations.

  • \n
  • executor_defs (Optional[List[ExecutorDefinition]]) \u2013 The set of executors available when\nexecuting in this mode. By default, this will be the \u2018in_process\u2019 and \u2018multiprocess\u2019\nexecutors (default_executors).

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the mode.

  • \n
  • _config_mapping (Optional[ConfigMapping]) \u2013 Only for internal use.

  • \n
  • _partitions (Optional[PartitionedConfig]) \u2013 Only for internal use.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/modes", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../ops/", "title": "Ops"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../loggers/", "title": "Loggers"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/ops", "Ops", "N", "next"], ["sections/api/apidocs/loggers", "Loggers", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/modes.rst.txt", "title": "[Legacy] Modes", "toc": "\n"}, "ops": {"alabaster_version": "0.7.12", "body": "
\n

Ops\u00b6

\n

The foundational unit of computation in Dagster.

\n
\n
\n

Defining ops\u00b6

\n
\n
\n@dagster.op(name=None, description=None, ins=None, out=None, config_schema=None, required_resource_keys=None, tags=None, version=None, retry_policy=None, input_defs=None, output_defs=None)[source]\u00b6
\n

Create an op with the specified parameters from the decorated function.

\n

Ins and outs will be inferred from the type signature of the decorated function\nif not explicitly provided.

\n

The decorated function will be used as the op\u2019s compute function. The signature of the\ndecorated function is more flexible than that of the compute_fn in the core API; it may:

\n
    \n
  1. Return a value. This value will be wrapped in an Output and yielded by the compute function.

  2. \n
  3. Return an Output. This output will be yielded by the compute function.

  4. \n
  5. Yield Output or other event objects. Same as default compute behavior.

  6. \n
\n

Note that options 1) and 2) are incompatible with yielding other events \u2013 if you would like\nto decorate a function that yields events, it must also wrap its eventual output in an\nOutput and yield it.

\n

@op supports async def functions as well, including async generators when yielding multiple\nevents or outputs. Note that async ops will generally be run on their own unless using a custom\nExecutor implementation that supports running them together.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 Name of op. Must be unique within any GraphDefinition\nusing the op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this op. If not provided, and\nthe decorated function has docstring, that docstring will be used as the description.

  • \n
  • ins (Optional[Dict[str, In]]) \u2013 Information about the inputs to the op. Information provided here will be combined\nwith what can be inferred from the function signature.

  • \n
  • out (Optional[Union[Out, Dict[str, Out]]]) \u2013 Information about the op outputs. Information provided here will be combined with\nwhat can be inferred from the return type signature if the function does not use yield.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the op matches this schema and fail if it does not. If not\nset, Dagster will accept any config provided for the op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this op.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Values that are not strings\nwill be json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the op\u2019s compute_fn. Two ops should have\nthe same version if and only if they deterministically produce the same outputs when\nprovided the same inputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this op.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013 (legacy) Preserved to ease migration from solid. Can be used in place of ins argument.

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013 (legacy) Preserved to ease migration from solid. Can be used in place of out argument.

  • \n
\n
\n
\n

Examples

\n
@op\ndef hello_world():\n    print('hello')\n\n@op\ndef echo(msg: str) -> str:\n    return msg\n\n@op(\n    ins={'msg': In(str)},\n    out=Out(str)\n)\ndef echo_2(msg): # same as above\n    return msg\n\n@op(\n    out={'word': Out(), 'num': Out()}\n)\ndef multi_out() -> Tuple[str, int]:\n    return 'cool', 4\n
\n
\n
\n\n
\n
\nclass dagster.OpDefinition(name, input_defs, compute_fn, output_defs, config_schema=None, description=None, tags=None, required_resource_keys=None, version=None, retry_policy=None)[source]\u00b6
\n

Defines an op, the functional unit of user-defined computation.

\n

For more details on what a op is, refer to the\nOps Overview .

\n

End users should prefer the @op decorator. OpDefinition is generally intended to be\nused by framework authors or for programatically generated ops.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the op. Must be unique within any GraphDefinition or\nJobDefinition that contains the op.

  • \n
  • input_defs (List[InputDefinition]) \u2013 Inputs of the op.

  • \n
  • compute_fn (Callable) \u2013

    The core of the op, the function that performs the actual\ncomputation. The signature of this function is determined by input_defs, and\noptionally, an injected first argument, context, a collection of information\nprovided by the system.

    \n

    This function will be coerced into a generator or an async generator, which must yield\none Output for each of the op\u2019s output_defs, and additionally may\nyield other types of Dagster events, including AssetMaterialization and\nExpectationResult.

    \n

  • \n
  • output_defs (List[OutputDefinition]) \u2013 Outputs of the op.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat the config provided for the op matches this schema and will fail if it does not. If\nnot set, Dagster will accept any config provided for the op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the op.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resources handles required by this op.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the op\u2019s compute_fn. Two ops should\nhave the same version if and only if they deterministically produce the same outputs\nwhen provided the same inputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this op.

  • \n
\n
\n
\n

Examples

\n
def _add_one(_context, inputs):\n    yield Output(inputs["num"] + 1)\n\nOpDefinition(\n    name="add_one",\n    input_defs=[InputDefinition("num", Int)],\n    output_defs=[OutputDefinition(Int)], # default name ("result")\n    compute_fn=_add_one,\n)\n
\n
\n
\n
\nconfigured(config_or_config_fn, name, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (str) \u2013 Name of the new definition. This is a required argument, as this definition\ntype has a name uniqueness constraint.

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\n
\n

Ins & outs\u00b6

\n
\n
\nclass dagster.In(dagster_type=<class 'dagster.core.definitions.utils.NoValueSentinel'>, description=None, default_value=<class 'dagster.core.definitions.utils.NoValueSentinel'>, root_manager_key=None, metadata=None, asset_key=None, asset_partitions=None)[source]\u00b6
\n

Defines an argument to an op\u2019s compute function.

\n

Inputs may flow from previous op\u2019s outputs, or be stubbed using config. They may optionally\nbe typed using the Dagster type system.

\n
\n
Parameters
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this input. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the input.

  • \n
  • default_value (Optional[Any]) \u2013 The default value to use if no input is provided.

  • \n
  • root_manager_key (Optional[str]) \u2013 (Experimental) The resource key for the\nRootInputManager used for loading this input when it is not connected to an\nupstream output.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata for the input.

  • \n
  • asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]) \u2013 (Experimental) An AssetKey\n(or function that produces an AssetKey from the InputContext) which should be associated\nwith this In. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the InputContext) which should be associated with this In.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Out(dagster_type=<class 'dagster.core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, asset_key=None, asset_partitions=None, asset_partitions_def=None)[source]\u00b6
\n

Defines an output from an op\u2019s compute function.

\n

Ops can have multiple outputs, in which case outputs cannot be anonymous.

\n

Many ops have only one output, in which case the user can provide a single output definition\nthat will be given the default name, \u201cresult\u201d.

\n

Outs may be typed using the Dagster type system.

\n
\n
Parameters
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this output. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the output.

  • \n
  • is_required (bool) \u2013 Whether the presence of this field is required. (default: True)

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the output manager used for this output.\n(default: \u201cio_manager\u201d).

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.

  • \n
  • asset_key (Optional[AssetKey]) \u2013 (Experimental) An AssetKey which should be associated\nwith this Out. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], OutputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the OutputContext) which should be associated with this Out.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Execution\u00b6

\n
\n
\nclass dagster.RetryPolicy(max_retries=1, delay=None, backoff=None, jitter=None)[source]\u00b6
\n

A declarative policy for when to request retries when an exception occurs during op execution.

\n
\n
Parameters
\n
    \n
  • max_retries (int) \u2013 The maximum number of retries to attempt. Defaults to 1.

  • \n
  • delay (Optional[Union[int,float]]) \u2013 The time in seconds to wait between the retry being requested and the next attempt\nbeing started. This unit of time can be modulated as a function of attempt number\nwith backoff and randomly with jitter.

  • \n
  • backoff (Optional[Backoff]) \u2013 A modifier for delay as a function of retry attempt number.

  • \n
  • jitter (Optional[Jitter]) \u2013 A randomizing modifier for delay, applied after backoff calculation.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Backoff(value)[source]\u00b6
\n

A modifier for delay as a function of attempt number.

\n

LINEAR: attempt_num * delay\nEXPONENTIAL: ((2 ^ attempt_num) - 1) * delay

\n
\n\n
\n
\nclass dagster.Jitter(value)[source]\u00b6
\n

A randomizing modifier for delay, applied after backoff calculation.

\n

FULL: between 0 and the calculated delay based on backoff: random() * backoff_delay\nPLUS_MINUS: +/- the delay: backoff_delay + ((2 * (random() * delay)) - delay)

\n
\n\n
\n
\n
\n

Events\u00b6

\n

The objects that can be yielded by the body of ops\u2019 compute functions to communicate with the\nDagster framework.

\n

(Note that Failure and RetryRequested are intended to be raised from ops rather than yielded.)

\n
\n

Event types\u00b6

\n
\n
\nclass dagster.Output(value, output_name='result', metadata_entries=None, metadata=None)[source]\u00b6
\n

Event corresponding to one of a op\u2019s outputs.

\n

Op compute functions must explicitly yield events of this type when they have more than\none output, or when they also yield events of other types, or when defining a op using the\nOpDefinition API directly.

\n

Outputs are values produced by ops that will be consumed by downstream ops in a job.\nThey are type-checked at op boundaries when their corresponding Out\nor the downstream In is typed.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding out. (default:\n\u201cresult\u201d)

  • \n
  • metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]) \u2013 (Experimental) A set of metadata entries to attach to events related to this Output.

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetMaterialization(asset_key, description=None, metadata_entries=None, partition=None, tags=None, metadata=None)[source]\u00b6
\n

Event indicating that an op has materialized an asset.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that they have produced a materialized value as a\nside effect of computation. Unlike outputs, asset materializations can not be passed to other\nops, and their persistence is controlled by op logic, rather than by the Dagster\nframework.

\n

Op authors should use these events to organize metadata about the side effects of their\ncomputations, enabling tooling like the Assets dashboard in Dagit.

\n
\n
Parameters
\n
    \n
  • asset_key (Union[str, List[str], AssetKey]) \u2013 A key to identify the materialized asset across job\nruns

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the materialized value.

  • \n
  • metadata_entries (Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]]) \u2013 Arbitrary metadata about the\nmaterialized value.

  • \n
  • partition (Optional[str]) \u2013 The name of the partition that was materialized.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 (Experimental) Tag metadata for a given asset\nmaterialization. Used for search and organization of the asset entry in the asset\ncatalog in Dagit.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the asset. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n
\nstatic file(path, description=None, asset_key=None)[source]\u00b6
\n

Static constructor for standard materializations corresponding to files on disk.

\n
\n
Parameters
\n
    \n
  • path (str) \u2013 The path to the file.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the materialization.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ExpectationResult(success, label=None, description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Event corresponding to a data quality test.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that a data quality test has produced a (positive or\nnegative) result.

\n
\n
Parameters
\n
    \n
  • success (bool) \u2013 Whether the expectation passed or not.

  • \n
  • label (Optional[str]) \u2013 Short display name for expectation. Defaults to \u201cresult\u201d.

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the expectation.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\nexpectation.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheck(success, description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Event corresponding to a successful typecheck.

\n

Events of this type should be returned by user-defined type checks when they need to encapsulate\nadditional metadata about a type check\u2019s success or failure. (i.e., when using\nas_dagster_type(), @usable_as_dagster_type, or the underlying\nPythonObjectDagsterType() API.)

\n

Solid compute functions should generally avoid yielding events of this type to avoid confusion.

\n
\n
Parameters
\n
    \n
  • success (bool) \u2013 True if the type check succeeded, False otherwise.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the type check.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\ntype check.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Failure(description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Event indicating op failure.

\n

Raise events of this type from within op compute functions or custom type checks in order to\nindicate an unrecoverable failure in user code to the Dagster machinery and return\nstructured metadata about the failure.

\n
\n
Parameters
\n
    \n
  • description (Optional[str]) \u2013 A human-readable description of the failure.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\nfailure.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RetryRequested(max_retries=1, seconds_to_wait=None)[source]\u00b6
\n

An exception to raise from an op to indicate that it should be retried.

\n
\n
Parameters
\n
    \n
  • max_retries (Optional[int]) \u2013 The max number of retries this step should attempt before failing

  • \n
  • seconds_to_wait (Optional[Union[float,int]]) \u2013 Seconds to wait before restarting the step after putting the step in\nto the up_for_retry state

  • \n
\n
\n
\n

Example

\n
@op\ndef flakes():\n    try:\n        flakey_operation()\n    except Exception as e:\n        raise RetryRequested(max_retries=3) from e\n
\n
\n
\n\n
\n
\n
\n

Event metadata\u00b6

\n

Dagster uses metadata to communicate arbitrary user-specified metadata about structured\nevents.

\n
\n
\nclass dagster.MetadataValue[source]\u00b6
\n

Utility class to wrap metadata values passed into Dagster events so that they can be\ndisplayed in Dagit and other tooling.

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": "hello",\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n            "num_rows": 0,\n        },\n    )\n
\n
\n
\n
\nstatic asset(asset_key)[source]\u00b6
\n

Static constructor for a metadata value referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata={\n            "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n        },\n    )\n
\n
\n
\n
Parameters
\n

asset_key (AssetKey) \u2013 The asset key referencing the asset.

\n
\n
\n
\n\n
\n
\nstatic dagster_run(run_id)[source]\u00b6
\n

Static constructor for a metadata value wrapping a reference to a Dagster run.

\n
\n
Parameters
\n

run_id (str) \u2013 The ID of the run.

\n
\n
\n
\n\n
\n
\nstatic float(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping a float as\nFloatMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n        }\n    )\n
\n
\n
\n
Parameters
\n

value (float) \u2013 The float value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic int(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping an int as\nIntMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "number of rows": MetadataValue.int(len(df)),\n        },\n    )\n
\n
\n
\n
Parameters
\n

value (int) \u2013 The int value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic json(data)[source]\u00b6
\n

Static constructor for a metadata value wrapping a path as\nJsonMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata={\n            "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n        },\n    )\n
\n
\n
\n
Parameters
\n

data (Dict[str, Any]) \u2013 The JSON data for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic md(data)[source]\u00b6
\n

Static constructor for a metadata value wrapping markdown data as\nMarkdownMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata={\n            'Details': MetadataValue.md(md_str)\n        },\n    )\n
\n
\n
\n
Parameters
\n

md_str (str) \u2013 The markdown for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic path(path)[source]\u00b6
\n

Static constructor for a metadata value wrapping a path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "filepath": MetadataValue.path("path/to/file"),\n        }\n    )\n
\n
\n
\n
Parameters
\n

path (str) \u2013 The path for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic python_artifact(python_artifact)[source]\u00b6
\n

Static constructor for a metadata value wrapping a python artifact as\nPythonArtifactMetadataValue. Can be used as the value type for the\nmetadata parameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "class": MetadataValue.python_artifact(MyClass),\n            "function": MetadataValue.python_artifact(my_function),\n        }\n    )\n
\n
\n
\n
Parameters
\n

value (Callable) \u2013 The python class or function for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic table(records, schema=None)[source]\u00b6
\n

Static constructor for a metadata value wrapping arbitrary tabular data as\nTableMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata={\n            "errors": MetadataValue.table(\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"}]\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        },\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • records (List[TableRecord]) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table_schema(schema)[source]\u00b6
\n

Static constructor for a metadata value wrapping a table schema as\nTableSchemaMetadataValue. Can be used as the value type\nfor the metadata parameter for supported events. For example:

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata={\n        'my_table_schema': MetadataValue.table_schema(schema),\n    }\n)\n
\n
\n
\n
Parameters
\n

schema (TableSchema) \u2013 The table schema for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic text(text)[source]\u00b6
\n

Static constructor for a metadata value wrapping text as\nTextMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": MetadataValue.text("hello")\n        },\n    )\n
\n
\n
\n
Parameters
\n

text (str) \u2013 The text string for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic url(url)[source]\u00b6
\n

Static constructor for a metadata value wrapping a URL as\nUrlMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata={\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n        }\n    )\n
\n
\n
\n
Parameters
\n

url (str) \u2013 The URL for a metadata entry.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MetadataEntry(label, description=None, entry_data=None, value=None)[source]\u00b6
\n

The standard structure for describing metadata for Dagster events.

\n

Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\nin Dagit and other tooling.

\n

Should be yielded from within an IO manager to append metadata for a given input/output event.\nFor other event types, passing a dict with MetadataValue values to the metadata argument\nis preferred.

\n
\n
Parameters
\n
    \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • value (MetadataValue) \u2013 Typed metadata entry data. The different types allow\nfor customized display in tools like dagit.

  • \n
\n
\n
\n
\n
\nstatic asset(asset_key, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata_entries=[\n             MetadataEntry.asset(AssetKey('my_other_table'), "Related asset"),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • asset_key (AssetKey) \u2013 The asset key referencing the asset.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic float(value, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing float as\nFloatMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.float(calculate_bytes(df), "size (bytes)")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • value (Optional[float]) \u2013 The float value contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic fspath(path, label=None, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing a filesystem path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.fspath("path/to/file")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • path (Optional[str]) \u2013 The path contained by this metadata entry.

  • \n
  • label (Optional[str]) \u2013 Short display label for this metadata entry. Defaults to the\nbase name of the path.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic int(value, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing int as\nIntMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.int(len(df), "number of rows")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • value (Optional[int]) \u2013 The int value contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic json(data, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing JSON data as\nJsonMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata_entries=[\n            MetadataEntry.json(\n                label="metadata", data={"missing_columns": missing_things},\n            )\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • data (Optional[Dict[str, Any]]) \u2013 The JSON data contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic md(md_str, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing markdown data as\nMarkdownMetadataValue. For example:

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata_entries=[MetadataEntry.md(md_str=md_str)],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • md_str (Optional[str]) \u2013 The markdown contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic path(path, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing a path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.path("path/to/file", label="filepath")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • path (Optional[str]) \u2013 The path contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table(records, label, description=None, schema=None)[source]\u00b6
\n

Static constructor for a metadata entry containing tabluar data as\nTableMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata_entries=[\n            MetadataEntry.table(\n                label="errors",\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"}]\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • records (List[TableRecord]) \u2013 The data as a list of records (i.e. rows).

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table. If none is provided, one will be\nautomatically generated by examining the first record. The schema will include as columns all\nfield names present in the first record, with a type of \u201cstring\u201d, \u201cint\u201d,\n\u201cbool\u201d or \u201cfloat\u201d inferred from the first record\u2019s values. If a value does\nnot directly match one of the above types, it will be treated as a string.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table_schema(schema, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing a table schema as\nTableSchemaMetadataValue. For example:

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata_entries=[\n        MetadataEntry.table_schema(\n            schema,\n            label='schema',\n        )\n    ]\n)\n
\n
\n
\n
Parameters
\n
    \n
  • schema (TableSchema) \u2013 The table schema for a metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic text(text, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing text as\nTextMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[\n            MetadataEntry.text("Text-based metadata for this event", "text_metadata")\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • text (Optional[str]) \u2013 The text of this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic url(url, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing a URL as\nUrlMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata_entries=[\n            MetadataEntry.url(\n                "http://mycoolsite.com/my_dashboard", label="dashboard_url"\n            ),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • url (Optional[str]) \u2013 The URL contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nproperty value\u00b6
\n

Alias of entry_data.

\n
\n\n
\n\n
\n
\n

Metadata types\u00b6

\n

All metadata types inherit from MetadataValue. The following types are defined:

\n
\n
\nclass dagster.DagsterAssetMetadataValue(asset_key)[source]\u00b6
\n

Representation of a dagster asset.

\n
\n
Parameters
\n

asset_key (AssetKey) \u2013 The dagster asset key

\n
\n
\n
\n\n
\n
\nclass dagster.DagsterPipelineRunMetadataValue(run_id)[source]\u00b6
\n

Representation of a dagster pipeline run.

\n
\n
Parameters
\n

run_id (str) \u2013 The pipeline run id

\n
\n
\n
\n\n
\n
\nclass dagster.FloatMetadataValue(value)[source]\u00b6
\n

Container class for float metadata entry data.

\n
\n
Parameters
\n

value (Optional[float]) \u2013 The float value.

\n
\n
\n
\n\n
\n
\nclass dagster.IntMetadataValue(value)[source]\u00b6
\n

Container class for int metadata entry data.

\n
\n
Parameters
\n

value (Optional[int]) \u2013 The int value.

\n
\n
\n
\n\n
\n
\nclass dagster.JsonMetadataValue(data)[source]\u00b6
\n

Container class for JSON metadata entry data.

\n
\n
Parameters
\n

data (Dict[str, Any]) \u2013 The JSON data.

\n
\n
\n
\n\n
\n
\nclass dagster.MarkdownMetadataValue(md_str)[source]\u00b6
\n

Container class for markdown metadata entry data.

\n
\n
Parameters
\n

md_str (Optional[str]) \u2013 The markdown as a string.

\n
\n
\n
\n\n
\n
\nclass dagster.PathMetadataValue(path)[source]\u00b6
\n

Container class for path metadata entry data.

\n
\n
Parameters
\n

path (Optional[str]) \u2013 The path as a string or conforming to os.PathLike.

\n
\n
\n
\n\n
\n
\nclass dagster.PythonArtifactMetadataValue(module, name)[source]\u00b6
\n

Container class for python artifact metadata entry data.

\n
\n
Parameters
\n
    \n
  • module (str) \u2013 The module where the python artifact can be found

  • \n
  • name (str) \u2013 The name of the python artifact

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableMetadataValue(records, schema)[source]\u00b6
\n

Container class for table metadata entry data.

\n
\n
Parameters
\n
    \n
  • records (TableRecord) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableSchemaMetadataValue(schema)[source]\u00b6
\n

Representation of a schema for arbitrary tabular data.

\n
\n
Parameters
\n

schema (TableSchema) \u2013 The dictionary containing the schema representation.

\n
\n
\n
\n\n
\n
\nclass dagster.TextMetadataValue(text)[source]\u00b6
\n

Container class for text metadata entry data.

\n
\n
Parameters
\n

text (Optional[str]) \u2013 The text data.

\n
\n
\n
\n\n
\n
\nclass dagster.UrlMetadataValue(url)[source]\u00b6
\n

Container class for URL metadata entry data.

\n
\n
Parameters
\n

url (Optional[str]) \u2013 The URL as a string.

\n
\n
\n
\n\n
\n
\n

Tables\u00b6

\n

These APIs provide the ability to express table schemas (TableSchema) and table rows/records (TableRecord) in Dagster. Currently the only use case for TableSchemas and TableRecords is to wrap them in their corresponding metadata classes TableMetadataValue and TableSchemaMetadataValue for attachment to events or Dagster types.

\n
\n
\nclass dagster.TableRecord(**data)[source]\u00b6
\n

Represents one record in a table. All passed keyword arguments are treated as field key/value\npairs in the record. Field keys are arbitrary strings\u2013 field values must be strings, integers,\nfloats, or bools.

\n
\n\n
\n
\nclass dagster.TableSchema(columns, constraints=None)[source]\u00b6
\n

Representation of a schema for tabular data. Schema is composed of two parts:

\n
    \n
  • A required list of columns (TableColumn). Each column specifies a\nname, type, set of constraints, and (optional) description. type\ndefaults to string if unspecified. Column constraints\n(TableColumnConstraints) consist of boolean properties unique and\nnullable, as well as a list of strings other containing string\ndescriptions of all additional constraints (e.g. \u201c<= 5\u201d).

  • \n
  • An optional list of table-level constraints (TableConstraints). A\ntable-level constraint cannot be expressed in terms of a single column,\ne.g. col a > col b. Presently, all table-level constraints must be\nexpressed as strings under the other attribute of a TableConstraints\nobject.

  • \n
\n
# example schema\nTableSchema(\n    constraints = TableConstraints(\n        other = [\n            "foo > bar",\n        ],\n    ),\n    columns = [\n        TableColumn(\n            name = "foo",\n            type = "string",\n            description = "Foo description",\n            constraints = TableColumnConstraints(\n                required = True,\n                other = [\n                    "starts with the letter 'a'",\n                ],\n            ),\n        ),\n        TableColumn(\n            name = "bar",\n            type = "string",\n        ),\n        TableColumn(\n            name = "baz",\n            type = "custom_type",\n            constraints = TableColumnConstraints(\n                unique = True,\n            )\n        ),\n    ],\n)\n
\n
\n
\n
Parameters
\n
    \n
  • columns (List[TableColumn]) \u2013 The columns of the table.

  • \n
  • constraints (Optional[TableConstraints]) \u2013 The constraints of the table.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableConstraints(other)[source]\u00b6
\n

Descriptor for \u201ctable-level\u201d constraints. Presently only one property,\nother is supported. This contains strings describing arbitrary\ntable-level constraints. A table-level constraint is a constraint defined\nin terms of multiple columns (e.g. col_A > col_B) or in terms of rows.

\n
\n
Parameters
\n

other (List[str]) \u2013 Descriptions of arbitrary table-level constraints.

\n
\n
\n
\n\n
\n
\nclass dagster.TableColumn(name, type='string', description=None, constraints=None)[source]\u00b6
\n

Descriptor for a table column. The only property that must be specified\nby the user is name. If no type is specified, string is assumed. If\nno constraints are specified, the column is assumed to be nullable\n(i.e. required = False) and have no other constraints beyond the data type.

\n
\n
Parameters
\n
    \n
  • name (List[str]) \u2013 Descriptions of arbitrary table-level constraints.

  • \n
  • type (Optional[str]) \u2013 The type of the column. Can be an arbitrary\nstring. Defaults to \u201cstring\u201d.

  • \n
  • description (Optional[str]) \u2013 Description of this column. Defaults to None.

  • \n
  • constraints (Optional[TableColumnConstraints]) \u2013 Column-level constraints.\nIf unspecified, column is nullable with no constraints.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableColumnConstraints(nullable=True, unique=False, other=None)[source]\u00b6
\n

Descriptor for a table column\u2019s constraints. Nullability and uniqueness are specified with\nboolean properties. All other constraints are described using arbitrary strings under the\nother property.

\n
\n
Parameters
\n
    \n
  • nullable (Optional[bool]) \u2013 If true, this column can hold null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, all values in this column must be unique.

  • \n
  • other (List[str]) \u2013 Descriptions of arbitrary column-level constraints\nnot expressible by the predefined properties.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Asset key\u00b6

\n

Dagster uses AssetKey to build an index on Materialization events.\nAssets materialized with an AssetKey are highlighted in dagit on the Assets\ndashboard.

\n
\n
\nclass dagster.AssetKey(path)[source]\u00b6
\n

Object representing the structure of an asset key. Takes in a sanitized string, list of\nstrings, or tuple of strings.

\n

Example usage:

\n
from dagster import op\n\n@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey('flat_asset_key'),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(['parent', 'child', 'grandchild']),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key_2(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(('parent', 'child', 'grandchild')),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n
\n
\n
\n
Parameters
\n

path (Sequence[str]) \u2013 String, list of strings, or tuple of strings. A list of strings\nrepresent the hierarchical structure of the asset_key.

\n
\n
\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/ops", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../io-managers/", "title": "IO Managers"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../modes/", "title": "[Legacy] Modes"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/io-managers", "IO Managers", "N", "next"], ["sections/api/apidocs/modes", "[Legacy] Modes", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/ops.rst.txt", "title": "Ops", "toc": "\n"}, "partitions": {"alabaster_version": "0.7.12", "body": "
\n

Partitions\u00b6

\n
\n
\nclass dagster.PartitionedConfig(partitions_def, run_config_for_partition_fn, decorated_fn=None, tags_for_partition_fn=None)[source]\u00b6
\n

Defines a way of configuring a job where the job can be run on one of a discrete set of\npartitions, and each partition corresponds to run configuration for the job.

\n

Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\nand view the run history across partitions.

\n
\n
\nget_run_config_for_partition_key(partition_key)[source]\u00b6
\n

Generates the run config corresponding to a partition key.

\n
\n
Parameters
\n

partition_key (str) \u2013 the key for a partition that should be used to generate a run config.

\n
\n
\n
\n\n
\n\n
\n
\ndagster.static_partitioned_config(partition_keys, tags_for_partition_fn=None)[source]\u00b6
\n

Creates a static partitioned config for a job.

\n

The provided partition_keys returns a static list of strings identifying the set of partitions,\ngiven an optional datetime argument (representing the current time). The list of partitions\nis static, so while the run config returned by the decorated function may change over time, the\nlist of valid partition keys does not.

\n

This has performance advantages over dynamic_partitioned_config in terms of loading different\npartition views in Dagit.

\n

The decorated function takes in a partition key and returns a valid run config for a particular\ntarget job.

\n
\n
Parameters
\n

partition_keys (List[str]) \u2013 A list of valid partition keys, which serve as the range of\nvalues that can be provided to the decorated run config function.

\n
\n
Returns
\n

PartitionedConfig

\n
\n
\n
\n\n
\n
\ndagster.dynamic_partitioned_config(partition_fn, tags_for_partition_fn=None)[source]\u00b6
\n

Creates a dynamic partitioned config for a job.

\n

The provided partition_fn returns a list of strings identifying the set of partitions, given\nan optional datetime argument (representing the current time). The list of partitions returned\nmay change over time.

\n

The decorated function takes in a partition key and returns a valid run config for a particular\ntarget job.

\n
\n
Parameters
\n

partition_fn (Callable[[datetime.datetime], Sequence[str]]) \u2013 A function that generates a\nlist of valid partition keys, which serve as the range of values that can be provided\nto the decorated run config function.

\n
\n
Returns
\n

PartitionedConfig

\n
\n
\n
\n\n
\n
\ndagster.hourly_partitioned_config(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of hourly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset is provided, the start and end times of each partition will be\nminute_offset past the hour.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n@hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\ndagster.daily_partitioned_config(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of daily partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the bounds\nof the date partition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset and/or hour_offset are used, the start and end times of each partition\nwill be hour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@daily_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n@daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\ndagster.weekly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of weekly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@weekly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n@weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\ndagster.monthly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of monthly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at midnight on the soonest first of the month after\nstart_date. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and end\ndate of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\nstart and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@monthly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n@monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n
\ndagster.build_schedule_from_partitioned_job(job, description=None, name=None, minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]
\n

Creates a schedule from a time window-partitioned job.

\n

The schedule executes at the cadence specified by the partitioning of the given job.

\n
\n\n
\n
\n

Legacy Functions\u00b6

\n

The following functions are useful for working with partitions on legacy pipelines.

\n
\n
\nclass dagster.Partition(value, name=None)[source]\u00b6
\n

A Partition represents a single slice of the entire set of a job\u2019s possible work. It consists\nof a value, which is an object that represents that partition, and an optional name, which is\nused to label the partition in a human-readable way.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The object for this partition

  • \n
  • name (str) \u2013 Name for this partition

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.PartitionSetDefinition(name, pipeline_name=None, partition_fn=None, solid_selection=None, mode=None, run_config_fn_for_partition=<function PartitionSetDefinition.<lambda>>, tags_fn_for_partition=<function PartitionSetDefinition.<lambda>>, partitions_def=None, job_name=None)[source]\u00b6
\n

Defines a partition set, representing the set of slices making up an axis of a pipeline

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name for this partition set

  • \n
  • pipeline_name (str) \u2013 The name of the pipeline definition

  • \n
  • partition_fn (Optional[Callable[void, List[Partition]]]) \u2013 User-provided function to define\nthe set of valid partition objects.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with this partition. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this partition. (default: \u2018default\u2019)

  • \n
  • run_config_fn_for_partition (Callable[[Partition], Any]) \u2013 A\nfunction that takes a Partition and returns the run\nconfiguration that parameterizes the execution for this partition.

  • \n
  • tags_fn_for_partition (Callable[[Partition], Optional[dict[str, str]]]) \u2013 A function that\ntakes a Partition and returns a list of key value pairs that will\nbe added to the generated run for this partition.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 A set of parameters used to construct the set\nof valid partition objects.

  • \n
\n
\n
\n
\n
\ncreate_schedule_definition(schedule_name, cron_schedule, partition_selector, should_execute=None, environment_vars=None, execution_timezone=None, description=None, decorated_fn=None, job=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Create a ScheduleDefinition from a PartitionSetDefinition.

\n
\n
Parameters
\n
    \n
  • schedule_name (str) \u2013 The name of the schedule.

  • \n
  • cron_schedule (str) \u2013 A valid cron string for the schedule

  • \n
  • partition_selector (Callable[ScheduleEvaluationContext, PartitionSetDefinition], Union[Partition, List[Partition]]) \u2013 Function that determines the partition to use at a given execution time. Can return\neither a single Partition or a list of Partitions. For time-based partition sets,\nwill likely be either identity_partition_selector or a selector returned by\ncreate_offset_partition_selector.

  • \n
  • should_execute (Optional[function]) \u2013 Function that runs at schedule execution time that\ndetermines whether a schedule should execute. Defaults to a function that always returns\nTrue.

  • \n
  • environment_vars (Optional[dict]) \u2013 The environment variables to set for the schedule.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
Returns
\n

\n
The generated PartitionScheduleDefinition for the partition

selector

\n
\n
\n

\n
\n
Return type
\n

PartitionScheduleDefinition

\n
\n
\n
\n\n
\n
\nget_partitions(current_time=None)[source]\u00b6
\n

Return the set of known partitions.

\n
\n
Parameters
\n

current_time (Optional[datetime]) \u2013 The evaluation time for the partition function, which\nis passed through to the partition_fn (if it accepts a parameter). Defaults to\nthe current time in UTC.

\n
\n
\n
\n\n
\n\n
\n
\ndagster.date_partition_range(start, end=None, delta_range='days', fmt=None, inclusive=False, timezone=None)[source]\u00b6
\n

Utility function that returns a partition generating function to be used in creating a\nPartitionSet definition.

\n
\n
Parameters
\n
    \n
  • start (datetime) \u2013 Datetime capturing the start of the time range.

  • \n
  • end (Optional(datetime)) \u2013 Datetime capturing the end of the partition. By default, the\ncurrent time is used. The range is not inclusive of the end\nvalue.

  • \n
  • delta_range (Optional(str)) \u2013 string representing the time duration of each partition.\nMust be a valid argument to pendulum.period.range (\u201cdays\u201d, \u201chours\u201d, \u201cmonths\u201d, etc.).

  • \n
  • fmt (Optional(str)) \u2013 Format string to represent each partition by its start time

  • \n
  • inclusive (Optional(bool)) \u2013 By default, the partition set only contains date interval\npartitions for which the end time of the interval is less than current time. In other\nwords, the partition set contains date interval partitions that are completely in the\npast. If inclusive is set to True, then the partition set will include all date\ninterval partitions for which the start time of the interval is less than the\ncurrent time.

  • \n
  • timezone (Optional(str)) \u2013 Timezone in which the partition values should be expressed.

  • \n
\n
\n
Returns
\n

Callable[[], List[Partition]]

\n
\n
\n
\n\n
\n
\ndagster.identity_partition_selector(context, partition_set_def)[source]\u00b6
\n

Utility function for supplying a partition selector when creating a schedule from a\npartition set made of datetime objects that assumes the schedule always executes at the\npartition time.

\n

It\u2019s important that the cron string passed into create_schedule_definition match\nthe partition set times. For example, a schedule created from a partition set with partitions for each day at\nmidnight would create its partition selector as follows:

\n
partition_set = PartitionSetDefinition(\n    name='hello_world_partition_set',\n    pipeline_name='hello_world_pipeline',\n    partition_fn= date_partition_range(\n        start=datetime.datetime(2021, 1, 1),\n        delta_range="days",\n        timezone="US/Central",\n    )\n    run_config_fn_for_partition=my_run_config_fn,\n)\n\nschedule_definition = partition_set.create_schedule_definition(\n    "hello_world_daily_schedule",\n    "0 0 * * *",\n    partition_selector=identity_partition_selector,\n    execution_timezone="US/Central",\n)\n
\n
\n
\n\n
\n
\ndagster.create_offset_partition_selector(execution_time_to_partition_fn)[source]\u00b6
\n

Utility function for supplying a partition selector when creating a schedule from a\npartition set made of datetime objects that assumes a fixed time offset between the\npartition time and the time at which the schedule executes.

\n

It\u2019s important to keep the cron string that\u2019s supplied to\nPartitionSetDefinition.create_schedule_definition in sync with the offset that\u2019s\nsupplied to this function. For example, a schedule created from a partition set with\npartitions for each day at midnight that fills in the partition for day N at day N+1 at\n10:00AM would create the partition selector as follows:

\n
partition_set = PartitionSetDefinition(\n    name='hello_world_partition_set',\n    pipeline_name='hello_world_pipeline',\n    partition_fn= date_partition_range(\n        start=datetime.datetime(2021, 1, 1),\n        delta_range="days",\n        timezone="US/Central",\n    )\n    run_config_fn_for_partition=my_run_config_fn,\n)\n\nschedule_definition = partition_set.create_schedule_definition(\n    "daily_10am_schedule",\n    "0 10 * * *",\n    partition_selector=create_offset_partition_selector(lambda d: d.subtract(hours=10, days=1))\n    execution_timezone="US/Central",\n)\n
\n
\n
\n
Parameters
\n

execution_time_to_partition_fn (Callable[[datetime.datetime], datetime.datetime]) \u2013 A\nfunction that maps the execution time of the schedule to the partition time.

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/partitions", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../pipeline/", "title": "[Legacy] Pipelines"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../io-managers/", "title": "IO Managers"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/pipeline", "[Legacy] Pipelines", "N", "next"], ["sections/api/apidocs/io-managers", "IO Managers", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/partitions.rst.txt", "title": "Partitions", "toc": "\n"}, "pipeline": {"alabaster_version": "0.7.12", "body": "
\n

[Legacy] Pipelines\u00b6

\n

As of Dagster 0.13.0, we recommend using Jobs as an alternative to Pipelines.

\n
\n

Pipeline definitions\u00b6

\n
\n
\n@dagster.pipeline(name=None, description=None, mode_defs=None, preset_defs=None, tags=None, hook_defs=None, input_defs=None, output_defs=None, config_schema=None, config_fn=None, solid_retry_policy=None, version_strategy=None)[source]\u00b6
\n

Create a pipeline with the specified parameters from the decorated composition function.

\n

Using this decorator allows you to build up the dependency graph of the pipeline by writing a\nfunction that invokes solids and passes the output to other solids.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the pipeline. Must be unique within any\nRepositoryDefinition containing the pipeline.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the pipeline.

  • \n
  • mode_defs (Optional[List[ModeDefinition]]) \u2013 The set of modes in which this pipeline can\noperate. Modes are used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline. Modes can be used, e.g., to vary\navailable resource and logging implementations between local test and production runs.

  • \n
  • preset_defs (Optional[List[PresetDefinition]]) \u2013 A set of preset collections of configuration\noptions that may be used to execute a pipeline. A preset consists of an environment\ndict, an optional subset of solids to execute, and a mode selection. Presets can be used\nto ship common combinations of options to pipeline end users in Python code, and can\nbe selected by tools like Dagit.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution run of the pipeline.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • hook_defs (Optional[Set[HookDefinition]]) \u2013 A set of hook definitions applied to the\npipeline. When a hook is applied to a pipeline, it will be attached to all solid\ninstances within the pipeline.

  • \n
  • solid_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all solids in\nthis pipeline. Only used if retry policy is not defined on the solid definition or\nsolid invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 The version strategy to use with this\npipeline. Providing a VersionStrategy will enable memoization on the pipeline.

  • \n
\n
\n
\n

Example

\n
@solid(output_defs=[OutputDefinition(int, "two"), OutputDefinition(int, "four")])\ndef emit_two_four(_) -> int:\n    yield Output(2, "two")\n    yield Output(4, "four")\n\n\n@lambda_solid\ndef add_one(num: int) -> int:\n    return num + 1\n\n\n@lambda_solid\ndef mult_two(num: int) -> int:\n    return num * 2\n\n\n@pipeline\ndef math_pipeline():\n    two, four = emit_two_four()\n    add_one(two)\n    mult_two(four)\n
\n
\n
\n\n
\n
\nclass dagster.PipelineDefinition(solid_defs=None, name=None, description=None, dependencies=None, mode_defs=None, preset_defs=None, tags=None, hook_defs=None, solid_retry_policy=None, graph_def=None, _parent_pipeline_def=None, version_strategy=None)[source]\u00b6
\n

Defines a Dagster pipeline.

\n

A pipeline is made up of

\n
    \n
  • Solids, each of which is a single functional unit of data computation.

  • \n
  • Dependencies, which determine how the values produced by solids as their outputs flow from\none solid to another. This tells Dagster how to arrange solids, and potentially multiple\naliased instances of solids, into a directed, acyclic graph (DAG) of compute.

  • \n
  • Modes, which can be used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline, and to switch between them.

  • \n
  • Presets, which can be used to ship common combinations of pipeline config options in Python\ncode, and to switch between them.

  • \n
\n
\n
Parameters
\n
    \n
  • solid_defs (List[SolidDefinition]) \u2013 The set of solids used in this pipeline.

  • \n
  • name (str) \u2013 The name of the pipeline. Must be unique within any\nRepositoryDefinition containing the pipeline.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the pipeline.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each solid\u2019s inputs on the outputs of\nother solids in the pipeline. Keys of the top level dict are either the string names of\nsolids in the pipeline or, in the case of aliased solids,\nNodeInvocations. Values of the top level dict are\nthemselves dicts, which map input names belonging to the solid or aliased solid to\nDependencyDefinitions.

  • \n
  • mode_defs (Optional[List[ModeDefinition]]) \u2013 The set of modes in which this pipeline can\noperate. Modes are used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline. Modes can be used, e.g., to vary available\nresource and logging implementations between local test and production runs.

  • \n
  • preset_defs (Optional[List[PresetDefinition]]) \u2013 A set of preset collections of configuration\noptions that may be used to execute a pipeline. A preset consists of an environment\ndict, an optional subset of solids to execute, and a mode selection. Presets can be used\nto ship common combinations of options to pipeline end users in Python code, and can\nbe selected by tools like Dagit.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution run of the pipeline.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • hook_defs (Optional[AbstractSet[HookDefinition]]) \u2013 A set of hook definitions applied to the\npipeline. When a hook is applied to a pipeline, it will be attached to all solid\ninstances within the pipeline.

  • \n
  • solid_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all solids in\nthis pipeline. Only used if retry policy is not defined on the solid definition or\nsolid invocation.

  • \n
  • _parent_pipeline_def (INTERNAL ONLY) \u2013 Used for tracking pipelines created using solid subsets.

  • \n
\n
\n
\n

Examples

\n
@solid\ndef return_one(_):\n    return 1\n\n\n@solid(input_defs=[InputDefinition('num')], required_resource_keys={'op'})\ndef apply_op(context, num):\n    return context.resources.op(num)\n\n@resource(config_schema=Int)\ndef adder_resource(init_context):\n    return lambda x: x + init_context.resource_config\n\n\nadd_mode = ModeDefinition(\n    name='add_mode',\n    resource_defs={'op': adder_resource},\n    description='Mode that adds things',\n)\n\n\nadd_three_preset = PresetDefinition(\n    name='add_three_preset',\n    run_config={'resources': {'op': {'config': 3}}},\n    mode='add_mode',\n)\n\n\npipeline_def = PipelineDefinition(\n    name='basic',\n    solid_defs=[return_one, apply_op],\n    dependencies={'apply_op': {'num': DependencyDefinition('return_one')}},\n    mode_defs=[add_mode],\n    preset_defs=[add_three_preset],\n)\n
\n
\n
\n\n
\n
\n

Executing pipelines\u00b6

\n
\n
\ndagster.execute_pipeline(pipeline, run_config=None, mode=None, preset=None, tags=None, solid_selection=None, instance=None, raise_on_error=True)[source]\u00b6
\n

Execute a pipeline synchronously.

\n

Users will typically call this API when testing pipeline execution, or running standalone\nscripts.

\n
\n
Parameters
\n
    \n
  • pipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run,\nas a dict.

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True, since this is the most useful behavior in test.

  • \n
  • solid_selection (Optional[List[str]]) \u2013

    A list of solid selection queries (including single\nsolid names) to execute. For example:

    \n
      \n
    • ['some_solid']: selects some_solid itself.

    • \n
    • ['*some_solid']: select some_solid and all its ancestors (upstream dependencies).

    • \n
    • ['*some_solid+++']: select some_solid, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_solid', 'other_solid_a', 'other_solid_b+']: select some_solid and all its\nancestors, other_solid_a itself, and other_solid_b and its direct child solids.

    • \n
    \n

  • \n
\n
\n
Returns
\n

The result of pipeline execution.

\n
\n
Return type
\n

PipelineExecutionResult

\n
\n
\n

For the asynchronous version, see execute_pipeline_iterator().

\n
\n\n
\n
\ndagster.execute_pipeline_iterator(pipeline, run_config=None, mode=None, preset=None, tags=None, solid_selection=None, instance=None)[source]\u00b6
\n

Execute a pipeline iteratively.

\n

Rather than package up the result of running a pipeline into a single object, like\nexecute_pipeline(), this function yields the stream of events resulting from pipeline\nexecution.

\n

This is intended to allow the caller to handle these events on a streaming basis in whatever\nway is appropriate.

\n
\n
Parameters
\n
    \n
  • pipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run,\nas a dict.

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • solid_selection (Optional[List[str]]) \u2013

    A list of solid selection queries (including single\nsolid names) to execute. For example:

    \n
      \n
    • ['some_solid']: selects some_solid itself.

    • \n
    • ['*some_solid']: select some_solid and all its ancestors (upstream dependencies).

    • \n
    • ['*some_solid+++']: select some_solid, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_solid', 'other_solid_a', 'other_solid_b+']: select some_solid and all its\nancestors, other_solid_a itself, and other_solid_b and its direct child solids.

    • \n
    \n

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
\n
\n
Returns
\n

The stream of events resulting from pipeline execution.

\n
\n
Return type
\n

Iterator[DagsterEvent]

\n
\n
\n
\n\n
\n
\nclass dagster.PipelineExecutionResult(pipeline_def, run_id, event_list, reconstruct_context, output_capture=None)[source]\u00b6
\n

The result of executing a pipeline.

\n

Returned by execute_pipeline(). Users should not instantiate this class directly.

\n
\n
\noutput_for_solid(handle_str, output_name='result')\u00b6
\n

Get the output of a solid by its solid handle string and output name.

\n
\n
Parameters
\n
    \n
  • handle_str (str) \u2013 The string handle for the solid.

  • \n
  • output_name (str) \u2013 Optional. The name of the output, default to DEFAULT_OUTPUT.

  • \n
\n
\n
Returns
\n

The output value for the handle and output_name.

\n
\n
\n
\n\n
\n
\nresult_for_handle(handle)\u00b6
\n

Get the result of a solid by its solid handle.

\n

This allows indexing into top-level solids to retrieve the results of children of\ncomposite solids.

\n
\n
Parameters
\n

handle (Union[str,NodeHandle]) \u2013 The handle for the solid.

\n
\n
Returns
\n

The result of the given\nsolid.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\nresult_for_solid(name)\u00b6
\n

Get the result of a top level solid.

\n
\n
Parameters
\n

name (str) \u2013 The name of the top-level solid or aliased solid for which to retrieve the\nresult.

\n
\n
Returns
\n

The result of the solid\nexecution within the pipeline.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\nproperty solid_result_list\u00b6
\n

The results for each\ntop level solid.

\n
\n
Type
\n

List[Union[CompositeSolidExecutionResult, SolidExecutionResult]]

\n
\n
\n
\n\n
\n
\nproperty step_event_list\u00b6
\n

List[DagsterEvent] The full list of events generated by steps in the execution.

\n

Excludes events generated by the pipeline lifecycle, e.g., PIPELINE_START.

\n
\n\n
\n
\nproperty success\u00b6
\n

Whether all steps in the execution were successful.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\ndagster.default_executors List[ExecutorDefinition]\u00b6
\n

Built-in mutable sequence.

\n

If no argument is given, the constructor creates a new empty list.\nThe argument must be an iterable if specified.

\n

The default executors available on any ModeDefinition that does not provide custom\nexecutors. These are currently [in_process_executor,\nmultiprocess_executor].

\n
\n\n
\n
\n

Re-executing pipelines\u00b6

\n
\n
\ndagster.reexecute_pipeline(pipeline, parent_run_id, run_config=None, step_selection=None, mode=None, preset=None, tags=None, instance=None, raise_on_error=True)[source]\u00b6
\n

Reexecute an existing pipeline run.

\n

Users will typically call this API when testing pipeline reexecution, or running standalone\nscripts.

\n
\n
Parameters
\n
    \n
  • pipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.

  • \n
  • parent_run_id (str) \u2013 The id of the previous run to reexecute. The run must exist in the\ninstance.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run,\nas a dict.

  • \n
  • solid_selection (Optional[List[str]]) \u2013

    A list of solid selection queries (including single\nsolid names) to execute. For example:

    \n
      \n
    • ['some_solid']: selects some_solid itself.

    • \n
    • ['*some_solid']: select some_solid and all its ancestors (upstream dependencies).

    • \n
    • ['*some_solid+++']: select some_solid, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_solid', 'other_solid_a', 'other_solid_b+']: select some_solid and all its\nancestors, other_solid_a itself, and other_solid_b and its direct child solids.

    • \n
    \n

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True, since this is the most useful behavior in test.

  • \n
\n
\n
Returns
\n

The result of pipeline execution.

\n
\n
Return type
\n

PipelineExecutionResult

\n
\n
\n

For the asynchronous version, see reexecute_pipeline_iterator().

\n
\n\n
\n
\ndagster.reexecute_pipeline_iterator(pipeline, parent_run_id, run_config=None, step_selection=None, mode=None, preset=None, tags=None, instance=None)[source]\u00b6
\n

Reexecute a pipeline iteratively.

\n

Rather than package up the result of running a pipeline into a single object, like\nreexecute_pipeline(), this function yields the stream of events resulting from pipeline\nreexecution.

\n

This is intended to allow the caller to handle these events on a streaming basis in whatever\nway is appropriate.

\n
\n
Parameters
\n
    \n
  • pipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.

  • \n
  • parent_run_id (str) \u2013 The id of the previous run to reexecute. The run must exist in the\ninstance.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run,\nas a dict.

  • \n
  • solid_selection (Optional[List[str]]) \u2013

    A list of solid selection queries (including single\nsolid names) to execute. For example:

    \n
      \n
    • ['some_solid']: selects some_solid itself.

    • \n
    • ['*some_solid']: select some_solid and all its ancestors (upstream dependencies).

    • \n
    • ['*some_solid+++']: select some_solid, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_solid', 'other_solid_a', 'other_solid_b+']: select some_solid and all its\nancestors, other_solid_a itself, and other_solid_b and its direct child solids.

    • \n
    \n

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
\n
\n
Returns
\n

The stream of events resulting from pipeline reexecution.

\n
\n
Return type
\n

Iterator[DagsterEvent]

\n
\n
\n
\n\n
\n
\n

Reconstructable pipelines\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]
\n

Create a ReconstructablePipeline from a\nfunction that returns a PipelineDefinition/JobDefinition,\nor a function decorated with @pipeline/@job.

\n

When your pipeline/job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the pipeline/job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of pipelines\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples:

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\nclass dagster.core.definitions.reconstruct.ReconstructablePipeline(repository, pipeline_name, solid_selection_str=None, solids_to_execute=None)[source]\u00b6
\n

Defines a reconstructable pipeline. When your pipeline/job must cross process boundaries,\nDagster must know how to reconstruct the pipeline/job on the other side of the process boundary.

\n
\n
Parameters
\n
    \n
  • repository (ReconstructableRepository) \u2013 The reconstructable representation of the repository\nthe pipeline/job belongs to.

  • \n
  • pipeline_name (str) \u2013 The name of the pipeline/job.

  • \n
  • solid_selection_str (Optional[str]) \u2013 The string value of a comma separated list of user-input\nsolid/op selection. None if no selection is specified, i.e. the entire pipeline/job will\nbe run.

  • \n
  • solids_to_execute (Optional[FrozenSet[str]]) \u2013 A set of solid/op names to execute. None if no selection\nis specified, i.e. the entire pipeline/job will be run.

  • \n
\n
\n
\n
\n
\nget_module()[source]\u00b6
\n

Return the module the pipeline is found in, the origin is a module code pointer

\n
\n\n
\n\n
\n
\n

Dependencies and aliases\u00b6

\n
\n
\nclass dagster.DependencyDefinition(solid=None, output='result', description=None, node=None)[source]\u00b6
\n

Represents an edge in the DAG of nodes (ops or graphs) forming a job.

\n

This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a job whose keys represent the dependent node and dependent\ninput, so this object only contains information about the dependee.

\n

Concretely, if the input named \u2018input\u2019 of op_b depends on the output named \u2018result\u2019 of\nop_a, and the output named \u2018other_result\u2019 of graph_a, the structure will look as follows:

\n
dependency_structure = {\n    'my_downstream_op': {\n        'input': DependencyDefinition('my_upstream_op', 'result')\n    }\n    'my_downstream_op': {\n        'input': DependencyDefinition('my_upstream_graph', 'result')\n    }\n}\n
\n
\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
@job\ndef the_job():\n    node_b(node_a())\n
\n
\n
\n
Parameters
\n
    \n
  • solid (str) \u2013 (legacy) The name of the solid that is depended on, that is, from which the value\npassed between the two nodes originates.

  • \n
  • output (Optional[str]) \u2013 The name of the output that is depended on. (default: \u201cresult\u201d)

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this dependency.

  • \n
  • node (str) \u2013 The name of the node (op or graph) that is depended on, that is, from which the value\npassed between the two nodes originates.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.MultiDependencyDefinition(dependencies)[source]\u00b6
\n

Represents a fan-in edge in the DAG of op instances forming a job.

\n

This object is used only when an input of type List[T] is assembled by fanning-in multiple\nupstream outputs of type T.

\n

This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a job or pipeline whose keys represent the dependent ops or graphs and dependent\ninput, so this object only contains information about the dependee.

\n

Concretely, if the input named \u2018input\u2019 of op_c depends on the outputs named \u2018result\u2019 of\nop_a and op_b, this structure will look as follows:

\n
dependency_structure = {\n    'op_c': {\n        'input': MultiDependencyDefinition(\n            [\n                DependencyDefinition('op_a', 'result'),\n                DependencyDefinition('op_b', 'result')\n            ]\n        )\n    }\n}\n
\n
\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
@job\ndef the_job():\n    op_c(op_a(), op_b())\n
\n
\n
\n
Parameters
\n

dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]) \u2013 List of\nupstream dependencies fanned in to this input.

\n
\n
\n
\n\n
\n
\ndagster.SolidInvocation\u00b6
\n

alias of dagster.core.definitions.dependency.NodeInvocation

\n
\n\n
\n
\n

Pipeline configuration\u00b6

\n
\n

Run Config Schema\u00b6

\n
\n

The run_config used by execute_pipeline() and\nexecute_pipeline_iterator() has the following schema:

\n
{\n  # configuration for execution, required if executors require config\n  execution: {\n    # the name of one, and only one available executor, typically 'in_process' or 'multiprocess'\n    __executor_name__: {\n      # executor-specific config, if required or permitted\n      config: {\n        ...\n      }\n    }\n  },\n\n  # configuration for loggers, required if loggers require config\n  loggers: {\n    # the name of an available logger\n    __logger_name__: {\n      # logger-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for resources, required if resources require config\n  resources: {\n    # the name of a resource\n    __resource_name__: {\n      # resource-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for solids, required if solids require config\n  solids: {\n\n    # these keys align with the names of the solids, or their alias in this pipeline\n    __solid_name__: {\n\n      # pass any data that was defined via config_field\n      config: ...,\n\n      # configurably specify input values, keyed by input name\n      inputs: {\n        __input_name__: {\n          # if an dagster_type_loader is specified, that schema must be satisfied here;\n          # scalar, built-in types will generally allow their values to be specified directly:\n          value: ...\n        }\n      },\n\n    }\n  },\n\n}\n
\n
\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/pipeline", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../presets/", "title": "[Legacy] Presets"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../partitions/", "title": "Partitions"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/presets", "[Legacy] Presets", "N", "next"], ["sections/api/apidocs/partitions", "Partitions", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/pipeline.rst.txt", "title": "[Legacy] Pipelines", "toc": "\n"}, "presets": {"alabaster_version": "0.7.12", "body": "
\n

[Legacy] Presets\u00b6

\n
\n
\nclass dagster.PresetDefinition(name, run_config=None, solid_selection=None, mode=None, tags=None)[source]\u00b6
\n

Defines a preset configuration in which a pipeline can execute.

\n

Presets can be used in Dagit to load predefined configurations into the tool.

\n

Presets may also be used from the Python API (in a script, or in test) as follows:

\n
execute_pipeline(pipeline_def, preset='example_preset')\n
\n
\n

Presets may also be used with the command line tools:

\n
$ dagster pipeline execute example_pipeline --preset example_preset\n
\n
\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.

  • \n
  • run_config (Optional[dict]) \u2013 A dict representing the config to set with the preset.\nThis is equivalent to the run_config argument to execute_pipeline().

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default: \u2018default\u2019)

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.

  • \n
\n
\n
\n
\n
\nstatic from_files(name, config_files=None, solid_selection=None, mode=None, tags=None)[source]\u00b6
\n

Static constructor for presets from YAML files.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.

  • \n
  • config_files (Optional[List[str]]) \u2013 List of paths or glob patterns for yaml files\nto load and parse as the run config for this preset.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.

  • \n
\n
\n
Returns
\n

A PresetDefinition constructed from the provided YAML files.

\n
\n
Return type
\n

PresetDefinition

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML files is invalid and has a parse\n error.

\n
\n
\n
\n\n
\n
\nstatic from_pkg_resources(name, pkg_resource_defs=None, solid_selection=None, mode=None, tags=None)[source]\u00b6
\n

Load a preset from a package resource, using pkg_resources.resource_string().

\n

Example:

\n
PresetDefinition.from_pkg_resources(\n    name='local',\n    mode='local',\n    pkg_resource_defs=[\n        ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n        ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n    ],\n)\n
\n
\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.

  • \n
  • pkg_resource_defs (Optional[List[(str, str)]]) \u2013 List of pkg_resource modules/files to\nload as the run config for this preset.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with this partition. e.g.\n['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.

  • \n
\n
\n
Returns
\n

A PresetDefinition constructed from the provided YAML strings

\n
\n
Return type
\n

PresetDefinition

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\nstatic from_yaml_strings(name, yaml_strings=None, solid_selection=None, mode=None, tags=None)[source]\u00b6
\n

Static constructor for presets from YAML strings.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.

  • \n
  • yaml_strings (Optional[List[str]]) \u2013 List of yaml strings to parse as the environment\nconfig for this preset.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.

  • \n
\n
\n
Returns
\n

A PresetDefinition constructed from the provided YAML strings

\n
\n
Return type
\n

PresetDefinition

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\nget_environment_yaml()[source]\u00b6
\n

Get the environment dict set on a preset as YAML.

\n
\n
Returns
\n

The environment dict as YAML.

\n
\n
Return type
\n

str

\n
\n
\n
\n\n
\n
\nwith_additional_config(run_config)[source]\u00b6
\n

Return a new PresetDefinition with additional config merged in to the existing config.

\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/presets", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../repositories/", "title": "Repositories"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../pipeline/", "title": "[Legacy] Pipelines"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/repositories", "Repositories", "N", "next"], ["sections/api/apidocs/pipeline", "[Legacy] Pipelines", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/presets.rst.txt", "title": "[Legacy] Presets", "toc": "\n"}, "repositories": {"alabaster_version": "0.7.12", "body": "
\n

Repositories\u00b6

\n
\n
\ndagster.repository RepositoryDefinition[source]\u00b6
\n

Create a repository from the decorated function.

\n

The decorated function should take no arguments and its return value should one of:

\n

1. List[Union[JobDefinition, PipelineDefinition, PartitionSetDefinition, ScheduleDefinition, SensorDefinition]].\nUse this form when you have no need to lazy load pipelines or other definitions. This is the\ntypical use case.

\n
    \n
  1. A dict of the form:

  2. \n
\n
{\n    'jobs': Dict[str, Callable[[], JobDefinition]],\n    'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n    'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n    'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n    'sensors': Dict[str, Callable[[], SensorDefinition]]\n}\n
\n
\n

This form is intended to allow definitions to be created lazily when accessed by name,\nwhich can be helpful for performance when there are many definitions in a repository, or\nwhen constructing the definitions is costly.

\n

3. A RepositoryData. Return this object if you need fine-grained\ncontrol over the construction and indexing of definitions within the repository, e.g., to\ncreate definitions dynamically from .yaml files in a directory.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the repository. Defaults to the name of the decorated\nfunction.

  • \n
  • description (Optional[str]) \u2013 A string description of the repository.

  • \n
\n
\n
\n

Example:

\n
######################################################################\n# A simple repository using the first form of the decorated function\n######################################################################\n\n@op(config_schema={n: Field(Int)})\ndef return_n(context):\n    return context.op_config['n']\n\n@job\ndef simple_job():\n    return_n()\n\n@job\ndef some_job():\n    ...\n\n@sensor(job=some_job)\ndef some_sensor():\n    if foo():\n        yield RunRequest(\n            run_key= ...,\n            run_config={\n                'ops': {'return_n': {'config': {'n': bar()}}}\n            }\n        )\n\n@job\ndef my_job():\n    ...\n\nmy_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n@repository\ndef simple_repository():\n    return [simple_job, some_sensor, my_schedule]\n\n\n######################################################################\n# A lazy-loaded repository\n######################################################################\n\ndef make_expensive_job():\n    @job\n    def expensive_job():\n        for i in range(10000):\n            return_n.alias(f'return_n_{i}')()\n\n    return expensive_job\n\ndef make_expensive_schedule():\n    @job\n    def other_expensive_job():\n        for i in range(11000):\n            return_n.alias(f'my_return_n_{i}')()\n\n    return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n@repository\ndef lazy_loaded_repository():\n    return {\n        'jobs': {'expensive_job': make_expensive_job},\n        'schedules': {'expensive_schedule: make_expensive_schedule}\n    }\n\n\n######################################################################\n# A complex repository that lazily constructs jobs from a directory\n# of files in a bespoke YAML format\n######################################################################\n\nclass ComplexRepositoryData(RepositoryData):\n    def __init__(self, yaml_directory):\n        self._yaml_directory = yaml_directory\n\n    def get_all_pipelines(self):\n        return [\n            self._construct_job_def_from_yaml_file(\n              self._yaml_file_for_job_name(file_name)\n            )\n            for file_name in os.listdir(self._yaml_directory)\n        ]\n\n    ...\n\n@repository\ndef complex_repository():\n    return ComplexRepositoryData('some_directory')\n
\n
\n
\n\n
\n
\nclass dagster.RepositoryDefinition(name, repository_data, description=None)[source]\u00b6
\n

Define a repository that contains a group of definitions.

\n

Users should typically not create objects of this class directly. Instead, use the\n@repository() decorator.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the repository.

  • \n
  • repository_data (RepositoryData) \u2013 Contains the definitions making up the repository.

  • \n
  • description (Optional[str]) \u2013 A string description of the repository.

  • \n
\n
\n
\n
\n
\nget_all_jobs()[source]\u00b6
\n

Return all jobs in the repository as a list.

\n

Note that this will construct any job in the lazily evaluated dictionary that has\nnot yet been constructed.

\n
\n
Returns
\n

All jobs in the repository.

\n
\n
Return type
\n

List[JobDefinition]

\n
\n
\n
\n\n
\n
\nget_job(name)[source]\u00b6
\n

Get a job by name.

\n

If this job is present in the lazily evaluated dictionary passed to the\nconstructor, but has not yet been constructed, only this job is constructed, and\nwill be cached for future calls.

\n
\n
Parameters
\n

name (str) \u2013 Name of the job to retrieve.

\n
\n
Returns
\n

The job definition corresponding to\nthe given name.

\n
\n
Return type
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nhas_job(name)[source]\u00b6
\n

Check if a job with a given name is present in the repository.

\n
\n
Parameters
\n

name (str) \u2013 The name of the job.

\n
\n
Returns
\n

bool

\n
\n
\n
\n\n
\n
\nproperty job_names\u00b6
\n

Names of all jobs in the repository

\n
\n
Type
\n

List[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.RepositoryData[source]\u00b6
\n

Users should usually rely on the @repository decorator to create new\nrepositories, which will in turn call the static constructors on this class. However, users may\nsubclass RepositoryData for fine-grained control over access to and lazy creation\nof repository members.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/repositories", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../resources/", "title": "Resources"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../presets/", "title": "[Legacy] Presets"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/resources", "Resources", "N", "next"], ["sections/api/apidocs/presets", "[Legacy] Presets", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/repositories.rst.txt", "title": "Repositories", "toc": "\n"}, "resources": {"alabaster_version": "0.7.12", "body": "
\n

Resources\u00b6

\n
\n
\n@dagster.resource(config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define a resource.

\n

The decorated function should accept an InitResourceContext and return an instance of\nthe resource. This function will become the resource_fn of an underlying\nResourceDefinition.

\n

If the decorated function yields once rather than returning (in the manner of functions\ndecorable with @contextlib.contextmanager) then\nthe body of the function after the yield will be run after execution resolves, allowing users\nto write their own teardown/cleanup logic.

\n
\n
Parameters
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.resource_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by this resource.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ResourceDefinition(resource_fn, config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Core class for defining resources.

\n

Resources are scoped ways to make external resources (like database connections) available to\nduring job execution and to clean up after execution resolves.

\n

If resource_fn yields once rather than returning (in the manner of functions decorable with\n@contextlib.contextmanager) then the body of the\nfunction after the yield will be run after execution resolves, allowing users to write their\nown teardown/cleanup logic.

\n

Depending on your executor, resources may be instantiated and cleaned up more than once in a\njob execution.

\n
\n
Parameters
\n
    \n
  • resource_fn (Callable[[InitResourceContext], Any]) \u2013 User-provided function to instantiate\nthe resource, which will be made available to executions keyed on the\ncontext.resources object.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the resource matches this schema and fail if it does not. If\nnot set, Dagster will accept any config provided for the resource.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • required_resource_keys \u2013 (Optional[Set[str]]) Keys for the resources required by this\nresource. A DagsterInvariantViolationError will be raised during initialization if\ndependencies are cyclic.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the resource\u2019s definition fn. Two\nwrapped resource functions should only have the same version if they produce the same\nresource definition when provided with the same inputs.

  • \n
\n
\n
\n
\n
\nconfigured(config_or_config_fn, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n
\nstatic hardcoded_resource(value, description=None)[source]\u00b6
\n

A helper function that creates a ResourceDefinition with a hardcoded object.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value that will be accessible via context.resources.resource_name.

  • \n
  • description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

  • \n
\n
\n
Returns
\n

A hardcoded resource.

\n
\n
Return type
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nstatic mock_resource(description=None)[source]\u00b6
\n

A helper function that creates a ResourceDefinition which wraps a mock.MagicMock.

\n
\n
Parameters
\n

description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

\n
\n
Returns
\n

\n
A resource that creates the magic methods automatically and helps

you mock existing resources.

\n
\n
\n

\n
\n
Return type
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nstatic none_resource(description=None)[source]\u00b6
\n

A helper function that returns a none resource.

\n
\n
Parameters
\n

description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

\n
\n
Returns
\n

A resource that does nothing.

\n
\n
Return type
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.InitResourceContext(resource_config, resources, resource_def=None, instance=None, dagster_run=None, pipeline_run=None, log_manager=None, pipeline_def_for_backwards_compat=None)[source]\u00b6
\n

Resource-specific initialization context.

\n
\n
\nresource_config\u00b6
\n

The configuration data provided by the run config. The schema\nfor this data is defined by the config_field argument to\nResourceDefinition.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\nresource_def\u00b6
\n

The definition of the resource currently being\nconstructed.

\n
\n
Type
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\nlog_manager\u00b6
\n

The log manager for this run of the job or pipeline

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

The resources that are available to the resource that we are\ninitalizing.

\n
\n
Type
\n

ScopedResources

\n
\n
\n
\n\n
\n
\ndagster_run\u00b6
\n

The dagster run to use. When initializing resources\noutside of execution context, this will be None.

\n
\n
Type
\n

Optional[PipelineRun]

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The id for this run of the job or pipeline. When initializing resources\noutside of execution context, this will be None.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npipeline_run\u00b6
\n

(legacy) The dagster run to use. When initializing resources\noutside of execution context, this will be None.

\n
\n
Type
\n

Optional[PipelineRun]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.make_values_resource(**kwargs)[source]\u00b6
\n

A helper function that creates a ResourceDefinition to take in user-defined values.

\n
\n

This is useful for sharing values between ops.

\n
\n
\n
Parameters
\n

**kwargs \u2013 Arbitrary keyword arguments that will be passed to the config schema of the\nreturned resource definition. If not set, Dagster will accept any config provided for\nthe resource.

\n
\n
\n

For example:

\n
@op(required_resource_keys={"globals"})\ndef my_op(context):\n    print(context.resources.globals["my_str_var"])\n\n@job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\ndef my_job():\n    my_op()\n
\n
\n
\n
Returns
\n

A resource that passes in user-defined values.

\n
\n
Return type
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\ndagster.build_init_resource_context(config=None, resources=None, instance=None)[source]\u00b6
\n

Builds resource initialization context from provided parameters.

\n

build_init_resource_context can be used as either a function or context manager. If there is a\nprovided resource to build_init_resource_context that is a context manager, then it must be\nused as a context manager. This function can be used to provide the context argument to the\ninvocation of a resource.

\n
\n
Parameters
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • config (Optional[Any]) \u2013 The resource config to provide to the context.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
\n
\n
\n

Examples

\n
context = build_init_resource_context()\nresource_to_init(context)\n\nwith build_init_resource_context(\n    resources={"foo": context_manager_resource}\n) as context:\n    resource_to_init(context)\n
\n
\n
\n\n
\n
\ndagster.build_resources(resources, instance=None, resource_config=None, pipeline_run=None, log_manager=None)[source]\u00b6
\n

Context manager that yields resources using provided resource definitions and run config.

\n

This API allows for using resources in an independent context. Resources will be initialized\nwith the provided run config, and optionally, pipeline_run. The resulting resources will be\nyielded on a dictionary keyed identically to that provided for resource_defs. Upon exiting the\ncontext, resources will also be torn down safely.

\n
\n
Parameters
\n
    \n
  • resources (Dict[str, Any]) \u2013 Resource instances or definitions to build. All\nrequired resource dependencies to a given resource must be contained within this\ndictionary, or the resource build will fail.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to instantiate\nresources on.

  • \n
  • resource_config (Optional[Dict[str, Any]]) \u2013 A dict representing the config to be\nprovided to each resource during initialization and teardown.

  • \n
  • pipeline_run (Optional[PipelineRun]) \u2013 The pipeline run to provide during resource\ninitialization and teardown. If the provided resources require either the pipeline_run\nor run_id attributes of the provided context during resource initialization and/or\nteardown, this must be provided, or initialization will fail.

  • \n
  • log_manager (Optional[DagsterLogManager]) \u2013 Log Manager to use during resource\ninitialization. Defaults to system log manager.

  • \n
\n
\n
\n

Examples:

\n
from dagster import resource, build_resources\n\n@resource\ndef the_resource():\n    return "foo"\n\nwith build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n    assert resources.from_def == "foo"\n    assert resources.from_val == "bar"\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/resources", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../schedules-sensors/", "title": "Run Requests"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../repositories/", "title": "Repositories"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/schedules-sensors", "Run Requests", "N", "next"], ["sections/api/apidocs/repositories", "Repositories", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/resources.rst.txt", "title": "Resources", "toc": "\n"}, "schedules-sensors": {"alabaster_version": "0.7.12", "body": "
\n

Run Requests\u00b6

\n
\n
\nclass dagster.RunRequest(run_key, run_config=None, tags=None, job_name=None)[source]\u00b6
\n

Represents all the information required to launch a single run. Must be returned by a\nSensorDefinition or ScheduleDefinition\u2019s evaluation function for a run to be launched.

\n
\n
\nrun_key\u00b6
\n

A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.

\n
\n
Type
\n

str | None

\n
\n
\n
\n\n
\n
\nrun_config\u00b6
\n

The config that parameterizes the run execution to\nbe launched, as a dict.

\n
\n
Type
\n

Optional[Dict]

\n
\n
\n
\n\n
\n
\ntags\u00b6
\n

A dictionary of tags (string key-value pairs) to attach\nto the launched run.

\n
\n
Type
\n

Optional[Dict[str, str]]

\n
\n
\n
\n\n
\n
\njob_name\u00b6
\n

(Experimental) The name of the job this run request will launch.\nRequired for sensors that target multiple jobs.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SkipReason(skip_message=None)[source]\u00b6
\n

Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\nwhy no runs were requested.

\n
\n
\nskip_message\u00b6
\n

A message displayed in dagit for why this evaluation resulted\nin no requested runs.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\n

Schedules\u00b6

\n
\n
\n@dagster.schedule(cron_schedule, pipeline_name=None, name=None, tags=None, tags_fn=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, execution_timezone=None, description=None, job=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a schedule following the provided cron schedule and requests runs for the provided job.

\n

The decorated function takes in a ScheduleEvaluationContext as its only\nargument, and does one of the following:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Yield multiple of RunRequest objects.

  4. \n
  5. Return or yield a SkipReason object, providing a descriptive message of why no runs were\nrequested.

  6. \n
  7. Return or yield nothing (skipping without providing a reason)

  8. \n
  9. Return a run config dictionary.

  10. \n
\n

Returns a ScheduleDefinition.

\n
\n
Parameters
\n
    \n
  • cron_schedule (str) \u2013 A valid cron string specifying when the schedule will run, e.g.,\n'45 23 * * 6' for a schedule that runs at 11:45 PM every Saturday.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The name of the pipeline to execute when the\nschedule runs.

  • \n
  • name (Optional[str]) \u2013 The name of the schedule to create.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]) \u2013 A function\nthat generates tags to attach to the schedules runs. Takes a\nScheduleEvaluationContext and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags and tags_fn.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The pipeline mode in which to execute this schedule.\n(Default: \u2018default\u2019)

  • \n
  • should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]) \u2013 A function that runs at\nschedule execution tie to determine whether a schedule should execute or skip. Takes a\nScheduleEvaluationContext and returns a boolean (True if the\nschedule should execute). Defaults to a function that always returns True.

  • \n
  • environment_vars (Optional[Dict[str, str]]) \u2013 Any environment variables to set when executing\nthe schedule.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job that should execute when this\nschedule runs.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScheduleDefinition(name=None, cron_schedule=None, pipeline_name=None, run_config=None, run_config_fn=None, tags=None, tags_fn=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, execution_timezone=None, execution_fn=None, description=None, job=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Define a schedule that targets a job

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the schedule to create. Defaults to the job name plus\n\u201c_schedule\u201d.

  • \n
  • cron_schedule (str) \u2013 A valid cron string specifying when the schedule will run, e.g.,\n\u201845 23 * * 6\u2019 for a schedule that runs at 11:45 PM every Saturday.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The name of the pipeline to execute when the schedule runs.

  • \n
  • execution_fn (Callable[ScheduleEvaluationContext]) \u2013

    The core evaluation function for the\nschedule, which is run at an interval to determine whether a run should be launched or\nnot. Takes a ScheduleEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • run_config (Optional[Dict]) \u2013 The config that parameterizes this execution,\nas a dict.

  • \n
  • run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Dict]]]) \u2013 A function that\ntakes a ScheduleEvaluationContext object and returns the run configuration that\nparameterizes this execution, as a dict. You may set only one of run_config,\nrun_config_fn, and execution_fn.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes a\nScheduleEvaluationContext and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags, tags_fn, and execution_fn.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing this schedule. (default: \u2018default\u2019)

  • \n
  • should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]) \u2013 A function that runs\nat schedule execution time to determine whether a schedule should execute or skip. Takes\na ScheduleEvaluationContext and returns a boolean (True if the\nschedule should execute). Defaults to a function that always returns True.

  • \n
  • environment_vars (Optional[dict[str, str]]) \u2013 The environment variables to set for the\nschedule

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job that should execute when this\nschedule runs.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScheduleEvaluationContext(instance_ref, scheduled_execution_time)[source]\u00b6
\n

Schedule-specific execution context.

\n

An instance of this class is made available as the first argument to various ScheduleDefinition\nfunctions. It is passed as the first argument to run_config_fn, tags_fn,\nand should_execute.

\n
\n
\ninstance_ref\u00b6
\n

The serialized instance configured to run the schedule

\n
\n
Type
\n

Optional[InstanceRef]

\n
\n
\n
\n\n
\n
\nscheduled_execution_time\u00b6
\n

The time in which the execution was scheduled to happen. May differ slightly\nfrom both the actual execution time and the time at which the run config is computed.\nNot available in all schedulers - currently only set in deployments using\nDagsterDaemonScheduler.

\n
\n
Type
\n

datetime

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_schedule_context(instance=None, scheduled_execution_time=None)[source]\u00b6
\n

Builds schedule execution context using the provided parameters.

\n

The instance provided to build_schedule_context must be persistent;\nDagsterInstance.ephemeral() will result in an error.

\n
\n
Parameters
\n
    \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the schedule.

  • \n
  • scheduled_execution_time (datetime) \u2013 The time in which the execution was scheduled to\nhappen. May differ slightly from both the actual execution time and the time at which\nthe run config is computed.

  • \n
\n
\n
\n

Examples

\n
context = build_schedule_context(instance)\ndaily_schedule.evaluate_tick(context)\n
\n
\n
\n\n
\n
\ndagster.core.scheduler.DagsterDaemonScheduler Scheduler[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
max_catchup_runs (dagster.IntSource, optional)
\n

For partitioned schedules, controls the maximum number of past\npartitions for each schedule that will be considered when looking for missing\nruns . Generally this parameter will only come into play if the scheduler\nfalls behind or launches after experiencing downtime. This parameter will not be checked for\nschedules without partition sets (for example, schedules created using the @schedule\ndecorator) - only the most recent execution time will be considered for those schedules.

\n

Note that no matter what this value is, the scheduler will never launch a run from a time\nbefore the schedule was turned on (even if the start_date on the schedule is earlier) - if\nyou want to launch runs for earlier partitions, launch a backfill.

\n

Default Value: 5

\n
\n
max_tick_retries (dagster.IntSource, optional)
\n

For each schedule tick that raises an error, how many times to retry that tick

\n

Default Value: 0

\n
\n
\n

Default scheduler implementation that submits runs from the dagster-daemon\nlong-lived process. Periodically checks each running schedule for execution times that don\u2019t\nhave runs yet and launches them.

\n
\n\n
\n
\n

Partitioned Schedules\u00b6

\n
\n
\ndagster.build_schedule_from_partitioned_job(job, description=None, name=None, minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a schedule from a time window-partitioned job.

\n

The schedule executes at the cadence specified by the partitioning of the given job.

\n
\n\n
\n
\nclass dagster.PartitionScheduleDefinition(name, cron_schedule, pipeline_name, tags_fn, solid_selection, mode, should_execute, environment_vars, partition_set, run_config_fn=None, execution_timezone=None, execution_fn=None, description=None, decorated_fn=None, job=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n
\n\n
\n
\n@dagster.hourly_partitioned_config(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of hourly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset is provided, the start and end times of each partition will be\nminute_offset past the hour.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n@hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\n@dagster.daily_partitioned_config(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of daily partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the bounds\nof the date partition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset and/or hour_offset are used, the start and end times of each partition\nwill be hour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@daily_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n@daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\n@dagster.weekly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of weekly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@weekly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n@weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\n@dagster.monthly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of monthly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at midnight on the soonest first of the month after\nstart_date. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and end\ndate of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\nstart and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@monthly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n@monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n
\n

Sensors\u00b6

\n
\n
\n@dagster.sensor(pipeline_name=None, name=None, solid_selection=None, mode=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a sensor where the decorated function is used as the sensor\u2019s evaluation function. The\ndecorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Yield multiple of RunRequest objects.

  4. \n
  5. Return or yield a SkipReason object, providing a descriptive message of why no runs were\nrequested.

  6. \n
  7. Return or yield nothing (skipping without providing a reason)

  8. \n
\n

Takes a SensorEvaluationContext.

\n
\n
Parameters
\n
    \n
  • pipeline_name (Optional[str]) \u2013 (legacy) Name of the target pipeline. Cannot be used in\nconjunction with job or jobs parameters.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 (legacy) A list of solid subselection (including single\nsolid names) to execute for runs for this sensor e.g.\n['*some_solid+', 'other_solid'].\nCannot be used in conjunction with job or jobs parameters.

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing runs for this sensor. Cannot be used\nin conjunction with job or jobs parameters.\n(default: \u2018default\u2019)

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.SensorDefinition(name=None, evaluation_fn=None, pipeline_name=None, solid_selection=None, mode=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Define a sensor that initiates a set of runs based on some external state

\n
\n
Parameters
\n
    \n
  • evaluation_fn (Callable[[SensorEvaluationContext]]) \u2013

    The core evaluation function for the\nsensor, which is run at an interval to determine whether a run should be launched or\nnot. Takes a SensorEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • name (Optional[str]) \u2013 The name of the sensor to create. Defaults to name of evaluation_fn

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The name of the pipeline to execute when the sensor\nfires. Cannot be used in conjunction with job or jobs parameters.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 (legacy) A list of solid subselection (including single\nsolid names) to execute when the sensor runs. e.g. ['*some_solid+', 'other_solid'].\nCannot be used in conjunction with job or jobs parameters.

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing runs triggered by this\nsensor. Cannot be used in conjunction with job or jobs parameters. (default:\n\u2018default\u2019)

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[GraphDefinition, JobDefinition]) \u2013 The job to execute when this sensor fires.

  • \n
  • jobs (Optional[Sequence[GraphDefinition, JobDefinition]]) \u2013 (experimental) A list of jobs to execute when this sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.SensorEvaluationContext(instance_ref, last_completion_time, last_run_key, cursor, repository_name, instance=None)[source]\u00b6
\n

Sensor execution context.

\n

An instance of this class is made available as the first argument to the evaluation function\non SensorDefinition.

\n
\n
\ninstance_ref\u00b6
\n

The serialized instance configured to run the schedule

\n
\n
Type
\n

Optional[InstanceRef]

\n
\n
\n
\n\n
\n
\ncursor\u00b6
\n

The cursor, passed back from the last sensor evaluation via\nthe cursor attribute of SkipReason and RunRequest

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nlast_completion_time\u00b6
\n

DEPRECATED The last time that the sensor was evaluated (UTC).

\n
\n
Type
\n

float

\n
\n
\n
\n\n
\n
\nlast_run_key\u00b6
\n

DEPRECATED The run key of the RunRequest most recently created by this\nsensor. Use the preferred cursor attribute instead.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nrepository_name\u00b6
\n

The name of the repository that the sensor belongs to.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

The deserialized instance can also be passed in\ndirectly (primarily useful in testing contexts).

\n
\n
Type
\n

Optional[DagsterInstance]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_sensor_context(instance=None, cursor=None, repository_name=None)[source]\u00b6
\n

Builds sensor execution context using the provided parameters.

\n

This function can be used to provide a context to the invocation of a sensor definition.If\nprovided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\nerror.

\n
\n
Parameters
\n
    \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the sensor.

  • \n
  • cursor (Optional[str]) \u2013 A cursor value to provide to the evaluation of the sensor.

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository that the sensor belongs to.

  • \n
\n
\n
\n

Examples

\n
context = build_sensor_context()\nmy_sensor(context)\n
\n
\n
\n\n
\n
\nclass dagster.AssetSensorDefinition(name, asset_key, pipeline_name, asset_materialization_fn, solid_selection=None, mode=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Define an asset sensor that initiates a set of runs based on the materialization of a given\nasset.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the sensor to create.

  • \n
  • asset_key (AssetKey) \u2013 The asset_key this sensor monitors.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The name of the pipeline to execute when the sensor\nfires. Cannot be used in conjunction with job or jobs parameters.

  • \n
  • asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Generator[Union[RunRequest, SkipReason], None, None], RunRequest, SkipReason]]) \u2013

    The core\nevaluation function for the sensor, which is run at an interval to determine whether a\nrun should be launched or not. Takes a SensorEvaluationContext and\nan EventLogEntry corresponding to an AssetMaterialization event.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • solid_selection (Optional[List[str]]) \u2013 (legacy) A list of solid subselection (including single\nsolid names) to execute when the sensor runs. e.g. ['*some_solid+', 'other_solid'].\nCannot be used in conjunction with job or jobs parameters.

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing runs triggered by this sensor.\n(default: \u2018default\u2019).\nCannot be used in conjunction with job or jobs parameters.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job object to target with this sensor.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.asset_sensor(asset_key, pipeline_name=None, name=None, solid_selection=None, mode=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates an asset sensor where the decorated function is used as the asset sensor\u2019s evaluation\nfunction. The decorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Yield multiple of RunRequest objects.

  4. \n
  5. Return or yield a SkipReason object, providing a descriptive message of why no runs were\nrequested.

  6. \n
  7. Return or yield nothing (skipping without providing a reason)

  8. \n
\n

Takes a SensorEvaluationContext and an EventLogEntry corresponding to an\nAssetMaterialization event.

\n
\n
Parameters
\n
    \n
  • asset_key (AssetKey) \u2013 The asset_key this sensor monitors.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) Name of the target pipeline. Cannot be used in conjunction with job or jobs parameters.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 (legacy) A list of solid subselection (including single\nsolid names) to execute for runs for this sensor e.g.\n['*some_solid+', 'other_solid']. Cannot be used in conjunction with job or jobs\nparameters.

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing runs for this sensor. Cannot be used\nin conjunction with job or jobs parameters.\n(default: \u2018default\u2019)

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RunStatusSensorDefinition(name, pipeline_run_status, run_status_sensor_fn, pipeline_selection=None, minimum_interval_seconds=None, description=None, job_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Define a sensor that reacts to a given status of pipeline execution, where the decorated\nfunction will be evaluated when a run is at the given status.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • pipeline_run_status (PipelineRunStatus) \u2013 The status of a run which will be\nmonitored by the sensor.

  • \n
  • run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]]) \u2013 The core\nevaluation function for the sensor. Takes a RunStatusSensorContext.

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 (legacy) Names of the pipelines that will be monitored by\nthis sensor. Defaults to None, which means the alert will be sent when any pipeline in\nthe repository fails.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition]]]) \u2013 The jobs that\nwill be monitored by this sensor. Defaults to None, which means the alert will be sent\nwhen any job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RunStatusSensorContext(sensor_name, dagster_run, dagster_event, instance)[source]\u00b6
\n

The context object available to a decorated function of run_status_sensor.

\n
\n
\nsensor_name\u00b6
\n

the name of the sensor.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\ndagster_run\u00b6
\n

the run of the job or pipeline.

\n
\n
Type
\n

DagsterRun

\n
\n
\n
\n\n
\n
\ndagster_event\u00b6
\n

the event associated with the job or pipeline run status.

\n
\n
Type
\n

DagsterEvent

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

the current instance.

\n
\n
Type
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nfor_run_failure()[source]\u00b6
\n

Converts RunStatusSensorContext to RunFailureSensorContext.

\n
\n\n
\n\n
\n
\nclass dagster.RunFailureSensorContext(sensor_name, dagster_run, dagster_event, instance)[source]\u00b6
\n

The context object available to a decorated function of run_failure_sensor.

\n
\n
\nsensor_name\u00b6
\n

the name of the sensor.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\npipeline_run\u00b6
\n

the failed pipeline run.

\n
\n
Type
\n

PipelineRun

\n
\n
\n
\n\n
\n
\nfailure_event\u00b6
\n

the pipeline failure event.

\n
\n
Type
\n

DagsterEvent

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_run_status_sensor_context(sensor_name, dagster_event, dagster_instance, dagster_run)[source]\u00b6
\n

Builds run status sensor context from provided parameters.

\n

This function can be used to provide the context argument when directly invoking a function\ndecorated with @run_status_sensor or @run_failure_sensor, such as when writing unit tests.

\n
\n
Parameters
\n
    \n
  • sensor_name (str) \u2013 The name of the sensor the context is being constructed for.

  • \n
  • dagster_event (DagsterEvent) \u2013 A DagsterEvent with the same event type as the one that\ntriggers the run_status_sensor

  • \n
  • dagster_instance (DagsterInstance) \u2013 The dagster instance configured for the context.

  • \n
  • dagster_run (DagsterRun) \u2013 DagsterRun object from running a job

  • \n
\n
\n
\n

Examples

\n
instance = DagsterInstance.ephemeral()\nresult = my_job.execute_in_process(instance=instance)\n\ndagster_run = result.dagster_run\ndagster_event = result.get_job_success_event() # or get_job_failure_event()\n\ncontext = build_run_status_sensor_context(\n    sensor_name="run_status_sensor_to_invoke",\n    dagster_instance=instance,\n    dagster_run=dagster_run,\n    dagster_event=dagster_event,\n)\nrun_status_sensor_to_invoke(context)\n
\n
\n
\n\n
\n
\n@dagster.run_status_sensor(pipeline_run_status, pipeline_selection=None, name=None, minimum_interval_seconds=None, description=None, job_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a sensor that reacts to a given status of pipeline execution, where the decorated\nfunction will be run when a pipeline is at the given status.

\n

Takes a RunStatusSensorContext.

\n
\n
Parameters
\n
    \n
  • pipeline_run_status (PipelineRunStatus) \u2013 The status of pipeline execution which will be\nmonitored by the sensor.

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 Names of the pipelines that will be monitored by\nthis sensor. Defaults to None, which means the alert will be sent when any pipeline in\nthe repository fails.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job_selection (Optional[List[Union[PipelineDefinition, GraphDefinition]]]) \u2013 Jobs that will\nbe monitored by this sensor. Defaults to None, which means the alert will be sent when\nany job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.run_failure_sensor(name=None, minimum_interval_seconds=None, description=None, job_selection=None, pipeline_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a sensor that reacts to job failure events, where the decorated function will be\nrun when a run fails.

\n

Takes a RunFailureSensorContext.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the job failure sensor. Defaults to the name of the\ndecorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition]]]) \u2013 The jobs that\nwill be monitored by this failure sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository fails.

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 (legacy) Names of the pipelines that will be monitored by\nthis sensor. Defaults to None, which means the alert will be sent when any pipeline in\nthe repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n

Legacy APIs\u00b6

\n
\n
\n@dagster.pipeline_failure_sensor(name=None, minimum_interval_seconds=None, description=None, pipeline_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a sensor that reacts to pipeline failure events, where the decorated function will be\nrun when a pipeline run fails.

\n

Takes a PipelineFailureSensorContext.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the pipeline failure sensor. Defaults to the name of the\ndecorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 Names of the pipelines that will be monitored by\nthis failure sensor. Defaults to None, which means the alert will be sent when any\npipeline in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.PipelineFailureSensorContext(sensor_name, dagster_run, dagster_event, instance)[source]\u00b6
\n

The context object available to a decorated function of pipeline_failure_sensor.

\n
\n
\nsensor_name\u00b6
\n

the name of the sensor.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\npipeline_run\u00b6
\n

the failed pipeline run.

\n
\n
Type
\n

PipelineRun

\n
\n
\n
\n\n
\n
\nfailure_event\u00b6
\n

the pipeline failure event.

\n
\n
Type
\n

DagsterEvent

\n
\n
\n
\n\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/schedules-sensors", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../solids/", "title": "[Legacy] Solids"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../resources/", "title": "Resources"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/solids", "[Legacy] Solids", "N", "next"], ["sections/api/apidocs/resources", "Resources", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/schedules-sensors.rst.txt", "title": "Run Requests", "toc": "\n"}, "solids": {"alabaster_version": "0.7.12", "body": "
\n

[Legacy] Solids\u00b6

\n

As of Dagster 0.13.0, we recommend Ops as an alternative to Solids. They can generally be used\ninterchangeably.

\n
\n
\n

Defining solids\u00b6

\n
\n
\n@dagster.solid(name=None, description=None, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, tags=None, version=None, retry_policy=None)[source]\u00b6
\n

Create a solid with the specified parameters from the decorated function.

\n

This shortcut simplifies the core SolidDefinition API by exploding arguments into\nkwargs of the decorated compute function and omitting additional parameters when they are not\nneeded.

\n

Input and output definitions will be inferred from the type signature of the decorated function\nif not explicitly provided.

\n

The decorated function will be used as the solid\u2019s compute function. The signature of the\ndecorated function is more flexible than that of the compute_fn in the core API; it may:

\n
    \n
  1. Return a value. This value will be wrapped in an Output and yielded by the compute function.

  2. \n
  3. Return an Output. This output will be yielded by the compute function.

  4. \n
  5. Yield Output or other event objects. Same as default compute behavior.

  6. \n
\n

Note that options 1) and 2) are incompatible with yielding other events \u2013 if you would like\nto decorate a function that yields events, it must also wrap its eventual output in an\nOutput and yield it.

\n

@solid supports async def functions as well, including async generators when yielding multiple\nevents or outputs. Note that async solids will generally be run on their own unless using a custom\nExecutor implementation that supports running them together.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 Name of solid. Must be unique within any PipelineDefinition\nusing the solid.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this solid. If not provided, and\nthe decorated function has docstring, that docstring will be used as the description.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013 Information about the inputs to the solid. Information provided here will be combined\nwith what can be inferred from the function signature, with these explicit InputDefinitions\ntaking precedence.

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013 Information about the solids outputs. Information provided here will be combined with\nwhat can be inferred from the return type signature if there is only one OutputDefinition\nand the function does not use yield.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the solid matches this schema and fail if it does not. If not\nset, Dagster will accept any config provided for the solid.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this solid.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Values that are not strings\nwill be json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the solid\u2019s compute_fn. Two solids should have\nthe same version if and only if they deterministically produce the same outputs when\nprovided the same inputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this solid.

  • \n
\n
\n
\n

Examples

\n
@solid\ndef hello_world():\n    print('hello')\n\n@solid\ndef hello_world():\n    return {'foo': 'bar'}\n\n@solid\ndef hello_world():\n    return Output(value={'foo': 'bar'})\n\n@solid\ndef hello_world():\n    yield Output(value={'foo': 'bar'})\n\n@solid\ndef hello_world(foo):\n    return foo\n\n@solid(\n    input_defs=[InputDefinition(name="foo", str)],\n    output_defs=[OutputDefinition(str)]\n)\ndef hello_world(foo):\n    # explicitly type and name inputs and outputs\n    return foo\n\n@solid\ndef hello_world(foo: str) -> str:\n    # same as above inferred from signature\n    return foo\n\n@solid\ndef hello_world(context, foo):\n    context.log.info('log something')\n    return foo\n\n@solid(\n    config_schema={'str_value' : Field(str)}\n)\ndef hello_world(context, foo):\n    # context.solid_config is a dictionary with 'str_value' key\n    return foo + context.solid_config['str_value']\n
\n
\n
\n\n
\n
\nclass dagster.SolidDefinition(name, input_defs, compute_fn, output_defs, config_schema=None, description=None, tags=None, required_resource_keys=None, version=None, retry_policy=None)[source]\u00b6
\n

The definition of a Solid that performs a user-defined computation.

\n

For more details on what a solid is, refer to the\nSolid Overview .

\n

End users should prefer the @solid and @lambda_solid\ndecorators. SolidDefinition is generally intended to be used by framework authors.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the solid. Must be unique within any PipelineDefinition\nusing the solid.

  • \n
  • input_defs (List[InputDefinition]) \u2013 Inputs of the solid.

  • \n
  • compute_fn (Callable) \u2013

    The core of the solid, the function that does the actual\ncomputation. The signature of this function is determined by input_defs, and\noptionally, an injected first argument, context, a collection of information provided\nby the system.

    \n

    This function will be coerced into a generator or an async generator, which must yield\none Output for each of the solid\u2019s output_defs, and additionally may\nyield other types of Dagster events, including Materialization and\nExpectationResult.

    \n

  • \n
  • output_defs (List[OutputDefinition]) \u2013 Outputs of the solid.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the solid matches this schema and fail if it does not. If not\nset, Dagster will accept any config provided for the solid.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the solid.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resources handles required by this\nsolid.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the solid\u2019s compute_fn. Two solids should have\nthe same version if and only if they deterministically produce the same outputs when\nprovided the same inputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this solid.

  • \n
\n
\n
\n

Examples

\n
def _add_one(_context, inputs):\n    yield Output(inputs["num"] + 1)\n\nSolidDefinition(\n    name="add_one",\n    input_defs=[InputDefinition("num", Int)],\n    output_defs=[OutputDefinition(Int)], # default name ("result")\n    compute_fn=_add_one,\n)\n
\n
\n
\n
\nconfigured(config_or_config_fn, name, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (str) \u2013 Name of the new definition. This is a required argument, as this definition\ntype has a name uniqueness constraint.

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\n
\n

Inputs & outputs\u00b6

\n
\n
\nclass dagster.InputDefinition(name=None, dagster_type=None, description=None, default_value=<class 'dagster.core.definitions.utils.NoValueSentinel'>, root_manager_key=None, metadata=None, asset_key=None, asset_partitions=None)[source]\u00b6
\n

Defines an argument to a solid\u2019s compute function.

\n

Inputs may flow from previous solids\u2019 outputs, or be stubbed using config. They may optionally\nbe typed using the Dagster type system.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the input.

  • \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this input.\nUsers should provide the Python type of the objects that they expect to be passed for\nthis input, or a DagsterType that defines a runtime check that they want\nto be run on this input. Defaults to Any.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the input.

  • \n
  • default_value (Optional[Any]) \u2013 The default value to use if no input is provided.

  • \n
  • root_manager_key (Optional[str]) \u2013 (Experimental) The resource key for the\nRootInputManager used for loading this input when it is not connected to an\nupstream output.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata for the input.

  • \n
  • asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]) \u2013 (Experimental) An AssetKey\n(or function that produces an AssetKey from the InputContext) which should be associated\nwith this InputDefinition. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the InputContext) which should be associated with this InputDefinition.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.OutputDefinition(dagster_type=None, name=None, description=None, is_required=True, io_manager_key=None, metadata=None, asset_key=None, asset_partitions=None, asset_partitions_def=None)[source]\u00b6
\n

Defines an output from a solid\u2019s compute function.

\n

Solids can have multiple outputs, in which case outputs cannot be anonymous.

\n

Many solids have only one output, in which case the user can provide a single output definition\nthat will be given the default name, \u201cresult\u201d.

\n

Output definitions may be typed using the Dagster type system.

\n
\n
Parameters
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this output.\nUsers should provide the Python type of the objects that they expect the solid to yield\nfor this output, or a DagsterType that defines a runtime check that they\nwant to be run on this output. Defaults to Any.

  • \n
  • name (Optional[str]) \u2013 Name of the output. (default: \u201cresult\u201d)

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the output.

  • \n
  • is_required (Optional[bool]) \u2013 Whether the presence of this field is required. (default: True)

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the IOManager used for storing this\noutput and loading it in downstream steps (default: \u201cio_manager\u201d).

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.

  • \n
  • asset_key (Optional[AssetKey]]) \u2013 (Experimental) An AssetKey which should be associated\nwith this OutputDefinition. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], OutputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the OutputContext) which should be associated with this OutputDefinition.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Retries\u00b6

\n
\n
\nclass dagster.RetryPolicy(max_retries=1, delay=None, backoff=None, jitter=None)[source]
\n

A declarative policy for when to request retries when an exception occurs during op execution.

\n
\n
Parameters
\n
    \n
  • max_retries (int) \u2013 The maximum number of retries to attempt. Defaults to 1.

  • \n
  • delay (Optional[Union[int,float]]) \u2013 The time in seconds to wait between the retry being requested and the next attempt\nbeing started. This unit of time can be modulated as a function of attempt number\nwith backoff and randomly with jitter.

  • \n
  • backoff (Optional[Backoff]) \u2013 A modifier for delay as a function of retry attempt number.

  • \n
  • jitter (Optional[Jitter]) \u2013 A randomizing modifier for delay, applied after backoff calculation.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Backoff(value)[source]
\n

A modifier for delay as a function of attempt number.

\n

LINEAR: attempt_num * delay\nEXPONENTIAL: ((2 ^ attempt_num) - 1) * delay

\n
\n\n
\n
\nclass dagster.Jitter(value)[source]
\n

A randomizing modifier for delay, applied after backoff calculation.

\n

FULL: between 0 and the calculated delay based on backoff: random() * backoff_delay\nPLUS_MINUS: +/- the delay: backoff_delay + ((2 * (random() * delay)) - delay)

\n
\n\n
\n
\n

Execution\u00b6

\n
\n
\ndagster.execute_solid(solid_def, mode_def=None, input_values=None, tags=None, run_config=None, raise_on_error=True)[source]\u00b6
\n

Execute a single solid in an ephemeral pipeline.

\n

Intended to support unit tests. Input values may be passed directly, and no pipeline need be\nspecified \u2013 an ephemeral pipeline will be constructed.

\n
\n
Parameters
\n
    \n
  • solid_def (SolidDefinition) \u2013 The solid to execute.

  • \n
  • mode_def (Optional[ModeDefinition]) \u2013 The mode within which to execute the solid. Use this\nif, e.g., custom resources, loggers, or executors are desired.

  • \n
  • input_values (Optional[Dict[str, Any]]) \u2013 A dict of input names to input values, used to\npass inputs to the solid directly. You may also use the run_config to\nconfigure any inputs that are configurable.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parameterized this\nexecution, as a dict.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True, since this is the most useful behavior in test.

  • \n
\n
\n
Returns
\n

The result of executing the\nsolid.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\ndagster.execute_solid_within_pipeline(pipeline_def, solid_name, inputs=None, run_config=None, mode=None, preset=None, tags=None, instance=None)[source]\u00b6
\n

Execute a single solid within an existing pipeline.

\n

Intended to support tests. Input values may be passed directly.

\n
\n
Parameters
\n
    \n
  • pipeline_def (PipelineDefinition) \u2013 The pipeline within which to execute the solid.

  • \n
  • solid_name (str) \u2013 The name of the solid, or the aliased solid, to execute.

  • \n
  • inputs (Optional[Dict[str, Any]]) \u2013 A dict of input names to input values, used to\npass input values to the solid directly. You may also use the run_config to\nconfigure any inputs that are configurable.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parameterized this\nexecution, as a dict.

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
\n
\n
Returns
\n

The result of executing the\nsolid.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\ndagster.execute_solids_within_pipeline(pipeline_def, solid_names, inputs=None, run_config=None, mode=None, preset=None, tags=None, instance=None)[source]\u00b6
\n

Execute a set of solids within an existing pipeline.

\n

Intended to support tests. Input values may be passed directly.

\n
\n
Parameters
\n
    \n
  • pipeline_def (PipelineDefinition) \u2013 The pipeline within which to execute the solid.

  • \n
  • solid_names (FrozenSet[str]) \u2013 A set of the solid names, or the aliased solids, to execute.

  • \n
  • inputs (Optional[Dict[str, Dict[str, Any]]]) \u2013 A dict keyed on solid names, whose values are\ndicts of input names to input values, used to pass input values to the solids directly.\nYou may also use the run_config to configure any inputs that are configurable.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parameterized this\nexecution, as a dict.

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
\n
\n
Returns
\n

The results of\nexecuting the solids, keyed by solid name.

\n
\n
Return type
\n

Dict[str, Union[CompositeSolidExecutionResult, SolidExecutionResult]]

\n
\n
\n
\n\n
\n
\nclass dagster.SolidExecutionResult(solid, step_events_by_kind, reconstruct_context, pipeline_def, output_capture=None)[source]\u00b6
\n

Execution result for a leaf solid in a pipeline.

\n

Users should not instantiate this class.

\n
\n
\nproperty compute_input_event_dict\u00b6
\n

All events of type STEP_INPUT, keyed by input name.

\n
\n
Type
\n

Dict[str, DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty compute_output_events_dict\u00b6
\n

All events of type STEP_OUTPUT, keyed by output name

\n
\n
Type
\n

Dict[str, List[DagsterEvent]]

\n
\n
\n
\n\n
\n
\nproperty compute_step_events\u00b6
\n

All events generated by execution of the solid compute function.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty compute_step_failure_event\u00b6
\n

The STEP_FAILURE event, throws if it did not fail.

\n
\n
Type
\n

DagsterEvent

\n
\n
\n
\n\n
\n
\nproperty expectation_events_during_compute\u00b6
\n

All events of type STEP_EXPECTATION_RESULT.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty expectation_results_during_compute\u00b6
\n

All expectation results yielded by the solid

\n
\n
Type
\n

List[ExpectationResult]

\n
\n
\n
\n\n
\n
\nproperty failure_data\u00b6
\n

Any data corresponding to this step\u2019s failure, if it\nfailed.

\n
\n
Type
\n

Union[None, StepFailureData]

\n
\n
\n
\n\n
\n
\nget_output_event_for_compute(output_name='result')[source]\u00b6
\n

The STEP_OUTPUT event for the given output name.

\n

Throws if not present.

\n
\n
Parameters
\n

output_name (Optional[str]) \u2013 The name of the output. (default: \u2018result\u2019)

\n
\n
Returns
\n

The corresponding event.

\n
\n
Return type
\n

DagsterEvent

\n
\n
\n
\n\n
\n
\nget_output_events_for_compute(output_name='result')[source]\u00b6
\n

The STEP_OUTPUT event for the given output name.

\n

Throws if not present.

\n
\n
Parameters
\n

output_name (Optional[str]) \u2013 The name of the output. (default: \u2018result\u2019)

\n
\n
Returns
\n

The corresponding events.

\n
\n
Return type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nget_step_success_event()[source]\u00b6
\n

DagsterEvent: The STEP_SUCCESS event, throws if not present.

\n
\n\n
\n
\nproperty input_events_during_compute\u00b6
\n

All events of type STEP_INPUT.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty materialization_events_during_compute\u00b6
\n

All events of type ASSET_MATERIALIZATION.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty materializations_during_compute\u00b6
\n

All materializations yielded by the solid.

\n
\n
Type
\n

List[Materialization]

\n
\n
\n
\n\n
\n
\nproperty output_events_during_compute\u00b6
\n

All events of type STEP_OUTPUT.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\noutput_value(output_name='result')[source]\u00b6
\n

Get a computed output value.

\n

Note that calling this method will reconstruct the pipeline context (including, e.g.,\nresources) to retrieve materialized output values.

\n
\n
Parameters
\n

output_name (str) \u2013 The output name for which to retrieve the value. (default: \u2018result\u2019)

\n
\n
Returns
\n

\n
None if execution did not succeed, the output value

in the normal case, and a dict of mapping keys to values in the mapped case.

\n
\n
\n

\n
\n
Return type
\n

Union[None, Any, Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nproperty output_values\u00b6
\n

The computed output values.

\n

Returns None if execution did not succeed.

\n
\n
Returns a dictionary where keys are output names and the values are:
    \n
  • the output values in the normal case

  • \n
  • a dictionary from mapping key to corresponding value in the mapped case

  • \n
\n
\n
\n

Note that accessing this property will reconstruct the pipeline context (including, e.g.,\nresources) to retrieve materialized output values.

\n
\n
Type
\n

Union[None, Dict[str, Union[Any, Dict[str, Any]]]

\n
\n
\n
\n\n
\n
\nproperty retry_attempts\u00b6
\n

Number of times this step retried

\n
\n\n
\n
\nproperty skipped\u00b6
\n

Whether solid execution was skipped.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n
\nproperty success\u00b6
\n

Whether solid execution was successful.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.CompositeSolidExecutionResult(solid, event_list, step_events_by_kind, reconstruct_context, pipeline_def, handle=None, output_capture=None)[source]\u00b6
\n

Execution result for a composite solid in a pipeline.

\n

Users should not instantiate this class directly.

\n
\n
\noutput_for_solid(handle_str, output_name='result')\u00b6
\n

Get the output of a solid by its solid handle string and output name.

\n
\n
Parameters
\n
    \n
  • handle_str (str) \u2013 The string handle for the solid.

  • \n
  • output_name (str) \u2013 Optional. The name of the output, default to DEFAULT_OUTPUT.

  • \n
\n
\n
Returns
\n

The output value for the handle and output_name.

\n
\n
\n
\n\n
\n
\nresult_for_handle(handle)\u00b6
\n

Get the result of a solid by its solid handle.

\n

This allows indexing into top-level solids to retrieve the results of children of\ncomposite solids.

\n
\n
Parameters
\n

handle (Union[str,NodeHandle]) \u2013 The handle for the solid.

\n
\n
Returns
\n

The result of the given\nsolid.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\nresult_for_solid(name)\u00b6
\n

Get the result of a top level solid.

\n
\n
Parameters
\n

name (str) \u2013 The name of the top-level solid or aliased solid for which to retrieve the\nresult.

\n
\n
Returns
\n

The result of the solid\nexecution within the pipeline.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\nproperty solid_result_list\u00b6
\n

The results for each\ntop level solid.

\n
\n
Type
\n

List[Union[CompositeSolidExecutionResult, SolidExecutionResult]]

\n
\n
\n
\n\n
\n
\nproperty step_event_list\u00b6
\n

List[DagsterEvent] The full list of events generated by steps in the execution.

\n

Excludes events generated by the pipeline lifecycle, e.g., PIPELINE_START.

\n
\n\n
\n
\nproperty success\u00b6
\n

Whether all steps in the execution were successful.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\n

Execution context\u00b6

\n
\n
\nclass dagster.SolidExecutionContext(step_execution_context)[source]\u00b6
\n

The context object that can be made available as the first argument to a solid\u2019s compute\nfunction.

\n

The context object provides system information such as resources, config, and logging to a\nsolid\u2019s compute function. Users should not instantiate this object directly.

\n

Example:

\n
@solid\ndef hello_world(context: SolidExecutionContext):\n    context.log.info("Hello, world!")\n
\n
\n
\n
\nadd_output_metadata(metadata, output_name=None, mapping_key=None)[source]\u00b6
\n

Add metadata to one of the outputs of an op.

\n

This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.

\n
\n
Parameters
\n
    \n
  • metadata (Mapping[str, Any]) \u2013 The metadata to attach to the output

  • \n
  • output_name (Optional[str]) \u2013 The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.

  • \n
\n
\n
\n

Examples:

\n
from dagster import Out, op\nfrom typing import Tuple\n\n@op\ndef add_metadata(context):\n    context.add_output_metadata({"foo", "bar"})\n    return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n@op(out={"a": Out(), "b": Out()})\ndef add_metadata_two_outputs(context) -> Tuple[str, int]:\n    context.add_output_metadata({"foo": "bar"}, output_name="b")\n    context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n    return ("dog", 5)\n
\n
\n
\n\n
\n
\nconsume_events()[source]\u00b6
\n

Pops and yields all user-generated events that have been recorded from this context.

\n

If consume_events has not yet been called, this will yield all logged events since the beginning of the op\u2019s computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nget_mapping_key()[source]\u00b6
\n

Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None.

\n
\n\n
\n
\nget_tag(key)[source]\u00b6
\n

Get a logging tag.

\n
\n
Parameters
\n

key (tag) \u2013 The tag to get.

\n
\n
Returns
\n

The value of the tag, if present.

\n
\n
Return type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run

\n
\n\n
\n
\nhas_tag(key)[source]\u00b6
\n

Check if a logging tag is set.

\n
\n
Parameters
\n

key (str) \u2013 The tag to check.

\n
\n
Returns
\n

Whether the tag is set.

\n
\n
Return type
\n

bool

\n
\n
\n
\n\n
\n
\nproperty instance\u00b6
\n

The current Dagster instance

\n
\n
Type
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager available in the execution context.

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nlog_event(event)[source]\u00b6
\n

Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.

\n

Events logged with this method will appear in the list of DagsterEvents, as well as the event log.

\n
\n
Parameters
\n

event (Union[AssetMaterialization, Materialization, AssetObservation, ExpectationResult]) \u2013 The event to log.

\n
\n
\n

Examples:

\n
from dagster import op, AssetMaterialization\n\n@op\ndef log_materialization(context):\n    context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty mode_def\u00b6
\n

The mode of the current execution.

\n
\n
Type
\n

ModeDefinition

\n
\n
\n
\n\n
\n
\noutput_asset_partition_key(output_name='result')[source]\u00b6
\n

Returns the asset partition key for the given output. Defaults to \u201cresult\u201d, which is the\nname of the default output.

\n
\n\n
\n
\noutput_asset_partitions_time_window(output_name='result')[source]\u00b6
\n

The time window for the partitions of the output asset.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nproperty pdb\u00b6
\n

Gives access to pdb debugging from within the op.

\n

Example:

\n
@op\ndef debug(context):\n    context.pdb.set_trace()\n
\n
\n
\n
Type
\n

dagster.utils.forked_pdb.ForkedPdb

\n
\n
\n
\n\n
\n
\nproperty pipeline_def\u00b6
\n

The currently executing pipeline.

\n
\n
Type
\n

PipelineDefinition

\n
\n
\n
\n\n
\n
\nproperty pipeline_name\u00b6
\n

The name of the currently executing pipeline.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty pipeline_run\u00b6
\n

The current pipeline run

\n
\n
Type
\n

PipelineRun

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

The currently available resources.

\n
\n
Type
\n

Resources

\n
\n
\n
\n\n
\n
\nproperty retry_number\u00b6
\n

Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc.

\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run config for the current execution.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the current execution\u2019s run.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty solid_config\u00b6
\n

The parsed config specific to this solid.

\n
\n\n
\n
\nproperty solid_def\u00b6
\n

The current solid definition.

\n
\n
Type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\nproperty step_launcher\u00b6
\n

The current step launcher, if any.

\n
\n
Type
\n

Optional[StepLauncher]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_solid_context(resources=None, solid_config=None, resources_config=None, instance=None, config=None, partition_key=None)[source]\u00b6
\n

Builds solid execution context from provided parameters.

\n

build_solid_context can be used as either a function or context manager. If there is a\nprovided resource that is a context manager, then build_solid_context must be used as a\ncontext manager. This function can be used to provide the context argument when directly\ninvoking a solid.

\n
\n
Parameters
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • solid_config (Optional[Any]) \u2013 The solid config to provide to the context. The value provided\nhere will be available as context.solid_config.

  • \n
  • resources_config (Optional[Dict[str, Any]]) \u2013 Configuration for any resource definitions\nprovided to the resources arg. The configuration under a specific key should match the\nresource under a specific key in the resources dictionary.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
\n
\n
\n

Examples

\n
context = build_solid_context()\nsolid_to_invoke(context)\n\nwith build_solid_context(resources={"foo": context_manager_resource}) as context:\n    solid_to_invoke(context)\n
\n
\n
\n\n
\n
\n
\n

Composing solids\u00b6

\n
\n
\n@dagster.composite_solid(name=None, input_defs=None, output_defs=None, description=None, config_schema=None, config_fn=None)[source]\u00b6
\n

Create a composite solid with the specified parameters from the decorated composition\nfunction.

\n

Using this decorator allows you to build up the dependency graph of the composite by writing a\nfunction that invokes solids and passes the output to other solids. This is similar to the use\nof the @pipeline decorator, with the additional ability to remap inputs,\noutputs, and config across the composite boundary.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 Name for the new composite solid. Must be unique within any\nPipelineDefinition using the solid.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the new composite solid.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013

    Information about the inputs that this composite solid maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit InputDefinitions taking precedence.

    \n

    Uses of inputs in the body of the decorated composition function will determine\nthe InputMappings passed to the underlying\nCompositeSolidDefinition.

    \n

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013

    Information about the outputs this composite solid maps. Information provided here\nwill be combined with what can be inferred from the return type signature if there\nis only one OutputDefinition.

    \n

    Uses of these outputs in the body of the decorated composition function, as well as the\nreturn value of the decorated function, will be used to infer the appropriate set of\nOutputMappings for the underlying\nCompositeSolidDefinition.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 If the config_fn argument is provided, this\nargument can be provided to set the schema for outer config that is passed to the\nconfig_fn. If config_fn is provided, but this argument is not provided, any config\nwill be accepted.

  • \n
  • config_fn (Callable[[dict], dict]) \u2013

    By specifying a config mapping\nfunction, you can override the configuration for the child solids contained within this\ncomposite solid. config_fn, maps the config provided to the\ncomposite solid to the config that will be provided to the child solids.

    \n

    If this argument is provided, the config_schema argument can also be provided to limit\nwhat config values can be passed to the composite solid.

    \n

  • \n
\n
\n
\n

Examples

\n
@lambda_solid\ndef add_one(num: int) -> int:\n    return num + 1\n\n@composite_solid\ndef add_two(num: int) -> int:\n    adder_1 = add_one.alias('adder_1')\n    adder_2 = add_one.alias('adder_2')\n\n    return adder_2(adder_1(num))\n
\n
\n
\n\n
\n
\nclass dagster.CompositeSolidDefinition(name, solid_defs, input_mappings=None, output_mappings=None, config_mapping=None, dependencies=None, description=None, tags=None, positional_inputs=None)[source]\u00b6
\n

The core unit of composition and abstraction, composite solids allow you to\ndefine a solid from a graph of solids.

\n

In the same way you would refactor a block of code in to a function to deduplicate, organize,\nor manage complexity - you can refactor solids in a pipeline in to a composite solid.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this composite solid. Must be unique within any\nPipelineDefinition using the solid.

  • \n
  • solid_defs (List[Union[SolidDefinition, CompositeSolidDefinition]]) \u2013 The set of solid\ndefinitions used in this composite solid. Composites may be arbitrarily nested.

  • \n
  • input_mappings (Optional[List[InputMapping]]) \u2013 Define the inputs to the composite solid,\nand how they map to the inputs of its constituent solids.

  • \n
  • output_mappings (Optional[List[OutputMapping]]) \u2013 Define the outputs of the composite solid,\nand how they map from the outputs of its constituent solids.

  • \n
  • config_mapping (Optional[ConfigMapping]) \u2013 By specifying a config mapping, you can override\nthe configuration for the child solids contained within this composite solid. Config\nmappings require both a configuration field to be specified, which is exposed as the\nconfiguration for the composite solid, and a configuration mapping function, which\nis called to map the configuration of the composite solid into the configuration that\nis applied to any child solids.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares where each solid gets its inputs. The keys at the top\nlevel dict are either string names of solids or NodeInvocations. The values\nare dicts that map input names to DependencyDefinitions.

  • \n
  • description (Optional[str]) \u2013 Human readable description of this composite solid.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.\nmay expect and require certain metadata to be attached to a solid.

  • \n
  • positional_inputs (Optional[List[str]]) \u2013 The positional order of the inputs if it\ndiffers from the order of the input mappings

  • \n
\n
\n
\n

Examples

\n
@lambda_solid\ndef add_one(num: int) -> int:\n    return num + 1\n\nadd_two = CompositeSolidDefinition(\n    'add_two',\n    solid_defs=[add_one],\n    dependencies={\n        NodeInvocation('add_one', 'adder_1'): {},\n        NodeInvocation('add_one', 'adder_2'): {'num': DependencyDefinition('adder_1')},\n    },\n    input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')],\n    output_mappings=[OutputDefinition(Int).mapping_from('adder_2')],\n)\n
\n
\n
\n
\nconfigured(config_or_config_fn, name, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (str) \u2013 Name of the new definition. This is a required argument, as this definition\ntype has a name uniqueness constraint.

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\nclass dagster.InputMapping(definition, maps_to)[source]\u00b6
\n

Defines an input mapping for a composite solid.

\n
\n
Parameters
\n
    \n
  • definition (InputDefinition) \u2013 Defines the input to the composite solid.

  • \n
  • solid_name (str) \u2013 The name of the child solid onto which to map the input.

  • \n
  • input_name (str) \u2013 The name of the input to the child solid onto which to map the input.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.OutputMapping(definition, maps_from)[source]\u00b6
\n

Defines an output mapping for a composite solid.

\n
\n
Parameters
\n
    \n
  • definition (OutputDefinition) \u2013 Defines the output of the composite solid.

  • \n
  • solid_name (str) \u2013 The name of the child solid from which to map the output.

  • \n
  • output_name (str) \u2013 The name of the child solid\u2019s output from which to map the output.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ConfigMapping(config_fn, config_schema=None, receive_processed_config_values=None)[source]
\n

Defines a config mapping for a graph (or job).

\n

By specifying a config mapping function, you can override the configuration for the child\nops and graphs contained within a graph.

\n

Config mappings require the configuration schema to be specified as config_schema, which will\nbe exposed as the configuration schema for the graph, as well as a configuration mapping\nfunction, config_fn, which maps the config provided to the composite solid to the config\nthat will be provided to the child nodes.

\n
\n
Parameters
\n
    \n
  • config_fn (Callable[[dict], dict]) \u2013 The function that will be called\nto map the graph config to a config appropriate for the child nodes.

  • \n
  • config_schema (ConfigSchema) \u2013 The schema of the graph config.

  • \n
  • receive_processed_config_values (Optional[bool]) \u2013 If true, config values provided to the config_fn\nwill be converted to their dagster types before being passed in. For example, if this\nvalue is true, enum config passed to config_fn will be actual enums, while if false,\nthen enum config passed to config_fn will be strings.

  • \n
\n
\n
\n
\n\n
\n
\n

Events\u00b6

\n

The objects that can be yielded by the body of solids\u2019 compute functions to communicate with the\nDagster framework.

\n

(Note that Failure and RetryRequested are intended to be raised from solids rather than yielded.)

\n
\n

Event types\u00b6

\n
\n
\nclass dagster.Output(value, output_name='result', metadata_entries=None, metadata=None)[source]
\n

Event corresponding to one of a op\u2019s outputs.

\n

Op compute functions must explicitly yield events of this type when they have more than\none output, or when they also yield events of other types, or when defining a op using the\nOpDefinition API directly.

\n

Outputs are values produced by ops that will be consumed by downstream ops in a job.\nThey are type-checked at op boundaries when their corresponding Out\nor the downstream In is typed.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding out. (default:\n\u201cresult\u201d)

  • \n
  • metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]) \u2013 (Experimental) A set of metadata entries to attach to events related to this Output.

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetMaterialization(asset_key, description=None, metadata_entries=None, partition=None, tags=None, metadata=None)[source]
\n

Event indicating that an op has materialized an asset.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that they have produced a materialized value as a\nside effect of computation. Unlike outputs, asset materializations can not be passed to other\nops, and their persistence is controlled by op logic, rather than by the Dagster\nframework.

\n

Op authors should use these events to organize metadata about the side effects of their\ncomputations, enabling tooling like the Assets dashboard in Dagit.

\n
\n
Parameters
\n
    \n
  • asset_key (Union[str, List[str], AssetKey]) \u2013 A key to identify the materialized asset across job\nruns

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the materialized value.

  • \n
  • metadata_entries (Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]]) \u2013 Arbitrary metadata about the\nmaterialized value.

  • \n
  • partition (Optional[str]) \u2013 The name of the partition that was materialized.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 (Experimental) Tag metadata for a given asset\nmaterialization. Used for search and organization of the asset entry in the asset\ncatalog in Dagit.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the asset. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n
\nstatic file(path, description=None, asset_key=None)[source]
\n

Static constructor for standard materializations corresponding to files on disk.

\n
\n
Parameters
\n
    \n
  • path (str) \u2013 The path to the file.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the materialization.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ExpectationResult(success, label=None, description=None, metadata_entries=None, metadata=None)[source]
\n

Event corresponding to a data quality test.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that a data quality test has produced a (positive or\nnegative) result.

\n
\n
Parameters
\n
    \n
  • success (bool) \u2013 Whether the expectation passed or not.

  • \n
  • label (Optional[str]) \u2013 Short display name for expectation. Defaults to \u201cresult\u201d.

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the expectation.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\nexpectation.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheck(success, description=None, metadata_entries=None, metadata=None)[source]
\n

Event corresponding to a successful typecheck.

\n

Events of this type should be returned by user-defined type checks when they need to encapsulate\nadditional metadata about a type check\u2019s success or failure. (i.e., when using\nas_dagster_type(), @usable_as_dagster_type, or the underlying\nPythonObjectDagsterType() API.)

\n

Solid compute functions should generally avoid yielding events of this type to avoid confusion.

\n
\n
Parameters
\n
    \n
  • success (bool) \u2013 True if the type check succeeded, False otherwise.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the type check.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\ntype check.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Failure(description=None, metadata_entries=None, metadata=None)[source]
\n

Event indicating op failure.

\n

Raise events of this type from within op compute functions or custom type checks in order to\nindicate an unrecoverable failure in user code to the Dagster machinery and return\nstructured metadata about the failure.

\n
\n
Parameters
\n
    \n
  • description (Optional[str]) \u2013 A human-readable description of the failure.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\nfailure.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RetryRequested(max_retries=1, seconds_to_wait=None)[source]
\n

An exception to raise from an op to indicate that it should be retried.

\n
\n
Parameters
\n
    \n
  • max_retries (Optional[int]) \u2013 The max number of retries this step should attempt before failing

  • \n
  • seconds_to_wait (Optional[Union[float,int]]) \u2013 Seconds to wait before restarting the step after putting the step in\nto the up_for_retry state

  • \n
\n
\n
\n

Example

\n
@op\ndef flakes():\n    try:\n        flakey_operation()\n    except Exception as e:\n        raise RetryRequested(max_retries=3) from e\n
\n
\n
\n\n
\n
\n
\n

Metadata\u00b6

\n

Dagster uses metadata to communicate arbitrary user-specified metadata about structured\nevents.

\n
\n
\nclass dagster.MetadataValue[source]
\n

Utility class to wrap metadata values passed into Dagster events so that they can be\ndisplayed in Dagit and other tooling.

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": "hello",\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n            "num_rows": 0,\n        },\n    )\n
\n
\n
\n
\nstatic asset(asset_key)[source]
\n

Static constructor for a metadata value referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata={\n            "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n        },\n    )\n
\n
\n
\n
Parameters
\n

asset_key (AssetKey) \u2013 The asset key referencing the asset.

\n
\n
\n
\n\n
\n
\nstatic dagster_run(run_id)[source]
\n

Static constructor for a metadata value wrapping a reference to a Dagster run.

\n
\n
Parameters
\n

run_id (str) \u2013 The ID of the run.

\n
\n
\n
\n\n
\n
\nstatic float(value)[source]
\n

Static constructor for a metadata value wrapping a float as\nFloatMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n        }\n    )\n
\n
\n
\n
Parameters
\n

value (float) \u2013 The float value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic int(value)[source]
\n

Static constructor for a metadata value wrapping an int as\nIntMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "number of rows": MetadataValue.int(len(df)),\n        },\n    )\n
\n
\n
\n
Parameters
\n

value (int) \u2013 The int value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic json(data)[source]
\n

Static constructor for a metadata value wrapping a path as\nJsonMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata={\n            "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n        },\n    )\n
\n
\n
\n
Parameters
\n

data (Dict[str, Any]) \u2013 The JSON data for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic md(data)[source]
\n

Static constructor for a metadata value wrapping markdown data as\nMarkdownMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata={\n            'Details': MetadataValue.md(md_str)\n        },\n    )\n
\n
\n
\n
Parameters
\n

md_str (str) \u2013 The markdown for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic path(path)[source]
\n

Static constructor for a metadata value wrapping a path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "filepath": MetadataValue.path("path/to/file"),\n        }\n    )\n
\n
\n
\n
Parameters
\n

path (str) \u2013 The path for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic python_artifact(python_artifact)[source]
\n

Static constructor for a metadata value wrapping a python artifact as\nPythonArtifactMetadataValue. Can be used as the value type for the\nmetadata parameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "class": MetadataValue.python_artifact(MyClass),\n            "function": MetadataValue.python_artifact(my_function),\n        }\n    )\n
\n
\n
\n
Parameters
\n

value (Callable) \u2013 The python class or function for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic table(records, schema=None)[source]
\n

Static constructor for a metadata value wrapping arbitrary tabular data as\nTableMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata={\n            "errors": MetadataValue.table(\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"}]\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        },\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • records (List[TableRecord]) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table_schema(schema)[source]
\n

Static constructor for a metadata value wrapping a table schema as\nTableSchemaMetadataValue. Can be used as the value type\nfor the metadata parameter for supported events. For example:

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata={\n        'my_table_schema': MetadataValue.table_schema(schema),\n    }\n)\n
\n
\n
\n
Parameters
\n

schema (TableSchema) \u2013 The table schema for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic text(text)[source]
\n

Static constructor for a metadata value wrapping text as\nTextMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": MetadataValue.text("hello")\n        },\n    )\n
\n
\n
\n
Parameters
\n

text (str) \u2013 The text string for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic url(url)[source]
\n

Static constructor for a metadata value wrapping a URL as\nUrlMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata={\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n        }\n    )\n
\n
\n
\n
Parameters
\n

url (str) \u2013 The URL for a metadata entry.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MetadataEntry(label, description=None, entry_data=None, value=None)[source]
\n

The standard structure for describing metadata for Dagster events.

\n

Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\nin Dagit and other tooling.

\n

Should be yielded from within an IO manager to append metadata for a given input/output event.\nFor other event types, passing a dict with MetadataValue values to the metadata argument\nis preferred.

\n
\n
Parameters
\n
    \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • value (MetadataValue) \u2013 Typed metadata entry data. The different types allow\nfor customized display in tools like dagit.

  • \n
\n
\n
\n
\n
\nstatic asset(asset_key, label, description=None)[source]
\n

Static constructor for a metadata entry referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata_entries=[\n             MetadataEntry.asset(AssetKey('my_other_table'), "Related asset"),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • asset_key (AssetKey) \u2013 The asset key referencing the asset.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic float(value, label, description=None)[source]
\n

Static constructor for a metadata entry containing float as\nFloatMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.float(calculate_bytes(df), "size (bytes)")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • value (Optional[float]) \u2013 The float value contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic fspath(path, label=None, description=None)[source]
\n

Static constructor for a metadata entry containing a filesystem path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.fspath("path/to/file")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • path (Optional[str]) \u2013 The path contained by this metadata entry.

  • \n
  • label (Optional[str]) \u2013 Short display label for this metadata entry. Defaults to the\nbase name of the path.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic int(value, label, description=None)[source]
\n

Static constructor for a metadata entry containing int as\nIntMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.int(len(df), "number of rows")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • value (Optional[int]) \u2013 The int value contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic json(data, label, description=None)[source]
\n

Static constructor for a metadata entry containing JSON data as\nJsonMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata_entries=[\n            MetadataEntry.json(\n                label="metadata", data={"missing_columns": missing_things},\n            )\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • data (Optional[Dict[str, Any]]) \u2013 The JSON data contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic md(md_str, label, description=None)[source]
\n

Static constructor for a metadata entry containing markdown data as\nMarkdownMetadataValue. For example:

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata_entries=[MetadataEntry.md(md_str=md_str)],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • md_str (Optional[str]) \u2013 The markdown contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic path(path, label, description=None)[source]
\n

Static constructor for a metadata entry containing a path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.path("path/to/file", label="filepath")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • path (Optional[str]) \u2013 The path contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table(records, label, description=None, schema=None)[source]
\n

Static constructor for a metadata entry containing tabluar data as\nTableMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata_entries=[\n            MetadataEntry.table(\n                label="errors",\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"}]\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • records (List[TableRecord]) \u2013 The data as a list of records (i.e. rows).

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table. If none is provided, one will be\nautomatically generated by examining the first record. The schema will include as columns all\nfield names present in the first record, with a type of \u201cstring\u201d, \u201cint\u201d,\n\u201cbool\u201d or \u201cfloat\u201d inferred from the first record\u2019s values. If a value does\nnot directly match one of the above types, it will be treated as a string.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table_schema(schema, label, description=None)[source]
\n

Static constructor for a metadata entry containing a table schema as\nTableSchemaMetadataValue. For example:

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata_entries=[\n        MetadataEntry.table_schema(\n            schema,\n            label='schema',\n        )\n    ]\n)\n
\n
\n
\n
Parameters
\n
    \n
  • schema (TableSchema) \u2013 The table schema for a metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic text(text, label, description=None)[source]
\n

Static constructor for a metadata entry containing text as\nTextMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[\n            MetadataEntry.text("Text-based metadata for this event", "text_metadata")\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • text (Optional[str]) \u2013 The text of this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic url(url, label, description=None)[source]
\n

Static constructor for a metadata entry containing a URL as\nUrlMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata_entries=[\n            MetadataEntry.url(\n                "http://mycoolsite.com/my_dashboard", label="dashboard_url"\n            ),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • url (Optional[str]) \u2013 The URL contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nproperty value
\n

Alias of entry_data.

\n
\n\n
\n\n
\n
\n

Metadata types\u00b6

\n

All metadata types inherit from MetadataValue. The following types are defined:

\n
\n
\nclass dagster.DagsterAssetMetadataValue(asset_key)[source]
\n

Representation of a dagster asset.

\n
\n
Parameters
\n

asset_key (AssetKey) \u2013 The dagster asset key

\n
\n
\n
\n\n
\n
\nclass dagster.DagsterPipelineRunMetadataValue(run_id)[source]
\n

Representation of a dagster pipeline run.

\n
\n
Parameters
\n

run_id (str) \u2013 The pipeline run id

\n
\n
\n
\n\n
\n
\nclass dagster.FloatMetadataValue(value)[source]
\n

Container class for float metadata entry data.

\n
\n
Parameters
\n

value (Optional[float]) \u2013 The float value.

\n
\n
\n
\n\n
\n
\nclass dagster.IntMetadataValue(value)[source]
\n

Container class for int metadata entry data.

\n
\n
Parameters
\n

value (Optional[int]) \u2013 The int value.

\n
\n
\n
\n\n
\n
\nclass dagster.JsonMetadataValue(data)[source]
\n

Container class for JSON metadata entry data.

\n
\n
Parameters
\n

data (Dict[str, Any]) \u2013 The JSON data.

\n
\n
\n
\n\n
\n
\nclass dagster.MarkdownMetadataValue(md_str)[source]
\n

Container class for markdown metadata entry data.

\n
\n
Parameters
\n

md_str (Optional[str]) \u2013 The markdown as a string.

\n
\n
\n
\n\n
\n
\nclass dagster.PathMetadataValue(path)[source]
\n

Container class for path metadata entry data.

\n
\n
Parameters
\n

path (Optional[str]) \u2013 The path as a string or conforming to os.PathLike.

\n
\n
\n
\n\n
\n
\nclass dagster.PythonArtifactMetadataValue(module, name)[source]
\n

Container class for python artifact metadata entry data.

\n
\n
Parameters
\n
    \n
  • module (str) \u2013 The module where the python artifact can be found

  • \n
  • name (str) \u2013 The name of the python artifact

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableMetadataValue(records, schema)[source]
\n

Container class for table metadata entry data.

\n
\n
Parameters
\n
    \n
  • records (TableRecord) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableSchemaMetadataValue(schema)[source]
\n

Representation of a schema for arbitrary tabular data.

\n
\n
Parameters
\n

schema (TableSchema) \u2013 The dictionary containing the schema representation.

\n
\n
\n
\n\n
\n
\nclass dagster.TextMetadataValue(text)[source]
\n

Container class for text metadata entry data.

\n
\n
Parameters
\n

text (Optional[str]) \u2013 The text data.

\n
\n
\n
\n\n
\n
\nclass dagster.UrlMetadataValue(url)[source]
\n

Container class for URL metadata entry data.

\n
\n
Parameters
\n

url (Optional[str]) \u2013 The URL as a string.

\n
\n
\n
\n\n
\n
\n
\n

Asset key\u00b6

\n

Dagster uses AssetKey to build an index on Materialization events.\nAssets materialized with an AssetKey are highlighted in dagit on the Assets\ndashboard.

\n
\n
\nclass dagster.AssetKey(path)[source]
\n

Object representing the structure of an asset key. Takes in a sanitized string, list of\nstrings, or tuple of strings.

\n

Example usage:

\n
from dagster import op\n\n@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey('flat_asset_key'),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(['parent', 'child', 'grandchild']),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key_2(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(('parent', 'child', 'grandchild')),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n
\n
\n
\n
Parameters
\n

path (Sequence[str]) \u2013 String, list of strings, or tuple of strings. A list of strings\nrepresent the hierarchical structure of the asset_key.

\n
\n
\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/solids", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../dynamic/", "title": "Dynamic Mapping & Collect"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../schedules-sensors/", "title": "Run Requests"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/dynamic", "Dynamic Mapping & Collect", "N", "next"], ["sections/api/apidocs/schedules-sensors", "Run Requests", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/solids.rst.txt", "title": "[Legacy] Solids", "toc": "\n"}, "types": {"alabaster_version": "0.7.12", "body": "
\n

Types\u00b6

\n

Dagster includes facilities for typing the input and output values of ops (\u201cruntime\u201d types).

\n
\n

Built-in primitive types\u00b6

\n
\n
\ndagster.Any\u00b6
\n

Use this type for any input, output, or config field whose type is unconstrained

\n

All values are considered to be instances of Any.

\n

Examples:

\n
@op\ndef identity(_, x: Any) -> Any:\n    return x\n\n# Untyped inputs and outputs are implicitly typed Any\n@op\ndef identity_imp(_, x):\n    return x\n\n# Explicitly typed\n@op(\n    ins={'x': In(dagster_type=Any)},\n    out=Out(dagster_type=Any),\n)\ndef identity(_, x):\n    return x\n\n@op(config_schema=Field(Any))\ndef any_config(context):\n    return context.op_config\n
\n
\n
\n\n
\n
\ndagster.Bool\u00b6
\n

Use this type for any boolean input, output, or config_field. At runtime, this will perform an\nisinstance(value, bool) check. You may also use the ordinary bool\ntype as an alias.

\n

Examples:

\n
@op\ndef boolean(_, x: Bool) -> String:\n    return 'true' if x else 'false'\n\n@op\ndef empty_string(_, x: String) -> bool:\n    return len(x) == 0\n\n# Explicit\n@op(\n    ins={'x': In(dagster_type=Bool)},\n    out=Out(dagster_type=String),\n)\ndef boolean(_, x):\n    return 'true' if x else 'false'\n\n@op(\n    ins={'x': In(dagster_type=String)},\n    out=Out(dagster_type=bool),\n)\ndef empty_string(_, x):\n    return len(x) == 0\n\n@op(config_schema=Field(Bool))\ndef bool_config(context):\n    return 'true' if context.op_config else 'false'\n
\n
\n
\n\n
\n
\ndagster.Int\u00b6
\n

Use this type for any integer input or output. At runtime, this will perform an\nisinstance(value, int) check. You may also use the ordinary int\ntype as an alias.

\n

Examples:

\n
@op\ndef add_3(_, x: Int) -> int:\n    return x + 3\n\n# Explicit\n@op(\n    ins={'x', In(dagster_type=Int)},\n    out=Out(dagster_type=Int),\n)\ndef add_3(_, x):\n    return x + 3\n
\n
\n
\n\n
\n
\ndagster.Float\u00b6
\n

Use this type for any float input, output, or config value. At runtime, this will perform an\nisinstance(value, float) check. You may also use the ordinary float\ntype as an alias.

\n

Examples:

\n
@op\ndef div_2(_, x: Float) -> float:\n    return x / 2\n\n# Explicit\n@op(\n    ins={'x', In(dagster_type=Float)},\n    out=Out(dagster_type=float),\n)\ndef div_2(_, x):\n    return x / 2\n\n@op(config_schema=Field(Float))\ndef div_y(context, x: Float) -> float:\n    return x / context.op_config\n
\n
\n
\n\n
\n
\ndagster.String\u00b6
\n

Use this type for any string input, output, or config value. At runtime, this will perform an\nisinstance(value, str) check. You may also use the ordinary str type\nas an alias.

\n

Examples:

\n
@op\ndef concat(_, x: String, y: str) -> str:\n    return x + y\n\n# Explicit\n@op(\n    ins= {\n        'x': In(dagster_type=String),\n        'y': In(dagster_type=str),\n    },\n    out= Out(dagster_type=str),\n)\ndef concat(_, x, y):\n    return x + y\n\n@op(config_schema=Field(String))\ndef hello(context) -> str:\n    return 'Hello, {friend}!'.format(friend=context.op_config)\n
\n
\n
\n\n
\n
\ndagster.Nothing\u00b6
\n

Use this type only for inputs and outputs, in order to establish an execution dependency without\ncommunicating a value. Inputs of this type will not be pased to the op compute function, so\nit is necessary to use the explicit InputDefinition API to define them rather than\nthe Python 3 type hint syntax.

\n

All values are considered to be instances of Nothing.

\n

Examples:

\n
@op\ndef wait(_) -> Nothing:\n    time.sleep(1)\n    return\n\n@op(\n    ins={"ready": In(dagster_type=Nothing)},\n)\ndef done(_) -> str:\n    return 'done'\n\n@job\ndef nothing_job():\n    done(wait())\n\n# Any value will pass the type check for Nothing\n@op\ndef wait_int(_) -> Int:\n    time.sleep(1)\n    return 1\n\n@job\ndef nothing_int_job():\n    done(wait_int())\n
\n
\n
\n\n
\n
\ndagster.Optional\u00b6
\n

Use this type only for inputs and outputs, if the value can also be None.

\n

Examples:

\n
@op\ndef nullable_concat(_, x: str, y: Optional[str]) -> str:\n    return x + (y or '')\n\n# Explicit\n@op(\n    ins={\n        'x': In(String),\n        'y': In(Optional[String]),\n    },\n    out=Out(String),\n)\ndef nullable_concat(_, x, y):\n    return x + (y or '')\n
\n
\n
\n\n
\n
\ndagster.List\u00b6
\n

Use this type for inputs, or outputs.

\n

Lists are also the appropriate input types when fanning in multiple outputs using a\nMultiDependencyDefinition or the equivalent composition function syntax.

\n

Examples:

\n
@op\ndef concat_list(_, xs: List[str]) -> str:\n    return ''.join(xs)\n\n# Explicit\n@op(\n    ins={'xs': In(dagster_type=List[str])},\n    out=Out(dagster_type=String),\n)\ndef concat_list(_, xs) -> str:\n    return ''.join(xs)\n\n# Fanning in multiple outputs\n@op\ndef emit_1(_) -> int:\n    return 1\n\n@op\ndef emit_2(_) -> int:\n    return 2\n\n@op\ndef emit_3(_) -> int:\n    return 3\n\n@op\ndef sum_op(_, xs: List[int]) -> int:\n    return sum(xs)\n\n@job\ndef sum_job():\n    sum_op([emit_1(), emit_2(), emit_3()])\n
\n
\n
\n\n
\n
\ndagster.Dict\u00b6
\n

Use this type for inputs, or outputs that are dicts.

\n

For Ins and Outs, you must specify the key and value types using the square\nbrackets syntax for Python typing.

\n

Examples:

\n
@op\ndef repeat(_, spec: Dict) -> str:\n    return spec['word'] * spec['times']\n\n# Explicit\n@op(\n    ins={'spec': In(Dict[String, String])},\n    out=Out(String),\n)\ndef repeat(_, spec):\n    return spec['word'] * spec['times']\n
\n
\n
\n\n
\n
\ndagster.Set\u00b6
\n

Use this type for inputs, or outputs that are sets. Alias for\ntyping.Set.

\n

You may optionally specify the inner type using the square brackets syntax for Python typing.

\n

Examples:

\n
@op\ndef set_op(_, set_input: Set[String]) -> List[str]:\n    return sorted([x for x in set_input])\n\n# Explicit\n@op(\n    ins={"set_input": In(dagster_type=Set[String])},\n    out=Out(List[String]),\n)\ndef set_op(_, set_input):\n    return sorted([x for x in set_input])\n
\n
\n
\n\n
\n
\ndagster.Tuple\u00b6
\n

Use this type for inputs or outputs that are tuples. Alias for\ntyping.Tuple.

\n

You may optionally specify the inner types using the square brackets syntax for Python typing.

\n

Config values should be passed as a list (in YAML or the Python config dict).

\n

Examples:

\n
@op\ndef tuple_op(_, tuple_input: Tuple[str, int, float]) -> List:\n    return [x for x in tuple_input]\n\n# Explicit\n@op(\n    ins={'tuple_input': In(dagster_type=Tuple[String, Int, Float])},\n    out=Out(List),\n)\ndef tuple_op(_, tuple_input):\n    return [x for x in tuple_input]\n
\n
\n
\n\n
\n
\nclass dagster.FileHandle[source]\u00b6
\n

A reference to a file as manipulated by a FileManager

\n

Subclasses may handle files that are resident on the local file system, in an object store, or\nin any arbitrary place where a file can be stored.

\n

This exists to handle the very common case where you wish to write a computation that reads,\ntransforms, and writes files, but where you also want the same code to work in local development\nas well as on a cluster where the files will be stored in a globally available object store\nsuch as S3.

\n
\n
\nabstract property path_desc\u00b6
\n

A representation of the file path for display purposes only.

\n
\n\n
\n\n
\n
\nclass dagster.LocalFileHandle(path)[source]\u00b6
\n

A reference to a file on a local filesystem.

\n
\n\n
\n
\n

Making New Types\u00b6

\n
\n
\nclass dagster.DagsterType(type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, materializer=None, required_resource_keys=None, kind=<DagsterTypeKind.REGULAR: 'REGULAR'>, typing_type=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Define a type in dagster. These can be used in the inputs and outputs of ops.

\n
\n
Parameters
\n
    \n
  • type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the op. If it passes, return either\nTrue or a TypeCheck with success set to True. If it fails,\nreturn either False or a TypeCheck with success set to False.\nThe first argument must be named context (or, if unused, _, _context, or context_).\nUse required_resource_keys for access to resources.

  • \n
  • key (Optional[str]) \u2013

    The unique key to identify types programmatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name. If\nneither key nor name is provided, a CheckError is thrown.

    \n

    In the case of a generic type such as List or Optional, this is\ngenerated programmatically based on the type parameters.

    \n

    For most use cases, name should be set and the key argument should\nnot be specified.

    \n

  • \n
  • name (Optional[str]) \u2013 A unique name given by a user. If key is None, key\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer\ndecorator to construct these arguments.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn.

  • \n
  • is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String, Int) to visually distinguish\nthem from user-defined types. Meant for internal use.

  • \n
  • kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.

  • \n
  • typing_type \u2013 Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\nvalue contained within the DagsterType. Meant for internal use.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.PythonObjectDagsterType(python_type, key=None, name=None, **kwargs)[source]\u00b6
\n

Define a type in dagster whose typecheck is an isinstance check.

\n

Specifically, the type can either be a single python type (e.g. int),\nor a tuple of types (e.g. (int, float)) which is treated as a union.

\n

Examples

\n
ntype = PythonObjectDagsterType(python_type=int)\nassert ntype.name == 'int'\nassert_success(ntype, 1)\nassert_failure(ntype, 'a')\n
\n
\n
ntype = PythonObjectDagsterType(python_type=(int, float))\nassert ntype.name == 'Union[int, float]'\nassert_success(ntype, 1)\nassert_success(ntype, 1.5)\nassert_failure(ntype, 'a')\n
\n
\n
\n
Parameters
\n
    \n
  • python_type (Union[Type, Tuple[Type, ..]) \u2013 The dagster typecheck function calls instanceof on\nthis type.

  • \n
  • name (Optional[str]) \u2013 Name the type. Defaults to the name of python_type.

  • \n
  • key (Optional[str]) \u2013 Key of the type. Defaults to name.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_mate\ndecorator to construct these arguments.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.dagster_type_loader(config_schema, required_resource_keys=None, loader_version=None, external_version_fn=None)[source]\u00b6
\n

Create an dagster type loader that maps config data to a runtime value.

\n

The decorated function should take the execution context and parsed config value and return the\nappropriate runtime value.

\n
\n
Parameters
\n
    \n
  • config_schema (ConfigSchema) \u2013 The schema for the config that\u2019s passed to the decorated\nfunction.

  • \n
  • loader_version (str) \u2013 (Experimental) The version of the decorated compute function. Two\nloading functions should have the same version if and only if they deterministically\nproduce the same outputs when provided the same inputs.

  • \n
  • external_version_fn (Callable) \u2013 (Experimental) A function that takes in the same parameters as the loader\nfunction (config_value) and returns a representation of the version of the external\nasset (str). Two external assets with identical versions are treated as identical to one\nanother.

  • \n
\n
\n
\n

Examples:

\n
@dagster_type_loader(Permissive())\ndef load_dict(_context, value):\n    return value\n
\n
\n
\n\n
\n
\nclass dagster.DagsterTypeLoader[source]\u00b6
\n

Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\nto.

\n

The recommended way to define a type loader is with the\n@dagster_type_loader decorator.

\n
\n\n
\n
\ndagster.dagster_type_materializer(config_schema, required_resource_keys=None)[source]\u00b6
\n

Create an output materialization hydration config that configurably materializes a runtime\nvalue.

\n

The decorated function should take the execution context, the parsed config value, and the\nruntime value. It should materialize the runtime value, and should\nreturn an appropriate AssetMaterialization.

\n
\n
Parameters
\n

config_schema (object) \u2013 The type of the config data expected by the decorated function.

\n
\n
\n

Examples:

\n
# Takes a list of dicts such as might be read in using csv.DictReader, as well as a config\nvalue, and writes\n@dagster_type_materializer(str)\ndef materialize_df(_context, path, value):\n    with open(path, 'w') as fd:\n        writer = csv.DictWriter(fd, fieldnames=value[0].keys())\n        writer.writeheader()\n        writer.writerows(rowdicts=value)\n\n    return AssetMaterialization.file(path)\n
\n
\n
\n\n
\n
\nclass dagster.DagsterTypeMaterializer[source]\u00b6
\n

Dagster type materializers are used to materialize outputs of the dagster type they are attached\nto.

\n

The recommended way to define a type loader is with the\n@dagster_type_materializer decorator.

\n
\n\n
\n
\ndagster.usable_as_dagster_type(name=None, description=None, loader=None, materializer=None)[source]\u00b6
\n

Decorate a Python class to make it usable as a Dagster Type.

\n

This is intended to make it straightforward to annotate existing business logic classes to\nmake them dagster types whose typecheck is an isinstance check against that python class.

\n
\n
Parameters
\n
    \n
  • python_type (cls) \u2013 The python type to make usable as python type.

  • \n
  • name (Optional[str]) \u2013 Name of the new Dagster type. If None, the name (__name__) of\nthe python_type will be used.

  • \n
  • description (Optional[str]) \u2013 A user-readable description of the type.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer\ndecorator to construct these arguments.

  • \n
\n
\n
\n

Examples:

\n
# dagster_aws.s3.file_manager.S3FileHandle\n@usable_as_dagster_type\nclass S3FileHandle(FileHandle):\n    def __init__(self, s3_bucket, s3_key):\n        self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n        self._s3_key = check.str_param(s3_key, 's3_key')\n\n    @property\n    def s3_bucket(self):\n        return self._s3_bucket\n\n    @property\n    def s3_key(self):\n        return self._s3_key\n\n    @property\n    def path_desc(self):\n        return self.s3_path\n\n    @property\n    def s3_path(self):\n        return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n
\n
\n
\n\n
\n
\ndagster.make_python_type_usable_as_dagster_type(python_type, dagster_type)[source]\u00b6
\n

Take any existing python type and map it to a dagster type (generally created with\nDagsterType) This can only be called once\non a given python type.

\n
\n\n
\n

Testing Types\u00b6

\n
\n
\ndagster.check_dagster_type(dagster_type, value)[source]\u00b6
\n

Test a custom Dagster type.

\n
\n
Parameters
\n
    \n
  • dagster_type (Any) \u2013 The Dagster type to test. Should be one of the\nbuilt-in types, a dagster type explicitly constructed with\nas_dagster_type(), @usable_as_dagster_type, or\nPythonObjectDagsterType(), or a Python type.

  • \n
  • value (Any) \u2013 The runtime value to test.

  • \n
\n
\n
Returns
\n

The result of the type check.

\n
\n
Return type
\n

TypeCheck

\n
\n
\n

Examples

\n
assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n
\n
\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/types", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "", "next": {"link": "../utilities/", "title": "Utilities"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dynamic/", "title": "Dynamic Mapping & Collect"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/utilities", "Utilities", "N", "next"], ["sections/api/apidocs/dynamic", "Dynamic Mapping & Collect", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/types.rst.txt", "title": "Types", "toc": "\n"}, "utilities": {"alabaster_version": "0.7.12", "body": "
\n

Utilities\u00b6

\n
\n
\ndagster.file_relative_path(dunderfile, relative_path)[source]\u00b6
\n

Get a path relative to the currently executing Python file.

\n

This function is useful when one needs to load a file that is relative to the position of\nthe current file. (Such as when you encode a configuration file path in source file and want\nin runnable in any current working directory)

\n
\n
Parameters
\n
    \n
  • dunderfile (str) \u2013 Should always be __file__.

  • \n
  • relative_path (str) \u2013 Path to get relative to the currently executing file.

  • \n
\n
\n
\n

Examples:

\n
file_relative_path(__file__, 'path/relative/to/file')\n
\n
\n
\n\n
\n
\ndagster.config_from_files(config_files)[source]\u00b6
\n

Constructs run config from YAML files.

\n
\n
Parameters
\n

config_files (List[str]) \u2013 List of paths or glob patterns for yaml files\nto load and parse as the run config.

\n
\n
Returns
\n

A run config dictionary constructed from provided YAML files.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
Raises
\n
\n
\n
\n
\n\n
\n
\ndagster.config_from_pkg_resources(pkg_resource_defs)[source]\u00b6
\n

Load a run config from a package resource, using pkg_resources.resource_string().

\n

Example:

\n
config_from_pkg_resources(\n    pkg_resource_defs=[\n        ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n        ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n    ],\n)\n
\n
\n
\n
Parameters
\n

pkg_resource_defs (List[(str, str)]) \u2013 List of pkg_resource modules/files to\nload as the run config.

\n
\n
Returns
\n

A run config dictionary constructed from the provided yaml strings

\n
\n
Return type
\n

Dict[Str, Any]

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\ndagster.config_from_yaml_strings(yaml_strings)[source]\u00b6
\n

Static constructor for run configs from YAML strings.

\n
\n
Parameters
\n

yaml_strings (List[str]) \u2013 List of yaml strings to parse as the run config.

\n
\n
Returns
\n

A run config dictionary constructed from the provided yaml strings

\n
\n
Return type
\n

Dict[Str, Any]

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\ndagster.get_dagster_logger(name=None)[source]\u00b6
\n

Creates a python logger whose output messages will be captured and converted into Dagster log\nmessages. This means they will have structured information such as the step_key, run_id, etc.\nembedded into them, and will show up in the Dagster event log.

\n

This can be used as a more convenient alternative to context.log in most cases. If log level\nis not set explicitly, defaults to DEBUG.

\n
\n
Parameters
\n

name (Optional[str]) \u2013 If supplied, will create a logger with the name \u201cdagster.builtin.{name}\u201d,\nwith properties inherited from the base Dagster logger. If omitted, the returned logger\nwill be named \u201cdagster.builtin\u201d.

\n
\n
Returns
\n

A logger whose output will be captured by Dagster.

\n
\n
Return type
\n

logging.Logger

\n
\n
\n

Example

\n
from dagster import get_dagster_logger, op\n\n@op\ndef hello_op():\n    log = get_dagster_logger()\n    for i in range(5):\n        # do something\n        log.info(f"Did {i+1} things!")\n
\n
\n
\n\n
\n
\nclass dagster.ExperimentalWarning[source]\u00b6
\n
\n\n
\n
\nclass dagster.utils.forked_pdb.ForkedPdb(completekey='tab', stdin=None, stdout=None, skip=None, nosigint=False, readrc=True)[source]\u00b6
\n

A pdb subclass that may be used from a forked multiprocessing child

\n

Examples:

\n
from dagster.utils.forked_pdb import ForkedPdb\n\n@solid\ndef complex_solid(_):\n    # some complicated stuff\n\n    ForkedPdb().set_trace()\n\n    # some other complicated stuff\n
\n
\n

You can initiate pipeline execution via dagit and use the pdb debugger to examine/step through\nexecution at the breakpoint.

\n
\n\n
\n
\ndagster.utils.make_email_on_run_failure_sensor(*args, **kwargs)[source]\u00b6
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/utilities", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "", "next": {"link": "../memoization/", "title": "Versioning and Memoization"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../types/", "title": "Types"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/memoization", "Versioning and Memoization", "N", "next"], ["sections/api/apidocs/types", "Types", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/utilities.rst.txt", "title": "Utilities", "toc": "\n"}}}} \ No newline at end of file +{"api": {"apidocs": {"assets": {"alabaster_version": "0.7.12", "body": "
\n

Software-Defined Assets (Experimental)\u00b6

\n

Software-defined assets sit on top of the graph/job/op APIs and enable a novel way of constructing Dagster jobs that puts assets at the forefront.

\n

Conceptually, software-defined assets invert the typical relationship between assets and computation. Instead of defining a graph of ops and recording which assets those ops end up materializing, you define a set of assets, each of which knows how to compute its contents from upstream assets.

\n

A software-defined asset combines:\n- An asset key, e.g. the name of a table.\n- A function, which can be run to compute the contents of the asset.\n- A set of upstream assets that are provided as inputs to the function when computing the asset.

\n
\n
\n@dagster.asset(name=None, namespace=None, ins=None, non_argument_deps=None, metadata=None, description=None, required_resource_keys=None, resource_defs=None, io_manager_def=None, io_manager_key=None, compute_kind=None, dagster_type=None, partitions_def=None, partition_mappings=None, op_tags=None)[source]\u00b6
\n

Create a definition for how to compute an asset.

\n

A software-defined asset is the combination of:\n1. An asset key, e.g. the name of a table.\n2. A function, which can be run to compute the contents of the asset.\n3. A set of upstream assets that are provided as inputs to the function when computing the asset.

\n

Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\nabout the upstream assets it depends on. The upstream assets are inferred from the arguments\nto the decorated function. The name of the argument designates the name of the upstream asset.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the asset. If not provided, defaults to the name of the\ndecorated function.

  • \n
  • namespace (Optional[Sequence[str]]) \u2013 The namespace that the asset resides in. The namespace + the\nname forms the asset key.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to their metadata\nand namespaces.

  • \n
  • non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]) \u2013 Set of asset keys that are\nupstream dependencies, but do not pass an input to the asset.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata entries for the asset.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the op.

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the IOManager used\nfor storing the output of the op as an asset, and for loading it in downstream ops (default: \u201cio_manager\u201d). Only one of io_manager_key and io_manager_def can be provided.

  • \n
  • io_manager_def (Optional[IOManagerDefinition]) \u2013 The definition of the IOManager used for\nstoring the output of the op as an asset, and for loading it in\ndownstream ops. Only one of io_manager_def and io_manager_key can be provided.

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that produces\nthe asset, e.g. \u201cdbt\u201d or \u201cspark\u201d. It will be displayed in Dagit as a badge on the asset.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 Allows specifying type validation functions that\nwill be executed on the output of the decorated function after it runs.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the asset.

  • \n
  • partition_mappings (Optional[Mapping[str, PartitionMapping]]) \u2013 Defines how to map partition\nkeys for this asset to partition keys of upstream assets. Each key in the dictionary\ncorreponds to one of the input assets, and each value is a PartitionMapping.\nIf no entry is provided for a particular asset dependency, the partition mapping defaults\nto the default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
\n
\n
\n

Examples

\n
@asset\ndef my_asset(my_upstream_asset: int) -> int:\n    return my_upstream_asset + 1\n
\n
\n
\n\n
\n
\nclass dagster.AssetGroup(assets, source_assets=None, resource_defs=None, executor_def=None)[source]\u00b6
\n

Defines a group of assets, along with environment information in the\nform of resources and an executor.

\n

An AssetGroup can be provided to a RepositoryDefinition. When\nprovided to a repository, the constituent assets can be materialized from\nDagit. The AssetGroup also provides an interface for creating jobs from\nsubselections of assets, which can then be provided to a\nScheduleDefinition or SensorDefinition.

\n

There can only be one AssetGroup per repository.

\n
\n
Parameters
\n
    \n
  • assets (Sequence[AssetsDefinition]) \u2013 The set of software-defined assets\nto group.

  • \n
  • source_assets (Optional[Sequence[SourceAsset]]) \u2013 The set of source\nassets that the software-defined may depend on.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A\ndictionary of resource definitions. When the AssetGroup is\nconstructed, if there are any unsatisfied resource requirements\nfrom the assets, it will result in an error. Note that the\nroot_manager key is a reserved resource key, and will result in\nan error if provided by the user.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 The executor definition to\nuse when re-materializing assets in this group.

  • \n
\n
\n
\n

Examples

\n
from dagster import AssetGroup, asset, AssetIn, AssetKey, SourceAsset, resource\n\nsource_asset = SourceAsset("source")\n\n@asset(required_resource_keys={"foo"})\ndef start_asset(context, source):\n    ...\n\n@asset\ndef next_asset(start_asset):\n    ...\n\n@resource\ndef foo_resource():\n    ...\n\nasset_group = AssetGroup(\n    assets=[start_asset, next_asset],\n    source_assets=[source_asset],\n    resource_defs={"foo": foo_resource},\n)\n...\n
\n
\n
\n
\nbuild_job(name, selection=None, executor_def=None, tags=None, description=None, _asset_selection_data=None)[source]\u00b6
\n

Defines an executable job from the provided assets, resources, and executor.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name to give the job.

  • \n
  • selection (Union[str, List[str]]) \u2013

    A single selection query or list of selection queries\nto execute. For example:

    \n
    \n
      \n
    • ['some_asset_key'] select some_asset_key itself.

    • \n
    • ['*some_asset_key'] select some_asset_key and all its ancestors (upstream dependencies).

    • \n
    • ['*some_asset_key+++'] select some_asset_key, all its ancestors, and its descendants (downstream dependencies) within 3 levels down.

    • \n
    • ['*some_asset_key', 'other_asset_key_a', 'other_asset_key_b+'] select some_asset_key and all its ancestors, other_asset_key_a itself, and other_asset_key_b and its direct child asset keys. When subselecting into a multi-asset, all of the asset keys in that multi-asset must be selected.

    • \n
    \n
    \n

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 The executor\ndefinition to use when executing the job. Defaults to the\nexecutor on the AssetGroup. If no executor was provided on the\nAssetGroup, then it defaults to multi_or_in_process_executor.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten\ntag values provided at invocation time.

  • \n
  • description (Optional[str]) \u2013 A description of the job.

  • \n
\n
\n
\n

Examples

\n
from dagster import AssetGroup\n\nthe_asset_group = AssetGroup(...)\n\njob_with_all_assets = the_asset_group.build_job()\n\njob_with_one_selection = the_asset_group.build_job(selection="some_asset")\n\njob_with_multiple_selections = the_asset_group.build_job(selection=["*some_asset", "other_asset++"])\n
\n
\n
\n\n
\n
\nstatic from_current_module(resource_defs=None, executor_def=None, extra_source_assets=None)[source]\u00b6
\n

Constructs an AssetGroup that includes all asset definitions and source assets in the module\nwhere this is called from.

\n
\n
Parameters
\n
    \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dictionary of resource\ndefinitions to include on the returned asset group.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 An executor to include on the returned\nasset group.

  • \n
  • extra_source_assets (Optional[Sequence[SourceAsset]]) \u2013 Source assets to include in the\ngroup in addition to the source assets found in the module.

  • \n
\n
\n
Returns
\n

An asset group with all the assets defined in the module.

\n
\n
Return type
\n

AssetGroup

\n
\n
\n
\n\n
\n
\nstatic from_modules(modules, resource_defs=None, executor_def=None, extra_source_assets=None)[source]\u00b6
\n

Constructs an AssetGroup that includes all asset definitions and source assets in the given\nmodules.

\n
\n
Parameters
\n
    \n
  • modules (Iterable[ModuleType]) \u2013 The Python modules to look for assets inside.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dictionary of resource\ndefinitions to include on the returned asset group.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 An executor to include on the returned\nasset group.

  • \n
  • extra_source_assets (Optional[Sequence[SourceAsset]]) \u2013 Source assets to include in the\ngroup in addition to the source assets found in the modules.

  • \n
\n
\n
Returns
\n

An asset group with all the assets defined in the given modules.

\n
\n
Return type
\n

AssetGroup

\n
\n
\n
\n\n
\n
\nstatic from_package_module(package_module, resource_defs=None, executor_def=None, extra_source_assets=None)[source]\u00b6
\n

Constructs an AssetGroup that includes all asset definitions and source assets in all\nsub-modules of the given package module.

\n

A package module is the result of importing a package.

\n
\n
Parameters
\n
    \n
  • package_module (ModuleType) \u2013 The package module to looks for assets inside.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dictionary of resource\ndefinitions to include on the returned asset group.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 An executor to include on the returned\nasset group.

  • \n
  • extra_source_assets (Optional[Sequence[SourceAsset]]) \u2013 Source assets to include in the\ngroup in addition to the source assets found in the package.

  • \n
\n
\n
Returns
\n

An asset group with all the assets in the package.

\n
\n
Return type
\n

AssetGroup

\n
\n
\n
\n\n
\n
\nstatic from_package_name(package_name, resource_defs=None, executor_def=None, extra_source_assets=None)[source]\u00b6
\n

Constructs an AssetGroup that includes all asset definitions and source assets in all\nsub-modules of the given package.

\n
\n
Parameters
\n
    \n
  • package_name (str) \u2013 The name of a Python package to look for assets inside.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dictionary of resource\ndefinitions to include on the returned asset group.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 An executor to include on the returned\nasset group.

  • \n
  • extra_source_assets (Optional[Sequence[SourceAsset]]) \u2013 Source assets to include in the\ngroup in addition to the source assets found in the package.

  • \n
\n
\n
Returns
\n

An asset group with all the assets in the package.

\n
\n
Return type
\n

AssetGroup

\n
\n
\n
\n\n
\n
\nget_base_jobs()[source]\u00b6
\n

For internal use only.

\n
\n\n
\n
\nmaterialize(selection=None)[source]\u00b6
\n

Executes an in-process run that materializes all assets in the group.

\n

The execution proceeds serially, in a single thread. Only supported by AssetGroups that have\nno executor_def or that that use the in-process executor.

\n
\n
Parameters
\n

selection (Union[str, List[str]]) \u2013

A single selection query or list of selection queries\nto for assets in the group. For example:

\n
\n
    \n
  • ['some_asset_key'] select some_asset_key itself.

  • \n
  • ['*some_asset_key'] select some_asset_key and all its ancestors (upstream dependencies).

  • \n
  • ['*some_asset_key+++'] select some_asset_key, all its ancestors, and its descendants (downstream dependencies) within 3 levels down.

  • \n
  • ['*some_asset_key', 'other_asset_key_a', 'other_asset_key_b+'] select some_asset_key and all its ancestors, other_asset_key_a itself, and other_asset_key_b and its direct child asset keys. When subselecting into a multi-asset, all of the asset keys in that multi-asset must be selected.

  • \n
\n
\n

\n
\n
Returns
\n

The result of the execution.

\n
\n
Return type
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nprefixed(key_prefix)[source]\u00b6
\n

Returns an AssetGroup that\u2019s identical to this AssetGroup, but with prefixes on all the\nasset keys. The prefix is not added to source assets.

\n

Input asset keys that reference other assets within the group are \u201cbrought along\u201d -\ni.e. prefixed as well.

\n

Example with a single asset:

\n
\n
@asset\ndef asset1():\n    ...\n\nresult = AssetGroup([asset1]).prefixed("my_prefix")\nassert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n
\n
\n
\n

Example with dependencies within the list of assets:

\n
\n
@asset\ndef asset1():\n    ...\n\n@asset\ndef asset2(asset1):\n    ...\n\nresult = AssetGroup([asset1, asset2]).prefixed("my_prefix")\nassert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\nassert result.assets[1].asset_key == AssetKey(["my_prefix", "asset2"])\nassert result.assets[1].dependency_asset_keys == {AssetKey(["my_prefix", "asset1"])}\n
\n
\n
\n

Examples with input prefixes provided by source assets:

\n
\n
asset1 = SourceAsset(AssetKey(["upstream_prefix", "asset1"]))\n\n@asset\ndef asset2(asset1):\n    ...\n\nresult = AssetGroup([asset2], source_assets=[asset1]).prefixed("my_prefix")\nassert len(result.assets) == 1\nassert result.assets[0].asset_key == AssetKey(["my_prefix", "asset2"])\nassert result.assets[0].dependency_asset_keys == {AssetKey(["upstream_prefix", "asset1"])}\nassert result.source_assets[0].key == AssetKey(["upstream_prefix", "asset1"])\n
\n
\n
\n
\n\n
\n
\nto_source_assets()[source]\u00b6
\n

Returns a list of source assets corresponding to all the non-source assets in this group.

\n
\n\n
\n\n
\n
\n@dagster.multi_asset(outs, name=None, ins=None, non_argument_deps=None, description=None, required_resource_keys=None, compute_kind=None, internal_asset_deps=None, partitions_def=None, partition_mappings=None, op_tags=None, can_subset=False)[source]\u00b6
\n

Create a combined definition of multiple assets that are computed using the same op and same\nupstream assets.

\n

Each argument to the decorated function references an upstream asset that this asset depends on.\nThe name of the argument designates the name of the upstream asset.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the op.

  • \n
  • outs \u2013 (Optional[Dict[str, Out]]): The Outs representing the produced assets.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to their metadata\nand namespaces.

  • \n
  • non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]) \u2013 Set of asset keys that are upstream dependencies,\nbut do not pass an input to the multi_asset.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the op.

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the IOManager used for storing the\noutput of the op as an asset, and for loading it in downstream ops\n(default: \u201cio_manager\u201d).

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that produces\nthe asset, e.g. \u201cdbt\u201d or \u201cspark\u201d. It will be displayed in Dagit as a badge on the asset.

  • \n
  • internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]) \u2013 By default, it is assumed\nthat all assets produced by a multi_asset depend on all assets that are consumed by that\nmulti asset. If this default is not correct, you pass in a map of output names to a\ncorrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\nused as input to the asset or produced within the op.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • partition_mappings (Optional[Mapping[str, PartitionMapping]]) \u2013 Defines how to map partition\nkeys for this asset to partition keys of upstream assets. Each key in the dictionary\ncorreponds to one of the input assets, and each value is a PartitionMapping.\nIf no entry is provided for a particular asset dependency, the partition mapping defaults\nto the default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • can_subset (bool) \u2013 If this asset\u2019s computation can emit a subset of the asset\nkeys based on the context.selected_assets argument. Defaults to False.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.build_assets_job(name, assets, source_assets=None, resource_defs=None, description=None, config=None, tags=None, executor_def=None, _asset_selection_data=None)[source]\u00b6
\n

Builds a job that materializes the given assets.

\n

The dependencies between the ops in the job are determined by the asset dependencies defined\nin the metadata on the provided asset nodes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the job.

  • \n
  • assets (List[AssetsDefinition]) \u2013 A list of assets or\nmulti-assets - usually constructed using the @asset() or @multi_asset()\ndecorator.

  • \n
  • source_assets (Optional[Sequence[Union[SourceAsset, AssetsDefinition]]]) \u2013 A list of\nassets that are not materialized by this job, but that assets in this job depend on.

  • \n
  • resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 Resource defs to be included in\nthis job.

  • \n
  • description (Optional[str]) \u2013 A description of the job.

  • \n
\n
\n
\n

Examples

\n
@asset\ndef asset1():\n    return 5\n\n@asset\ndef asset2(asset1):\n    return my_upstream_asset + 1\n\nmy_assets_job = build_assets_job("my_assets_job", assets=[asset1, asset2])\n
\n
\n
\n
Returns
\n

A job that materializes the given assets.

\n
\n
Return type
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nclass dagster.AssetIn(asset_key=None, metadata=None, namespace=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster.SourceAsset(key, metadata=None, io_manager_key='io_manager', description=None, partitions_def=None)[source]\u00b6
\n

A SourceAsset represents an asset that will be loaded by (but not updated by) Dagster.

\n
\n
\nkey\u00b6
\n

The key of the asset.

\n
\n
Type
\n

Union[AssetKey, Sequence[str], str]

\n
\n
\n
\n\n
\n
\nmetadata_entries\u00b6
\n

Metadata associated with the asset.

\n
\n
Type
\n

List[MetadataEntry]

\n
\n
\n
\n\n
\n
\nio_manager_key\u00b6
\n

The key for the IOManager that will be used to load the contents of\nthe asset when it\u2019s used as an input to other assets inside a job.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\ndescription\u00b6
\n

The description of the asset.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npartitions_def\u00b6
\n

Defines the set of partition keys that\ncompose the asset.

\n
\n
Type
\n

Optional[PartitionsDefinition]

\n
\n
\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/assets", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../cli/", "title": "Dagster CLI"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../../../../", "title": "<no title>"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/cli", "Dagster CLI", "N", "next"], ["index", "<no title>", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/assets.rst.txt", "title": "Software-Defined Assets (Experimental)", "toc": "\n"}, "cli": {"alabaster_version": "0.7.12", "body": "
\n

Dagster CLI\u00b6

\n
\n

dagster asset\u00b6

\n
\n

Commands for working with Dagster assets.

\n
\n
dagster asset [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nwipe
\n

Eliminate asset key indexes from event logs.

\n
\n\n
\n
\n

dagster debug\u00b6

\n
\n

Commands for debugging Dagster pipeline/job runs.

\n
\n
dagster debug [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nexport
\n

Export the relevant artifacts for a\u2026

\n
\n\n
\n
\nimport
\n

Import the relevant artifacts for a\u2026

\n
\n\n
\n
\n

dagster instance\u00b6

\n
\n

Commands for working with the current Dagster instance.

\n
\n
dagster instance [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ninfo
\n

List the information about the current\u2026

\n
\n\n
\n
\nmigrate
\n

Automatically migrate an out of date\u2026

\n
\n\n
\n
\nreindex
\n

Rebuild index over historical runs for\u2026

\n
\n\n
\n
\n

dagster job\u00b6

\n
\n

Commands for working with Dagster jobs.

\n
\n
dagster job [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nbackfill
\n

Backfill a partitioned job.

\n
\n\n
\n
\nexecute
\n

Execute a job.

\n
\n\n
\n
\nlaunch
\n

Launch a job using the run launcher\u2026

\n
\n\n
\n
\nlist
\n

List the jobs in a repository.

\n
\n\n
\n
\nlist_versions
\n

Display the freshness of memoized results\u2026

\n
\n\n
\n
\nprint
\n

Print a job.

\n
\n\n
\n
\nscaffold_config
\n

Scaffold the config for a job.

\n
\n\n
\n
\n

dagster run\u00b6

\n
\n

Commands for working with Dagster pipeline/job runs.

\n
\n
dagster run [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ndelete
\n

Delete a run by id and its associated\u2026

\n
\n\n
\n
\nlist
\n

List the runs in the current Dagster\u2026

\n
\n\n
\n
\nwipe
\n

Eliminate all run history and event logs.

\n
\n\n
\n
\n

dagster schedule\u00b6

\n
\n

Commands for working with Dagster schedules.

\n
\n
dagster schedule [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ndebug
\n

Debug information about the scheduler.

\n
\n\n
\n
\nlist
\n

List all schedules that correspond to a\u2026

\n
\n\n
\n
\nlogs
\n

Get logs for a schedule.

\n
\n\n
\n
\npreview
\n

Preview changes that will be performed by\u2026

\n
\n\n
\n
\nrestart
\n

Restart a running schedule.

\n
\n\n
\n
\nstart
\n

Start an existing schedule.

\n
\n\n
\n
\nstop
\n

Stop an existing schedule.

\n
\n\n
\n
\nwipe
\n

Delete the schedule history and turn off\u2026

\n
\n\n
\n
\n

dagster sensor\u00b6

\n
\n

Commands for working with Dagster sensors.

\n
\n
dagster sensor [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ncursor
\n

Set the cursor value for an existing sensor.

\n
\n\n
\n
\nlist
\n

List all sensors that correspond to a\u2026

\n
\n\n
\n
\npreview
\n

Preview an existing sensor execution.

\n
\n\n
\n
\nstart
\n

Start an existing sensor.

\n
\n\n
\n
\nstop
\n

Stop an existing sensor.

\n
\n\n
\n
\n

dagster-graphql\u00b6

\n

Run a GraphQL query against the dagster interface to a specified repository or pipeline/job.

\n

Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.

\n

Examples:

\n
    \n
  1. dagster-graphql

  2. \n
  3. dagster-graphql -y path/to/workspace.yaml

  4. \n
  5. dagster-graphql -f path/to/file.py -a define_repo

  6. \n
  7. dagster-graphql -m some_module -a define_repo

  8. \n
  9. dagster-graphql -f path/to/file.py -a define_pipeline

  10. \n
  11. dagster-graphql -m some_module -a define_pipeline

  12. \n
\n
dagster-graphql [OPTIONS]\n
\n
\n

Options

\n
\n
\n--version\u00b6
\n

Show the version and exit.

\n
\n\n
\n
\n-t, --text <text>\u00b6
\n

GraphQL document to execute passed as a string

\n
\n\n
\n
\n-f, --file <file>\u00b6
\n

GraphQL document to execute passed as a file

\n
\n\n
\n
\n-p, --predefined <predefined>\u00b6
\n

GraphQL document to execute, from a predefined set provided by dagster-graphql.

\n
\n
Options
\n

launchPipelineExecution

\n
\n
\n
\n\n
\n
\n-v, --variables <variables>\u00b6
\n

A JSON encoded string containing the variables for GraphQL execution.

\n
\n\n
\n
\n-r, --remote <remote>\u00b6
\n

A URL for a remote instance running dagit server to send the GraphQL request to.

\n
\n\n
\n
\n-o, --output <output>\u00b6
\n

A file path to store the GraphQL response to. This flag is useful when making pipeline/job execution queries, since pipeline/job execution causes logs to print to stdout and stderr.

\n
\n\n
\n
\n--ephemeral-instance\u00b6
\n

Use an ephemeral DagsterInstance instead of resolving via DAGSTER_HOME

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or pipeline/job.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where repository or pipeline/job function lives

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or pipeline/job function lives

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where repository or pipeline/job function lives

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or pipeline/job or 2) a function that returns a repository or pipeline/job

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\n

dagit\u00b6

\n

Run dagit. Loads a repository or pipeline/job.

\n

Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.

\n

Examples:

\n
    \n
  1. dagit (works if .workspace.yaml exists)

  2. \n
  3. dagit -w path/to/workspace.yaml

  4. \n
  5. dagit -f path/to/file.py

  6. \n
  7. dagit -f path/to/file.py -d path/to/working_directory

  8. \n
  9. dagit -m some_module

  10. \n
  11. dagit -f path/to/file.py -a define_repo

  12. \n
  13. dagit -m some_module -a define_repo

  14. \n
  15. dagit -p 3333

  16. \n
\n

Options can also provide arguments via environment variables prefixed with DAGIT

\n

For example, DAGIT_PORT=3333 dagit

\n
dagit [OPTIONS]\n
\n
\n

Options

\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or pipeline/job or 2) a function that returns a repository or pipeline/job

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where repository or pipeline/job function lives

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or pipeline/job function lives

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where repository or pipeline/job function lives

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or pipeline/job.

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n-h, --host <host>\u00b6
\n

Host to run server on

\n
\n
Default
\n

127.0.0.1

\n
\n
\n
\n\n
\n
\n-p, --port <port>\u00b6
\n

Port to run server on.

\n
\n
Default
\n

3000

\n
\n
\n
\n\n
\n
\n-l, --path-prefix <path_prefix>\u00b6
\n

The path prefix where Dagit will be hosted (eg: /dagit)

\n
\n
Default
\n

\n
\n
\n\n
\n
\n--db-statement-timeout <db_statement_timeout>\u00b6
\n

The timeout in milliseconds to set on database statements sent to the DagsterInstance. Not respected in all configurations.

\n
\n
Default
\n

15000

\n
\n
\n
\n\n
\n
\n--read-only\u00b6
\n

Start Dagit in read-only mode, where all mutations such as launching runs and turning schedules on/off are turned off.

\n
\n\n
\n
\n--suppress-warnings\u00b6
\n

Filter all warnings when hosting Dagit.

\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Set the log level for the uvicorn web server.

\n
\n
Default
\n

warning

\n
\n
Options
\n

critical|error|warning|info|debug|trace

\n
\n
\n
\n\n
\n
\n--version\u00b6
\n

Show the version and exit.

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\n

dagster-daemon run\u00b6

\n

Run any daemons configured on the DagsterInstance.

\n
dagster-daemon run [OPTIONS]\n
\n
\n

Options

\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or pipeline/job or 2) a function that returns a repository or pipeline/job

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where repository or pipeline/job function lives

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or pipeline/job function lives

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where repository or pipeline/job function lives

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or pipeline/job.

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\n

dagster-daemon wipe\u00b6

\n

Wipe all heartbeats from storage.

\n
dagster-daemon wipe [OPTIONS]\n
\n
\n
\n
\n

dagster-daemon debug heartbeat-dump\u00b6

\n

Log all heartbeat statuses

\n
dagster-daemon debug heartbeat-dump [OPTIONS]\n
\n
\n
\n
\n

dagster api grpc\u00b6

\n

Serve the Dagster inter-process API over GRPC

\n
dagster api grpc [OPTIONS]\n
\n
\n

Options

\n
\n
\n-p, --port <port>\u00b6
\n

Port over which to serve. You must pass one and only one of \u2013port/-p or \u2013socket/-s.

\n
\n\n
\n
\n-s, --socket <socket>\u00b6
\n

Serve over a UDS socket. You must pass one and only one of \u2013port/-p or \u2013socket/-s.

\n
\n\n
\n
\n-h, --host <host>\u00b6
\n

Hostname at which to serve. Default is localhost.

\n
\n\n
\n
\n-n, --max_workers <max_workers>\u00b6
\n

Maximum number of (threaded) workers to use in the GRPC server

\n
\n\n
\n
\n--heartbeat\u00b6
\n

If set, the GRPC server will shut itself down when it fails to receive a heartbeat after a timeout configurable with \u2013heartbeat-timeout.

\n
\n\n
\n
\n--heartbeat-timeout <heartbeat_timeout>\u00b6
\n

Timeout after which to shutdown if \u2013heartbeat is set and a heartbeat is not received

\n
\n\n
\n
\n--lazy-load-user-code\u00b6
\n

Wait until the first LoadRepositories call to actually load the repositories, instead of waiting to load them when the server is launched. Useful for surfacing errors when the server is managed directly from Dagit

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or pipeline/job or 2) a function that returns a repository or pipeline/job

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where repository or pipeline/job function lives

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or pipeline/job function lives

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where repository or pipeline/job function lives

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or pipeline/job.

\n
\n\n
\n
\n--use-python-environment-entry-point\u00b6
\n

If this flag is set, the server will signal to clients that they should launch dagster commands using <this server\u2019s python executable> -m dagster, instead of the default dagster entry point. This is useful when there are multiple Python environments running in the same machine, so a single dagster entry point is not enough to uniquely determine the environment.

\n
\n\n
\n
\n--empty-working-directory\u00b6
\n

Indicates that the working directory should be empty and should not set to the current directory as a default

\n
\n\n
\n
\n--ipc-output-file <ipc_output_file>\u00b6
\n

[INTERNAL] This option should generally not be used by users. Internal param used by dagster when it automatically spawns gRPC servers to communicate the success or failure of the server launching.

\n
\n\n
\n
\n--fixed-server-id <fixed_server_id>\u00b6
\n

[INTERNAL] This option should generally not be used by users. Internal param used by dagster to spawn a gRPC server with the specified server id.

\n
\n\n
\n
\n--override-system-timezone <override_system_timezone>\u00b6
\n

[INTERNAL] This option should generally not be used by users. Override the system timezone for tests.

\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Level at which to log output from the gRPC server process

\n
\n\n
\n
\n--container-image <container_image>\u00b6
\n

Container image to use to run code from this server.

\n
\n\n
\n
\n--container-context <container_context>\u00b6
\n

Serialized JSON with configuration for any containers created to run the code from this server.

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_GRPC_PORT
\n
\n

Provide a default for --port

\n
\n
\n\n
\n
\nDAGSTER_GRPC_SOCKET
\n
\n

Provide a default for --socket

\n
\n
\n\n
\n
\nDAGSTER_GRPC_HOST
\n
\n

Provide a default for --host

\n
\n
\n\n
\n
\nDAGSTER_LAZY_LOAD_USER_CODE
\n
\n

Provide a default for --lazy-load-user-code

\n
\n
\n\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_USE_PYTHON_ENVIRONMENT_ENTRY_POINT
\n
\n

Provide a default for --use-python-environment-entry-point

\n
\n
\n\n
\n
\nDAGSTER_EMPTY_WORKING_DIRECTORY
\n
\n

Provide a default for --empty-working-directory

\n
\n
\n\n
\n
\nDAGSTER_CONTAINER_IMAGE
\n
\n

Provide a default for --container-image

\n
\n
\n\n
\n
\nDAGSTER_CONTAINER_CONTEXT
\n
\n

Provide a default for --container-context

\n
\n
\n\n
\n
\n

Legacy APIs\u00b6

\n
\n

dagster pipeline\u00b6

\n
\n

Commands for working with Dagster pipelines/jobs.

\n
\n
dagster pipeline [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nbackfill
\n

Backfill a partitioned pipeline/job.

\n
\n\n
\n
\nexecute
\n

Execute a pipeline.

\n
\n\n
\n
\nlaunch
\n

Launch a pipeline using the run launcher\u2026

\n
\n\n
\n
\nlist
\n

List the pipelines/jobs in a repository.

\n
\n\n
\n
\nlist_versions
\n

Display the freshness of memoized results\u2026

\n
\n\n
\n
\nprint
\n

Print a pipeline/job.

\n
\n\n
\n
\nscaffold_config
\n

Scaffold the config for a pipeline.

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/cli", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../config/", "title": "Config"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../assets/", "title": "Software-Defined Assets (Experimental)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/config", "Config", "N", "next"], ["sections/api/apidocs/assets", "Software-Defined Assets (Experimental)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/cli.rst.txt", "title": "Dagster CLI", "toc": "\n"}, "config": {"alabaster_version": "0.7.12", "body": "
\n

Config\u00b6

\n
\n

Config Types\u00b6

\n

The following types are used to describe the schema of configuration\ndata via config. They are used in conjunction with the\nbuiltin types above.

\n
\n
\nclass dagster.ConfigSchema[source]\u00b6
\n

This is a placeholder type. Any time that it appears in documentation, it means that any of\nthe following types are acceptable:

\n
    \n
  1. A Python scalar type that resolves to a Dagster config type\n(int, float, bool,\nor str). For example:

    \n
      \n
    • @op(config_schema=int)

    • \n
    • @op(config_schema=str)

    • \n
    \n
  2. \n
  3. A built-in python collection (list, or dict).\nlist is exactly equivalent to Array [\nAny ] and dict is equivalent to\nPermissive. For example:

    \n
      \n
    • @op(config_schema=list)

    • \n
    • @op(config_schema=dict)

    • \n
    \n
  4. \n
  5. A Dagster config type:

    \n\n
  6. \n
  7. A bare python dictionary, which will be automatically wrapped in\nShape. Values of the dictionary are resolved recursively\naccording to the same rules. For example:

    \n
      \n
    • {'some_config': str} is equivalent to Shape({'some_config: str}).

    • \n
    • \n
      {'some_config1': {'some_config2': str}} is equivalent to

      Shape({'some_config1: Shape({'some_config2: str})}).

      \n
      \n
      \n
    • \n
    \n
  8. \n
  9. A bare python list of length one, whose single element will be wrapped in a\nArray is resolved recursively according to the same\nrules. For example:

    \n
      \n
    • [str] is equivalent to Array[str].

    • \n
    • [[str]] is equivalent to Array[Array[str]].

    • \n
    • [{'some_config': str}] is equivalent to Array(Shape({'some_config: str})).

    • \n
    \n
  10. \n
  11. An instance of Field.

  12. \n
\n
\n\n
\n
\nclass dagster.Field(config, default_value=<class 'dagster.config.field_utils.__FieldValueSentinel'>, is_required=None, description=None)[source]\u00b6
\n

Defines the schema for a configuration field.

\n

Fields are used in config schema instead of bare types when one wants to add a description,\na default value, or to mark it as not required.

\n

Config fields are parsed according to their schemas in order to yield values available at\njob execution time through the config system. Config fields can be set on ops, on\nloaders and materializers for custom, and on other pluggable components of the system, such as\nresources, loggers, and executors.

\n
\n
Parameters
\n
    \n
  • config (Any) \u2013

    The schema for the config. This value can be any of:

    \n
      \n
    1. A Python primitive type that resolves to a Dagster config type\n(int, float, bool,\nstr, or list).

    2. \n
    3. A Dagster config type:

      \n\n
    4. \n
    5. A bare python dictionary, which will be automatically wrapped in\nShape. Values of the dictionary are resolved recursively\naccording to the same rules.

    6. \n
    7. A bare python list of length one which itself is config type.\nBecomes Array with list element as an argument.

    8. \n
    \n

  • \n
  • default_value (Any) \u2013

    A default value for this field, conformant to the schema set by the dagster_type\nargument. If a default value is provided, is_required should be False.

    \n

    Note: for config types that do post processing such as Enum, this value must be\nthe pre processed version, ie use ExampleEnum.VALUE.name instead of\nExampleEnum.VALUE

    \n

  • \n
  • is_required (bool) \u2013 Whether the presence of this field is required. Defaults to true. If is_required\nis True, no default value should be provided.

  • \n
  • description (str) \u2013 A human-readable description of this config field.

  • \n
\n
\n
\n

Examples:

\n
@op(\n    config_schema={\n        'word': Field(str, description='I am a word.'),\n        'repeats': Field(Int, default_value=1, is_required=False),\n    }\n)\ndef repeat_word(context):\n    return context.op_config['word'] * context.op_config['repeats']\n
\n
\n
\n\n
\n
\nclass dagster.Selector(fields, description=None)[source]\u00b6
\n

Define a config field requiring the user to select one option.

\n

Selectors are used when you want to be able to present several different options in config but\nallow only one to be selected. For example, a single input might be read in from either a csv\nfile or a parquet file, but not both at once.

\n

Note that in some other type systems this might be called an \u2018input union\u2019.

\n

Functionally, a selector is like a Dict, except that only one key from the dict can\nbe specified in valid config.

\n
\n
Parameters
\n

fields (Dict[str, Field]) \u2013 The fields from which the user must select.

\n
\n
\n

Examples:

\n
@op(\n    config_schema=Field(\n        Selector(\n            {\n                'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n                'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n                'en': {'whom': Field(String, default_value='world', is_required=False)},\n            }\n        ),\n        is_required=False,\n        default_value={'en': {'whom': 'world'}},\n    )\n)\ndef hello_world_with_default(context):\n    if 'haw' in context.op_config:\n        return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n    if 'cn' in context.op_config:\n        return '\u4f60\u597d\uff0c{whom}!'.format(whom=context.op_config['cn']['whom'])\n    if 'en' in context.op_config:\n        return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n
\n
\n
\n\n
\n
\nclass dagster.Permissive(fields=None, description=None)[source]\u00b6
\n

Defines a config dict with a partially specified schema.

\n

A permissive dict allows partial specification of the config schema. Any fields with a\nspecified schema will be type checked. Other fields will be allowed, but will be ignored by\nthe type checker.

\n
\n
Parameters
\n

fields (Dict[str, Field]) \u2013 The partial specification of the config dict.

\n
\n
\n

Examples:

\n
@op(config_schema=Field(Permissive({'required': Field(String)})))\ndef map_config_op(context) -> List:\n    return sorted(list(context.op_config.items()))\n
\n
\n
\n\n
\n
\nclass dagster.Shape(fields, description=None, field_aliases=None)[source]\u00b6
\n

Schema for configuration data with string keys and typed values via Field.

\n

Unlike Permissive, unspecified fields are not allowed and will throw a\nDagsterInvalidConfigError.

\n
\n
Parameters
\n
    \n
  • fields (Dict[str, Field]) \u2013 The specification of the config dict.

  • \n
  • field_aliases (Dict[str, str]) \u2013 Maps a string key to an alias that can be used instead of the original key. For example,\nan entry {\u201csolids\u201d: \u201cops\u201d} means that someone could use \u201cops\u201d instead of \u201csolids\u201d as a\ntop level string key.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Map(key_type, inner_type, key_label_name=None)[source]\u00b6
\n

Defines a config dict with arbitrary scalar keys and typed values.

\n

A map can contrain arbitrary keys of the specified scalar type, each of which has\ntype checked values. Unlike Shape and Permissive, scalar\nkeys other than strings can be used, and unlike Permissive, all\nvalues are type checked.\n:param key_type: The type of keys this map can contain. Must be a scalar type.\n:type key_type: type\n:param inner_type: The type of the values that this map type can contain.\n:type inner_type: type\n:param key_label_name: Optional name which describes the role of keys in the map.\n:type key_label_name: string

\n

Examples:

\n
@op(config_schema=Field(Map({str: int})))\ndef partially_specified_config(context) -> List:\n    return sorted(list(context.op_config.items()))\n
\n
\n
\n\n
\n
\nclass dagster.Array(inner_type)[source]\u00b6
\n

Defines an array (list) configuration type that contains values of type inner_type.

\n
\n
Parameters
\n

inner_type (type) \u2013 The type of the values that this configuration type can contain.

\n
\n
\n
\n\n
\n
\nclass dagster.Noneable(inner_type)[source]\u00b6
\n

Defines a configuration type that is the union of NoneType and the type inner_type.

\n
\n
Parameters
\n

inner_type (type) \u2013 The type of the values that this configuration type can contain.

\n
\n
\n

Examples:

\n
config_schema={"name": Noneable(str)}\n\nconfig={"name": "Hello"}  # Ok\nconfig={"name": None}     # Ok\nconfig={}                 # Error\n
\n
\n
\n\n
\n
\nclass dagster.Enum(name, enum_values)[source]\u00b6
\n

Defines a enum configuration type that allows one of a defined set of possible values.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the enum configuration type.

  • \n
  • enum_values (List[EnumValue]) \u2013 The set of possible values for the enum configuration type.

  • \n
\n
\n
\n

Examples:

\n
@op(\n    config_schema=Field(\n        Enum(\n            'CowboyType',\n            [\n                EnumValue('good'),\n                EnumValue('bad'),\n                EnumValue('ugly'),\n            ]\n        )\n    )\n)\ndef resolve_standoff(context):\n    # ...\n
\n
\n
\n
\nclassmethod from_python_enum(enum, name=None)[source]\u00b6
\n

Create a Dagster enum corresponding to an existing Python enum.

\n
\n
Parameters
\n
    \n
  • enum (enum.EnumMeta) \u2013 The class representing the enum.

  • \n
  • name (Optional[str]) \u2013 The name for the enum. If not present, enum.__name__ will be used.

  • \n
\n
\n
\n

Example:

\n
class Color(enum.Enum):\n    RED = enum.auto()\n    GREEN = enum.auto()\n    BLUE = enum.auto()\n\n@op(\n    config_schema={"color": Field(Enum.from_python_enum(Color))}\n)\ndef select_color(context):\n    # ...\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.EnumValue(config_value, python_value=None, description=None)[source]\u00b6
\n

Define an entry in a Enum.

\n
\n
Parameters
\n
    \n
  • config_value (str) \u2013 The string representation of the config to accept when passed.

  • \n
  • python_value (Optional[Any]) \u2013 The python value to convert the enum entry in to. Defaults to the config_value.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the enum entry.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScalarUnion(scalar_type, non_scalar_schema, _key=None)[source]\u00b6
\n

Defines a configuration type that accepts a scalar value OR a non-scalar value like a\nList, Dict, or Selector.

\n

This allows runtime scalars to be configured without a dictionary with the key value and\ninstead just use the scalar value directly. However this still leaves the option to\nload scalars from a json or pickle file.

\n
\n
Parameters
\n
    \n
  • scalar_type (type) \u2013 The scalar type of values that this configuration type can hold. For example,\nint, float, bool,\nor str.

  • \n
  • non_scalar_schema (ConfigSchema) \u2013 The schema of a non-scalar Dagster configuration type. For example, List,\nDict, or Selector.

  • \n
  • key (Optional[str]) \u2013 The configuation type\u2019s unique key. If not set, then the key will be set to\nScalarUnion.{scalar_type}-{non_scalar_schema}.

  • \n
\n
\n
\n

Examples:

\n
graph:\n  transform_word:\n    inputs:\n      word:\n        value: foobar\n
\n
\n

becomes, optionally,

\n
graph:\n  transform_word:\n    inputs:\n      word: foobar\n
\n
\n
\n\n
\n
\ndagster.StringSource\u00b6
\n

Use this type when you want to read a string config value from an environment variable. The value\npassed to a config field of this type may either be a string literal, or a selector describing\nhow to look up the value from the executing process\u2019s environment variables.

\n

Examples:

\n
from dagster import job, op, StringSource\n\n@op(config_schema=StringSource)\ndef secret_op(context) -> str:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_op': {'config': 'test_value'}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE'}}}\n    }\n)\n
\n
\n
\n\n
\n
\ndagster.IntSource\u00b6
\n

Use this type when you want to read an integer config value from an environment variable. The\nvalue passed to a config field of this type may either be a integer literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables.

\n

Examples:

\n
from dagster import job, op, IntSource\n\n@op(config_schema=IntSource)\ndef secret_int_op(context) -> int:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_int_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_int_op': {'config': 1234}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_int_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_INT'}}}\n    }\n)\n
\n
\n
\n\n
\n
\ndagster.BoolSource\u00b6
\n

Use this type when you want to read an boolean config value from an environment variable. The\nvalue passed to a config field of this type may either be a boolean literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables. Set the\nvalue of the corresponding environment variable to "" to indicate False.

\n

Examples:

\n
from dagster import job, op, BoolSource\n\n@op(config_schema=BoolSource)\ndef secret_bool_op(context) -> bool:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_bool_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_bool_op': {'config': False}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_bool_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_BOOL'}}}\n    }\n)\n
\n
\n
\n\n
\n
\n

Config Utilities\u00b6

\n
\n
\nclass dagster.ConfigMapping(config_fn, config_schema=None, receive_processed_config_values=None)[source]\u00b6
\n

Defines a config mapping for a graph (or job).

\n

By specifying a config mapping function, you can override the configuration for the child\nops and graphs contained within a graph.

\n

Config mappings require the configuration schema to be specified as config_schema, which will\nbe exposed as the configuration schema for the graph, as well as a configuration mapping\nfunction, config_fn, which maps the config provided to the composite solid to the config\nthat will be provided to the child nodes.

\n
\n
Parameters
\n
    \n
  • config_fn (Callable[[dict], dict]) \u2013 The function that will be called\nto map the graph config to a config appropriate for the child nodes.

  • \n
  • config_schema (ConfigSchema) \u2013 The schema of the graph config.

  • \n
  • receive_processed_config_values (Optional[bool]) \u2013 If true, config values provided to the config_fn\nwill be converted to their dagster types before being passed in. For example, if this\nvalue is true, enum config passed to config_fn will be actual enums, while if false,\nthen enum config passed to config_fn will be strings.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.configured(configurable, config_schema=None, **kwargs)[source]\u00b6
\n

A decorator that makes it easy to create a function-configured version of an object.\nThe following definition types can be configured using this function:

\n\n

If the config that will be supplied to the object is constant, you may alternatively invoke this\nand call the result with a dict of config values to be curried. Examples of both strategies\nbelow.

\n
\n
Parameters
\n
    \n
  • configurable (ConfigurableDefinition) \u2013 An object that can be configured.

  • \n
  • config_schema (ConfigSchema) \u2013 The config schema that the inputs to the decorated function\nmust satisfy.

  • \n
  • **kwargs \u2013 Arbitrary keyword arguments that will be passed to the initializer of the returned\nobject.

  • \n
\n
\n
Returns
\n

(Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])

\n
\n
\n

Examples:

\n
dev_s3 = configured(s3_resource, name="dev_s3")({'bucket': 'dev'})\n\n@configured(s3_resource)\ndef dev_s3(_):\n    return {'bucket': 'dev'}\n\n@configured(s3_resource, {'bucket_prefix', str})\ndef dev_s3(config):\n    return {'bucket': config['bucket_prefix'] + 'dev'}\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/config", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../errors/", "title": "Errors"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../cli/", "title": "Dagster CLI"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/errors", "Errors", "N", "next"], ["sections/api/apidocs/cli", "Dagster CLI", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/config.rst.txt", "title": "Config", "toc": "\n"}, "dynamic": {"alabaster_version": "0.7.12", "body": "
\n

Dynamic Mapping & Collect\u00b6

\n

These APIs provide the means for a simple kind of dynamic orchestration \u2014 where the work to be orchestrated is determined not at pipeline definition time but at runtime, dependent on data that\u2019s observed as part of pipeline execution.

\n
\n
\nclass dagster.DynamicOut(dagster_type=<class 'dagster.core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, asset_key=None, asset_partitions=None, asset_partitions_def=None)[source]\u00b6
\n

Variant of Out for an output that will dynamically alter the graph at\nruntime.

\n

When using in a composition function such as @graph,\ndynamic outputs must be used with either

\n
    \n
  • map - clone downstream ops for each separate DynamicOut

  • \n
  • collect - gather across all DynamicOut in to a list

  • \n
\n

Uses the same constructor as Out

\n
\n
@op(\n    config_schema={\n        "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n    },\n    out=DynamicOut(str),\n)\ndef files_in_directory(context):\n    path = context.op_config["path"]\n    dirname, _, filenames = next(os.walk(path))\n    for file in filenames:\n        yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n@job\ndef process_directory():\n    files = files_in_directory()\n\n    # use map to invoke an op on each dynamic output\n    file_results = files.map(process_file)\n\n    # use collect to gather the results in to a list\n    summarize_directory(file_results.collect())\n
\n
\n
\n
\n\n
\n
\nclass dagster.DynamicOutput(value, mapping_key, output_name='result', metadata_entries=None, metadata=None)[source]\u00b6
\n

Variant of Output used to support\ndynamic mapping & collect. Each DynamicOutput produced by an op represents\none item in a set that can be processed individually with map or gathered\nwith collect.

\n

Each DynamicOutput must have a unique mapping_key to distinguish it with it\u2019s set.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • mapping_key (str) \u2013 The key that uniquely identifies this dynamic value relative to its peers.\nThis key will be used to identify the downstream ops when mapped, ie\nmapped_op[example_mapping_key]

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding DynamicOut defined on the op.\n(default: \u201cresult\u201d)

  • \n
  • metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]) \u2013 (Experimental) A set of metadata entries to attach to events related to this output.

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/dynamic", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../types/", "title": "Types"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../solids/", "title": "[Legacy] Solids"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/types", "Types", "N", "next"], ["sections/api/apidocs/solids", "[Legacy] Solids", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/dynamic.rst.txt", "title": "Dynamic Mapping & Collect", "toc": "\n"}, "errors": {"alabaster_version": "0.7.12", "body": "
\n

Errors\u00b6

\n

Core Dagster error classes.

\n

All errors thrown by the Dagster framework inherit from DagsterError. Users\nshould not subclass this base class for their own exceptions.

\n

There is another exception base class, DagsterUserCodeExecutionError, which is\nused by the framework in concert with the user_code_error_boundary().

\n

Dagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\nDagsterUserCodeExecutionError.

\n

The wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.

\n
\n
\nexception dagster.DagsterError[source]\u00b6
\n

Base class for all errors thrown by the Dagster framework.

\n

Users should not subclass this base class for their own exceptions.

\n
\n\n
\n
\nexception dagster.DagsterConfigMappingFunctionError(*args, **kwargs)[source]\u00b6
\n

Indicates that an unexpected error occurred while executing the body of a config mapping\nfunction defined in a JobDefinition or ~dagster.GraphDefinition during\nconfig parsing.

\n
\n\n
\n
\nexception dagster.DagsterEventLogInvalidForRun(run_id)[source]\u00b6
\n

Raised when the event logs for a historical run are malformed or invalid.

\n
\n\n
\n
\nexception dagster.DagsterExecutionStepExecutionError(*args, **kwargs)[source]\u00b6
\n

Indicates an error occurred while executing the body of an execution step.

\n
\n\n
\n
\nexception dagster.DagsterExecutionStepNotFoundError(*args, **kwargs)[source]\u00b6
\n

Thrown when the user specifies execution step keys that do not exist.

\n
\n\n
\n
\nexception dagster.DagsterInvalidConfigError(preamble, errors, config_value, *args, **kwargs)[source]\u00b6
\n

Thrown when provided config is invalid (does not type check against the relevant config\nschema).

\n
\n\n
\n
\nexception dagster.DagsterInvalidConfigDefinitionError(original_root, current_value, stack, reason=None, **kwargs)[source]\u00b6
\n

Indicates that you have attempted to construct a config with an invalid value

\n
\n
Acceptable values for config types are any of:
    \n
  1. \n
    A Python primitive type that resolves to a Dagster config type

    (int, float, bool,\nstr, or list).

    \n
    \n
    \n
  2. \n
  3. \n
    A Dagster config type: Int, Float,

    Bool, String,\nStringSource, Any,\nArray, Noneable, Enum,\nSelector, Shape, or\nPermissive.

    \n
    \n
    \n
  4. \n
  5. \n
    A bare python dictionary, which will be automatically wrapped in

    Shape. Values of the dictionary are resolved recursively\naccording to the same rules.

    \n
    \n
    \n
  6. \n
  7. \n
    A bare python list of length one which itself is config type.

    Becomes Array with list element as an argument.

    \n
    \n
    \n
  8. \n
  9. An instance of Field.

  10. \n
\n
\n
\n
\n\n
\n
\nexception dagster.DagsterInvalidDefinitionError[source]\u00b6
\n

Indicates that the rules for a definition have been violated by the user.

\n
\n\n
\n
\nexception dagster.DagsterInvariantViolationError[source]\u00b6
\n

Indicates the user has violated a well-defined invariant that can only be enforced\nat runtime.

\n
\n\n
\n
\nexception dagster.DagsterResourceFunctionError(*args, **kwargs)[source]\u00b6
\n

Indicates an error occurred while executing the body of the resource_fn in a\nResourceDefinition during resource initialization.

\n
\n\n
\n
\nexception dagster.DagsterRunNotFoundError(*args, **kwargs)[source]\u00b6
\n

Thrown when a run cannot be found in run storage.

\n
\n\n
\n
\nexception dagster.DagsterStepOutputNotFoundError(*args, **kwargs)[source]\u00b6
\n

Indicates that previous step outputs required for an execution step to proceed are not\navailable.

\n
\n\n
\n
\nexception dagster.DagsterSubprocessError(*args, **kwargs)[source]\u00b6
\n

An exception has occurred in one or more of the child processes dagster manages.\nThis error forwards the message and stack trace for all of the collected errors.

\n
\n\n
\n
\nexception dagster.DagsterTypeCheckDidNotPass(description=None, metadata_entries=None, dagster_type=None)[source]\u00b6
\n

Indicates that a type check failed.

\n

This is raised when raise_on_error is True in calls to the synchronous job and\ngraph execution APIs (e.g. graph.execute_in_process(), job.execute_in_process() \u2013 typically\nwithin a test), and a DagsterType\u2019s type check fails by returning either\nFalse or an instance of TypeCheck whose success member is False.

\n
\n\n
\n
\nexception dagster.DagsterTypeCheckError(*args, **kwargs)[source]\u00b6
\n

Indicates an error in the op type system at runtime. E.g. a op receives an\nunexpected input, or produces an output that does not match the type of the output definition.

\n
\n\n
\n
\nexception dagster.DagsterUnknownResourceError(resource_name, *args, **kwargs)[source]\u00b6
\n

Indicates that an unknown resource was accessed in the body of an execution step. May often\nhappen by accessing a resource in the compute function of an op without first supplying the\nop with the correct required_resource_keys argument.

\n
\n\n
\n
\nexception dagster.DagsterUnmetExecutorRequirementsError[source]\u00b6
\n

Indicates the resolved executor is incompatible with the state of other systems\nsuch as the DagsterInstance or system storage configuration.

\n
\n\n
\n
\nexception dagster.DagsterUserCodeExecutionError(*args, **kwargs)[source]\u00b6
\n

This is the base class for any exception that is meant to wrap an\nException thrown by user code. It wraps that existing user code.\nThe original_exc_info argument to the constructor is meant to be a tuple of the type\nreturned by sys.exc_info at the call site of the constructor.

\n

Users should not subclass this base class for their own exceptions and should instead throw\nfreely from user code. User exceptions will be automatically wrapped and rethrown.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/errors", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../execution/", "title": "Execution"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../config/", "title": "Config"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/execution", "Execution", "N", "next"], ["sections/api/apidocs/config", "Config", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/errors.rst.txt", "title": "Errors", "toc": "\n"}, "execution": {"alabaster_version": "0.7.12", "body": "
\n

Execution\u00b6

\n
\n

Executing Jobs\u00b6

\n
\n
\nclass dagster.JobDefinition(graph_def, resource_defs=None, executor_def=None, logger_defs=None, config_mapping=None, partitioned_config=None, name=None, description=None, preset_defs=None, tags=None, hook_defs=None, op_retry_policy=None, version_strategy=None, _subset_selection_data=None, asset_layer=None, _input_values=None)[source]
\n
\n
\nexecute_in_process(run_config=None, instance=None, partition_key=None, raise_on_error=True, op_selection=None, asset_selection=None, run_id=None, input_values=None)[source]
\n

Execute the Job in-process, gathering results in-memory.

\n

The executor_def on the Job will be ignored, and replaced with the in-process executor.\nIf using the default io_manager, it will switch from filesystem to in-memory.

\n
\n
Parameters
\n
    \n
  • (Optional[Dict[str (run_config) \u2013 The configuration for the run

  • \n
  • Any]] \u2013 The configuration for the run

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for jobs with partitioned config.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.

  • \n
\n
\n
Returns
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n\n
\n
\n

Executing Graphs\u00b6

\n
\n
\nclass dagster.GraphDefinition(name, description=None, node_defs=None, dependencies=None, input_mappings=None, output_mappings=None, config=None, tags=None, **kwargs)[source]
\n

Defines a Dagster graph.

\n

A graph is made up of

\n
    \n
  • Nodes, which can either be an op (the functional unit of computation), or another graph.

  • \n
  • Dependencies, which determine how the values produced by nodes as outputs flow from\none node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n(DAG) of compute.

  • \n
\n

End users should prefer the @graph decorator. GraphDefinition is generally\nintended to be used by framework authors or for programatically generated graphs.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the graph. Must be unique within any GraphDefinition\nor JobDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the pipeline.

  • \n
  • node_defs (Optional[List[NodeDefinition]]) \u2013 The set of ops / graphs used in this graph.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each op\u2019s inputs on the outputs of other\nops in the graph. Keys of the top level dict are either the string names of ops in the\ngraph or, in the case of aliased ops, NodeInvocations.\nValues of the top level dict are themselves dicts, which map input names belonging to\nthe op or aliased op to DependencyDefinitions.

  • \n
  • input_mappings (Optional[List[InputMapping]]) \u2013 Defines the inputs to the nested graph, and\nhow they map to the inputs of its constituent ops.

  • \n
  • output_mappings (Optional[List[OutputMapping]]) \u2013 Defines the outputs of the nested graph,\nand how they map from the outputs of its constituent ops.

  • \n
  • config (Optional[ConfigMapping]) \u2013 Defines the config of the graph, and how its schema maps\nto the config of its constituent ops.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the graph.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(num):\n    return num + 1\n\ngraph_def = GraphDefinition(\n    name='basic',\n    node_defs=[return_one, add_one],\n    dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n)\n
\n
\n
\n
\nexecute_in_process(run_config=None, instance=None, resources=None, raise_on_error=True, op_selection=None, run_id=None, input_values=None)[source]
\n

Execute this graph in-process, collecting results in-memory.

\n
\n
Parameters
\n
    \n
  • run_config (Optional[Dict[str, Any]]) \u2013 Run config to provide to execution. The configuration for the underlying graph\nshould exist under the \u201cops\u201d key.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the graph.

  • \n
\n
\n
Returns
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n\n
\n
\n

Execution results\u00b6

\n
\n
\nclass dagster.ExecuteInProcessResult(node_def, all_events, dagster_run, output_capture)[source]\u00b6
\n
\n
\nproperty all_events\u00b6
\n

All dagster events emitted during in-process execution.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty all_node_events\u00b6
\n

All dagster events from the in-process execution.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty dagster_run\u00b6
\n

the DagsterRun object for the completed execution.

\n
\n
Type
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nevents_for_node(node_name)[source]\u00b6
\n

Retrieves all dagster events for a specific node.

\n
\n
Parameters
\n

node_name (str) \u2013 The name of the node for which outputs should be retrieved.

\n
\n
Returns
\n

A list of all dagster events associated with provided node name.

\n
\n
Return type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nget_job_failure_event()[source]\u00b6
\n

Returns a DagsterEvent with type DagsterEventType.PIPELINE_FAILURE if it ocurred during\nexecution

\n
\n\n
\n
\nget_job_success_event()[source]\u00b6
\n

Returns a DagsterEvent with type DagsterEventType.PIPELINE_SUCCESS if it ocurred during\nexecution

\n
\n\n
\n
\noutput_for_node(node_str, output_name='result')[source]\u00b6
\n

Retrieves output value with a particular name from the in-process run of the job.

\n
\n
Parameters
\n
    \n
  • node_str (str) \u2013 Name of the op/graph whose output should be retrieved. If the intended\ngraph/op is nested within another graph, the syntax is outer_graph.inner_node.

  • \n
  • output_name (Optional[str]) \u2013 Name of the output on the op/graph to retrieve. Defaults to\nresult, the default output name in dagster.

  • \n
\n
\n
Returns
\n

The value of the retrieved output.

\n
\n
Return type
\n

Any

\n
\n
\n
\n\n
\n
\noutput_value(output_name='result')[source]\u00b6
\n

Retrieves output of top-level job, if an output is returned.

\n

If the top-level job has no output, calling this method will result in a\nDagsterInvariantViolationError.

\n
\n
Parameters
\n

output_name (Optional[str]) \u2013 The name of the output to retrieve. Defaults to result,\nthe default output name in dagster.

\n
\n
Returns
\n

The value of the retrieved output.

\n
\n
Return type
\n

Any

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The run id for the executed run

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty success\u00b6
\n

Whether execution was successful.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterEvent(event_type_value, pipeline_name, step_handle=None, solid_handle=None, step_kind_value=None, logging_tags=None, event_specific_data=None, message=None, pid=None, step_key=None)[source]\u00b6
\n

Events yielded by solid and pipeline execution.

\n

Users should not instantiate this class.

\n
\n
\nevent_type_value\u00b6
\n

Value for a DagsterEventType.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\npipeline_name\u00b6
\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nsolid_handle\u00b6
\n
\n
Type
\n

NodeHandle

\n
\n
\n
\n\n
\n
\nstep_kind_value\u00b6
\n

Value for a StepKind.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nlogging_tags\u00b6
\n
\n
Type
\n

Dict[str, str]

\n
\n
\n
\n\n
\n
\nevent_specific_data\u00b6
\n

Type must correspond to event_type_value.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\nmessage\u00b6
\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\npid\u00b6
\n
\n
Type
\n

int

\n
\n
\n
\n\n
\n
\nstep_key\u00b6
\n

DEPRECATED

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty event_type\u00b6
\n

The type of this event.

\n
\n
Type
\n

DagsterEventType

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterEventType(value)[source]\u00b6
\n

The types of events that may be yielded by solid and pipeline execution.

\n
\n
\nALERT_FAILURE = 'ALERT_FAILURE'\u00b6
\n
\n\n
\n
\nALERT_START = 'ALERT_START'\u00b6
\n
\n\n
\n
\nALERT_SUCCESS = 'ALERT_SUCCESS'\u00b6
\n
\n\n
\n
\nASSET_MATERIALIZATION = 'ASSET_MATERIALIZATION'\u00b6
\n
\n\n
\n
\nASSET_MATERIALIZATION_PLANNED = 'ASSET_MATERIALIZATION_PLANNED'\u00b6
\n
\n\n
\n
\nASSET_OBSERVATION = 'ASSET_OBSERVATION'\u00b6
\n
\n\n
\n
\nASSET_STORE_OPERATION = 'ASSET_STORE_OPERATION'\u00b6
\n
\n\n
\n
\nENGINE_EVENT = 'ENGINE_EVENT'\u00b6
\n
\n\n
\n
\nHANDLED_OUTPUT = 'HANDLED_OUTPUT'\u00b6
\n
\n\n
\n
\nHOOK_COMPLETED = 'HOOK_COMPLETED'\u00b6
\n
\n\n
\n
\nHOOK_ERRORED = 'HOOK_ERRORED'\u00b6
\n
\n\n
\n
\nHOOK_SKIPPED = 'HOOK_SKIPPED'\u00b6
\n
\n\n
\n
\nLOADED_INPUT = 'LOADED_INPUT'\u00b6
\n
\n\n
\n
\nLOGS_CAPTURED = 'LOGS_CAPTURED'\u00b6
\n
\n\n
\n
\nOBJECT_STORE_OPERATION = 'OBJECT_STORE_OPERATION'\u00b6
\n
\n\n
\n
\nPIPELINE_CANCELED = 'PIPELINE_CANCELED'\u00b6
\n
\n\n
\n
\nPIPELINE_CANCELING = 'PIPELINE_CANCELING'\u00b6
\n
\n\n
\n
\nPIPELINE_DEQUEUED = 'PIPELINE_DEQUEUED'\u00b6
\n
\n\n
\n
\nPIPELINE_ENQUEUED = 'PIPELINE_ENQUEUED'\u00b6
\n
\n\n
\n
\nPIPELINE_FAILURE = 'PIPELINE_FAILURE'\u00b6
\n
\n\n
\n
\nPIPELINE_START = 'PIPELINE_START'\u00b6
\n
\n\n
\n
\nPIPELINE_STARTING = 'PIPELINE_STARTING'\u00b6
\n
\n\n
\n
\nPIPELINE_SUCCESS = 'PIPELINE_SUCCESS'\u00b6
\n
\n\n
\n
\nRUN_CANCELED = 'PIPELINE_CANCELED'\u00b6
\n
\n\n
\n
\nRUN_CANCELING = 'PIPELINE_CANCELING'\u00b6
\n
\n\n
\n
\nRUN_DEQUEUED = 'PIPELINE_DEQUEUED'\u00b6
\n
\n\n
\n
\nRUN_ENQUEUED = 'PIPELINE_ENQUEUED'\u00b6
\n
\n\n
\n
\nRUN_FAILURE = 'PIPELINE_FAILURE'\u00b6
\n
\n\n
\n
\nRUN_START = 'PIPELINE_START'\u00b6
\n
\n\n
\n
\nRUN_STARTING = 'PIPELINE_STARTING'\u00b6
\n
\n\n
\n
\nRUN_SUCCESS = 'PIPELINE_SUCCESS'\u00b6
\n
\n\n
\n
\nSTEP_EXPECTATION_RESULT = 'STEP_EXPECTATION_RESULT'\u00b6
\n
\n\n
\n
\nSTEP_FAILURE = 'STEP_FAILURE'\u00b6
\n
\n\n
\n
\nSTEP_INPUT = 'STEP_INPUT'\u00b6
\n
\n\n
\n
\nSTEP_OUTPUT = 'STEP_OUTPUT'\u00b6
\n
\n\n
\n
\nSTEP_RESTARTED = 'STEP_RESTARTED'\u00b6
\n
\n\n
\n
\nSTEP_SKIPPED = 'STEP_SKIPPED'\u00b6
\n
\n\n
\n
\nSTEP_START = 'STEP_START'\u00b6
\n
\n\n
\n
\nSTEP_SUCCESS = 'STEP_SUCCESS'\u00b6
\n
\n\n
\n
\nSTEP_UP_FOR_RETRY = 'STEP_UP_FOR_RETRY'\u00b6
\n
\n\n
\n\n
\n
\n

Reconstructable jobs\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]\u00b6
\n

Create a ReconstructablePipeline from a\nfunction that returns a PipelineDefinition/JobDefinition,\nor a function decorated with @pipeline/@job.

\n

When your pipeline/job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the pipeline/job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of pipelines\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples:

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\n

Executors\u00b6

\n
\n
\ndagster.in_process_executor ExecutorDefinition[source]\u00b6
\n

The in-process executor executes all steps in a single process.

\n

For legacy pipelines, this will be the default executor. To select it explicitly,\ninclude the following top-level fragment in config:

\n
execution:\n  in_process:\n
\n
\n

Execution priority can be configured using the dagster/priority tag via solid/op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\ndagster.multiprocess_executor ExecutorDefinition[source]\u00b6
\n

The multiprocess executor executes each step in an individual process.

\n

Any job that does not specify custom executors will use the multiprocess_executor by default.\nFor jobs or legacy pipelines, to configure the multiprocess executor, include a fragment such\nas the following in your run config:

\n
execution:\n  config:\n    multiprocess:\n      max_concurrent: 4\n
\n
\n

The max_concurrent arg is optional and tells the execution engine how many processes may run\nconcurrently. By default, or if you set max_concurrent to be 0, this is the return value of\nmultiprocessing.cpu_count().

\n

Execution priority can be configured using the dagster/priority tag via solid/op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\n

Contexts\u00b6

\n
\n
\nclass dagster.OpExecutionContext(step_execution_context)[source]\u00b6
\n
\n
\nadd_output_metadata(metadata, output_name=None, mapping_key=None)\u00b6
\n

Add metadata to one of the outputs of an op.

\n

This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.

\n
\n
Parameters
\n
    \n
  • metadata (Mapping[str, Any]) \u2013 The metadata to attach to the output

  • \n
  • output_name (Optional[str]) \u2013 The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.

  • \n
\n
\n
\n

Examples:

\n
from dagster import Out, op\nfrom typing import Tuple\n\n@op\ndef add_metadata(context):\n    context.add_output_metadata({"foo", "bar"})\n    return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n@op(out={"a": Out(), "b": Out()})\ndef add_metadata_two_outputs(context) -> Tuple[str, int]:\n    context.add_output_metadata({"foo": "bar"}, output_name="b")\n    context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n    return ("dog", 5)\n
\n
\n
\n\n
\n
\nconsume_events()\u00b6
\n

Pops and yields all user-generated events that have been recorded from this context.

\n

If consume_events has not yet been called, this will yield all logged events since the beginning of the op\u2019s computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nget_mapping_key()\u00b6
\n

Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None.

\n
\n\n
\n
\nget_tag(key)\u00b6
\n

Get a logging tag.

\n
\n
Parameters
\n

key (tag) \u2013 The tag to get.

\n
\n
Returns
\n

The value of the tag, if present.

\n
\n
Return type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run

\n
\n\n
\n
\nhas_tag(key)\u00b6
\n

Check if a logging tag is set.

\n
\n
Parameters
\n

key (str) \u2013 The tag to check.

\n
\n
Returns
\n

Whether the tag is set.

\n
\n
Return type
\n

bool

\n
\n
\n
\n\n
\n
\nproperty instance\u00b6
\n

The current Dagster instance

\n
\n
Type
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty job_def\u00b6
\n

The currently executing job.

\n
\n
Type
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the currently executing job.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager available in the execution context.

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nlog_event(event)\u00b6
\n

Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.

\n

Events logged with this method will appear in the list of DagsterEvents, as well as the event log.

\n
\n
Parameters
\n

event (Union[AssetMaterialization, Materialization, AssetObservation, ExpectationResult]) \u2013 The event to log.

\n
\n
\n

Examples:

\n
from dagster import op, AssetMaterialization\n\n@op\ndef log_materialization(context):\n    context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty mode_def\u00b6
\n

The mode of the current execution.

\n
\n
Type
\n

ModeDefinition

\n
\n
\n
\n\n
\n
\nproperty op_def\u00b6
\n

The current op definition.

\n
\n
Type
\n

OpDefinition

\n
\n
\n
\n\n
\n
\noutput_asset_partition_key(output_name='result')\u00b6
\n

Returns the asset partition key for the given output. Defaults to \u201cresult\u201d, which is the\nname of the default output.

\n
\n\n
\n
\noutput_asset_partitions_time_window(output_name='result')\u00b6
\n

The time window for the partitions of the output asset.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nproperty partition_time_window\u00b6
\n

The partition time window for the current run.

\n

Raises an error if the current run is not a partitioned run, or if the job\u2019s partition\ndefinition is not a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nproperty pdb\u00b6
\n

Gives access to pdb debugging from within the op.

\n

Example:

\n
@op\ndef debug(context):\n    context.pdb.set_trace()\n
\n
\n
\n
Type
\n

dagster.utils.forked_pdb.ForkedPdb

\n
\n
\n
\n\n
\n
\nproperty pipeline_def\u00b6
\n

The currently executing pipeline.

\n
\n
Type
\n

PipelineDefinition

\n
\n
\n
\n\n
\n
\nproperty pipeline_name\u00b6
\n

The name of the currently executing pipeline.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty pipeline_run\u00b6
\n

The current pipeline run

\n
\n
Type
\n

PipelineRun

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

The currently available resources.

\n
\n
Type
\n

Resources

\n
\n
\n
\n\n
\n
\nproperty retry_number\u00b6
\n

Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc.

\n
\n\n
\n
\nproperty run\u00b6
\n

The current run

\n
\n
Type
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run config for the current execution.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the current execution\u2019s run.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty solid_config\u00b6
\n

The parsed config specific to this solid.

\n
\n\n
\n
\nproperty solid_def\u00b6
\n

The current solid definition.

\n
\n
Type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\nproperty step_launcher\u00b6
\n

The current step launcher, if any.

\n
\n
Type
\n

Optional[StepLauncher]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_op_context(resources=None, op_config=None, resources_config=None, instance=None, config=None, partition_key=None, mapping_key=None)[source]\u00b6
\n

Builds op execution context from provided parameters.

\n

op is currently built on top of solid, and thus this function creates a SolidExecutionContext.\nbuild_op_context can be used as either a function or context manager. If there is a\nprovided resource that is a context manager, then build_op_context must be used as a\ncontext manager. This function can be used to provide the context argument when directly\ninvoking a op.

\n
\n
Parameters
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • config (Optional[Any]) \u2013 The op config to provide to the context.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
  • mapping_key (Optional[str]) \u2013 A key representing the mapping key from an upstream dynamic output. Can be accessed using context.get_mapping_key().

  • \n
\n
\n
\n

Examples

\n
context = build_op_context()\nop_to_invoke(context)\n\nwith build_op_context(resources={"foo": context_manager_resource}) as context:\n    op_to_invoke(context)\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheckContext(run_id, log_manager, scoped_resources_builder, dagster_type)[source]\u00b6
\n

The context object available to a type check function on a DagsterType.

\n
\n
\nlog\u00b6
\n

Centralized log dispatch from user code.

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

An object whose attributes contain the resources available to this op.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The id of this job run.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\n

Job configuration\u00b6

\n
\n
\ndagster.validate_run_config(job_def=None, run_config=None, mode=None, pipeline_def=None)[source]\u00b6
\n

Function to validate a provided run config blob against a given job. For legacy APIs, a\npipeline/mode can also be passed in.

\n

If validation is successful, this function will return a dictionary representation of the\nvalidated config actually used during execution.

\n
\n
Parameters
\n
    \n
  • job_def (Union[PipelineDefinition, JobDefinition]) \u2013 The job definition to validate run\nconfig against

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 The run config to validate

  • \n
  • mode (str) \u2013 The mode of the pipeline to validate against (different modes may require\ndifferent config)

  • \n
  • pipeline_def (PipelineDefinition) \u2013 The pipeline definition to validate run config against.

  • \n
\n
\n
Returns
\n

A dictionary representation of the validated config.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n

Run Config Schema\u00b6

\n
\n

The run_config used for jobs has the following schema:

\n
{\n  # configuration for execution, required if executors require config\n  execution: {\n    # the name of one, and only one available executor, typically 'in_process' or 'multiprocess'\n    __executor_name__: {\n      # executor-specific config, if required or permitted\n      config: {\n        ...\n      }\n    }\n  },\n\n  # configuration for loggers, required if loggers require config\n  loggers: {\n    # the name of an available logger\n    __logger_name__: {\n      # logger-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for resources, required if resources require config\n  resources: {\n    # the name of a resource\n    __resource_name__: {\n      # resource-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for underlying ops, required if ops require config\n  ops: {\n\n    # these keys align with the names of the ops, or their alias in this job\n    __op_name__: {\n\n      # pass any data that was defined via config_field\n      config: ...,\n\n      # configurably specify input values, keyed by input name\n      inputs: {\n        __input_name__: {\n          # if an dagster_type_loader is specified, that schema must be satisfied here;\n          # scalar, built-in types will generally allow their values to be specified directly:\n          value: ...\n        }\n      },\n\n    }\n  },\n\n}\n
\n
\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/execution", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../graphs/", "title": "Graphs"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../errors/", "title": "Errors"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/graphs", "Graphs", "N", "next"], ["sections/api/apidocs/errors", "Errors", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/execution.rst.txt", "title": "Execution", "toc": "\n"}, "graphs": {"alabaster_version": "0.7.12", "body": "
\n

Graphs\u00b6

\n

The core of a job is a _graph_ of ops - connected via data dependencies.

\n
\n
\n@dagster.graph(name=None, description=None, input_defs=None, output_defs=None, ins=None, out=None, tags=None, config=None)[source]\u00b6
\n

Create a graph with the specified parameters from the decorated composition function.

\n

Using this decorator allows you to build up a dependency graph by writing a\nfunction that invokes ops (or other graphs) and passes the output to subsequent invocations.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the graph. Must be unique within any RepositoryDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the graph.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013

    Information about the inputs that this graph maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit InputDefinitions taking precedence.

    \n

    Uses of inputs in the body of the decorated composition function will determine\nthe InputMappings passed to the underlying\nGraphDefinition.

    \n

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013

    Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.

    \n

    Uses of these outputs in the body of the decorated composition function, as well as the\nreturn value of the decorated function, will be used to infer the appropriate set of\nOutputMappings for the underlying\nGraphDefinition.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
  • ins (Optional[Dict[str, GraphIn]]) \u2013 Information about the inputs that this graph maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit GraphIn taking precedence.

  • \n
  • out \u2013

    Information about the outputs that this graph maps. Information provided here will be\ncombined with what can be inferred from the return type signature if the function does\nnot use yield.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.GraphDefinition(name, description=None, node_defs=None, dependencies=None, input_mappings=None, output_mappings=None, config=None, tags=None, **kwargs)[source]\u00b6
\n

Defines a Dagster graph.

\n

A graph is made up of

\n
    \n
  • Nodes, which can either be an op (the functional unit of computation), or another graph.

  • \n
  • Dependencies, which determine how the values produced by nodes as outputs flow from\none node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n(DAG) of compute.

  • \n
\n

End users should prefer the @graph decorator. GraphDefinition is generally\nintended to be used by framework authors or for programatically generated graphs.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the graph. Must be unique within any GraphDefinition\nor JobDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the pipeline.

  • \n
  • node_defs (Optional[List[NodeDefinition]]) \u2013 The set of ops / graphs used in this graph.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each op\u2019s inputs on the outputs of other\nops in the graph. Keys of the top level dict are either the string names of ops in the\ngraph or, in the case of aliased ops, NodeInvocations.\nValues of the top level dict are themselves dicts, which map input names belonging to\nthe op or aliased op to DependencyDefinitions.

  • \n
  • input_mappings (Optional[List[InputMapping]]) \u2013 Defines the inputs to the nested graph, and\nhow they map to the inputs of its constituent ops.

  • \n
  • output_mappings (Optional[List[OutputMapping]]) \u2013 Defines the outputs of the nested graph,\nand how they map from the outputs of its constituent ops.

  • \n
  • config (Optional[ConfigMapping]) \u2013 Defines the config of the graph, and how its schema maps\nto the config of its constituent ops.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the graph.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(num):\n    return num + 1\n\ngraph_def = GraphDefinition(\n    name='basic',\n    node_defs=[return_one, add_one],\n    dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n)\n
\n
\n
\n
\nexecute_in_process(run_config=None, instance=None, resources=None, raise_on_error=True, op_selection=None, run_id=None, input_values=None)[source]\u00b6
\n

Execute this graph in-process, collecting results in-memory.

\n
\n
Parameters
\n
    \n
  • run_config (Optional[Dict[str, Any]]) \u2013 Run config to provide to execution. The configuration for the underlying graph\nshould exist under the \u201cops\u201d key.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the graph.

  • \n
\n
\n
Returns
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nto_job(name=None, description=None, resource_defs=None, config=None, tags=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, op_selection=None, partitions_def=None, asset_layer=None, input_values=None, _asset_selection_data=None)[source]\u00b6
\n

Make this graph in to an executable Job by providing remaining components required for execution.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its solids and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagit playground, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the job, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagit playground, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multi_or_in_process_executor,\nwhich can be switched between multi-process and in-process modes of execution. The\ndefault mode of execution is multi-process.

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 Defines how each solid (and optionally, resource) in the job can be versioned. If\nprovided, memoizaton will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition\nkeys that can parameterize the job. If this argument is supplied, the config\nargument can\u2019t also be supplied.

  • \n
  • asset_layer (Optional[AssetLayer]) \u2013 Top level information about the assets this job\nwill produce. Generally should not be set manually.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of a job.

  • \n
\n
\n
Returns
\n

JobDefinition

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.GraphIn(description=None)[source]\u00b6
\n

Represents information about an input that a graph maps.

\n
\n
Parameters
\n

description (Optional[str]) \u2013 Human-readable description of the input.

\n
\n
\n
\n\n
\n
\nclass dagster.GraphOut(description=None)[source]\u00b6
\n

Represents information about the outputs that a graph maps.

\n
\n
Parameters
\n

description (Optional[str]) \u2013 Human-readable description of the output.

\n
\n
\n
\n\n
\n

Explicit dependencies\u00b6

\n
\n
\nclass dagster.DependencyDefinition(solid=None, output='result', description=None, node=None)[source]\u00b6
\n

Represents an edge in the DAG of nodes (ops or graphs) forming a job.

\n

This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a job whose keys represent the dependent node and dependent\ninput, so this object only contains information about the dependee.

\n

Concretely, if the input named \u2018input\u2019 of op_b depends on the output named \u2018result\u2019 of\nop_a, and the output named \u2018other_result\u2019 of graph_a, the structure will look as follows:

\n
dependency_structure = {\n    'my_downstream_op': {\n        'input': DependencyDefinition('my_upstream_op', 'result')\n    }\n    'my_downstream_op': {\n        'input': DependencyDefinition('my_upstream_graph', 'result')\n    }\n}\n
\n
\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
@job\ndef the_job():\n    node_b(node_a())\n
\n
\n
\n
Parameters
\n
    \n
  • solid (str) \u2013 (legacy) The name of the solid that is depended on, that is, from which the value\npassed between the two nodes originates.

  • \n
  • output (Optional[str]) \u2013 The name of the output that is depended on. (default: \u201cresult\u201d)

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this dependency.

  • \n
  • node (str) \u2013 The name of the node (op or graph) that is depended on, that is, from which the value\npassed between the two nodes originates.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.MultiDependencyDefinition(dependencies)[source]\u00b6
\n

Represents a fan-in edge in the DAG of op instances forming a job.

\n

This object is used only when an input of type List[T] is assembled by fanning-in multiple\nupstream outputs of type T.

\n

This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a job or pipeline whose keys represent the dependent ops or graphs and dependent\ninput, so this object only contains information about the dependee.

\n

Concretely, if the input named \u2018input\u2019 of op_c depends on the outputs named \u2018result\u2019 of\nop_a and op_b, this structure will look as follows:

\n
dependency_structure = {\n    'op_c': {\n        'input': MultiDependencyDefinition(\n            [\n                DependencyDefinition('op_a', 'result'),\n                DependencyDefinition('op_b', 'result')\n            ]\n        )\n    }\n}\n
\n
\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
@job\ndef the_job():\n    op_c(op_a(), op_b())\n
\n
\n
\n
Parameters
\n

dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]) \u2013 List of\nupstream dependencies fanned in to this input.

\n
\n
\n
\n\n
\n
\nclass dagster.NodeInvocation(name, alias=None, tags=None, hook_defs=None, retry_policy=None)[source]\u00b6
\n

Identifies an instance of a node in a graph dependency structure.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the solid of which this is an instance.

  • \n
  • alias (Optional[str]) \u2013 Name specific to this instance of the solid. Necessary when there are\nmultiple instances of the same solid.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Optional tags values to extend or override those\nset on the solid definition.

  • \n
  • hook_defs (Optional[AbstractSet[HookDefinition]]) \u2013 A set of hook definitions applied to the\nsolid instance.

  • \n
\n
\n
\n

Examples:

\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
from dagster import job\n\n@job\ndef my_job():\n    other_name = some_op.alias('other_name')\n    some_graph(other_name(some_op))\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/graphs", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../hooks/", "title": "Hooks"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../execution/", "title": "Execution"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/hooks", "Hooks", "N", "next"], ["sections/api/apidocs/execution", "Execution", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/graphs.rst.txt", "title": "Graphs", "toc": "\n"}, "hooks": {"alabaster_version": "0.7.12", "body": "
\n

Hooks\u00b6

\n
\n
\n@dagster.success_hook(name=None, required_resource_keys=None)[source]\u00b6
\n

Create a hook on step success events with the specified parameters from the decorated function.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of this hook.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n

Examples

\n
@success_hook(required_resource_keys={'slack'})\ndef slack_message_on_success(context):\n    message = 'op {} succeeded'.format(context.op.name)\n    context.resources.slack.send_message(message)\n\n@success_hook\ndef do_something_on_success(context):\n    do_something()\n
\n
\n
\n\n
\n
\n@dagster.failure_hook(name=None, required_resource_keys=None)[source]\u00b6
\n

Create a hook on step failure events with the specified parameters from the decorated function.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of this hook.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n

Examples

\n
@failure_hook(required_resource_keys={'slack'})\ndef slack_message_on_failure(context):\n    message = 'op {} failed'.format(context.op.name)\n    context.resources.slack.send_message(message)\n\n@failure_hook\ndef do_something_on_failure(context):\n    do_something()\n
\n
\n
\n\n
\n
\nclass dagster.HookDefinition(name, hook_fn, required_resource_keys=None, decorated_fn=None)[source]\u00b6
\n

Define a hook which can be triggered during a op execution (e.g. a callback on the step\nexecution failure event during a op execution).

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this hook.

  • \n
  • hook_fn (Callable) \u2013 The callback function that will be triggered.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.HookContext(step_execution_context, hook_def)[source]\u00b6
\n

The context object available to a hook function on an DagsterEvent.

\n
\n
\nlog\u00b6
\n

Centralized log dispatch from user code.

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nhook_def\u00b6
\n

The hook that the context object belongs to.

\n
\n
Type
\n

HookDefinition

\n
\n
\n
\n\n
\n
\nsolid\u00b6
\n

The solid instance associated with the hook.

\n
\n
Type
\n

Solid

\n
\n
\n
\n\n
\n
\nop\u00b6
\n

The op instance associated with the hook.

\n
\n
Type
\n

Op

\n
\n
\n
\n\n
\n
\nstep_key\u00b6
\n

The key for the step where this hook is being triggered.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nrequired_resource_keys\u00b6
\n

Resources required by this hook.

\n
\n
Type
\n

Set[str]

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

Resources available in the hook context.

\n
\n
Type
\n

Resources

\n
\n
\n
\n\n
\n
\nsolid_config\u00b6
\n

The parsed config specific to this solid.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\nop_config\u00b6
\n

The parsed config specific to this op.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\npipeline_name\u00b6
\n

The name of the pipeline where this hook is being triggered.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\njob_name\u00b6
\n

The name of the job where this hook is being triggered.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The id of the run where this hook is being triggered.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nmode_def\u00b6
\n

The mode with which the pipeline is being run.

\n
\n
Type
\n

ModeDefinition

\n
\n
\n
\n\n
\n
\nop_exception\u00b6
\n

The thrown exception in a failed op.

\n
\n
Type
\n

Optional[BaseException]

\n
\n
\n
\n\n
\n
\nop_output_values\u00b6
\n

Computed output values in an op.

\n
\n
Type
\n

Dict

\n
\n
\n
\n\n
\n
\nproperty solid_exception\u00b6
\n

The thrown exception in a failed solid.

\n
\n
Returns
\n

the exception object, None if the solid execution succeeds.

\n
\n
Return type
\n

Optional[BaseException]

\n
\n
\n
\n\n
\n
\nproperty solid_output_values\u00b6
\n

The computed output values.

\n
\n
Returns a dictionary where keys are output names and the values are:
    \n
  • the output values in the normal case

  • \n
  • a dictionary from mapping key to corresponding value in the mapped case

  • \n
\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_hook_context(resources=None, mode_def=None, solid=None, op=None, run_id=None, job_name=None, op_exception=None)[source]\u00b6
\n

Builds hook context from provided parameters.

\n

build_hook_context can be used as either a function or a context manager. If there is a\nprovided resource to build_hook_context that is a context manager, then it must be used as a\ncontext manager. This function can be used to provide the context argument to the invocation of\na hook definition.

\n
\n
Parameters
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can\neither be values or resource definitions.

  • \n
  • mode_def (Optional[ModeDefinition]) \u2013 The mode definition used with the context.

  • \n
  • op (Optional[OpDefinition, PendingNodeInvocation]) \u2013 The op definition which the\nhook may be associated with.

  • \n
  • solid (Optional[SolidDefinition, PendingNodeInvocation]) \u2013 (legacy) The solid definition which the\nhook may be associated with.

  • \n
  • run_id (Optional[str]) \u2013 The id of the run in which the hook is invoked (provided for mocking purposes).

  • \n
  • job_name (Optional[str]) \u2013 The name of the job in which the hook is used (provided for mocking purposes).

  • \n
  • op_exception (Optional[Exception]) \u2013 The exception that caused the hook to be triggered.

  • \n
\n
\n
\n

Examples

\n
context = build_hook_context()\nhook_to_invoke(context)\n\nwith build_hook_context(resources={"foo": context_manager_resource}) as context:\n    hook_to_invoke(context)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/hooks", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../internals/", "title": "Internals"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../graphs/", "title": "Graphs"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/internals", "Internals", "N", "next"], ["sections/api/apidocs/graphs", "Graphs", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/hooks.rst.txt", "title": "Hooks", "toc": "\n"}, "internals": {"alabaster_version": "0.7.12", "body": "
\n

Internals\u00b6

\n

Please note that internal APIs are likely to be in much greater flux pre-1.0 than user-facing APIs,\nparticularly if not exported in the top level dagster module.

\n

If you find yourself consulting these docs because you are writing custom components and plug-ins,\nplease get in touch with the core team on our Slack.\nWe\u2019re curious what you\u2019re up to, happy to help, excited for new community contributions, and eager\nto make the system as easy to work with as possible \u2013 including for teams who are looking to\ncustomize it.

\n
\n

Executors\u00b6

\n
\n
\n@dagster.executor(name=None, config_schema=None, requirements=None)[source]\u00b6
\n

Define an executor.

\n

The decorated function should accept an InitExecutorContext and return an instance\nof Executor.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the executor.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.executor_config. If not set, Dagster will accept any config provided for.

  • \n
  • requirements (Optional[List[ExecutorRequirement]]) \u2013 Any requirements that must\nbe met in order for the executor to be usable for a particular pipeline execution.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ExecutorDefinition(name, config_schema=None, requirements=None, executor_creation_fn=None, description=None)[source]\u00b6
\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the executor.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data\navailable in init_context.executor_config. If not set, Dagster will accept any config\nprovided.

  • \n
  • requirements (Optional[List[ExecutorRequirement]]) \u2013 Any requirements that must\nbe met in order for the executor to be usable for a particular pipeline execution.

  • \n
  • executor_creation_fn (Optional[Callable]) \u2013 Should accept an InitExecutorContext\nand return an instance of Executor

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the\nexecutor.

  • \n
\n
\n
\n
\n
\nconfigured(config_or_config_fn, name=None, config_schema=None, description=None)[source]\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (Optional[str]) \u2013 Name of the new definition. If not provided, the emitted\ndefinition will inherit the name of the ExecutorDefinition upon which this\nfunction is called.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 If config_or_config_fn is a function, the config\nschema that its input must satisfy. If not set, Dagster will accept any config\nprovided.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\nclass dagster.InitExecutorContext(job, executor_def, executor_config, instance)[source]\u00b6
\n

Executor-specific initialization context.

\n
\n
\njob\u00b6
\n

The job to be executed.

\n
\n
Type
\n

IPipeline

\n
\n
\n
\n\n
\n
\nexecutor_def\u00b6
\n

The definition of the executor currently being\nconstructed.

\n
\n
Type
\n

ExecutorDefinition

\n
\n
\n
\n\n
\n
\nexecutor_config\u00b6
\n

The parsed config passed to the executor.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

The current instance.

\n
\n
Type
\n

DagsterInstance

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.Executor[source]\u00b6
\n
\n
\nabstract execute(plan_context, execution_plan)[source]\u00b6
\n

For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.

\n
\n
Parameters
\n
    \n
  • plan_context (PlanOrchestrationContext) \u2013 The plan\u2019s orchestration context.

  • \n
  • execution_plan (ExecutionPlan) \u2013 The plan to execute.

  • \n
\n
\n
Returns
\n

A stream of dagster events.

\n
\n
\n
\n\n
\n
\nabstract property retries\u00b6
\n

Whether retries are enabled or disabled for this instance of the executor.

\n

Executors should allow this to be controlled via configuration if possible.

\n

Returns: RetryMode

\n
\n\n
\n\n
\n
\n
\n

File Manager\u00b6

\n
\n
\nclass dagster.core.storage.file_manager.FileManager[source]\u00b6
\n

Base class for all file managers in dagster.

\n

The file manager is an interface that can be implemented by resources to provide abstract\naccess to a file system such as local disk, S3, or other cloud storage.

\n

For examples of usage, see the documentation of the concrete file manager implementations.

\n
\n
\nabstract copy_handle_to_local_temp(file_handle)[source]\u00b6
\n

Copy a file represented by a file handle to a temp file.

\n

In an implementation built around an object store such as S3, this method would be expected\nto download the file from S3 to local filesystem in a location assigned by the standard\nlibrary\u2019s tempfile module.

\n

Temp files returned by this method are not guaranteed to be reusable across solid\nboundaries. For files that must be available across solid boundaries, use the\nread(),\nread_data(),\nwrite(), and\nwrite_data() methods.

\n
\n
Parameters
\n

file_handle (FileHandle) \u2013 The handle to the file to make available as a local temp file.

\n
\n
Returns
\n

Path to the local temp file.

\n
\n
Return type
\n

str

\n
\n
\n
\n\n
\n
\nabstract delete_local_temp()[source]\u00b6
\n

Delete all local temporary files created by previous calls to\ncopy_handle_to_local_temp().

\n

Should typically only be called by framework implementors.

\n
\n\n
\n
\nabstract read(file_handle, mode='rb')[source]\u00b6
\n

Return a file-like stream for the file handle.

\n

This may incur an expensive network call for file managers backed by object stores\nsuch as S3.

\n
\n
Parameters
\n
    \n
  • file_handle (FileHandle) \u2013 The file handle to make available as a stream.

  • \n
  • mode (str) \u2013 The mode in which to open the file. Default: "rb".

  • \n
\n
\n
Returns
\n

A file-like stream.

\n
\n
Return type
\n

Union[TextIO, BinaryIO]

\n
\n
\n
\n\n
\n
\nabstract read_data(file_handle)[source]\u00b6
\n

Return the bytes for a given file handle. This may incur an expensive network\ncall for file managers backed by object stores such as s3.

\n
\n
Parameters
\n

file_handle (FileHandle) \u2013 The file handle for which to return bytes.

\n
\n
Returns
\n

Bytes for a given file handle.

\n
\n
Return type
\n

bytes

\n
\n
\n
\n\n
\n
\nabstract write(file_obj, mode='wb', ext=None)[source]\u00b6
\n

Write the bytes contained within the given file object into the file manager.

\n
\n
Parameters
\n
    \n
  • file_obj (Union[TextIO, StringIO]) \u2013 A file-like object.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to write the file into the file manager.\nDefault: "wb".

  • \n
  • ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None.

  • \n
\n
\n
Returns
\n

A handle to the newly created file.

\n
\n
Return type
\n

FileHandle

\n
\n
\n
\n\n
\n
\nabstract write_data(data, ext=None)[source]\u00b6
\n

Write raw bytes into the file manager.

\n
\n
Parameters
\n
    \n
  • data (bytes) \u2013 The bytes to write into the file manager.

  • \n
  • ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None.

  • \n
\n
\n
Returns
\n

A handle to the newly created file.

\n
\n
Return type
\n

FileHandle

\n
\n
\n
\n\n
\n\n
\n
\ndagster.local_file_manager ResourceDefinition[source]\u00b6
\n

FileManager that provides abstract access to a local filesystem.

\n

By default, files will be stored in <local_artifact_storage>/storage/file_manager where\n<local_artifact_storage> can be configured the dagster.yaml file in $DAGSTER_HOME.

\n

Implements the FileManager API.

\n

Examples:

\n
import tempfile\n\nfrom dagster import ModeDefinition, local_file_manager, pipeline, solid\n\n\n@solid(required_resource_keys={"file_manager"})\ndef write_files(context):\n    fh_1 = context.resources.file_manager.write_data(b"foo")\n\n    with tempfile.NamedTemporaryFile("w+") as fd:\n        fd.write("bar")\n        fd.seek(0)\n        fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n    return (fh_1, fh_2)\n\n\n@solid(required_resource_keys={"file_manager"})\ndef read_files(context, file_handles):\n    fh_1, fh_2 = file_handles\n    assert context.resources.file_manager.read_data(fh_2) == b"bar"\n    fd = context.resources.file_manager.read(fh_2, mode="r")\n    assert fd.read() == "foo"\n    fd.close()\n\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"file_manager": local_file_manager})])\ndef files_pipeline():\n    read_files(write_files())\n
\n
\n

Or to specify the file directory:

\n
@pipeline(\n    mode_defs=[\n        ModeDefinition(\n            resource_defs={\n                "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n            }\n        )\n    ]\n)\ndef files_pipeline():\n    read_files(write_files())\n
\n
\n
\n\n
\n
\n
\n

Instance\u00b6

\n
\n
\nclass dagster.DagsterInstance(instance_type, local_artifact_storage, run_storage, event_storage, compute_log_manager, run_coordinator, run_launcher, scheduler=None, schedule_storage=None, settings=None, ref=None)[source]\u00b6
\n

Core abstraction for managing Dagster\u2019s access to storage and other resources.

\n

Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\nthe values in the dagster.yaml file in $DAGSTER_HOME.

\n

Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\ntransient in-memory components.

\n

Configuration of this class should be done by setting values in $DAGSTER_HOME/dagster.yaml.\nFor example, to use Postgres for run and event log storage, you can write a dagster.yaml\nsuch as the following:

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n  module: dagster_postgres.run_storage\n  class: PostgresRunStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { database }\n      port: { port }\n\nevent_log_storage:\n  module: dagster_postgres.event_log\n  class: PostgresEventLogStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { database }\n      port: { port }\n\nschedule_storage:\n  module: dagster_postgres.event_log\n  class: PostgresScheduleStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { database }\n      port: { port }\n
\n
\n
\n
\n
Parameters
\n
    \n
  • instance_type (InstanceType) \u2013 Indicates whether the instance is ephemeral or persistent.\nUsers should not attempt to set this value directly or in their dagster.yaml files.

  • \n
  • local_artifact_storage (LocalArtifactStorage) \u2013 The local artifact storage is used to\nconfigure storage for any artifacts that require a local disk, such as schedules, or\nwhen using the filesystem system storage to manage files and intermediates. By default,\nthis will be a dagster.core.storage.root.LocalArtifactStorage. Configurable\nin dagster.yaml using the ConfigurableClass\nmachinery.

  • \n
  • run_storage (RunStorage) \u2013 The run storage is used to store metadata about ongoing and past\npipeline runs. By default, this will be a\ndagster.core.storage.runs.SqliteRunStorage. Configurable in dagster.yaml\nusing the ConfigurableClass machinery.

  • \n
  • event_storage (EventLogStorage) \u2013 Used to store the structured event logs generated by\npipeline runs. By default, this will be a\ndagster.core.storage.event_log.SqliteEventLogStorage. Configurable in\ndagster.yaml using the ConfigurableClass machinery.

  • \n
  • compute_log_manager (ComputeLogManager) \u2013 The compute log manager handles stdout and stderr\nlogging for solid compute functions. By default, this will be a\ndagster.core.storage.local_compute_log_manager.LocalComputeLogManager.\nConfigurable in dagster.yaml using the\nConfigurableClass machinery.

  • \n
  • run_coordinator (RunCoordinator) \u2013 A runs coordinator may be used to manage the execution\nof pipeline runs.

  • \n
  • run_launcher (Optional[RunLauncher]) \u2013 Optionally, a run launcher may be used to enable\na Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\naddition to running them locally.

  • \n
  • settings (Optional[Dict]) \u2013 Specifies certain per-instance settings,\nsuch as feature flags. These are set in the dagster.yaml under a set of whitelisted\nkeys.

  • \n
  • ref (Optional[InstanceRef]) \u2013 Used by internal machinery to pass instances across process\nboundaries.

  • \n
\n
\n
\n
\n
\nadd_daemon_heartbeat(daemon_heartbeat)[source]\u00b6
\n

Called on a regular interval by the daemon

\n
\n\n
\n
\nget_daemon_heartbeats()[source]\u00b6
\n

Latest heartbeats of all daemon types

\n
\n\n
\n
\nlaunch_run(run_id, workspace)[source]\u00b6
\n

Launch a pipeline run.

\n

This method is typically called using instance.submit_run rather than being invoked\ndirectly. This method delegates to the RunLauncher, if any, configured on the instance,\nand will call its implementation of RunLauncher.launch_run() to begin the execution of\nthe specified run. Runs should be created in the instance (e.g., by calling\nDagsterInstance.create_run()) before this method is called, and should be in the\nPipelineRunStatus.NOT_STARTED state.

\n
\n
Parameters
\n

run_id (str) \u2013 The id of the run the launch.

\n
\n
\n
\n\n
\n
\nreport_engine_event(message, pipeline_run=None, engine_event_data=None, cls=None, step_key=None, pipeline_name=None, run_id=None)[source]\u00b6
\n

Report a EngineEvent that occurred outside of a pipeline execution context.

\n
\n\n
\n
\nresume_run(run_id, workspace, attempt_number)[source]\u00b6
\n

Resume a pipeline run.

\n

This method should be called on runs which have already been launched, but whose run workers\nhave died.

\n
\n
Parameters
\n

run_id (str) \u2013 The id of the run the launch.

\n
\n
\n
\n\n
\n
\nproperty should_start_background_run_thread\u00b6
\n

Gate on an experimental feature to start a thread that monitors for if the run should be canceled.

\n
\n\n
\n
\nsubmit_run(run_id, workspace)[source]\u00b6
\n

Submit a pipeline run to the coordinator.

\n

This method delegates to the RunCoordinator, configured on the instance, and will\ncall its implementation of RunCoordinator.submit_run() to send the run to the\ncoordinator for execution. Runs should be created in the instance (e.g., by calling\nDagsterInstance.create_run()) before this method is called, and\nshould be in the PipelineRunStatus.NOT_STARTED state. They also must have a non-null\nExternalPipelineOrigin.

\n
\n
Parameters
\n

run_id (str) \u2013 The id of the run.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.core.instance.InstanceRef(local_artifact_storage_data, run_storage_data, event_storage_data, compute_logs_data, schedule_storage_data, scheduler_data, run_coordinator_data, run_launcher_data, settings, custom_instance_class_data=None)[source]\u00b6
\n

Serializable representation of a DagsterInstance.

\n

Users should not instantiate this class directly.

\n
\n\n
\n
\nclass dagster.serdes.ConfigurableClass[source]\u00b6
\n

Abstract mixin for classes that can be loaded from config.

\n

This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\nof conditional imports / optional extras_requires in dagster core and b) a magic directory or\nfile in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\nrun storage, pluggable with a config chunk like:

\n
run_storage:\n    module: very_cool_package.run_storage\n    class: SplendidRunStorage\n    config:\n        magic_word: "quux"\n
\n
\n

This same pattern should eventually be viable for other system components, e.g. engines.

\n

The ConfigurableClass mixin provides the necessary hooks for classes to be instantiated from\nan instance of ConfigurableClassData.

\n

Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\ntype such as:

\n
{'module': str, 'class': str, 'config': Field(Permissive())}\n
\n
\n
\n
\nabstract classmethod config_type()[source]\u00b6
\n

dagster.ConfigType: The config type against which to validate a config yaml fragment\nserialized in an instance of ConfigurableClassData.

\n
\n\n
\n
\nabstract static from_config_value(inst_data, config_value)[source]\u00b6
\n

New up an instance of the ConfigurableClass from a validated config value.

\n

Called by ConfigurableClassData.rehydrate.

\n
\n
Parameters
\n

config_value (dict) \u2013 The validated config value to use. Typically this should be the\nvalue attribute of a\nEvaluateValueResult.

\n
\n
\n

A common pattern is for the implementation to align the config_value with the signature\nof the ConfigurableClass\u2019s constructor:

\n
@staticmethod\ndef from_config_value(inst_data, config_value):\n    return MyConfigurableClass(inst_data=inst_data, **config_value)\n
\n
\n
\n\n
\n
\nabstract property inst_data\u00b6
\n

Subclass must be able to return the inst_data as a property if it has been constructed\nthrough the from_config_value code path.

\n
\n\n
\n\n
\n
\nclass dagster.serdes.ConfigurableClassData(module_name, class_name, config_yaml)[source]\u00b6
\n

Serializable tuple describing where to find a class and the config fragment that should\nbe used to instantiate it.

\n

Users should not instantiate this class directly.

\n

Classes intended to be serialized in this way should implement the\ndagster.serdes.ConfigurableClass mixin.

\n
\n\n
\n
\nclass dagster.core.storage.root.LocalArtifactStorage(base_dir, inst_data=None)[source]\u00b6
\n
\n
\nclassmethod config_type()[source]\u00b6
\n

dagster.ConfigType: The config type against which to validate a config yaml fragment\nserialized in an instance of ConfigurableClassData.

\n
\n\n
\n
\nstatic from_config_value(inst_data, config_value)[source]\u00b6
\n

New up an instance of the ConfigurableClass from a validated config value.

\n

Called by ConfigurableClassData.rehydrate.

\n
\n
Parameters
\n

config_value (dict) \u2013 The validated config value to use. Typically this should be the\nvalue attribute of a\nEvaluateValueResult.

\n
\n
\n

A common pattern is for the implementation to align the config_value with the signature\nof the ConfigurableClass\u2019s constructor:

\n
@staticmethod\ndef from_config_value(inst_data, config_value):\n    return MyConfigurableClass(inst_data=inst_data, **config_value)\n
\n
\n
\n\n
\n
\nproperty inst_data\u00b6
\n

Subclass must be able to return the inst_data as a property if it has been constructed\nthrough the from_config_value code path.

\n
\n\n
\n\n
\n
\n
\n

Run storage\u00b6

\n
\n
\nclass dagster.PipelineRun(pipeline_name, run_id=None, run_config=None, mode=None, asset_selection=None, solid_selection=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot_id=None, execution_plan_snapshot_id=None, external_pipeline_origin=None, pipeline_code_origin=None)[source]\u00b6
\n

Serializable internal representation of a pipeline run, as stored in a\nRunStorage.

\n
\n\n
\n
\nclass dagster.DagsterRunStatus(value)[source]\u00b6
\n

The status of pipeline execution.

\n
\n
\nCANCELED = 'CANCELED'\u00b6
\n
\n\n
\n
\nCANCELING = 'CANCELING'\u00b6
\n
\n\n
\n
\nFAILURE = 'FAILURE'\u00b6
\n
\n\n
\n
\nMANAGED = 'MANAGED'\u00b6
\n
\n\n
\n
\nNOT_STARTED = 'NOT_STARTED'\u00b6
\n
\n\n
\n
\nQUEUED = 'QUEUED'\u00b6
\n
\n\n
\n
\nSTARTED = 'STARTED'\u00b6
\n
\n\n
\n
\nSTARTING = 'STARTING'\u00b6
\n
\n\n
\n
\nSUCCESS = 'SUCCESS'\u00b6
\n
\n\n
\n\n
\n
\ndagster.PipelineRunStatus\u00b6
\n

alias of dagster.core.storage.pipeline_run.DagsterRunStatus

\n
\n\n
\n
\nclass dagster.core.storage.runs.RunStorage[source]\u00b6
\n

Abstract base class for storing pipeline run history.

\n

Note that run storages using SQL databases as backing stores should implement\nSqlRunStorage.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagit and dagster-graphql load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\nclass dagster.core.storage.runs.SqlRunStorage[source]\u00b6
\n

Base class for SQL based run storages

\n
\n\n
\n
\nclass dagster.core.storage.runs.SqliteRunStorage(conn_string, inst_data=None)[source]\u00b6
\n

SQLite-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

This is the default run storage when none is specified in the dagster.yaml.

\n

To explicitly specify SQLite for run storage, you can add a block such as the following to your\ndagster.yaml:

\n
run_storage:\n  module: dagster.core.storage.runs\n  class: SqliteRunStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the run storage where on disk to store the database.

\n
\n\n

See also: dagster_postgres.PostgresRunStorage and dagster_mysql.MySQLRunStorage.

\n
\n
\n
\n

Event log storage\u00b6

\n
\n
\nclass dagster.core.storage.event_log.EventLogEntry(error_info, level, user_message, run_id, timestamp, step_key=None, pipeline_name=None, dagster_event=None, job_name=None)[source]\u00b6
\n

Entries in the event log.

\n

These entries may originate from the logging machinery (DagsterLogManager/context.log), from\nframework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n(e.g. Output).

\n
\n
Parameters
\n
    \n
  • error_info (Optional[SerializableErrorInfo]) \u2013 Error info for an associated exception, if\nany, as generated by serializable_error_info_from_exc_info and friends.

  • \n
  • level (Union[str, int]) \u2013 The Python log level at which to log this event. Note that\nframework and user code events are also logged to Python logging. This value may be an\ninteger or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.

  • \n
  • user_message (str) \u2013 For log messages, this is the user-generated message.

  • \n
  • run_id (str) \u2013 The id of the run which generated this event.

  • \n
  • timestamp (float) \u2013 The Unix timestamp of this event.

  • \n
  • step_key (Optional[str]) \u2013 The step key for the step which generated this event. Some events\nare generated outside of a step context.

  • \n
  • job_name (Optional[str]) \u2013 The job which generated this event. Some events are\ngenerated outside of a job context.

  • \n
  • dagster_event (Optional[DagsterEvent]) \u2013 For framework and user events, the associated\nstructured event.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The pipeline which generated this event. Some events are\ngenerated outside of a pipeline context.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.core.storage.event_log.EventLogRecord(storage_id, event_log_entry)[source]\u00b6
\n

Internal representation of an event record, as stored in a\nEventLogStorage.

\n
\n\n
\n
\nclass dagster.core.storage.event_log.EventRecordsFilter(event_type=None, asset_key=None, asset_partitions=None, after_cursor=None, before_cursor=None, after_timestamp=None, before_timestamp=None)[source]\u00b6
\n

Defines a set of filter fields for fetching a set of event log entries or event log records.

\n
\n
Parameters
\n
    \n
  • event_type (Optional[DagsterEventType]) \u2013 Filter argument for dagster event type

  • \n
  • asset_key (Optional[AssetKey]) \u2013 Asset key for which to get asset materialization event\nentries / records.

  • \n
  • asset_partitions (Optional[List[str]]) \u2013 Filter parameter such that only asset\nmaterialization events with a partition value matching one of the provided values. Only\nvalid when the asset_key parameter is provided.

  • \n
  • after_cursor (Optional[Union[int, RunShardedEventsCursor]]) \u2013 Filter parameter such that only\nrecords with storage_id greater than the provided value are returned. Using a\nrun-sharded events cursor will result in a significant performance gain when run against\na SqliteEventLogStorage implementation (which is run-sharded)

  • \n
  • before_cursor (Optional[Union[int, RunShardedEventsCursor]]) \u2013 Filter parameter such that\nrecords with storage_id less than the provided value are returned. Using a run-sharded\nevents cursor will result in a significant performance gain when run against\na SqliteEventLogStorage implementation (which is run-sharded)

  • \n
  • after_timestamp (Optional[float]) \u2013 Filter parameter such that only event records for\nevents with timestamp greater than the provided value are returned.

  • \n
  • before_timestamp (Optional[float]) \u2013 Filter parameter such that only event records for\nevents with timestamp less than the provided value are returned.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.core.storage.event_log.RunShardedEventsCursor(id, run_updated_after)[source]\u00b6
\n

Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\nperformance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\nrun-sharded storages, the id field is ignored, since they may not be unique across shards

\n
\n\n
\n
\nclass dagster.core.storage.event_log.EventLogStorage[source]\u00b6
\n

Abstract base class for storing structured event logs from pipeline runs.

\n

Note that event log storages using SQL databases as backing stores should implement\nSqlEventLogStorage.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagit and dagster-graphql load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\nclass dagster.core.storage.event_log.SqlEventLogStorage[source]\u00b6
\n

Base class for SQL backed event log storages.

\n

Distinguishes between run-based connections and index connections in order to support run-level\nsharding, while maintaining the ability to do cross-run queries

\n
\n\n
\n
\nclass dagster.core.storage.event_log.SqliteEventLogStorage(base_dir, inst_data=None)[source]\u00b6
\n

SQLite-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

This is the default event log storage when none is specified in the dagster.yaml.

\n

To explicitly specify SQLite for event log storage, you can add a block such as the following\nto your dagster.yaml:

\n
event_log_storage:\n  module: dagster.core.storage.event_log\n  class: SqliteEventLogStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the event log storage where on disk to store the databases. To\nimprove concurrent performance, event logs are stored in a separate SQLite database for each\nrun.

\n
\n\n
\n
\nclass dagster.core.storage.event_log.ConsolidatedSqliteEventLogStorage(base_dir, inst_data=None)[source]\u00b6
\n

SQLite-backed consolidated event log storage intended for test cases only.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\nthe following to your dagster.yaml:

\n
run_storage:\n  module: dagster.core.storage.event_log\n  class: ConsolidatedSqliteEventLogStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the event log storage where on disk to store the database.

\n
\n\n

See also: dagster_postgres.PostgresEventLogStorage and dagster_mysql.MySQLEventLogStorage.

\n
\n
\n
\n

Compute log manager\u00b6

\n
\n
\nclass dagster.core.storage.compute_log_manager.ComputeLogManager[source]\u00b6
\n

Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\nsteps of pipeline solids.

\n
\n\n
\n
\nclass dagster.core.storage.local_compute_log_manager.LocalComputeLogManager(base_dir, polling_timeout=None, inst_data=None)[source]\u00b6
\n

Stores copies of stdout & stderr for each compute step locally on disk.

\n
\n\n

See also: dagster_aws.S3ComputeLogManager.

\n
\n
\n
\n

Run launcher\u00b6

\n
\n
\nclass dagster.core.launcher.RunLauncher[source]\u00b6
\n
\n\n
\n
\nclass dagster.core.launcher.DefaultRunLauncher(inst_data=None, wait_for_processes=False)[source]\u00b6
\n

Launches runs against running GRPC servers.

\n
\n\n
\n
\n
\n

Run coordinator\u00b6

\n
\n
\nclass dagster.core.run_coordinator.DefaultRunCoordinator(inst_data=None)[source]\u00b6
\n

Immediately send runs to the run launcher.

\n
\n\n
\n
\ndagster.core.run_coordinator.QueuedRunCoordinator RunCoordinator[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
max_concurrent_runs (dagster.IntSource, optional)
\n

The maximum number of runs that are allowed to be in progress at once. Defaults to 10. Set to -1 to disable the limit. Set to 0 to stop any runs from launching. Any other negative values are disallowed.

\n
\n
tag_concurrency_limits (Union[List[strict dict], None], optional)
\n

A set of limits that are applied to runs with particular tags. If a value is set, the limit is applied to only that key-value pair. If no value is set, the limit is applied across all values of that key. If the value is set to a dict with applyLimitPerUniqueValue: true, the limit will apply to the number of unique values for that key.

\n
\n
dequeue_interval_seconds (dagster.IntSource, optional)
\n

The interval in seconds at which the Dagster Daemon should periodically check the run queue for new runs to launch.

\n
\n
\n

Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\nthe Dagster Daemon process to be alive in order for runs to be launched.

\n
\n\n
\n
\n
\n

Scheduling\u00b6

\n
\n
\nclass dagster.core.scheduler.Scheduler[source]\u00b6
\n

Abstract base class for a scheduler. This component is responsible for interfacing with\nan external system such as cron to ensure scheduled repeated execution according.

\n
\n\n
\n
\nclass dagster.core.storage.schedules.ScheduleStorage[source]\u00b6
\n

Abstract class for managing persistance of scheduler artifacts

\n
\n\n
\n
\nclass dagster.core.storage.schedules.SqlScheduleStorage[source]\u00b6
\n

Base class for SQL backed schedule storage

\n
\n\n
\n
\nclass dagster.core.storage.schedules.SqliteScheduleStorage(conn_string, inst_data=None)[source]\u00b6
\n

Local SQLite backed schedule storage

\n
\n\n

see also: dagster_postgres.PostgresScheduleStorage and dagster_mysql.MySQLScheduleStorage.

\n
\n
\n
\n

Exception handling\u00b6

\n
\n
\ndagster.core.errors.user_code_error_boundary(error_cls, msg_fn, log_manager=None, **kwargs)[source]\u00b6
\n

Wraps the execution of user-space code in an error boundary. This places a uniform\npolicy around any user code invoked by the framework. This ensures that all user\nerrors are wrapped in an exception derived from DagsterUserCodeExecutionError,\nand that the original stack trace of the user error is preserved, so that it\ncan be reported without confusing framework code in the stack trace, if a\ntool author wishes to do so.

\n

Examples:

\n
with user_code_error_boundary(\n    # Pass a class that inherits from DagsterUserCodeExecutionError\n    DagsterExecutionStepExecutionError,\n    # Pass a function that produces a message\n    "Error occurred during step execution"\n):\n    call_user_provided_function()\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/internals", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../jobs/", "title": "Jobs"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../hooks/", "title": "Hooks"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/jobs", "Jobs", "N", "next"], ["sections/api/apidocs/hooks", "Hooks", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/internals.rst.txt", "title": "Internals", "toc": "\n"}, "io-managers": {"alabaster_version": "0.7.12", "body": "
\n

IO Managers\u00b6

\n

IO managers are user-provided objects that store op outputs and load them as inputs to downstream\nops.

\n
\n
\n@dagster.io_manager(config_schema=None, description=None, output_config_schema=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define an IO manager.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

The decorated function should accept an InitResourceContext and return an\nIOManager.

\n
\n
Parameters
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource config. Configuration\ndata available in init_context.resource_config. If not set, Dagster will accept any\nconfig provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • output_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-output config. If not set,\nno per-output configuration will be allowed.

  • \n
  • input_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-input config. If not set,\nDagster will accept any config provided.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the object\nmanager.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.

  • \n
\n
\n
\n

Examples:

\n
class MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        write_csv("some/path")\n\n    def load_input(self, context):\n        return read_csv("some/path")\n\n@io_manager\ndef my_io_manager(init_context):\n    return MyIOManager()\n\n@op(out=Out(io_manager_key="my_io_manager_key"))\ndef my_op(_):\n    return do_stuff()\n\n@job(resource_defs={"my_io_manager_key": my_io_manager})\ndef my_job():\n    my_op()\n
\n
\n
\n\n
\n
\nclass dagster.IOManager[source]\u00b6
\n

Base class for user-provided IO managers.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

Extend this class to handle how objects are loaded and stored. Users should implement\nhandle_output to store an object and load_input to retrieve an object.

\n
\n
\nget_input_asset_key(context)[source]\u00b6
\n

User-defined method that associates inputs loaded by this IOManager with a particular\nAssetKey.

\n
\n
Parameters
\n

context (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.

\n
\n
\n
\n\n
\n
\nget_input_asset_partitions(context)[source]\u00b6
\n

User-defined method that associates inputs loaded by this IOManager with a set of\npartitions of an AssetKey.

\n
\n
Parameters
\n

context (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.

\n
\n
\n
\n\n
\n
\nget_output_asset_key(_context)[source]\u00b6
\n

User-defined method that associates outputs handled by this IOManager with a particular\nAssetKey.

\n
\n
Parameters
\n

context (OutputContext) \u2013 The context of the step output that produces this object.

\n
\n
\n
\n\n
\n
\nget_output_asset_partitions(_context)[source]\u00b6
\n

User-defined method that associates outputs handled by this IOManager with a set of\npartitions of an AssetKey.

\n
\n
Parameters
\n

context (OutputContext) \u2013 The context of the step output that produces this object.

\n
\n
\n
\n\n
\n
\nabstract handle_output(context, obj)[source]\u00b6
\n

User-defined method that stores an output of an op.

\n
\n
Parameters
\n
    \n
  • context (OutputContext) \u2013 The context of the step output that produces this object.

  • \n
  • obj (Any) \u2013 The object, returned by the op, to be stored.

  • \n
\n
\n
\n
\n\n
\n
\nabstract load_input(context)[source]\u00b6
\n

User-defined method that loads an input to an op.

\n
\n
Parameters
\n

context (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.

\n
\n
Returns
\n

The data object.

\n
\n
Return type
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.IOManagerDefinition(resource_fn=None, config_schema=None, description=None, required_resource_keys=None, version=None, input_config_schema=None, output_config_schema=None)[source]\u00b6
\n

Definition of an IO manager resource.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

An IOManagerDefinition is a ResourceDefinition whose resource_fn returns an\nIOManager.

\n

The easiest way to create an IOManagerDefnition is with the @io_manager\ndecorator.

\n
\n
\nstatic hardcoded_io_manager(value, description=None)[source]\u00b6
\n

A helper function that creates an IOManagerDefinition with a hardcoded IOManager.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 A hardcoded IO Manager which helps mock the definition.

  • \n
  • description ([Optional[str]]) \u2013 The description of the IO Manager. Defaults to None.

  • \n
\n
\n
Returns
\n

A hardcoded resource.

\n
\n
Return type
\n

[IOManagerDefinition]

\n
\n
\n
\n\n
\n
\nproperty input_config_schema\u00b6
\n

The schema for per-input configuration for inputs that are managed by this\ninput manager

\n
\n\n
\n
\nproperty output_config_schema\u00b6
\n

The schema for per-output configuration for outputs that are managed by this\nmanager

\n
\n\n
\n\n
\n

Input and Output Contexts\u00b6

\n
\n
\nclass dagster.InputContext(name=None, pipeline_name=None, solid_def=None, config=None, metadata=None, upstream_output=None, dagster_type=None, log_manager=None, resource_config=None, resources=None, step_context=None, op_def=None)[source]\u00b6
\n

The context object available to the load_input method of RootInputManager.

\n
\n
\nname\u00b6
\n

The name of the input that we\u2019re loading.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npipeline_name\u00b6
\n

The name of the pipeline.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nsolid_def\u00b6
\n

The definition of the solid that\u2019s loading the input.

\n
\n
Type
\n

Optional[SolidDefinition]

\n
\n
\n
\n\n
\n
\nconfig\u00b6
\n

The config attached to the input that we\u2019re loading.

\n
\n
Type
\n

Optional[Any]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of metadata that is assigned to the\nInputDefinition that we\u2019re loading for.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nupstream_output\u00b6
\n

Info about the output that produced the object\nwe\u2019re loading.

\n
\n
Type
\n

Optional[OutputContext]

\n
\n
\n
\n\n
\n
\ndagster_type\u00b6
\n

The type of this input.

\n
\n
Type
\n

Optional[DagsterType]

\n
\n
\n
\n\n
\n
\nlog\u00b6
\n

The log manager to use for this input.

\n
\n
Type
\n

Optional[DagsterLogManager]

\n
\n
\n
\n\n
\n
\nresource_config\u00b6
\n

The config associated with the resource that\ninitializes the RootInputManager.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

The resources required by the resource that initializes the\ninput manager. If using the @root_input_manager() decorator, these resources\ncorrespond to those requested with the required_resource_keys parameter.

\n
\n
Type
\n

Optional[Resources]

\n
\n
\n
\n\n
\n
\nop_def\u00b6
\n

The definition of the op that\u2019s loading the input.

\n
\n
Type
\n

Optional[OpDefinition]

\n
\n
\n
\n\n
\n
\nadd_input_metadata(metadata, description=None)[source]\u00b6
\n

Accepts a dictionary of metadata. Metadata entries will appear on the LOADED_INPUT event.\nIf the input is an asset, metadata will be attached to an asset observation.

\n

The asset observation will be yielded from the run and appear in the event log.\nOnly valid if the context has an asset key.

\n
\n\n
\n
\nproperty asset_partition_key\u00b6
\n

The partition key for input asset.

\n

Raises an error if the input asset has no partitioning, or if the run covers a partition\nrange for the input asset.

\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n

The partition key range for input asset.

\n

Raises an error if the input asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partitions_time_window\u00b6
\n

The time window for the partitions of the input asset.

\n

Raises an error if either of the following are true:\n- The input asset has no partitioning.\n- The input asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nconsume_events()[source]\u00b6
\n

Pops and yields all user-generated events that have been recorded from this context.

\n

If consume_events has not yet been called, this will yield all logged events since the call to handle_input. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nget_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep input.

\n

If not using memoization, the unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the input.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the input is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n

If using memoization, the version corresponding to the step output is used in place of\nthe run_id.

\n
\n
Returns
\n

A list of identifiers, i.e. (run_id or version), step_key, and output_name

\n
\n
Return type
\n

List[str, ..]

\n
\n
\n
\n\n
\n
\nget_observations()[source]\u00b6
\n

Retrieve the list of user-generated asset observations that were observed via the context.

\n

User-generated events that were yielded will not appear in this list.

\n

Examples:

\n
from dagster import IOManager, build_input_context, AssetObservation\n\nclass MyIOManager(IOManager):\n    def load_input(self, context, obj):\n        ...\n\ndef test_load_input():\n    mgr = MyIOManager()\n    context = build_input_context()\n    mgr.load_input(context)\n    observations = context.get_observations()\n    ...\n
\n
\n
\n\n
\n
\nproperty has_input_name\u00b6
\n

If we\u2019re the InputContext is being used to load the result of a run from outside the run,\nthen it won\u2019t have an input name.

\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n\n
\n
\nclass dagster.OutputContext(step_key=None, name=None, pipeline_name=None, run_id=None, metadata=None, mapping_key=None, config=None, solid_def=None, dagster_type=None, log_manager=None, version=None, resource_config=None, resources=None, step_context=None, op_def=None, asset_info=None, warn_on_step_context_use=False)[source]\u00b6
\n

The context object that is available to the handle_output method of an IOManager.

\n
\n
\nstep_key\u00b6
\n

The step_key for the compute step that produced the output.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nname\u00b6
\n

The name of the output that produced the output.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npipeline_name\u00b6
\n

The name of the pipeline definition.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The id of the run that produced the output.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of the metadata that is assigned to the\nOutputDefinition that produced the output.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nmapping_key\u00b6
\n

The key that identifies a unique mapped output. None for regular outputs.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nconfig\u00b6
\n

The configuration for the output.

\n
\n
Type
\n

Optional[Any]

\n
\n
\n
\n\n
\n
\nsolid_def\u00b6
\n

The definition of the solid that produced the output.

\n
\n
Type
\n

Optional[SolidDefinition]

\n
\n
\n
\n\n
\n
\ndagster_type\u00b6
\n

The type of this output.

\n
\n
Type
\n

Optional[DagsterType]

\n
\n
\n
\n\n
\n
\nlog\u00b6
\n

The log manager to use for this output.

\n
\n
Type
\n

Optional[DagsterLogManager]

\n
\n
\n
\n\n
\n
\nversion\u00b6
\n

(Experimental) The version of the output.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nresource_config\u00b6
\n

The config associated with the resource that\ninitializes the RootInputManager.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

The resources required by the output manager, specified by the\nrequired_resource_keys parameter.

\n
\n
Type
\n

Optional[Resources]

\n
\n
\n
\n\n
\n
\nop_def\u00b6
\n

The definition of the op that produced the output.

\n
\n
Type
\n

Optional[OpDefinition]

\n
\n
\n
\n\n
\n
\nasset_info\u00b6
\n

Optional[AssetOutputInfo]: (Experimental) Asset info corresponding to the\noutput.

\n
\n\n
\n
\nadd_output_metadata(metadata)[source]\u00b6
\n

Add a dictionary of metadata to the handled output.

\n

Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.

\n
\n
Parameters
\n

metadata (Dict[str, Any]) \u2013 A metadata dictionary to log

\n
\n
\n

Examples:

\n
from dagster import IOManager\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        context.add_output_metadata({"foo": "bar"})\n
\n
\n
\n\n
\n
\nproperty asset_partition_key\u00b6
\n

The partition key for output asset.

\n

Raises an error if the output asset has no partitioning, or if the run covers a partition\nrange for the output asset.

\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n

The partition key range for output asset.

\n

Raises an error if the output asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partitions_time_window\u00b6
\n

The time window for the partitions of the output asset.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nconsume_events()[source]\u00b6
\n

Pops and yields all user-generated events that have been recorded from this context.

\n

If consume_events has not yet been called, this will yield all logged events since the call to handle_output. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nconsume_logged_metadata_entries()[source]\u00b6
\n

Pops and yields all user-generated metadata entries that have been recorded from this context.

\n

If consume_logged_metadata_entries has not yet been called, this will yield all logged events since the call to handle_output. If consume_logged_metadata_entries has been called, it will yield all events since the last time consume_logged_metadata_entries was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nget_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep output.

\n

If not using memoization, the unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the output.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the output is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n

If using memoization, the version corresponding to the step output is used in place of\nthe run_id.

\n
\n
Returns
\n

A list of identifiers, i.e. (run_id or version), step_key, and output_name

\n
\n
Return type
\n

List[str, ..]

\n
\n
\n
\n\n
\n
\nget_logged_events()[source]\u00b6
\n

Retrieve the list of user-generated events that were logged via the context.

\n

User-generated events that were yielded will not appear in this list.

\n

Examples:

\n
from dagster import IOManager, build_output_context, AssetMaterialization\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        ...\n\ndef test_handle_output():\n    mgr = MyIOManager()\n    context = build_output_context()\n    mgr.handle_output(context)\n    all_user_events = context.get_logged_events()\n    materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n    ...\n
\n
\n
\n\n
\n
\nget_logged_metadata_entries()[source]\u00b6
\n

Get the list of metadata entries that have been logged for use with this output.

\n
\n\n
\n
\nget_run_scoped_output_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep output.

\n

The unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the output.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the output is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n
\n
Returns
\n

A list of identifiers, i.e. run id, step key, and output name

\n
\n
Return type
\n

List[str, ..]

\n
\n
\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run

\n
\n\n
\n
\nlog_event(event)[source]\u00b6
\n

Log an AssetMaterialization or AssetObservation from within the body of an io manager\u2019s handle_output method.

\n

Events logged with this method will appear in the event log.

\n
\n
Parameters
\n

event (Union[AssetMaterialization, Materialization, AssetObservation]) \u2013 The event to log.

\n
\n
\n

Examples:

\n
from dagster import IOManager, AssetMaterialization\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n\n
\n
\ndagster.build_input_context(name=None, config=None, metadata=None, upstream_output=None, dagster_type=None, resource_config=None, resources=None, op_def=None, step_context=None)[source]\u00b6
\n

Builds input context from provided parameters.

\n

build_input_context can be used as either a function, or a context manager. If resources\nthat are also context managers are provided, then build_input_context must be used as a\ncontext manager.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the input that we\u2019re loading.

  • \n
  • config (Optional[Any]) \u2013 The config attached to the input that we\u2019re loading.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata that is assigned to the\nInputDefinition that we\u2019re loading for.

  • \n
  • upstream_output (Optional[OutputContext]) \u2013 Info about the output that produced the object\nwe\u2019re loading.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 The type of this input.

  • \n
  • resource_config (Optional[Dict[str, Any]]) \u2013 The resource config to make available from the\ninput context. This usually corresponds to the config provided to the resource that\nloads the input manager.

  • \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to make available from the context.\nFor a given key, you can provide either an actual instance of an object, or a resource\ndefinition.

  • \n
  • asset_key (Optional[AssetKey]) \u2013 The asset key attached to the InputDefinition.

  • \n
  • op_def (Optional[OpDefinition]) \u2013 The definition of the op that\u2019s loading the input.

  • \n
  • step_context (Optional[StepExecutionContext]) \u2013 For internal use.

  • \n
\n
\n
\n

Examples

\n
build_input_context()\n\nwith build_input_context(resources={"foo": context_manager_resource}) as context:\n    do_something\n
\n
\n
\n\n
\n
\ndagster.build_output_context(step_key=None, name=None, metadata=None, run_id=None, mapping_key=None, config=None, dagster_type=None, version=None, resource_config=None, resources=None, solid_def=None, op_def=None, asset_key=None)[source]\u00b6
\n

Builds output context from provided parameters.

\n

build_output_context can be used as either a function, or a context manager. If resources\nthat are also context managers are provided, then build_output_context must be used as a\ncontext manager.

\n
\n
Parameters
\n
    \n
  • step_key (Optional[str]) \u2013 The step_key for the compute step that produced the output.

  • \n
  • name (Optional[str]) \u2013 The name of the output that produced the output.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of the metadata that is assigned to the\nOutputDefinition that produced the output.

  • \n
  • mapping_key (Optional[str]) \u2013 The key that identifies a unique mapped output. None for regular outputs.

  • \n
  • config (Optional[Any]) \u2013 The configuration for the output.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 The type of this output.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the output.

  • \n
  • resource_config (Optional[Dict[str, Any]]) \u2013 The resource config to make available from the\ninput context. This usually corresponds to the config provided to the resource that\nloads the output manager.

  • \n
  • resources (Optional[Resources]) \u2013 The resources to make available from the context.\nFor a given key, you can provide either an actual instance of an object, or a resource\ndefinition.

  • \n
  • solid_def (Optional[SolidDefinition]) \u2013 The definition of the solid that produced the output.

  • \n
  • op_def (Optional[OpDefinition]) \u2013 The definition of the op that produced the output.

  • \n
  • asset_key \u2013 Optional[Union[AssetKey, Sequence[str], str]]: The asset key corresponding to the\noutput.

  • \n
\n
\n
\n

Examples

\n
build_output_context()\n\nwith build_output_context(resources={"foo": context_manager_resource}) as context:\n    do_something\n
\n
\n
\n\n
\n
\n

Built-in IO Managers\u00b6

\n
\n
\ndagster.mem_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in IO manager that stores and retrieves values in memory.

\n
\n\n
\n
\ndagster.fs_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in filesystem IO manager that stores and retrieves values using pickling.

\n

Allows users to specify a base directory where all the step outputs will be stored. By\ndefault, step outputs will be stored in the directory specified by local_artifact_storage in\nyour dagster.yaml file (which will be a temporary directory if not explicitly set).

\n

Serializes and deserializes output values using pickling and automatically constructs\nthe filepaths for ops and assets.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nSo, with a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n

1. Specify a job-level IO manager using the reserved resource key "io_manager",\nwhich will set the given IO manager on all ops in a job.

\n
from dagster import fs_io_manager, job, op\n\n@op\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(\n    resource_defs={\n        "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n    }\n)\ndef job():\n    op_b(op_a())\n
\n
\n

2. Specify IO manager on Out, which allows the user to set different IO managers on\ndifferent step outputs.

\n
from dagster import fs_io_manager, job, op, Out\n\n@op(out=Out(io_manager_key="my_io_manager"))\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(resource_defs={"my_io_manager": fs_io_manager})\ndef job():\n    op_b(op_a())\n
\n
\n
\n\n
\n
\ndagster.custom_path_fs_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in IO manager that allows users to custom output file path per output definition.

\n

It requires users to specify a base directory where all the step output will be stored in. It\nserializes and deserializes output values (assets) using pickling and stores the pickled object\nin the user-provided file paths.

\n

Example usage:

\n
from dagster import custom_path_fs_io_manager, job, op\n\n@op(out=Out(metadata={"path": "path/to/sample_output"}))\ndef sample_data(df):\n    return df[:5]\n\nmy_custom_path_fs_io_manager = custom_path_fs_io_manager.configured(\n    {"base_dir": "path/to/basedir"}\n)\n\n@job(resource_defs={"io_manager": my_custom_path_fs_io_manager})\ndef my_job():\n    sample_data()\n
\n
\n
\n\n
\n
\n

Root Input Managers (Experimental)\u00b6

\n

Root input managers are user-provided objects that specify how to load inputs that aren\u2019t connected\nto upstream outputs.

\n
\n
\n@dagster.root_input_manager(config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define a root input manager.

\n

Root input managers load op inputs that aren\u2019t connected to upstream outputs.

\n

The decorated function should accept a InputContext and resource config, and return\na loaded object that will be passed into one of the inputs of an op.

\n

The decorator produces an RootInputManagerDefinition.

\n
\n
Parameters
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource-level config. If not\nset, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • input_config_schema (Optional[ConfigSchema]) \u2013 A schema for the input-level config. Each\ninput that uses this input manager can be configured separately using this config.\nIf not set, Dagster will accept any config provided.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the input\nmanager.

  • \n
  • version (Optional[str]) \u2013 (Experimental) the version of the input manager definition.

  • \n
\n
\n
\n

Examples:

\n
from dagster import root_input_manager, op, job, In\n\n@root_input_manager\ndef csv_loader(_):\n    return read_csv("some/path")\n\n@op(ins={"input1": In(root_manager_key="csv_loader_key")})\ndef my_op(_, input1):\n    do_stuff(input1)\n\n@job(resource_defs={"csv_loader_key": csv_loader})\ndef my_job():\n    my_op()\n\n@root_input_manager(config_schema={"base_dir": str})\ndef csv_loader(context):\n    return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n@root_input_manager(input_config_schema={"path": str})\ndef csv_loader(context):\n    return read_csv(context.config["path"])\n
\n
\n
\n\n
\n
\nclass dagster.RootInputManager[source]\u00b6
\n

RootInputManagers are used to load inputs to ops at the root of a job.

\n

The easiest way to define an RootInputManager is with the\n@root_input_manager decorator.

\n
\n
\nabstract load_input(context)[source]\u00b6
\n

The user-defined read method that loads data given its metadata.

\n
\n
Parameters
\n

context (InputContext) \u2013 The context of the step output that produces this asset.

\n
\n
Returns
\n

The data object.

\n
\n
Return type
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.RootInputManagerDefinition(resource_fn=None, config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Definition of a root input manager resource.

\n

Root input managers load op inputs that aren\u2019t connected to upstream outputs.

\n

An RootInputManagerDefinition is a ResourceDefinition whose resource_fn returns an\nRootInputManager.

\n

The easiest way to create an RootInputManagerDefinition is with the\n@root_input_manager decorator.

\n
\n
\nproperty input_config_schema\u00b6
\n

The schema for per-input configuration for inputs that are managed by this\ninput manager

\n
\n\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/io-managers", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../partitions/", "title": "Partitioned Config"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../ops/", "title": "Ops"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/partitions", "Partitioned Config", "N", "next"], ["sections/api/apidocs/ops", "Ops", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/io-managers.rst.txt", "title": "IO Managers", "toc": "\n"}, "jobs": {"alabaster_version": "0.7.12", "body": "
\n

Jobs\u00b6

\n

A Job binds a Graph and the resources it needs to be executable.

\n

Jobs are created by calling GraphDefinition.to_job() on a graph instance, or using the job decorator.

\n
\n
\n@dagster.job(name=None, description=None, resource_defs=None, config=None, tags=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, partitions_def=None, input_values=None)[source]\u00b6
\n

Creates a job with the specified parameters from the decorated graph/op invocation function.

\n

Using this decorator allows you to build an executable job by writing a function that invokes\nops (or graphs).

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagit playground, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the pipeline, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagit playground, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multiprocess_executor .

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 Defines how each op (and optionally, resource) in the job can be versioned. If\nprovided, memoizaton will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition keys\nthat can parameterize the job. If this argument is supplied, the config argument\ncan\u2019t also be supplied.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of a job.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.JobDefinition(graph_def, resource_defs=None, executor_def=None, logger_defs=None, config_mapping=None, partitioned_config=None, name=None, description=None, preset_defs=None, tags=None, hook_defs=None, op_retry_policy=None, version_strategy=None, _subset_selection_data=None, asset_layer=None, _input_values=None)[source]\u00b6
\n
\n
\nexecute_in_process(run_config=None, instance=None, partition_key=None, raise_on_error=True, op_selection=None, asset_selection=None, run_id=None, input_values=None)[source]\u00b6
\n

Execute the Job in-process, gathering results in-memory.

\n

The executor_def on the Job will be ignored, and replaced with the in-process executor.\nIf using the default io_manager, it will switch from filesystem to in-memory.

\n
\n
Parameters
\n
    \n
  • (Optional[Dict[str (run_config) \u2013 The configuration for the run

  • \n
  • Any]] \u2013 The configuration for the run

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for jobs with partitioned config.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.

  • \n
\n
\n
Returns
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]\u00b6
\n

Apply a set of hooks to all op instances within the job.

\n
\n\n
\n\n
\n

Reconstructable jobs\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]
\n

Create a ReconstructablePipeline from a\nfunction that returns a PipelineDefinition/JobDefinition,\nor a function decorated with @pipeline/@job.

\n

When your pipeline/job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the pipeline/job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of pipelines\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples:

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\ndagster.build_reconstructable_job(reconstructor_module_name, reconstructor_function_name, reconstructable_args=None, reconstructable_kwargs=None, reconstructor_working_directory=None)[source]\u00b6
\n

Create a dagster.core.definitions.reconstructable.ReconstructablePipeline.

\n

When your job must cross process boundaries, e.g., for execution on multiple nodes or in\ndifferent systems (like dagstermill), Dagster must know how to reconstruct the job\non the other side of the process boundary.

\n

This function allows you to use the strategy of your choice for reconstructing jobs, so\nthat you can reconstruct certain kinds of jobs that are not supported by\nreconstructable(), such as those defined by lambdas, in nested scopes (e.g.,\ndynamically within a method call), or in interactive environments such as the Python REPL or\nJupyter notebooks.

\n

If you need to reconstruct jobs constructed in these ways, use this function instead of\nreconstructable().

\n
\n
Parameters
\n
    \n
  • reconstructor_module_name (str) \u2013 The name of the module containing the function to use to\nreconstruct the job.

  • \n
  • reconstructor_function_name (str) \u2013 The name of the function to use to reconstruct the\njob.

  • \n
  • reconstructable_args (Tuple) \u2013 Args to the function to use to reconstruct the job.\nValues of the tuple must be JSON serializable.

  • \n
  • reconstructable_kwargs (Dict[str, Any]) \u2013 Kwargs to the function to use to reconstruct the\njob. Values of the dict must be JSON serializable.

  • \n
\n
\n
\n

Examples:

\n
# module: mymodule\n\nfrom dagster import JobDefinition, job, build_reconstructable_job\n\nclass JobFactory:\n    def make_job(*args, **kwargs):\n\n        @job\n        def _job(...):\n            ...\n\n        return _job\n\ndef reconstruct_job(*args):\n    factory = JobFactory()\n    return factory.make_job(*args)\n\nfactory = JobFactory()\n\nfoo_job_args = (...,...)\n\nfoo_job_kwargs = {...:...}\n\nfoo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\nreconstructable_foo_job = build_reconstructable_job(\n    'mymodule',\n    'reconstruct_job',\n    foo_job_args,\n    foo_job_kwargs,\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/jobs", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../loggers/", "title": "Loggers"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../internals/", "title": "Internals"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/loggers", "Loggers", "N", "next"], ["sections/api/apidocs/internals", "Internals", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/jobs.rst.txt", "title": "Jobs", "toc": "\n"}, "libraries": {"dagster-airbyte": {"alabaster_version": "0.7.12", "body": "
\n

Airbyte (dagster-airbyte)\u00b6

\n

This library provides a Dagster integration with Airbyte.

\n
\n

Ops\u00b6

\n
\n
\ndagster_airbyte.airbyte_sync_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection_id (String)
\n

The Airbyte Connection ID that this op will sync. You can retrieve this value from the \u201cConnections\u201d tab of a given connector in the Airbyte UI.

\n
\n
poll_interval (Float, optional)
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional)
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n

Default Value: None

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the Airbyte sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018airbyte\u2019]

\n
\n
\n

Executes a Airbyte job sync for a given connection_id, and polls until that sync\ncompletes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\nthe job details for a given connection_id.

\n

It requires the use of the airbyte_resource, which allows it to\ncommunicate with the Airbyte API.

\n

Examples:

\n
from dagster import job\nfrom dagster_airbyte import airbyte_resource, airbyte_sync_op\n\nmy_airbyte_resource = airbyte_resource.configured(\n    {\n        "host": {"env": "AIRBYTE_HOST"},\n        "port": {"env": "AIRBYTE_PORT"},\n    }\n)\n\nsync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"airbyte": my_airbyte_resource})\ndef my_simple_airbyte_job():\n    sync_foobar()\n\n@job(resource_defs={"airbyte": my_airbyte_resource})\ndef my_composed_airbyte_job():\n    final_foobar_state = sync_foobar(start_after=some_op())\n    other_op(final_foobar_state)\n
\n
\n
\n\n
\n
\n

Resources\u00b6

\n
\n
\ndagster_airbyte.airbyte_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

The Airbyte Server Address.

\n
\n
port (dagster.StringSource, optional)
\n

Port for the Airbyte Server.

\n
\n
use_https (Bool, optional)
\n

Use https to connect in Airbyte Server.

\n

Default Value: False

\n
\n
request_max_retries (Int, optional)
\n

The maximum number of times requests to the Airbyte API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional)
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the Airbyte REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

For a complete set of documentation on the Airbyte REST API, including expected response JSON\nschema, see the Airbyte API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_airbyte import airbyte_resource\n\nmy_airbyte_resource = airbyte_resource.configured(\n    {\n        "host": {"env": "AIRBYTE_HOST"},\n        "port": {"env": "AIRBYTE_PORT"},\n    }\n)\n\n@job(resource_defs={"airbyte":my_airbyte_resource})\ndef my_airbyte_job():\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_airbyte.AirbyteResource(host, port, use_https, request_max_retries=3, request_retry_delay=0.25, log=<Logger dagster.builtin (DEBUG)>)[source]\u00b6
\n

This class exposes methods on top of the Airbyte REST API.

\n
\n
\nmake_request(endpoint, data)[source]\u00b6
\n

Creates and sends a request to the desired Airbyte REST API endpoint.

\n
\n
Parameters
\n
    \n
  • endpoint (str) \u2013 The Airbyte API endpoint to send this request to.

  • \n
  • data (Optional[str]) \u2013 JSON-formatted data string to be included in the request.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nsync_and_poll(connection_id, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Initializes a sync operation for the given connector, and polls until it completes.

\n
\n
Parameters
\n
    \n
  • connection_id (str) \u2013 The Airbyte Connector ID. You can retrieve this value from the\n\u201cConnection\u201d tab of a given connection in the Arbyte UI.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (float) \u2013 The maximum time that will waited before this operation is timed\nout. By default, this will never time out.

  • \n
\n
\n
Returns
\n

Details of the sync job.

\n
\n
Return type
\n

AirbyteOutput

\n
\n
\n
\n\n
\n\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_airbyte.build_airbyte_assets(connection_id, destination_tables, asset_key_prefix=None)[source]\u00b6
\n

Builds a set of assets representing the tables created by an Airbyte sync operation.

\n
\n
Parameters
\n
    \n
  • connection_id (str) \u2013 The Airbyte Connection ID that this op will sync. You can retrieve this\nvalue from the \u201cConnections\u201d tab of a given connector in the Airbyte UI.

  • \n
  • destination_tables (List[str]) \u2013 The names of the tables that you want to be represented\nin the Dagster asset graph for this sync. This will generally map to the name of the\nstream in Airbyte, unless a stream prefix has been specified in Airbyte.

  • \n
  • asset_key_prefix (Optional[List[str]]) \u2013 A prefix for the asset keys inside this asset.\nIf left blank, assets will have a key of AssetKey([table_name]).

  • \n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-airbyte", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-airflow/", "title": "Airflow (dagster-airflow)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../../memoization/", "title": "Versioning and Memoization"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-airflow", "Airflow (dagster-airflow)", "N", "next"], ["sections/api/apidocs/memoization", "Versioning and Memoization", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-airbyte.rst.txt", "title": "Airbyte (dagster-airbyte)", "toc": "\n"}, "dagster-airflow": {"alabaster_version": "0.7.12", "body": "
\n

Airflow (dagster-airflow)\u00b6

\n
\n
\ndagster_airflow.make_airflow_dag(module_name, job_name, run_config=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, pipeline_name=None)[source]\u00b6
\n

Construct an Airflow DAG corresponding to a given Dagster job/pipeline.

\n

Tasks in the resulting DAG will execute the Dagster logic they encapsulate as a Python\ncallable, run by an underlying PythonOperator. As a\nconsequence, both dagster, any Python dependencies required by your solid logic, and the module\ncontaining your pipeline definition must be available in the Python environment within which\nyour Airflow tasks execute. If you cannot install requirements into this environment, or you\nare looking for a containerized solution to provide better isolation, see instead\nmake_airflow_dag_containerized().

\n

This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.

\n
\n
Parameters
\n
    \n
  • module_name (str) \u2013 The name of the importable module in which the pipeline/job definition can be\nfound.

  • \n
  • job_name (str) \u2013 The name of the job definition.

  • \n
  • run_config (Optional[dict]) \u2013 The config, if any, with which to compile\nthe pipeline/job to an execution plan, as a Python dict.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to execute the pipeline.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The Dagster instance to use to execute the pipeline/job.

  • \n
  • dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG).

  • \n
  • dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG)

  • \n
  • dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG constructor, including default_args.

  • \n
  • op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator (a subclass of\nPythonOperator).

  • \n
  • pipeline_name (str) \u2013 (legacy) The name of the pipeline definition.

  • \n
\n
\n
Returns
\n

The generated Airflow DAG, and a\nlist of its constituent tasks.

\n
\n
Return type
\n

(airflow.models.DAG, List[airflow.models.BaseOperator])

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_airflow_dag_for_operator(recon_repo, job_name, operator, run_config=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, pipeline_name=None)[source]\u00b6
\n

Construct an Airflow DAG corresponding to a given Dagster job/pipeline and custom operator.

\n

Custom operator template

\n

Tasks in the resulting DAG will execute the Dagster logic they encapsulate run by the given\nOperator BaseOperator. If you\nare looking for a containerized solution to provide better isolation, see instead\nmake_airflow_dag_containerized().

\n

This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.

\n
\n
Parameters
\n
    \n
  • recon_repo (dagster.ReconstructableRepository) \u2013 reference to a Dagster RepositoryDefinition\nthat can be reconstructed in another process

  • \n
  • job_name (str) \u2013 The name of the job definition.

  • \n
  • operator (type) \u2013 The operator to use. Must be a class that inherits from\nBaseOperator

  • \n
  • run_config (Optional[dict]) \u2013 The config, if any, with which to compile\nthe pipeline to an execution plan, as a Python dict.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to execute the pipeline.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The Dagster instance to use to execute the pipeline.

  • \n
  • dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG).

  • \n
  • dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG)

  • \n
  • dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG constructor, including default_args.

  • \n
  • op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator.

  • \n
  • pipeline_name (str) \u2013 (legacy) The name of the pipeline definition.

  • \n
\n
\n
Returns
\n

The generated Airflow DAG, and a\nlist of its constituent tasks.

\n
\n
Return type
\n

(airflow.models.DAG, List[airflow.models.BaseOperator])

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_airflow_dag_containerized(module_name, job_name, image, run_config=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, pipeline_name=None)[source]\u00b6
\n

Construct a containerized Airflow DAG corresponding to a given Dagster job/pipeline.

\n

Tasks in the resulting DAG will execute the Dagster logic they encapsulate using a subclass of\nDockerOperator. As a\nconsequence, both dagster, any Python dependencies required by your solid logic, and the module\ncontaining your pipeline definition must be available in the container spun up by this operator.\nTypically you\u2019ll want to install these requirements onto the image you\u2019re using.

\n

This function should be invoked in an Airflow DAG definition file, such as that created by an\ninvocation of the dagster-airflow scaffold CLI tool.

\n
\n
Parameters
\n
    \n
  • module_name (str) \u2013 The name of the importable module in which the pipeline/job definition can be\nfound.

  • \n
  • job_name (str) \u2013 The name of the job definition.

  • \n
  • image (str) \u2013 The name of the Docker image to use for execution (passed through to\nDockerOperator).

  • \n
  • run_config (Optional[dict]) \u2013 The config, if any, with which to compile\nthe pipeline/job to an execution plan, as a Python dict.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to execute the pipeline.

  • \n
  • dag_id (Optional[str]) \u2013 The id to use for the compiled Airflow DAG (passed through to\nDAG).

  • \n
  • dag_description (Optional[str]) \u2013 The description to use for the compiled Airflow DAG\n(passed through to DAG)

  • \n
  • dag_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the Airflow\nDAG constructor, including default_args.

  • \n
  • op_kwargs (Optional[dict]) \u2013 Any additional kwargs to pass to the underlying Airflow\noperator (a subclass of\nDockerOperator).

  • \n
  • pipeline_name (str) \u2013 (legacy) The name of the pipeline definition.

  • \n
\n
\n
Returns
\n

The generated Airflow DAG, and a\nlist of its constituent tasks.

\n
\n
Return type
\n

(airflow.models.DAG, List[airflow.models.BaseOperator])

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_job_from_airflow_dag(dag, tags=None, use_airflow_template_context=False, unique_id=None)[source]\u00b6
\n

Construct a Dagster job corresponding to a given Airflow DAG.

\n

Tasks in the resulting job will execute the execute() method on the corresponding\nAirflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\ncontaining your DAG definition must be available in the Python environment within which your\nDagster solids execute.

\n

To set Airflow\u2019s execution_date for use with Airflow Operator\u2019s execute() methods,\neither:

\n
    \n
  1. \n
    (Best for ad hoc runs) Execute job directly. This will set execution_date to the

    time (in UTC) of the run.

    \n
    \n
    \n
  2. \n
  3. \n
    Add {'airflow_execution_date': utc_date_string} to the job tags. This will override

    behavior from (1).

    \n
    my_dagster_job = make_dagster_job_from_airflow_dag(\n        dag=dag,\n        tags={'airflow_execution_date': utc_execution_date_str}\n)\nmy_dagster_job.execute_in_process()\n
    \n
    \n
    \n
    \n
  4. \n
  5. \n
    (Recommended) Add {'airflow_execution_date': utc_date_string} to the run tags,

    such as in the Dagit UI. This will override behavior from (1) and (2)

    \n
    \n
    \n
  6. \n
\n

We apply normalized_name() to the dag id and task ids when generating job name and op\nnames to ensure that names conform to Dagster\u2019s naming conventions.

\n
\n
Parameters
\n
    \n
  • dag (DAG) \u2013 The Airflow DAG to compile into a Dagster job

  • \n
  • tags (Dict[str, Field]) \u2013 Job tags. Optionally include\ntags={\u2018airflow_execution_date\u2019: utc_date_string} to specify execution_date used within\nexecution of Airflow Operators.

  • \n
  • use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)

  • \n
  • unique_id (int) \u2013 If not None, this id will be postpended to generated op names. Used by\nframework authors to enforce unique op names within a repo.

  • \n
\n
\n
Returns
\n

The generated Dagster job

\n
\n
Return type
\n

JobDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_repo_from_airflow_dags_path(dag_path, repo_name, safe_mode=True, store_serialized_dags=False, use_airflow_template_context=False)[source]\u00b6
\n

Construct a Dagster repository corresponding to Airflow DAGs in dag_path.

\n

DagBag.get_dag() dependency requires Airflow DB to be initialized.

\n
\n
Usage:

Create make_dagster_repo.py:

\n
from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dags_path\n\ndef make_repo_from_dir():\n    return make_dagster_repo_from_airflow_dags_path(\n        '/path/to/dags/', 'my_repo_name'\n    )\n
\n
\n

Use RepositoryDefinition as usual, for example:\ndagit -f path/to/make_dagster_repo.py -n make_repo_from_dir

\n
\n
\n
\n
Parameters
\n
    \n
  • dag_path (str) \u2013 Path to directory or file that contains Airflow Dags

  • \n
  • repo_name (str) \u2013 Name for generated RepositoryDefinition

  • \n
  • include_examples (bool) \u2013 True to include Airflow\u2019s example DAGs. (default: False)

  • \n
  • safe_mode (bool) \u2013 True to use Airflow\u2019s default heuristic to find files that contain DAGs\n(ie find files that contain both b\u2019DAG\u2019 and b\u2019airflow\u2019) (default: True)

  • \n
  • store_serialized_dags (bool) \u2013 True to read Airflow DAGS from Airflow DB. False to read DAGS\nfrom Python files. (default: False)

  • \n
  • use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)

  • \n
\n
\n
Returns
\n

RepositoryDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_repo_from_airflow_dag_bag(dag_bag, repo_name, refresh_from_airflow_db=False, use_airflow_template_context=False)[source]\u00b6
\n

Construct a Dagster repository corresponding to Airflow DAGs in DagBag.

\n
\n
Usage:
\n
Create make_dagster_repo.py:

from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_dag_bag\nfrom airflow_home import my_dag_bag

\n
\n
def make_repo_from_dag_bag():

return make_dagster_repo_from_airflow_dag_bag(my_dag_bag, \u2018my_repo_name\u2019)

\n
\n
\n
\n
Use RepositoryDefinition as usual, for example:

dagit -f path/to/make_dagster_repo.py -n make_repo_from_dag_bag

\n
\n
\n
\n
\n
\n
Parameters
\n
    \n
  • dag_path (str) \u2013 Path to directory or file that contains Airflow Dags

  • \n
  • repo_name (str) \u2013 Name for generated RepositoryDefinition

  • \n
  • refresh_from_airflow_db (bool) \u2013 If True, will refresh DAG if expired via DagBag.get_dag(),\nwhich requires access to initialized Airflow DB. If False (recommended), gets dag from\nDagBag\u2019s dags dict without depending on Airflow DB. (default: False)

  • \n
  • use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)

  • \n
\n
\n
Returns
\n

RepositoryDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_repo_from_airflow_example_dags(repo_name='airflow_example_dags_repo')[source]\u00b6
\n

Construct a Dagster repository for Airflow\u2019s example DAGs.

\n
\n
Execution of the following Airflow example DAGs is not currently supported:

\u2018example_external_task_marker_child\u2019,\n\u2018example_pig_operator\u2019,\n\u2018example_skip_dag\u2019,\n\u2018example_trigger_target_dag\u2019,\n\u2018example_xcom\u2019,\n\u2018test_utils\u2019,

\n
\n
\n

Usage:

\n
\n
\n
Create make_dagster_repo.py:

from dagster_airflow.dagster_pipeline_factory import make_dagster_repo_from_airflow_example_dags

\n
\n
def make_airflow_example_dags():

return make_dagster_repo_from_airflow_example_dags()

\n
\n
\n
\n
Use RepositoryDefinition as usual, for example:

dagit -f path/to/make_dagster_repo.py -n make_airflow_example_dags

\n
\n
\n
\n
\n
Parameters
\n

repo_name (str) \u2013 Name for generated RepositoryDefinition

\n
\n
Returns
\n

RepositoryDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_pipeline_from_airflow_dag(dag, tags=None, use_airflow_template_context=False, unique_id=None)[source]\u00b6
\n

Construct a Dagster pipeline corresponding to a given Airflow DAG.

\n

Tasks in the resulting pipeline will execute the execute() method on the corresponding\nAirflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\ncontaining your DAG definition must be available in the Python environment within which your\nDagster solids execute.

\n

To set Airflow\u2019s execution_date for use with Airflow Operator\u2019s execute() methods,\neither:

\n
    \n
  1. \n
    (Best for ad hoc runs) Run Pipeline with \u2018default\u2019 preset, which sets execution_date to the

    time (in UTC) of pipeline invocation:

    \n
    execute_pipeline(\n    pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),\n    preset='default')\n
    \n
    \n
    \n
    \n
  2. \n
  3. Add {'airflow_execution_date': utc_date_string} to the PipelineDefinition tags. This will\noverride behavior from (1).

    \n
    \n
    execute_pipeline(\n    make_dagster_pipeline_from_airflow_dag(\n        dag=dag,\n        tags={'airflow_execution_date': utc_execution_date_str}\n    )\n)\n
    \n
    \n
    \n
  4. \n
  5. \n
    (Recommended) Add {'airflow_execution_date': utc_date_string} to the PipelineRun tags,

    such as in the Dagit UI. This will override behavior from (1) and (2)

    \n
    \n
    \n
  6. \n
\n

We apply normalized_name() to the dag id and task ids when generating pipeline name and solid\nnames to ensure that names conform to Dagster\u2019s naming conventions.

\n
\n
Parameters
\n
    \n
  • dag (DAG) \u2013 The Airflow DAG to compile into a Dagster pipeline

  • \n
  • tags (Dict[str, Field]) \u2013 Pipeline tags. Optionally include\ntags={\u2018airflow_execution_date\u2019: utc_date_string} to specify execution_date used within\nexecution of Airflow Operators.

  • \n
  • use_airflow_template_context (bool) \u2013 If True, will call get_template_context() on the\nAirflow TaskInstance model which requires and modifies the DagRun table.\n(default: False)

  • \n
  • unique_id (int) \u2013 If not None, this id will be postpended to generated solid names. Used by\nframework authors to enforce unique solid names within a repo.

  • \n
\n
\n
Returns
\n

The generated Dagster pipeline

\n
\n
Return type
\n

pipeline_def (PipelineDefinition)

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-airflow", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-aws/", "title": "AWS (dagster-aws)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-airbyte/", "title": "Airbyte (dagster-airbyte)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-aws", "AWS (dagster-aws)", "N", "next"], ["sections/api/apidocs/libraries/dagster-airbyte", "Airbyte (dagster-airbyte)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-airflow.rst.txt", "title": "Airflow (dagster-airflow)", "toc": "\n"}, "dagster-aws": {"alabaster_version": "0.7.12", "body": "
\n

AWS (dagster-aws)\u00b6

\n

Utilities for interfacing with AWS with Dagster.

\n
\n

S3\u00b6

\n
\n
\nclass dagster_aws.s3.S3ComputeLogManager(bucket, local_dir=None, inst_data=None, prefix='dagster', use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None, skip_empty_files=False)[source]\u00b6
\n

Logs compute function stdout and stderr to S3.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_aws.s3.compute_log_manager\n  class: S3ComputeLogManager\n  config:\n    bucket: "mycorp-dagster-compute-logs"\n    local_dir: "/tmp/cool"\n    prefix: "dagster-test-"\n    use_ssl: true\n    verify: true\n    verify_cert_path: "/path/to/cert/bundle.pem"\n    endpoint_url: "http://alternate-s3-host.io"\n    skip_empty_files: true\n
\n
\n
\n
Parameters
\n
    \n
  • bucket (str) \u2013 The name of the s3 bucket to which to log.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster.seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • use_ssl (Optional[bool]) \u2013 Whether or not to use SSL. Default True.

  • \n
  • verify (Optional[bool]) \u2013 Whether or not to verify SSL certificates. Default True.

  • \n
  • verify_cert_path (Optional[str]) \u2013 A filename of the CA cert bundle to use. Only used if\nverify set to False.

  • \n
  • endpoint_url (Optional[str]) \u2013 Override for the S3 endpoint url.

  • \n
  • skip_empty_files \u2013 (Optional[bool]): Skip upload of empty log files.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_aws.s3.S3FileCache(s3_bucket, s3_key, s3_session, overwrite=False)[source]\u00b6
\n
\n\n
\n
\nclass dagster_aws.s3.S3FileHandle(s3_bucket, s3_key)[source]\u00b6
\n

A reference to a file on S3.

\n
\n
\nproperty path_desc\u00b6
\n

The file\u2019s S3 URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty s3_bucket\u00b6
\n

The name of the S3 bucket.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty s3_key\u00b6
\n

The S3 key.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty s3_path\u00b6
\n

The file\u2019s S3 URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\ndagster_aws.s3.s3_file_manager ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (Bool, optional)
\n

Specifies whether to use an unsigned S3 session

\n

Default Value: False

\n
\n
region_name (String, optional)
\n

Specifies a custom region for the S3 session

\n
\n
endpoint_url (dagster.StringSource, optional)
\n

Specifies a custom endpoint for the S3 session

\n
\n
max_attempts (Int, optional)
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (String, optional)
\n

Specifies a profile to connect that session

\n
\n
s3_bucket (dagster.StringSource)
\n

\n
s3_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to S3.

\n

Implements the FileManager API.

\n
\n\n
\n
\ndagster_aws.s3.s3_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (Bool, optional)
\n

Specifies whether to use an unsigned S3 session

\n

Default Value: False

\n
\n
region_name (String, optional)
\n

Specifies a custom region for the S3 session

\n
\n
endpoint_url (dagster.StringSource, optional)
\n

Specifies a custom endpoint for the S3 session

\n
\n
max_attempts (Int, optional)
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (String, optional)
\n

Specifies a profile to connect that session

\n
\n
\n

Resource that gives access to S3.

\n

The underlying S3 session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is an S3 client, an instance of botocore.client.S3.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.s3 import s3_resource\n\n@op(required_resource_keys={'s3'})\ndef example_s3_op(context):\n    return context.resources.s3.list_objects_v2(\n        Bucket='my-bucket',\n        Prefix='some-key'\n    )\n\n@job(resource_defs={'s3': s3_resource})\ndef example_job(context):\n    example_s3_op()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            's3': {\n                'config': {\n                    'region_name': 'us-west-1',\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  s3:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n      # through the ordinary boto credential chain.\n      use_unsigned_session: false\n      # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n      endpoint_url: "http://localhost"\n      # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for S3 session. Default is default\n      # profile as specified in ~/.aws/credentials file\n
\n
\n
\n\n
\n
\ndagster_aws.s3.S3Coordinate DagsterType\u00b6
\n

A dagster.DagsterType intended to make it easier to pass information about files on S3\nfrom op to op. Objects of this type should be dicts with 'bucket' and 'key' keys,\nand may be hydrated from config in the intuitive way, e.g., for an input with the name\ns3_file:

\n
inputs:\n  s3_file:\n    value:\n      bucket: my-bucket\n      key: my-key\n
\n
\n
\n\n
\n
\ndagster_aws.s3.s3_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_bucket (dagster.StringSource)
\n

\n
s3_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using S3 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.

\n

Attach this resource definition to your job to make it available to your ops.

\n
@job(resource_defs={'io_manager': s3_pickle_io_manager, "s3": s3_resource, ...})\ndef my_job():\n    ...\n
\n
\n

You may configure this storage as follows:

\n
resources:\n    io_manager:\n        config:\n            s3_bucket: my-cool-bucket\n            s3_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\ndagster_aws.s3.s3_pickle_asset_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_bucket (dagster.StringSource)
\n

\n
s3_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using S3 for storage, meant for use with software-defined assets.

\n

Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\nwill overwrite previous materializations of that asset.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.

\n

Attach this resource definition to your job to make it available to your ops.

\n
asset_group = AssetGroup(\n    assets...,\n    resource_defs={'io_manager': s3_pickle_asset_io_manager, "s3": s3_resource, ...}),\n)\n
\n
\n

You may configure this IO manager as follows:

\n
resources:\n    io_manager:\n        config:\n            s3_bucket: my-cool-bucket\n            s3_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\n

ECS\u00b6

\n
\n
\ndagster_aws.ecs.EcsRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
task_definition (dagster.StringSource, optional)
\n

The task definition to use when launching new tasks. If none is provided, each run will create its own task definition.

\n
\n
container_name (dagster.StringSource, optional)
\n

The container name to use when launching new tasks. Defaults to \u2018run\u2019.

\n

Default Value: \u2018run\u2019

\n
\n
secrets (List[Union[String, strict dict]], optional)
\n

An array of AWS Secrets Manager secrets. These secrets will be mounted as environment variables in the container. See https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html.

\n
\n
secrets_tag (Union[dagster.StringSource, None], optional)
\n

AWS Secrets Manager secrets with this tag will be mounted as environment variables in the container. Defaults to \u2018dagster\u2019.

\n

Default Value: \u2018dagster\u2019

\n
\n
include_sidecars (Bool, optional)
\n

Whether each run should use the same sidecars as the task that launches it. Defaults to False.

\n

Default Value: False

\n
\n
\n

RunLauncher that starts a task in ECS for each Dagster job run.

\n
\n\n
\n
\n

Redshift\u00b6

\n
\n
\ndagster_aws.redshift.redshift_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

Redshift host

\n
\n
port (dagster.IntSource, optional)
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (dagster.StringSource, optional)
\n

Username for Redshift connection

\n
\n
password (dagster.StringSource, optional)
\n

Password for Redshift connection

\n
\n
database (dagster.StringSource, optional)
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (dagster.StringSource, optional)
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
autocommit (Bool, optional)
\n

None by default, which honors the Redshift parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
connect_timeout (Int, optional)
\n

Connection timeout in seconds. 5 seconds by default

\n

Default Value: 5

\n
\n
sslmode (String, optional)
\n

SSL mode to use. See the Redshift documentation for more information on usage: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n

This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.

\n

Example

\n
from dagster import build_op_context, op\nfrom dagster_aws.redshift import redshift_resource\n\n@op(required_resource_keys={'redshift'})\ndef example_redshift_op(context):\n    return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\nredshift_configured = redshift_resource.configured({\n    'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n    'port': 5439,\n    'user': 'dagster',\n    'password': 'dagster',\n    'database': 'dev',\n})\ncontext = build_op_context(resources={'redshift': redshift_configured})\nassert example_redshift_op(context) == [(1,)]\n
\n
\n
\n\n
\n

Testing\u00b6

\n
\n
\ndagster_aws.redshift.fake_redshift_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

Redshift host

\n
\n
port (dagster.IntSource, optional)
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (dagster.StringSource, optional)
\n

Username for Redshift connection

\n
\n
password (dagster.StringSource, optional)
\n

Password for Redshift connection

\n
\n
database (dagster.StringSource, optional)
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (dagster.StringSource, optional)
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
autocommit (Bool, optional)
\n

None by default, which honors the Redshift parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
connect_timeout (Int, optional)
\n

Connection timeout in seconds. 5 seconds by default

\n

Default Value: 5

\n
\n
sslmode (String, optional)
\n

SSL mode to use. See the Redshift documentation for more information on usage: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n
\n\n
\n
\n
\n

EMR\u00b6

\n
\n
\ndagster_aws.emr.emr_pyspark_step_launcher ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_config (permissive dict, optional)
\n
\nDefault Value:
{\n    "spark": {\n        "app": {},\n        "driver": {\n            "blockManager": {}\n        },\n        "executor": {\n            "pyspark": {},\n            "logs": {\n                "rolling": {\n                    "time": {}\n                }\n            }\n        },\n        "local": {},\n        "submit": {},\n        "log": {},\n        "redaction": {},\n        "python": {\n            "profile": {},\n            "worker": {}\n        },\n        "files": {},\n        "jars": {},\n        "pyspark": {\n            "driver": {}\n        },\n        "reducer": {},\n        "shuffle": {\n            "file": {},\n            "io": {},\n            "service": {\n                "index": {\n                    "cache": {}\n                }\n            },\n            "sort": {},\n            "spill": {},\n            "registration": {}\n        },\n        "eventLog": {\n            "logBlockUpdates": {},\n            "longForm": {},\n            "buffer": {}\n        },\n        "ui": {\n            "dagGraph": {},\n            "liveUpdate": {}\n        },\n        "worker": {\n            "ui": {}\n        },\n        "sql": {\n            "ui": {}\n        },\n        "streaming": {\n            "ui": {},\n            "backpressure": {},\n            "receiver": {\n                "writeAheadLog": {}\n            },\n            "kafka": {},\n            "driver": {\n                "writeAheadLog": {}\n            }\n        },\n        "broadcast": {},\n        "io": {\n            "compression": {\n                "lz4": {},\n                "snappy": {},\n                "zstd": {}\n            }\n        },\n        "kryo": {},\n        "kryoserializer": {\n            "buffer": {}\n        },\n        "rdd": {},\n        "serializer": {},\n        "memory": {\n            "offHeap": {}\n        },\n        "storage": {\n            "replication": {}\n        },\n        "cleaner": {\n            "periodicGC": {},\n            "referenceTracking": {\n                "blocking": {}\n            }\n        },\n        "default": {},\n        "hadoop": {\n            "mapreduce": {\n                "fileoutputcommitter": {\n                    "algorithm": {}\n                }\n            }\n        },\n        "rpc": {\n            "message": {},\n            "retry": {}\n        },\n        "blockManager": {},\n        "network": {},\n        "port": {},\n        "core": {\n            "connection": {\n                "ack": {\n                    "wait": {}\n                }\n            }\n        },\n        "cores": {},\n        "locality": {\n            "wait": {}\n        },\n        "scheduler": {\n            "revive": {},\n            "listenerbus": {\n                "eventqueue": {}\n            }\n        },\n        "blacklist": {\n            "task": {},\n            "stage": {},\n            "application": {\n                "fetchFailure": {}\n            }\n        },\n        "speculation": {},\n        "task": {\n            "reaper": {}\n        },\n        "stage": {},\n        "dynamicAllocation": {},\n        "r": {\n            "driver": {},\n            "shell": {}\n        },\n        "graphx": {\n            "pregel": {}\n        },\n        "deploy": {\n            "zookeeper": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
spark (permissive dict, optional)
\n
\nDefault Value:
{\n    "app": {},\n    "driver": {\n        "blockManager": {}\n    },\n    "executor": {\n        "pyspark": {},\n        "logs": {\n            "rolling": {\n                "time": {}\n            }\n        }\n    },\n    "local": {},\n    "submit": {},\n    "log": {},\n    "redaction": {},\n    "python": {\n        "profile": {},\n        "worker": {}\n    },\n    "files": {},\n    "jars": {},\n    "pyspark": {\n        "driver": {}\n    },\n    "reducer": {},\n    "shuffle": {\n        "file": {},\n        "io": {},\n        "service": {\n            "index": {\n                "cache": {}\n            }\n        },\n        "sort": {},\n        "spill": {},\n        "registration": {}\n    },\n    "eventLog": {\n        "logBlockUpdates": {},\n        "longForm": {},\n        "buffer": {}\n    },\n    "ui": {\n        "dagGraph": {},\n        "liveUpdate": {}\n    },\n    "worker": {\n        "ui": {}\n    },\n    "sql": {\n        "ui": {}\n    },\n    "streaming": {\n        "ui": {},\n        "backpressure": {},\n        "receiver": {\n            "writeAheadLog": {}\n        },\n        "kafka": {},\n        "driver": {\n            "writeAheadLog": {}\n        }\n    },\n    "broadcast": {},\n    "io": {\n        "compression": {\n            "lz4": {},\n            "snappy": {},\n            "zstd": {}\n        }\n    },\n    "kryo": {},\n    "kryoserializer": {\n        "buffer": {}\n    },\n    "rdd": {},\n    "serializer": {},\n    "memory": {\n        "offHeap": {}\n    },\n    "storage": {\n        "replication": {}\n    },\n    "cleaner": {\n        "periodicGC": {},\n        "referenceTracking": {\n            "blocking": {}\n        }\n    },\n    "default": {},\n    "hadoop": {\n        "mapreduce": {\n            "fileoutputcommitter": {\n                "algorithm": {}\n            }\n        }\n    },\n    "rpc": {\n        "message": {},\n        "retry": {}\n    },\n    "blockManager": {},\n    "network": {},\n    "port": {},\n    "core": {\n        "connection": {\n            "ack": {\n                "wait": {}\n            }\n        }\n    },\n    "cores": {},\n    "locality": {\n        "wait": {}\n    },\n    "scheduler": {\n        "revive": {},\n        "listenerbus": {\n            "eventqueue": {}\n        }\n    },\n    "blacklist": {\n        "task": {},\n        "stage": {},\n        "application": {\n            "fetchFailure": {}\n        }\n    },\n    "speculation": {},\n    "task": {\n        "reaper": {}\n    },\n    "stage": {},\n    "dynamicAllocation": {},\n    "r": {\n        "driver": {},\n        "shell": {}\n    },\n    "graphx": {\n        "pregel": {}\n    },\n    "deploy": {\n        "zookeeper": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
app (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
name (dagster.StringSource, optional)
\n

Application Properties: The name of your application. This will appear in the UI and in log data.

\n
\n
\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{\n    "blockManager": {}\n}\n
\n
\n
\nConfig Schema:
\n
cores (dagster.IntSource, optional)
\n

Application Properties: Number of cores to use for the driver process, only in cluster mode.

\n
\n
maxResultSize (dagster.StringSource, optional)
\n

Application Properties: Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size is above this limit. Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory and memory overhead of objects in JVM). Setting a proper limit can protect the driver from out-of-memory errors.

\n
\n
memory (dagster.StringSource, optional)
\n

Application Properties: Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-memory command line option or in your default properties file.

\n
\n
memoryOverhead (dagster.StringSource, optional)
\n

Application Properties: The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
supervise (Bool, optional)
\n

Application Properties: If true, restarts the driver automatically if it fails with a non-zero exit status. Only has effect in Spark standalone mode or Mesos cluster deploy mode.

\n
\n
extraClassPath (dagster.StringSource, optional)
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of the driver. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-class-path command line option or in your default properties file.

\n
\n
extraJavaOptions (dagster.StringSource, optional)
\n

Runtime Environment: A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set with spark.driver.memory in the cluster mode and through the \u2013driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-java-options command line option or in your default properties file.

\n
\n
extraLibraryPath (dagster.StringSource, optional)
\n

Runtime Environment: Set a special library path to use when launching the driver JVM. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-library-path command line option or in your default properties file.

\n
\n
userClassPathFirst (Bool, optional)
\n

Runtime Environment: (Experimental) Whether to give user-added jars precedence over Spark\u2019s own jars when loading classes in the driver. This feature can be used to mitigate conflicts between Spark\u2019s dependencies and user dependencies. It is currently an experimental feature. This is used in cluster mode only.

\n
\n
blockManager (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional)
\n

Networking: Driver-specific port for the block manager to listen on, for cases where it cannot use the same configuration as executors.

\n
\n
\n
\n
bindAddress (dagster.StringSource, optional)
\n

Networking: Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP environment variable (see below). It also allows a different address from the local one to be advertised to executors or external systems. This is useful, for example, when running containers with bridged networking. For this to properly work, the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the container\u2019s host.

\n
\n
host (dagster.StringSource, optional)
\n

Networking: Hostname or IP address for the driver. This is used for communicating with the executors and the standalone Master.

\n
\n
port (dagster.StringSource, optional)
\n

Networking: Port for the driver to listen on. This is used for communicating with the executors and the standalone Master.

\n
\n
\n
\n
executor (permissive dict, optional)
\n
\nDefault Value:
{\n    "pyspark": {},\n    "logs": {\n        "rolling": {\n            "time": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Application Properties: Amount of memory to use per executor process, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g).

\n
\n
pyspark (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Application Properties: The amount of memory to be allocated to PySpark in each executor, in MiB unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python\u2019s memory use and it is up to the application to avoid exceeding the overhead memory space shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests.

\n
\n
\n
\n
memoryOverhead (dagster.StringSource, optional)
\n

Application Properties: The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
extraClassPath (dagster.StringSource, optional)
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of executors. This exists primarily for backwards-compatibility with older versions of Spark. Users typically should not need to set this option.

\n
\n
extraJavaOptions (dagster.StringSource, optional)
\n

Runtime Environment: A string of extra JVM options to pass to executors. For instance, GC settings or other logging. Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. The following symbols, if present will be interpolated: {{APP_ID}} will be replaced by application ID and {{EXECUTOR_ID}} will be replaced by executor ID. For example, to enable verbose gc logging to a file named for the executor ID of the app in /tmp, pass a \u2018value\u2019 of: -verbose:gc -Xloggc:/tmp/{{APP_ID}}-{{EXECUTOR_ID}}.gc

\n
\n
extraLibraryPath (dagster.StringSource, optional)
\n

Runtime Environment: Set a special library path to use when launching executor JVM\u2019s.

\n
\n
logs (permissive dict, optional)
\n
\nDefault Value:
{\n    "rolling": {\n        "time": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
rolling (permissive dict, optional)
\n
\nDefault Value:
{\n    "time": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRetainedFiles (dagster.IntSource, optional)
\n

Runtime Environment: Sets the number of latest rolling log files that are going to be retained by the system. Older log files will be deleted. Disabled by default.

\n
\n
enableCompression (Bool, optional)
\n

Runtime Environment: Enable executor log compression. If it is enabled, the rolled executor logs will be compressed. Disabled by default.

\n
\n
maxSize (dagster.IntSource, optional)
\n

Runtime Environment: Set the max size of the file in bytes by which the executor logs will be rolled over. Rolling is disabled by default. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
strategy (dagster.StringSource, optional)
\n

Runtime Environment: Set the strategy of rolling of executor logs. By default it is disabled. It can be set to \u201ctime\u201d (time-based rolling) or \u201csize\u201d (size-based rolling). For \u201ctime\u201d, use spark.executor.logs.rolling.time.interval to set the rolling interval. For \u201csize\u201d, use spark.executor.logs.rolling.maxSize to set the maximum file size for rolling.

\n
\n
time (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Runtime Environment: Set the time interval by which the executor logs will be rolled over. Rolling is disabled by default. Valid values are daily, hourly, minutely or any interval in seconds. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
\n
\n
\n
\n
\n
\n
userClassPathFirst (Bool, optional)
\n

Runtime Environment: (Experimental) Same functionality as spark.driver.userClassPathFirst, but applied to executor instances.

\n
\n
cores (dagster.IntSource, optional)
\n

Execution Behavior: The number of cores to use on each executor. In standalone and Mesos coarse-grained modes, for more detail, see this description.

\n
\n
heartbeatInterval (dagster.StringSource, optional)
\n

Execution Behavior: Interval between each executor\u2019s heartbeats to the driver. Heartbeats let the driver know that the executor is still alive and update it with metrics for in-progress tasks. spark.executor.heartbeatInterval should be significantly less than spark.network.timeout

\n
\n
\n
\n
extraListeners (dagster.StringSource, optional)
\n

Application Properties: A comma-separated list of classes that implement SparkListener; when initializing SparkContext, instances of these classes will be created and registered with Spark\u2019s listener bus. If a class has a single-argument constructor that accepts a SparkConf, that constructor will be called; otherwise, a zero-argument constructor will be called. If no valid constructor can be found, the SparkContext creation will fail with an exception.

\n
\n
local (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
dir (dagster.StringSource, optional)
\n

Application Properties: Directory to use for \u201cscratch\u201d space in Spark, including map output files and RDDs that get stored on disk. This should be on a fast, local disk in your system. It can also be a comma-separated list of multiple directories on different disks. NOTE: In Spark 1.0 and later this will be overridden by SPARK_LOCAL_DIRS (Standalone), MESOS_SANDBOX (Mesos) or LOCAL_DIRS (YARN) environment variables set by the cluster manager.

\n
\n
\n
\n
logConf (Bool, optional)
\n

Application Properties: Logs the effective SparkConf as INFO when a SparkContext is started.

\n
\n
master (dagster.StringSource, optional)
\n

Application Properties: The cluster manager to connect to. See the list of allowed master URL\u2019s.

\n
\n
submit (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
deployMode (dagster.StringSource, optional)
\n

Application Properties: The deploy mode of Spark driver program, either \u201cclient\u201d or \u201ccluster\u201d, Which means to launch driver program locally (\u201cclient\u201d) or remotely (\u201ccluster\u201d) on one of the nodes inside the cluster.

\n
\n
pyFiles (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.

\n
\n
\n
\n
log (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
callerContext (dagster.StringSource, optional)
\n

Application Properties: Application information that will be written into Yarn RM log/HDFS audit log when running on Yarn/HDFS. Its length depends on the Hadoop configuration hadoop.caller.context.max.size. It should be concise, and typically can have up to 50 characters.

\n
\n
\n
\n
redaction (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
regex (dagster.StringSource, optional)
\n

Runtime Environment: Regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the environment UI and various logs like YARN and event logs.

\n
\n
\n
\n
python (permissive dict, optional)
\n
\nDefault Value:
{\n    "profile": {},\n    "worker": {}\n}\n
\n
\n
\nConfig Schema:
\n
profile (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Runtime Environment: Enable profiling in Python worker, the profile result will show up by sc.show_profiles(), or it will be displayed before the driver exits. It also can be dumped into disk by sc.dump_profiles(path). If some of the profile results had been displayed manually, they will not be displayed automatically before driver exiting. By default the pyspark.profiler.BasicProfiler will be used, but this can be overridden by passing a profiler class in as a parameter to the SparkContext constructor.

\n
\n
dump (dagster.StringSource, optional)
\n

Runtime Environment: The directory which is used to dump the profile result before driver exiting. The results will be dumped as separated file for each RDD. They can be loaded by ptats.Stats(). If this is specified, the profile result will not be displayed automatically.

\n
\n
\n
\n
worker (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Runtime Environment: Amount of memory to use per python worker process during aggregation, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). If the memory used during aggregation goes above this amount, it will spill the data into disks.

\n
\n
reuse (Bool, optional)
\n

Runtime Environment: Reuse Python worker or not. If yes, it will use a fixed number of Python workers, does not need to fork() a Python process for every task. It will be very useful if there is large broadcast, then the broadcast will not be needed to transferred from JVM to Python worker for every task.

\n
\n
\n
\n
\n
\n
files (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.

\n
\n
fetchTimeout (dagster.StringSource, optional)
\n

Execution Behavior: Communication timeout to use when fetching files added through SparkContext.addFile() from the driver.

\n
\n
useFetchCache (Bool, optional)
\n

Execution Behavior: If set to true (default), file fetching will use a local cache that is shared by executors that belong to the same application, which can improve task launching performance when running many executors on the same host. If set to false, these caching optimizations will be disabled and all executors will fetch their own copies of files. This optimization may be disabled in order to use Spark local directories that reside on NFS filesystems (see SPARK-6313 for more details).

\n
\n
overwrite (Bool, optional)
\n

Execution Behavior: Whether to overwrite files added through SparkContext.addFile() when the target file exists and its contents do not match those of the source.

\n
\n
maxPartitionBytes (dagster.IntSource, optional)
\n

Execution Behavior: The maximum number of bytes to pack into a single partition when reading files.

\n
\n
openCostInBytes (dagster.IntSource, optional)
\n

Execution Behavior: The estimated cost to open a file, measured by the number of bytes could be scanned at the same time. This is used when putting multiple files into a partition. It is better to overestimate, then the partitions with small files will be faster than partitions with bigger files.

\n
\n
\n
\n
jars (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.

\n
\n
packages (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option \u2013repositories. For more details, see Advanced Dependency Management.

\n
\n
excludes (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in spark.jars.packages to avoid dependency conflicts.

\n
\n
ivy (dagster.StringSource, optional)
\n

Runtime Environment: Path to specify the Ivy user directory, used for the local Ivy cache and package files from spark.jars.packages. This will override the Ivy property ivy.default.ivy.user.dir which defaults to ~/.ivy2.

\n
\n
ivySettings (dagster.StringSource, optional)
\n

Runtime Environment: Path to an Ivy settings file to customize resolution of jars specified using spark.jars.packages instead of the built-in defaults, such as maven central. Additional repositories given by the command-line option \u2013repositories or spark.jars.repositories will also be included. Useful for allowing Spark to resolve artifacts from behind a firewall e.g. via an in-house artifact server like Artifactory. Details on the settings file format can be found at http://ant.apache.org/ivy/history/latest-milestone/settings.html

\n
\n
repositories (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of additional remote repositories to search for the maven coordinates given with \u2013packages or spark.jars.packages.

\n
\n
\n
\n
pyspark (permissive dict, optional)
\n
\nDefault Value:
{\n    "driver": {}\n}\n
\n
\n
\nConfig Schema:
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
python (dagster.StringSource, optional)
\n

Runtime Environment: Python binary executable to use for PySpark in driver. (default is spark.pyspark.python)

\n
\n
\n
\n
python (dagster.StringSource, optional)
\n

Runtime Environment: Python binary executable to use for PySpark in both driver and executors.

\n
\n
\n
\n
reducer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSizeInFlight (dagster.StringSource, optional)
\n

Shuffle Behavior: Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless otherwise specified. Since each output requires us to create a buffer to receive it, this represents a fixed memory overhead per reduce task, so keep it small unless you have a large amount of memory.

\n
\n
maxReqsInFlight (dagster.IntSource, optional)
\n

Shuffle Behavior: This configuration limits the number of remote requests to fetch blocks at any given point. When the number of hosts in the cluster increase, it might lead to very large number of inbound connections to one or more nodes, causing the workers to fail under load. By allowing it to limit the number of fetch requests, this scenario can be mitigated.

\n
\n
maxBlocksInFlightPerAddress (dagster.IntSource, optional)
\n

Shuffle Behavior: This configuration limits the number of remote blocks being fetched per reduce task from a given host port. When a large number of blocks are being requested from a given address in a single fetch or simultaneously, this could crash the serving executor or Node Manager. This is especially useful to reduce the load on the Node Manager when external shuffle is enabled. You can mitigate this issue by setting it to a lower value.

\n
\n
\n
\n
maxRemoteBlockSizeFetchToMem (dagster.IntSource, optional)
\n

Shuffle Behavior: The remote block will be fetched to disk when size of the block is above this threshold in bytes. This is to avoid a giant request that takes too much memory. By default, this is only enabled for blocks > 2GB, as those cannot be fetched directly into memory, no matter what resources are available. But it can be turned down to a much lower value (eg. 200m) to avoid using too much memory on smaller blocks as well. Note this configuration will affect both shuffle fetch and block manager remote block fetch. For users who enabled external shuffle service, this feature can only be used when external shuffle service is newer than Spark 2.2.

\n
\n
shuffle (permissive dict, optional)
\n
\nDefault Value:
{\n    "file": {},\n    "io": {},\n    "service": {\n        "index": {\n            "cache": {}\n        }\n    },\n    "sort": {},\n    "spill": {},\n    "registration": {}\n}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional)
\n

Shuffle Behavior: Whether to compress map output files. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
file (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
buffer (dagster.StringSource, optional)
\n

Shuffle Behavior: Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise specified. These buffers reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

\n
\n
\n
\n
io (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.IntSource, optional)
\n

Shuffle Behavior: (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.

\n
\n
numConnectionsPerPeer (dagster.IntSource, optional)
\n

Shuffle Behavior: (Netty only) Connections between hosts are reused in order to reduce connection buildup for large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value.

\n
\n
preferDirectBufs (Bool, optional)
\n

Shuffle Behavior: (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache block transfer. For environments where off-heap memory is tightly limited, users may wish to turn this off to force all allocations from Netty to be on-heap.

\n
\n
retryWait (dagster.StringSource, optional)
\n

Shuffle Behavior: (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds by default, calculated as maxRetries * retryWait.

\n
\n
\n
\n
service (permissive dict, optional)
\n
\nDefault Value:
{\n    "index": {\n        "cache": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional)
\n

Shuffle Behavior: Enables the external shuffle service. This service preserves the shuffle files written by executors so the executors can be safely removed. This must be enabled if spark.dynamicAllocation.enabled is \u201ctrue\u201d. The external shuffle service must be set up in order to enable it. See dynamic allocation configuration and setup documentation for more information.

\n
\n
port (dagster.IntSource, optional)
\n

Shuffle Behavior: Port on which the external shuffle service will run.

\n
\n
index (permissive dict, optional)
\n
\nDefault Value:
{\n    "cache": {}\n}\n
\n
\n
\nConfig Schema:
\n
cache (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
size (dagster.StringSource, optional)
\n

Shuffle Behavior: Cache entries limited to the specified memory footprint in bytes.

\n
\n
\n
\n
\n
\n
\n
\n
maxChunksBeingTransferred (dagster.IntSource, optional)
\n

Shuffle Behavior: The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see spark.shuffle.io.maxRetries and spark.shuffle.io.retryWait), if those limits are reached the task will fail with fetch failure.

\n
\n
sort (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
bypassMergeThreshold (dagster.IntSource, optional)
\n

Shuffle Behavior: (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no map-side aggregation and there are at most this many reduce partitions.

\n
\n
\n
\n
spill (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional)
\n

Shuffle Behavior: Whether to compress data spilled during shuffles. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
accurateBlockThreshold (dagster.IntSource, optional)
\n

Shuffle Behavior: Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle block size when fetch shuffle blocks.

\n
\n
registration (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.IntSource, optional)
\n

Shuffle Behavior: Timeout in milliseconds for registration to the external shuffle service.

\n
\n
maxAttempts (dagster.IntSource, optional)
\n

Shuffle Behavior: When we fail to register to the external shuffle service, we will retry for maxAttempts times.

\n
\n
\n
\n
memoryFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for aggregation and cogroups during shuffles. At any given time, the collective size of all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will begin to spill to disk. If spills are often, consider increasing this value at the expense of spark.storage.memoryFraction.

\n
\n
\n
\n
eventLog (permissive dict, optional)
\n
\nDefault Value:
{\n    "logBlockUpdates": {},\n    "longForm": {},\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
logBlockUpdates (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to log events for every block update, if spark.eventLog.enabled is true. *Warning*: This will increase the size of the event log considerably.

\n
\n
\n
\n
longForm (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: If true, use the long form of call sites in the event log. Otherwise use the short form.

\n
\n
\n
\n
compress (dagster.StringSource, optional)
\n

Spark UI: Whether to compress logged events, if spark.eventLog.enabled is true. Compression will use spark.io.compression.codec.

\n
\n
dir (dagster.StringSource, optional)
\n

Spark UI: Base directory in which Spark events are logged, if spark.eventLog.enabled is true. Within this base directory, Spark creates a sub-directory for each application, and logs the events specific to the application in this directory. Users may want to set this to a unified location like an HDFS directory so history files can be read by the history server.

\n
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to log Spark events, useful for reconstructing the Web UI after the application has finished.

\n
\n
overwrite (dagster.StringSource, optional)
\n

Spark UI: Whether to overwrite any existing files.

\n
\n
buffer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
kb (dagster.StringSource, optional)
\n

Spark UI: Buffer size to use when writing to output streams, in KiB unless otherwise specified.

\n
\n
\n
\n
\n
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{\n    "dagGraph": {},\n    "liveUpdate": {}\n}\n
\n
\n
\nConfig Schema:
\n
dagGraph (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedRootRDDs (dagster.StringSource, optional)
\n

Spark UI: How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to run the web UI for the Spark application.

\n
\n
killEnabled (dagster.StringSource, optional)
\n

Spark UI: Allows jobs and stages to be killed from the web UI.

\n
\n
liveUpdate (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
period (dagster.StringSource, optional)
\n

Spark UI: How often to update live entities. -1 means \u201cnever update\u201d when replaying applications, meaning only the last write will happen. For live applications, this avoids a few operations that we can live without when rapidly processing incoming task events.

\n
\n
\n
\n
port (dagster.StringSource, optional)
\n

Spark UI: Port for your application\u2019s dashboard, which shows memory and workload data.

\n
\n
retainedJobs (dagster.StringSource, optional)
\n

Spark UI: How many jobs the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedStages (dagster.StringSource, optional)
\n

Spark UI: How many stages the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedTasks (dagster.StringSource, optional)
\n

Spark UI: How many tasks the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
reverseProxy (dagster.StringSource, optional)
\n

Spark UI: Enable running Spark Master as reverse proxy for worker and application UIs. In this mode, Spark master will reverse proxy the worker and application UIs to enable access without requiring direct access to their hosts. Use it with caution, as worker and application UI will not be accessible directly, you will only be able to access them through spark master/proxy public URL. This setting affects all the workers and application UIs running in the cluster and must be set on all the workers, drivers and masters.

\n
\n
reverseProxyUrl (dagster.StringSource, optional)
\n

Spark UI: This is the URL where your proxy is running. This URL is for proxy which is running in front of Spark Master. This is useful when running proxy for authentication e.g. OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to reach your proxy.

\n
\n
showConsoleProgress (dagster.StringSource, optional)
\n

Spark UI: Show the progress bar in the console. The progress bar shows the progress of stages that run for longer than 500ms. If multiple stages run at the same time, multiple progress bars will be displayed on the same line.

\n
\n
retainedDeadExecutors (dagster.StringSource, optional)
\n

Spark UI: How many dead executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
filters (dagster.StringSource, optional)
\n

Spark UI: Comma separated list of filter class names to apply to the Spark Web UI. The filter should be a standard javax servlet Filter. Filter parameters can also be specified in the configuration, by setting config entries of the form spark.<class name of filter>.param.<param name>=<value> For example: spark.ui.filters=com.test.filter1 spark.com.test.filter1.param.name1=foo spark.com.test.filter1.param.name2=bar

\n
\n
\n
\n
worker (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutors (dagster.StringSource, optional)
\n

Spark UI: How many finished executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
retainedDrivers (dagster.StringSource, optional)
\n

Spark UI: How many finished drivers the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
sql (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutions (dagster.StringSource, optional)
\n

Spark UI: How many finished executions the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
streaming (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {},\n    "backpressure": {},\n    "receiver": {\n        "writeAheadLog": {}\n    },\n    "kafka": {},\n    "driver": {\n        "writeAheadLog": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedBatches (dagster.StringSource, optional)
\n

Spark Streaming: How many batches the Spark Streaming UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
backpressure (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark Streaming: Enables or disables Spark Streaming\u2019s internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values spark.streaming.receiver.maxRate and spark.streaming.kafka.maxRatePerPartition if they are set (see below).

\n
\n
initialRate (dagster.StringSource, optional)
\n

Spark Streaming: This is the initial maximum receiving rate at which each receiver will receive data for the first batch when the backpressure mechanism is enabled.

\n
\n
\n
\n
blockInterval (dagster.StringSource, optional)
\n

Spark Streaming: Interval at which data received by Spark Streaming receivers is chunked into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the performance tuning section in the Spark Streaming programing guide for more details.

\n
\n
receiver (permissive dict, optional)
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRate (dagster.StringSource, optional)
\n

Spark Streaming: Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details.

\n
\n
writeAheadLog (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enable (dagster.StringSource, optional)
\n

Spark Streaming: Enable write-ahead logs for receivers. All the input data received through receivers will be saved to write-ahead logs that will allow it to be recovered after driver failures. See the deployment guide in the Spark Streaming programing guide for more details.

\n
\n
closeFileAfterWrite (dagster.StringSource, optional)
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the receivers. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the data WAL on the receivers.

\n
\n
\n
\n
\n
\n
unpersist (dagster.StringSource, optional)
\n

Spark Streaming: Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from Spark\u2019s memory. The raw input data received by Spark Streaming is also automatically cleared. Setting this to false will allow the raw data and persisted RDDs to be accessible outside the streaming application as they will not be cleared automatically. But it comes at the cost of higher memory usage in Spark.

\n
\n
stopGracefullyOnShutdown (dagster.StringSource, optional)
\n

Spark Streaming: If true, Spark shuts down the StreamingContext gracefully on JVM shutdown rather than immediately.

\n
\n
kafka (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRatePerPartition (dagster.StringSource, optional)
\n

Spark Streaming: Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API. See the Kafka Integration guide for more details.

\n
\n
minRatePerPartition (dagster.StringSource, optional)
\n

Spark Streaming: Minimum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API.

\n
\n
maxRetries (dagster.StringSource, optional)
\n

Spark Streaming: Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the new Kafka direct stream API.

\n
\n
\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
writeAheadLog (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
closeFileAfterWrite (dagster.StringSource, optional)
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the driver. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the metadata WAL on the driver.

\n
\n
\n
\n
\n
\n
\n
\n
broadcast (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to compress broadcast variables before sending them. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
blockSize (dagster.StringSource, optional)
\n

Execution Behavior: Size of each piece of a block for TorrentBroadcastFactory, in KiB unless otherwise specified. Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit.

\n
\n
checksum (dagster.StringSource, optional)
\n

Execution Behavior: Whether to enable checksum for broadcast. If enabled, broadcasts will include a checksum, which can help detect corrupted blocks, at the cost of computing and sending a little more data. It\u2019s possible to disable it if the network has other mechanisms to guarantee data won\u2019t be corrupted during broadcast.

\n
\n
\n
\n
io (permissive dict, optional)
\n
\nDefault Value:
{\n    "compression": {\n        "lz4": {},\n        "snappy": {},\n        "zstd": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
compression (permissive dict, optional)
\n
\nDefault Value:
{\n    "lz4": {},\n    "snappy": {},\n    "zstd": {}\n}\n
\n
\n
\nConfig Schema:
\n
codec (dagster.StringSource, optional)
\n

Compression and Serialization: The codec used to compress internal data such as RDD partitions, event log, broadcast variables and shuffle outputs. By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, and org.apache.spark.io.ZStdCompressionCodec.

\n
\n
lz4 (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional)
\n

Compression and Serialization: Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.

\n
\n
\n
\n
snappy (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional)
\n

Compression and Serialization: Block size in bytes used in Snappy compression, in the case when Snappy compression codec is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.

\n
\n
\n
\n
zstd (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
level (dagster.StringSource, optional)
\n

Compression and Serialization: Compression level for Zstd compression codec. Increasing the compression level will result in better compression at the expense of more CPU and memory.

\n
\n
bufferSize (dagster.StringSource, optional)
\n

Compression and Serialization: Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it might increase the compression cost because of excessive JNI call overhead.

\n
\n
\n
\n
\n
\n
\n
\n
kryo (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
classesToRegister (dagster.StringSource, optional)
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of custom class names to register with Kryo. See the tuning guide for more details.

\n
\n
referenceTracking (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple copies of the same object. Can be disabled to improve performance if you know this is not the case.

\n
\n
registrationRequired (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to require registration with Kryo. If set to \u2018true\u2019, Kryo will throw an exception if an unregistered class is serialized. If set to false (the default), Kryo will write unregistered class names along with each object. Writing class names can cause significant performance overhead, so enabling this option can enforce strictly that a user has not omitted classes from registration.

\n
\n
registrator (dagster.StringSource, optional)
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of classes that register your custom classes with Kryo. This property is useful if you need to register your classes in a custom way, e.g. to specify a custom field serializer. Otherwise spark.kryo.classesToRegister is simpler. It should be set to classes that extend KryoRegistrator. See the tuning guide for more details.

\n
\n
unsafe (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to use unsafe based Kryo serializer. Can be substantially faster by using Unsafe Based IO.

\n
\n
\n
\n
kryoserializer (permissive dict, optional)
\n
\nDefault Value:
{\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
buffer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Compression and Serialization: Initial size of Kryo\u2019s serialization buffer, in KiB unless otherwise specified. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.

\n
\n
max (dagster.StringSource, optional)
\n

Compression and Serialization: Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger than any object you attempt to serialize and must be less than 2048m. Increase this if you get a \u201cbuffer limit exceeded\u201d exception inside Kryo.

\n
\n
\n
\n
\n
\n
rdd (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
serializer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Compression and Serialization: Class to use for serializing objects that will be sent over the network or need to be cached in serialized form. The default of Java serialization works with any Serializable Java object but is quite slow, so we recommend using org.apache.spark.serializer.KryoSerializer and configuring Kryo serialization when speed is necessary. Can be any subclass of org.apache.spark.Serializer.

\n
\n
objectStreamReset (dagster.StringSource, optional)
\n

Compression and Serialization: When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches objects to prevent writing redundant data, however that stops garbage collection of those objects. By calling \u2018reset\u2019 you flush that info from the serializer, and allow old objects to be collected. To turn off this periodic reset set it to -1. By default it will reset the serializer every 100 objects.

\n
\n
\n
\n
memory (permissive dict, optional)
\n
\nDefault Value:
{\n    "offHeap": {}\n}\n
\n
\n
\nConfig Schema:
\n
fraction (Float, optional)
\n

Memory Management: Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at the default value is recommended. For more detail, including important information about correctly tuning JVM garbage collection when increasing this value, see this description.

\n
\n
storageFraction (Float, optional)
\n

Memory Management: Amount of storage memory immune to eviction, expressed as a fraction of the size of the region set aside by spark.memory.fraction. The higher this is, the less working memory may be available to execution and tasks may spill to disk more often. Leaving this at the default value is recommended. For more detail, see this description.

\n
\n
offHeap (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional)
\n

Memory Management: If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.

\n
\n
size (dagster.IntSource, optional)
\n

Memory Management: The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors\u2019 total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.

\n
\n
\n
\n
useLegacyMode (Bool, optional)
\n

Memory Management: Whether to enable the legacy memory management mode used in Spark 1.5 and before. The legacy mode rigidly partitions the heap space into fixed-size regions, potentially leading to excessive spilling if the application was not tuned. The following deprecated memory fraction configurations are not read unless this is enabled: spark.shuffle.memoryFraction spark.storage.memoryFraction spark.storage.unrollFraction

\n
\n
\n
\n
storage (permissive dict, optional)
\n
\nDefault Value:
{\n    "replication": {}\n}\n
\n
\n
\nConfig Schema:
\n
memoryFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for Spark\u2019s memory cache. This should not be larger than the \u201cold\u201d generation of objects in the JVM, which by default is given 0.6 of the heap, but you can increase it if you configure your own old generation size.

\n
\n
unrollFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of spark.storage.memoryFraction to use for unrolling blocks in memory. This is dynamically allocated by dropping existing blocks when there is not enough free storage space to unroll the new block in its entirety.

\n
\n
replication (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
proactive (Bool, optional)
\n

Memory Management: Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries to get the replication level of the block to the initial number.

\n
\n
\n
\n
memoryMapThreshold (dagster.StringSource, optional)
\n

Execution Behavior: Size in bytes of a block above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory mapping has high overhead for blocks close to or below the page size of the operating system.

\n
\n
\n
\n
cleaner (permissive dict, optional)
\n
\nDefault Value:
{\n    "periodicGC": {},\n    "referenceTracking": {\n        "blocking": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
periodicGC (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Memory Management: Controls how often to trigger a garbage collection. This context cleaner triggers cleanups only when weak references are garbage collected. In long-running applications with large driver JVMs, where there is little memory pressure on the driver, this may happen very occasionally or not at all. Not cleaning at all may lead to executors running out of disk space after a while.

\n
\n
\n
\n
referenceTracking (permissive dict, optional)
\n
\nDefault Value:
{\n    "blocking": {}\n}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Memory Management: Enables or disables context cleaning.

\n
\n
blocking (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Memory Management: Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by spark.cleaner.referenceTracking.blocking.shuffle Spark property).

\n
\n
shuffle (Bool, optional)
\n

Memory Management: Controls whether the cleaning thread should block on shuffle cleanup tasks.

\n
\n
\n
\n
cleanCheckpoints (Bool, optional)
\n

Memory Management: Controls whether to clean checkpoint files if the reference is out of scope.

\n
\n
\n
\n
\n
\n
default (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
parallelism (dagster.IntSource, optional)
\n

Execution Behavior: Default number of partitions in RDDs returned by transformations like join, reduceByKey, and parallelize when not set by user.

\n
\n
\n
\n
hadoop (permissive dict, optional)
\n
\nDefault Value:
{\n    "mapreduce": {\n        "fileoutputcommitter": {\n            "algorithm": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
cloneConf (Bool, optional)
\n

Execution Behavior: If set to true, clones a new Hadoop Configuration object for each task. This option should be enabled to work around Configuration thread-safety issues (see SPARK-2546 for more details). This is disabled by default in order to avoid unexpected performance regressions for jobs that are not affected by these issues.

\n
\n
validateOutputSpecs (Bool, optional)
\n

Execution Behavior: If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with previous versions of Spark. Simply use Hadoop\u2019s FileSystem API to delete output directories by hand. This setting is ignored for jobs generated through Spark Streaming\u2019s StreamingContext, since data may need to be rewritten to pre-existing output directories during checkpoint recovery.

\n
\n
mapreduce (permissive dict, optional)
\n
\nDefault Value:
{\n    "fileoutputcommitter": {\n        "algorithm": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
fileoutputcommitter (permissive dict, optional)
\n
\nDefault Value:
{\n    "algorithm": {}\n}\n
\n
\n
\nConfig Schema:
\n
algorithm (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
version (dagster.IntSource, optional)
\n

Execution Behavior: The file output committer algorithm version, valid algorithm version number: 1 or 2. Version 2 may have better performance, but version 1 may handle failures better in certain situations, as per MAPREDUCE-4815.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
rpc (permissive dict, optional)
\n
\nDefault Value:
{\n    "message": {},\n    "retry": {}\n}\n
\n
\n
\nConfig Schema:
\n
message (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSize (dagster.StringSource, optional)
\n

Networking: Maximum message size (in MB) to allow in \u201ccontrol plane\u201d communication; generally only applies to map output size information sent between executors and the driver. Increase this if you are running jobs with many thousands of map and reduce tasks and see messages about the RPC message size.

\n
\n
\n
\n
numRetries (dagster.StringSource, optional)
\n

Networking: Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number.

\n
\n
retry (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
wait (dagster.StringSource, optional)
\n

Networking: Duration for an RPC ask operation to wait before retrying.

\n
\n
\n
\n
askTimeout (dagster.StringSource, optional)
\n

Networking: Duration for an RPC ask operation to wait before timing out.

\n
\n
lookupTimeout (dagster.StringSource, optional)
\n

Networking: Duration for an RPC remote endpoint lookup operation to wait before timing out.

\n
\n
\n
\n
blockManager (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional)
\n

Networking: Port for all block managers to listen on. These exist on both the driver and the executors.

\n
\n
\n
\n
network (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional)
\n

Networking: Default timeout for all network interactions. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.

\n
\n
\n
\n
port (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.StringSource, optional)
\n

Networking: Maximum number of retries when binding to a port before giving up. When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. This essentially allows it to try a range of ports from the start port specified to port + maxRetries.

\n
\n
\n
\n
core (permissive dict, optional)
\n
\nDefault Value:
{\n    "connection": {\n        "ack": {\n            "wait": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
connection (permissive dict, optional)
\n
\nDefault Value:
{\n    "ack": {\n        "wait": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ack (permissive dict, optional)
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional)
\n

Networking: How long for the connection to wait for ack to occur before timing out and giving up. To avoid unwilling timeout caused by long pause like GC, you can set larger value.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cores (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
max (dagster.StringSource, optional)
\n

Scheduling: When running on a standalone deploy cluster or a Mesos cluster in \u201ccoarse-grained\u201d sharing mode, the maximum amount of CPU cores to request for the application from across the cluster (not from each machine). If not set, the default will be spark.deploy.defaultCores on Spark\u2019s standalone cluster manager, or infinite (all available cores) on Mesos.

\n
\n
\n
\n
locality (permissive dict, optional)
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Scheduling: How long to wait to launch a data-local task before giving up and launching it on a less-local node. The same wait will be used to step through multiple locality levels (process-local, node-local, rack-local and then any). It is also possible to customize the waiting time for each level by setting spark.locality.wait.node, etc. You should increase this setting if your tasks are long and see poor locality, but the default usually works well.

\n
\n
node (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for node locality. For example, you can set this to 0 to skip node locality and search immediately for rack locality (if your cluster has rack information).

\n
\n
process (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for process locality. This affects tasks that attempt to access cached data in a particular executor process.

\n
\n
rack (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for rack locality.

\n
\n
\n
\n
\n
\n
scheduler (permissive dict, optional)
\n
\nDefault Value:
{\n    "revive": {},\n    "listenerbus": {\n        "eventqueue": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
maxRegisteredResourcesWaitingTime (dagster.StringSource, optional)
\n

Scheduling: Maximum amount of time to wait for resources to register before scheduling begins.

\n
\n
minRegisteredResourcesRatio (dagster.StringSource, optional)
\n

Scheduling: The minimum ratio of registered resources (registered resources / total expected resources) (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarse-grained mode [\u2018spark.cores.max\u2019 value is total expected resources for Mesos coarse-grained mode] ) to wait for before scheduling begins. Specified as a double between 0.0 and 1.0. Regardless of whether the minimum ratio of resources has been reached, the maximum amount of time it will wait before scheduling begins is controlled by config spark.scheduler.maxRegisteredResourcesWaitingTime.

\n
\n
mode (dagster.StringSource, optional)
\n

Scheduling: The scheduling mode between jobs submitted to the same SparkContext. Can be set to FAIR to use fair sharing instead of queueing jobs one after another. Useful for multi-user services.

\n
\n
revive (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Scheduling: The interval length for the scheduler to revive the worker resource offers to run tasks.

\n
\n
\n
\n
listenerbus (permissive dict, optional)
\n
\nDefault Value:
{\n    "eventqueue": {}\n}\n
\n
\n
\nConfig Schema:
\n
eventqueue (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
capacity (dagster.StringSource, optional)
\n

Scheduling: Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing value (e.g. 20000) if listener events are dropped. Increasing this value may result in the driver using more memory.

\n
\n
\n
\n
\n
\n
\n
\n
blacklist (permissive dict, optional)
\n
\nDefault Value:
{\n    "task": {},\n    "stage": {},\n    "application": {\n        "fetchFailure": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: If set to \u201ctrue\u201d, prevent Spark from scheduling tasks on executors that have been blacklisted due to too many task failures. The blacklisting algorithm can be further controlled by the other \u201cspark.blacklist\u201d configuration options.

\n
\n
timeout (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How long a node or executor is blacklisted for the entire application, before it is unconditionally removed from the blacklist to attempt running new tasks.

\n
\n
task (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxTaskAttemptsPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one executor before the executor is blacklisted for that task.

\n
\n
maxTaskAttemptsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one node, before the entire node is blacklisted for that task.

\n
\n
\n
\n
stage (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, within one stage, before the executor is blacklisted for that stage.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different executors are marked as blacklisted for a given stage, before the entire node is marked as failed for the stage.

\n
\n
\n
\n
application (permissive dict, optional)
\n
\nDefault Value:
{\n    "fetchFailure": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, in successful task sets, before the executor is blacklisted for the entire application. Blacklisted executors will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors may get marked as idle and be reclaimed by the cluster manager.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different executors must be blacklisted for the entire application, before the node is blacklisted for the entire application. Blacklisted nodes will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors on the node may get marked as idle and be reclaimed by the cluster manager.

\n
\n
fetchFailure (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, Spark will blacklist the executor immediately when a fetch failure happens. If external shuffle service is enabled, then the whole node will be blacklisted.

\n
\n
\n
\n
\n
\n
killBlacklistedExecutors (dagster.StringSource, optional)
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, allow Spark to automatically kill the executors when they are blacklisted on fetch failure or blacklisted for the entire application, as controlled by spark.blacklist.application.*. Note that, when an entire node is added to the blacklist, all of the executors on that node will be killed.

\n
\n
\n
\n
speculation (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Scheduling: If set to \u201ctrue\u201d, performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.

\n
\n
interval (dagster.StringSource, optional)
\n

Scheduling: How often Spark will check for tasks to speculate.

\n
\n
multiplier (dagster.StringSource, optional)
\n

Scheduling: How many times slower a task is than the median to be considered for speculation.

\n
\n
quantile (dagster.StringSource, optional)
\n

Scheduling: Fraction of tasks which must be complete before speculation is enabled for a particular stage.

\n
\n
\n
\n
task (permissive dict, optional)
\n
\nDefault Value:
{\n    "reaper": {}\n}\n
\n
\n
\nConfig Schema:
\n
cpus (dagster.StringSource, optional)
\n

Scheduling: Number of cores to allocate for each task.

\n
\n
maxFailures (dagster.StringSource, optional)
\n

Scheduling: Number of failures of any particular task before giving up on the job. The total number of failures spread across different tasks will not cause the job to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1.

\n
\n
reaper (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed will be monitored by the executor until that task actually finishes executing. See the other spark.task.reaper.* configurations for details on how to control the exact behavior of this monitoring. When set to false (the default), task killing will use an older code path which lacks such monitoring.

\n
\n
pollingInterval (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls the frequency at which executors will poll the status of killed tasks. If a killed task is still running when polled then a warning will be logged and, by default, a thread-dump of the task will be logged (this thread dump can be disabled via the spark.task.reaper.threadDump setting, which is documented below).

\n
\n
threadDump (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls whether task thread dumps are logged during periodic polling of killed tasks. Set this to false to disable collection of thread dumps.

\n
\n
killTimeout (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting specifies a timeout after which the executor JVM will kill itself if a killed task has not stopped running. The default value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose of this setting is to act as a safety-net to prevent runaway noncancellable tasks from rendering an executor unusable.

\n
\n
\n
\n
\n
\n
stage (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxConsecutiveAttempts (dagster.StringSource, optional)
\n

Scheduling: Number of consecutive stage attempts allowed before a stage is aborted.

\n
\n
\n
\n
dynamicAllocation (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Dynamic Allocation: Whether to use dynamic resource allocation, which scales the number of executors registered with this application up and down based on the workload. For more detail, see the description here. This requires spark.shuffle.service.enabled to be set. The following configurations are also relevant: spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors, and spark.dynamicAllocation.initialExecutors spark.dynamicAllocation.executorAllocationRatio

\n
\n
executorIdleTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor has been idle for more than this duration, the executor will be removed. For more detail, see this description.

\n
\n
cachedExecutorIdleTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration, the executor will be removed. For more details, see this description.

\n
\n
initialExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Initial number of executors to run if dynamic allocation is enabled. If \u2013num-executors (or spark.executor.instances) is set and larger than this value, it will be used as the initial number of executors.

\n
\n
maxExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Upper bound for the number of executors if dynamic allocation is enabled.

\n
\n
minExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Lower bound for the number of executors if dynamic allocation is enabled.

\n
\n
executorAllocationRatio (dagster.StringSource, optional)
\n

Dynamic Allocation: By default, the dynamic allocation will request enough executors to maximize the parallelism according to the number of tasks to process. While this minimizes the latency of the job, with small tasks this setting can waste a lot of resources due to executor allocation overhead, as some executor might not even do any work. This setting allows to set a ratio that will be used to reduce the number of executors w.r.t. full parallelism. Defaults to 1.0 to give maximum parallelism. 0.5 will divide the target number of executors by 2 The target number of executors computed by the dynamicAllocation can still be overridden by the spark.dynamicAllocation.minExecutors and spark.dynamicAllocation.maxExecutors settings

\n
\n
schedulerBacklogTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and there have been pending tasks backlogged for more than this duration, new executors will be requested. For more detail, see this description.

\n
\n
sustainedSchedulerBacklogTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: Same as spark.dynamicAllocation.schedulerBacklogTimeout, but used only for subsequent executor requests. For more detail, see this description.

\n
\n
\n
\n
r (permissive dict, optional)
\n
\nDefault Value:
{\n    "driver": {},\n    "shell": {}\n}\n
\n
\n
\nConfig Schema:
\n
numRBackendThreads (dagster.StringSource, optional)
\n

SparkR: Number of threads used by RBackend to handle RPC calls from SparkR package.

\n
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing R scripts in cluster modes for both driver and workers.

\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing R scripts in client modes for driver. Ignored in cluster modes.

\n
\n
\n
\n
shell (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script.

\n
\n
\n
\n
backendConnectionTimeout (dagster.StringSource, optional)
\n

SparkR: Connection timeout set by R process on its connection to RBackend in seconds.

\n
\n
heartBeatInterval (dagster.StringSource, optional)
\n

SparkR: Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.

\n
\n
\n
\n
graphx (permissive dict, optional)
\n
\nDefault Value:
{\n    "pregel": {}\n}\n
\n
\n
\nConfig Schema:
\n
pregel (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
checkpointInterval (dagster.StringSource, optional)
\n

GraphX: Checkpoint interval for graph and message in Pregel. It used to avoid stackOverflowError due to long lineage chains after lots of iterations. The checkpoint is disabled by default.

\n
\n
\n
\n
\n
\n
deploy (permissive dict, optional)
\n
\nDefault Value:
{\n    "zookeeper": {}\n}\n
\n
\n
\nConfig Schema:
\n
recoveryMode (dagster.StringSource, optional)
\n

Deploy: The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches. This is only applicable for cluster mode when running with Standalone or Mesos.

\n
\n
zookeeper (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
url (dagster.StringSource, optional)
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.

\n
\n
dir (dagster.StringSource, optional)
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cluster_id (dagster.StringSource)
\n

Name of the job flow (cluster) on which to execute.

\n
\n
region_name (dagster.StringSource)
\n

The AWS region that the cluster is in.

\n
\n
action_on_failure (String, optional)
\n

The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html

\n

Default Value: \u2018CANCEL_AND_WAIT\u2019

\n
\n
staging_bucket (dagster.StringSource)
\n

S3 bucket to use for passing files between the plan process and EMR process.

\n
\n
staging_prefix (dagster.StringSource, optional)
\n

S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process

\n

Default Value: \u2018emr_staging\u2019

\n
\n
wait_for_logs (Bool, optional)
\n

If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.

\n

Default Value: False

\n
\n
local_job_package_path (dagster.StringSource, optional)
\n

Absolute path to the package that contains the job definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the job. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_job_package option, referenced on s3 via the s3_job_package_path option, or installed on the cluster via bootstrap actions.

\n
\n
local_pipeline_package_path (dagster.StringSource, optional)
\n

(legacy) Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.

\n
\n
deploy_local_job_package (Bool, optional)
\n

If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

\n

Default Value: False

\n
\n
deploy_local_pipeline_package (Bool, optional)
\n

(legacy) If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

\n

Default Value: False

\n
\n
s3_job_package_path (dagster.StringSource, optional)
\n

If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_job_package should not be set to True.

\n
\n
s3_pipeline_package_path (dagster.StringSource, optional)
\n

If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.

\n
\n
\n
    \n
  • spark_config:

  • \n
  • cluster_id: Name of the job flow (cluster) on which to execute.

  • \n
  • region_name: The AWS region that the cluster is in.

  • \n
  • action_on_failure: The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html

  • \n
  • staging_bucket: S3 bucket to use for passing files between the plan process and EMR process.

  • \n
  • staging_prefix: S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process

  • \n
  • wait_for_logs: If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.

  • \n
  • local_job_package_path: Absolute path to the package that contains the job definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the job. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_job_package option, referenced on s3 via the s3_job_package_path option, or installed on the cluster via bootstrap actions.

  • \n
  • local_pipeline_package_path: (legacy) Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.

  • \n
  • deploy_local_job_package: If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

  • \n
  • deploy_local_pipeline_package: (legacy) If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

  • \n
  • s3_job_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_job_package should not be set to True.

  • \n
  • s3_pipeline_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.

  • \n
\n
\n\n
\n
\nclass dagster_aws.emr.EmrJobRunner(region, check_cluster_every=30, aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster_aws.emr.EmrError[source]\u00b6
\n
\n\n
\n
\ndagster_aws.emr.EmrClusterState = <enum 'EmrClusterState'>[source]\u00b6
\n

An enumeration.

\n
\n\n
\n
\ndagster_aws.emr.EmrStepState = <enum 'EmrStepState'>[source]\u00b6
\n

An enumeration.

\n
\n\n
\n
\n

CloudWatch\u00b6

\n
\n
\ndagster_aws.cloudwatch.cloudwatch_logger LoggerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
log_level (String, optional)
\n

Default Value: \u2018INFO\u2019

\n
\n
name (String, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
log_group_name (String)
\n

The name of the log group

\n
\n
log_stream_name (String)
\n

The name of the log stream

\n
\n
aws_region (dagster.StringSource, optional)
\n

Specifies a custom region for the S3 session. Default is chosen through the ordinary boto3 credential chain.

\n
\n
aws_secret_access_key (dagster.StringSource, optional)
\n

\n
aws_access_key_id (dagster.StringSource, optional)
\n

\n
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\n

SecretsManager\u00b6

\n

Resources which surface SecretsManager secrets for use in Dagster resources and jobs.

\n
\n
\ndagster_aws.secretsmanager.secretsmanager_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (String, optional)
\n

Specifies a custom region for the SecretsManager session

\n
\n
max_attempts (Int, optional)
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (String, optional)
\n

Specifies a profile to connect that session

\n
\n
\n

Resource that gives access to AWS SecretsManager.

\n

The underlying SecretsManager session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is a SecretsManager client, an instance of botocore.client.SecretsManager.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import secretsmanager_resource\n\n@op(required_resource_keys={'secretsmanager'})\ndef example_secretsmanager_op(context):\n    return context.resources.secretsmanager.get_secret_value(\n        SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n    )\n\n@job(resource_defs={'secretsmanager': secretsmanager_resource})\ndef example_job():\n    example_secretsmanager_op()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            'secretsmanager': {\n                'config': {\n                    'region_name': 'us-west-1',\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  secretsmanager:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n      # through the ordinary boto credential chain.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n      # profile as specified in ~/.aws/credentials file\n
\n
\n
\n\n
\n
\ndagster_aws.secretsmanager.secretsmanager_secrets_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (String, optional)
\n

Specifies a custom region for the SecretsManager session

\n
\n
max_attempts (Int, optional)
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (String, optional)
\n

Specifies a profile to connect that session

\n
\n
secrets (List[String], optional)
\n

An array of AWS Secrets Manager secrets arns to fetch.

\n

Default Value: []

\n
\n
secrets_tag (Union[String, None], optional)
\n

AWS Secrets Manager secrets with this tag will be fetched and made available.

\n

Default Value: None

\n
\n
add_to_environment (Bool, optional)
\n

Whether to mount the secrets as environment variables.

\n

Default Value: False

\n
\n
\n

Resource that provides a dict which maps selected SecretsManager secrets to\ntheir string values. Also optionally sets chosen secrets as environment variables.

\n

Example

\n
import os\nfrom dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n@op(required_resource_keys={'secrets'})\ndef example_secretsmanager_secrets_op(context):\n    return context.resources.secrets.get("my-secret-name")\n\n@op(required_resource_keys={'secrets'})\ndef example_secretsmanager_secrets_op_2(context):\n    return os.getenv("my-other-secret-name")\n\n@job(resource_defs={'secrets': secretsmanager_secrets_resource})\ndef example_job():\n    example_secretsmanager_secrets_op()\n    example_secretsmanager_secrets_op_2()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            'secrets': {\n                'config': {\n                    'region_name': 'us-west-1',\n                    'secrets_tag': 'dagster',\n                    'add_to_environment': True,\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  secretsmanager:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n      # through the ordinary boto credential chain.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n      # profile as specified in ~/.aws/credentials file\n      secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n      # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n      secrets_tag: "dagster"\n      # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n      # from SecretsManager.\n      add_to_environment: true\n      # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n      # to false.\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-aws", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-azure/", "title": "Azure (dagster-azure)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-airflow/", "title": "Airflow (dagster-airflow)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-azure", "Azure (dagster-azure)", "N", "next"], ["sections/api/apidocs/libraries/dagster-airflow", "Airflow (dagster-airflow)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-aws.rst.txt", "title": "AWS (dagster-aws)", "toc": "\n"}, "dagster-azure": {"alabaster_version": "0.7.12", "body": "
\n

Azure (dagster-azure)\u00b6

\n

Utilities for using Azure Storage Accounts with Dagster. This is mostly aimed at Azure Data Lake\nStorage Gen 2 (ADLS2) but also contains some utilities for Azure Blob Storage.

\n
\n

\n
\n

NOTE: This package is incompatible with dagster-snowflake! This is due to a version mismatch\nbetween the underlying azure-storage-blob package; dagster-snowflake has a transitive\ndependency on an old version, via snowflake-connector-python.

\n
\n
\ndagster_azure.adls2.adls2_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource)
\n

The storage account name.

\n
\n
credential (selector)
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource)
\n

SAS token for the account.

\n
\n
key (dagster.StringSource)
\n

Shared Access Key for the account

\n
\n
\n
\n
\n

Resource that gives ops access to Azure Data Lake Storage Gen2.

\n

The underlying client is a DataLakeServiceClient.

\n

Attach this resource definition to a JobDefinition in order to make it\navailable to your ops.

\n

Example

\n
from dagster import job, op\nfrom dagster_azure.adls2 import adls2_resource\n\n@op(required_resource_keys={'adls2'})\ndef example_adls2_op(context):\n    return list(context.resources.adls2.adls2_client.list_file_systems())\n\n@job(resource_defs={"adls2": adls2_resource})\ndef my_job():\n    example_adls2_op()\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may pass credentials to this resource using either a SAS token or a key, using\nenvironment variables if desired:

\n
resources:\n  adls2:\n    config:\n      storage_account: my_storage_account\n      # str: The storage account name.\n      credential:\n        sas: my_sas_token\n        # str: the SAS token for the account.\n        key:\n          env: AZURE_DATA_LAKE_STORAGE_KEY\n        # str: The shared access key for the account.\n
\n
\n
\n\n
\n
\nclass dagster_azure.adls2.FakeADLS2Resource(account_name, credential='fake-creds')[source]\u00b6
\n

Stateful mock of an ADLS2Resource for testing.

\n

Wraps a mock.MagicMock. Containers are implemented using an in-memory dict.

\n
\n\n
\n
\ndagster_azure.adls2.adls2_file_cache ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource)
\n

The storage account name.

\n
\n
credential (selector)
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource)
\n

SAS token for the account.

\n
\n
key (dagster.StringSource)
\n

Shared Access Key for the account

\n
\n
\n
\n
prefix (dagster.StringSource)
\n

The base path prefix to use in ADLS2

\n
\n
file_system (dagster.StringSource)
\n

The storage account filesystem (aka container)

\n
\n
overwrite (Bool, optional)
\n

Default Value: False

\n
\n
\n
\n\n
\n
\nclass dagster_azure.blob.AzureBlobComputeLogManager(storage_account, container, secret_key, local_dir=None, inst_data=None, prefix='dagster')[source]\u00b6
\n

Logs op compute function stdout and stderr to Azure Blob Storage.

\n

This is also compatible with Azure Data Lake Storage.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_azure.blob.compute_log_manager\n  class: AzureBlobComputeLogManager\n  config:\n    storage_account: my-storage-account\n    container: my-container\n    credential: sas-token-or-secret-key\n    prefix: "dagster-test-"\n    local_dir: "/tmp/cool"\n
\n
\n
\n
Parameters
\n
    \n
  • storage_account (str) \u2013 The storage account name to which to log.

  • \n
  • container (str) \u2013 The container (or ADLS2 filesystem) to which to log.

  • \n
  • secret_key (str) \u2013 Secret key for the storage account. SAS tokens are not\nsupported because we need a secret key to generate a SAS token for a download URL.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster.seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_azure.adls2.adls2_file_manager ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource)
\n

The storage account name.

\n
\n
credential (selector)
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource)
\n

SAS token for the account.

\n
\n
key (dagster.StringSource)
\n

Shared Access Key for the account

\n
\n
\n
\n
adls2_file_system (dagster.StringSource)
\n

ADLS Gen2 file system name

\n
\n
adls2_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to ADLS2.

\n

Implements the FileManager API.

\n
\n\n
\n
\nclass dagster_azure.adls2.ADLS2FileHandle(account, file_system, key)[source]\u00b6
\n

A reference to a file on ADLS2.

\n
\n
\nproperty account\u00b6
\n

The name of the ADLS2 account.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty adls2_path\u00b6
\n

The file\u2019s ADLS2 URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty file_system\u00b6
\n

The name of the ADLS2 file system.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty key\u00b6
\n

The ADLS2 key.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty path_desc\u00b6
\n

The file\u2019s ADLS2 URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\ndagster_azure.adls2.adls2_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2_file_system (dagster.StringSource)
\n

ADLS Gen2 file system name

\n
\n
adls2_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using Azure Data Lake Storage Gen2 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for ADLS and the backing\ncontainer.

\n

Attach this resource definition to your job in order to make it available all your ops:

\n
@job(resource_defs={\n    'io_manager': adls2_pickle_io_manager,\n    'adls2': adls2_resource,\n    ...,\n})\ndef my_job():\n    ...\n
\n
\n

You may configure this storage as follows:

\n
resources:\n    io_manager:\n        config:\n            adls2_file_system: my-cool-file-system\n            adls2_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\ndagster_azure.adls2.adls2_pickle_asset_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2_file_system (dagster.StringSource)
\n

ADLS Gen2 file system name

\n
\n
adls2_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using Azure Data Lake Storage Gen2 for storage, meant for use with\nsoftware-defined assets.

\n

Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\nwill overwrite previous materializations of that asset.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for ADLS and the backing\ncontainer.

\n

Attach this resource definition to your job in order to make it available all your ops:

\n
asset_group = AssetGroup(\n    assets...,\n    resource_defs={'io_manager': adls2_pickle_io_manager, "adls2": adls2_resource, ...}),\n)\n
\n
\n

You may configure this storage as follows:

\n
resources:\n    io_manager:\n        config:\n            adls2_file_system: my-cool-file-system\n            adls2_prefix: good/prefix-for-files\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-azure", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-celery/", "title": "Celery (dagster-celery)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-aws/", "title": "AWS (dagster-aws)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery", "Celery (dagster-celery)", "N", "next"], ["sections/api/apidocs/libraries/dagster-aws", "AWS (dagster-aws)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-azure.rst.txt", "title": "Azure (dagster-azure)", "toc": "\n"}, "dagster-celery": {"alabaster_version": "0.7.12", "body": "
\n

Celery (dagster-celery)\u00b6

\n
\n

Quickstart\u00b6

\n

To get a local rabbitmq broker started and available via the default\npyamqp://guest@localhost:5672, in the dagster/python_modules/libraries/dagster-celery/\ndirectory run:

\n
docker-compose up\n
\n
\n

To run a celery worker:

\n
celery -A dagster_celery.app worker -l info\n
\n
\n

To start multiple workers in the background, run:

\n
celery multi start w2 -A dagster_celery.app -l info\n
\n
\n

To execute a job using the celery-backed executor, you\u2019ll need to set the job\u2019s executor_def to\nthe celery_executor.

\n
from dagster import job\nfrom dagster_celery import celery_executor\n\n@job(executor_def=celery_executor)\ndef my_job():\n    pass\n
\n
\n
\n

Monitoring your Celery tasks\u00b6

\n

We advise using [Flower](https://celery.readthedocs.io/en/latest/userguide/monitoring.html#flower-real-time-celery-web-monitor):

\n
celery -A dagster_celery.app flower\n
\n
\n
\n
\n

Customizing the Celery broker, backend, and other app configuration\u00b6

\n

By default this will use amqp://guest:**@localhost:5672// as the Celery broker URL and\nrpc:// as the results backend. In production, you will want to change these values. Pending the\nintroduction of a dagster_celery CLI, that would entail writing a Python module my_module as\nfollows:

\n
from celery import Celery\n\nfrom dagster_celery.tasks import create_task\n\napp = Celery('dagster', broker_url='some://custom@value', ...)\n\nexecute_plan = create_task(app)\n\nif __name__ == '__main__':\n    app.worker_main()\n
\n
\n

You can then run the celery worker using:

\n
celery -A my_module worker --loglevel=info\n
\n
\n

This customization mechanism is used to implement dagster_celery_k8s and dagster_celery_k8s which delegate the execution of steps to ephemeral kubernetes pods and docker containers, respectively.

\n
\n
\n

Celery best practices\u00b6

\n

Celery is a rich and full-featured system. We\u2019ve found the following resources helpful:

\n\n
\n
\n
\n

API\u00b6

\n
\n
\ndagster_celery.celery_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional)
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional)
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Celery-based executor.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when solid executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute jobs\nwith variations on these settings.

\n

To use the celery_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery import celery_executor\n\n@job(executor_def=celery_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n
\n\n
\n
\n

CLI\u00b6

\n

The dagster-celery CLI lets you start, monitor, and terminate workers.

\n
\n

dagster-celery worker start\u00b6

\n

Start a dagster celery worker.

\n
dagster-celery worker start [OPTIONS] [ADDITIONAL_ARGS]...\n
\n
\n

Options

\n
\n
\n-n, --name <name>\u00b6
\n

The name of the worker. Defaults to a unique name prefixed with \u201cdagster-\u201d and ending with the hostname.

\n
\n\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the worker. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use.

\n
\n\n
\n
\n-q, --queue <queue>\u00b6
\n

Names of the queues on which this worker should listen for tasks. Provide multiple -q arguments to specify multiple queues. Note that each celery worker may listen on no more than four queues.

\n
\n\n
\n
\n-d, --background\u00b6
\n

Set this flag to run the worker in the background.

\n
\n\n
\n
\n-i, --includes <includes>\u00b6
\n

Python modules the worker should import. Provide multiple -i arguments to specify multiple modules.

\n
\n\n
\n
\n-l, --loglevel <loglevel>\u00b6
\n

Log level for the worker.

\n
\n\n
\n
\n-A, --app <app>\u00b6
\n
\n\n

Arguments

\n
\n
\nADDITIONAL_ARGS\u00b6
\n

Optional argument(s)

\n
\n\n
\n
\n

dagster-celery worker list\u00b6

\n

List running dagster-celery workers. Note that we use the broker to contact the workers.

\n
dagster-celery worker list [OPTIONS]\n
\n
\n

Options

\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to find your workers (since the CLI won\u2019t know how to reach the broker).

\n
\n\n
\n
\n

dagster-celery worker terminate\u00b6

\n

Shut down dagster-celery workers. Note that we use the broker to send signals to the workers to terminate \u2013 if the broker is not running, this command is a no-op. Provide the argument NAME to terminate a specific worker by name.

\n
dagster-celery worker terminate [OPTIONS] [NAME]\n
\n
\n

Options

\n
\n
\n-a, --all\u00b6
\n

Set this flag to terminate all running workers.

\n
\n\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to terminate your workers (since the CLI won\u2019t know how to reach the broker).

\n
\n\n

Arguments

\n
\n
\nNAME\u00b6
\n

Optional argument

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-celery-docker/", "title": "Orchestration on Celery + Docker"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-azure/", "title": "Azure (dagster-azure)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery-docker", "Orchestration on Celery + Docker", "N", "next"], ["sections/api/apidocs/libraries/dagster-azure", "Azure (dagster-azure)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery.rst.txt", "title": "Celery (dagster-celery)", "toc": "\n"}, "dagster-celery-docker": {"alabaster_version": "0.7.12", "body": "
\n

Orchestration on Celery + Docker\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_celery_docker.celery_docker_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional)
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional)
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
docker (strict dict)
\n

The configuration for interacting with docker in the celery worker.

\n
\nConfig Schema:
\n
image (dagster.StringSource, optional)
\n

The docker image to be used for step execution.

\n
\n
registry (strict dict, optional)
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource)
\n

\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
\n
\n
env_vars (List[String], optional)
\n

The list of environment variables names to forward from the celery worker in to the docker container

\n
\n
network (String, optional)
\n

Name of the network this container will be connected to at creation time

\n
\n
container_kwargs (permissive dict, optional)
\n

Additional keyword args for the docker container

\n
\n
\n
\n
\n

Celery-based executor which launches tasks in docker containers.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when op executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute jobs\nwith variations on these settings.

\n

To use the celery_docker_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery_docker.executor import celery_docker_executor\n\n@job(executor_def=celery_docker_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    docker:\n      image: 'my_repo.com/image_name:latest'\n      registry:\n        url: 'my_repo.com'\n        username: 'my_user'\n        password: {env: 'DOCKER_PASSWORD'}\n      env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n      container_kwargs: # keyword args to be passed to the container. example:\n        volumes: ['/home/user1/:/mnt/vol2','/var/www:/mnt/vol1']\n\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n

In deployments where the celery_docker_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_docker.app argument.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery-docker", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-celery-k8s/", "title": "Orchestration on Celery + Kubernetes"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery/", "title": "Celery (dagster-celery)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery-k8s", "Orchestration on Celery + Kubernetes", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery", "Celery (dagster-celery)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery-docker.rst.txt", "title": "Orchestration on Celery + Docker", "toc": "\n"}, "dagster-celery-k8s": {"alabaster_version": "0.7.12", "body": "
\n

Orchestration on Celery + Kubernetes\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_celery_k8s.CeleryK8sRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional)
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional)
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional)
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional)
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional)
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional)
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional)
\n

A list of environment variables to inject into the Job. Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional)
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional)
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional)
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional)
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
instance_config_map (dagster.StringSource)
\n

The name of an existing Volume to mount into the pod in order to provide a ConfigMap for the Dagster instance. This Volume should contain a dagster.yaml with appropriate values for run storage, event log storage, etc.

\n
\n
postgres_password_secret (dagster.StringSource, optional)
\n

The name of the Kubernetes Secret where the postgres password can be retrieved. Will be mounted and supplied as an environment variable to the Job Pod.Secret must contain the key "postgresql-password" which will be exposed in the Job environment as the environment variable DAGSTER_PG_PASSWORD.

\n
\n
dagster_home (dagster.StringSource, optional)
\n

The location of DAGSTER_HOME in the Job container; this is where the dagster.yaml file will be mounted from the instance ConfigMap specified here. Defaults to /opt/dagster/dagster_home.

\n

Default Value: \u2018/opt/dagster/dagster_home\u2019

\n
\n
load_incluster_config (Bool, optional)
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional)
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
fail_pod_on_run_failure (Bool, optional)
\n

Whether the launched Kubernetes Jobs and Pods should fail if the Dagster run fails

\n
\n
broker (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional)
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional)
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

In contrast to the K8sRunLauncher, which launches dagster runs as single K8s\nJobs, this run launcher is intended for use in concert with\ndagster_celery_k8s.celery_k8s_job_executor().

\n

With this run launcher, execution is delegated to:

\n
\n
    \n
  1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\nsubmits steps to Celery queues for execution;

  2. \n
  3. The step executions which are submitted to Celery queues are picked up by Celery workers,\nand each step execution spawns a step execution Kubernetes Job. See the implementation\ndefined in dagster_celery_k8.executor.create_k8s_job_task().

  4. \n
\n
\n

You can configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml like the following:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: CeleryK8sRunLauncher\n  config:\n    instance_config_map: "dagster-k8s-instance-config-map"\n    dagster_home: "/some/path"\n    postgres_password_secret: "dagster-k8s-pg-password"\n    broker: "some_celery_broker_url"\n    backend: "some_celery_backend_url"\n
\n
\n
\n\n
\n
\ndagster_celery_k8s.celery_k8s_job_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional)
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional)
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional)
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
job_image (Union[dagster.StringSource, None], optional)
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional)
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional)
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional)
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional)
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional)
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional)
\n

A list of environment variables to inject into the Job. Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional)
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional)
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional)
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional)
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
load_incluster_config (Bool, optional)
\n

Set this value if you are running the launcher within a k8s cluster. If\nTrue, we assume the launcher is running within the target cluster and load config\nusing kubernetes.config.load_incluster_config. Otherwise, we will use the k8s config\nspecified in kubeconfig_file (using kubernetes.config.load_kube_config) or fall\nback to the default kubeconfig. Default: True.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional)
\n

Path to a kubeconfig file to use, if not using default kubeconfig.

\n
\n
job_namespace (dagster.StringSource, optional)
\n

The namespace into which to launch new jobs. Note that any other Kubernetes resources the Job requires (such as the service account) must be present in this namespace. Default: "default"

\n

Default Value: \u2018default\u2019

\n
\n
repo_location_name (dagster.StringSource, optional)
\n

The repository location name to use for execution.

\n

Default Value: \u2018<<in_process>>\u2019

\n
\n
job_wait_timeout (Float, optional)
\n

Wait this many seconds for a job to complete before marking the run as failed. Defaults to 86400.0 seconds.

\n

Default Value: 86400.0

\n
\n
\n

Celery-based executor which launches tasks as Kubernetes Jobs.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when op executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute dagster jobs\nwith variations on these settings.

\n

To use the celery_k8s_job_executor, set it as the executor_def when defining a job:

\n
from dagster_celery_k8s.executor import celery_k8s_job_executor\n\nfrom dagster import job\n\n\n@job(executor_def=celery_k8s_job_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    job_image: 'my_repo.com/image_name:latest'\n    job_namespace: 'some-namespace'\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n

In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_k8s.app argument.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery-k8s", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-dask/", "title": "Dask (dagster-dask)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery-docker/", "title": "Orchestration on Celery + Docker"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-dask", "Dask (dagster-dask)", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery-docker", "Orchestration on Celery + Docker", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery-k8s.rst.txt", "title": "Orchestration on Celery + Kubernetes", "toc": "\n"}, "dagster-dask": {"alabaster_version": "0.7.12", "body": "
\n

Dask (dagster-dask)\u00b6

\n

See also the Dask deployment guide.

\n
\n
\ndagster_dask.dask_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
cluster (selector)
\n
\nConfig Schema:
\n
existing (strict dict)
\n

Connect to an existing scheduler.

\n
\nConfig Schema:
\n
address (dagster.StringSource)
\n

\n
\n
\n
local (permissive dict, optional)
\n

Local cluster configuration.

\n
\n
yarn (permissive dict, optional)
\n

YARN cluster configuration.

\n
\n
ssh (permissive dict, optional)
\n

SSH cluster configuration.

\n
\n
pbs (permissive dict, optional)
\n

PBS cluster configuration.

\n
\n
moab (permissive dict, optional)
\n

Moab cluster configuration.

\n
\n
sge (permissive dict, optional)
\n

SGE cluster configuration.

\n
\n
lsf (permissive dict, optional)
\n

LSF cluster configuration.

\n
\n
slurm (permissive dict, optional)
\n

SLURM cluster configuration.

\n
\n
oar (permissive dict, optional)
\n

OAR cluster configuration.

\n
\n
kube (permissive dict, optional)
\n

Kubernetes cluster configuration.

\n
\n
\n
\n
\n

Dask-based executor.

\n

The \u2018cluster\u2019 can be one of the following:\n(\u2018existing\u2019, \u2018local\u2019, \u2018yarn\u2019, \u2018ssh\u2019, \u2018pbs\u2019, \u2018moab\u2019, \u2018sge\u2019, \u2018lsf\u2019, \u2018slurm\u2019, \u2018oar\u2019, \u2018kube\u2019).

\n

If the Dask executor is used without providing executor-specific config, a local Dask cluster\nwill be created (as when calling dask.distributed.Client()\nwith dask.distributed.LocalCluster()).

\n

The Dask executor optionally takes the following config:

\n
cluster:\n    {\n        local?: # takes distributed.LocalCluster parameters\n            {\n                timeout?: 5,  # Timeout duration for initial connection to the scheduler\n                n_workers?: 4  # Number of workers to start\n                threads_per_worker?: 1 # Number of threads per each worker\n            }\n    }\n
\n
\n

To use the dask_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_dask import dask_executor\n\n@job(executor_def=dask_executor)\ndef dask_enabled_job():\n    pass\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-dask", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-databricks/", "title": "Databricks (dagster-databricks)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery-k8s/", "title": "Orchestration on Celery + Kubernetes"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-databricks", "Databricks (dagster-databricks)", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery-k8s", "Orchestration on Celery + Kubernetes", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-dask.rst.txt", "title": "Dask (dagster-dask)", "toc": "\n"}, "dagster-databricks": {"alabaster_version": "0.7.12", "body": "
\n

Databricks (dagster-databricks)\u00b6

\n

The dagster_databricks package provides two main pieces of functionality:

\n\n

Note that, for the databricks_pyspark_step_launcher, either S3 or Azure Data Lake Storage config\nmust be specified for ops to succeed, and the credentials for this storage must also be\nstored as a Databricks Secret and stored in the resource config so that the Databricks cluster can\naccess storage.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_databricks.create_databricks_job_op(name='databricks_job', num_inputs=1, description=None, required_resource_keys=frozenset({'databricks_client'}))[source]\u00b6
\n

Creates an op that launches a databricks job (not to be confused with Dagster\u2019s job API).

\n

As config, the op accepts a blob of the form described in Databricks\u2019 job API:\nhttps://docs.databricks.com/dev-tools/api/latest/jobs.html.

\n
\n
Returns
\n

An op definition.

\n
\n
Return type
\n

OpDefinition

\n
\n
\n

Example

\n
from dagster import graph\nfrom dagster_databricks import create_databricks_job_op, databricks_client\n\nsparkpi = create_databricks_job_op().configured(\n    {\n        "job": {\n            "name": "SparkPi Python job",\n            "new_cluster": {\n                "spark_version": "7.3.x-scala2.12",\n                "node_type_id": "i3.xlarge",\n                "num_workers": 2,\n            },\n            "spark_python_task": {"python_file": "dbfs:/docs/pi.py", "parameters": ["10"]},\n        }\n    },\n    name="sparkpi",\n)\n\n@graph\ndef my_spark():\n    sparkpi()\n\nmy_spark.to_job(\n    resource_defs={\n        "databricks_client": databricks_client.configured(\n            {"host": "my.workspace.url", "token": "my.access.token"}\n        )\n    }\n)\n
\n
\n
\n\n
\n
\ndagster_databricks.databricks_pyspark_step_launcher ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
run_config (strict dict)
\n

Databricks job run configuration

\n
\nConfig Schema:
\n
cluster (selector)
\n
\nConfig Schema:
\n
new (strict dict)
\n
\nConfig Schema:
\n
size (selector)
\n
\nConfig Schema:
\n
autoscale (strict dict)
\n
\nConfig Schema:
\n
min_workers (Int)
\n

The minimum number of workers to which the cluster can scale down when underutilized. It is also the initial number of workers the cluster will have after creation.

\n
\n
max_workers (Int)
\n

The maximum number of workers to which the cluster can scale up when overloaded. max_workers must be strictly greater than min_workers.

\n
\n
\n
\n
num_workers (Int)
\n

If num_workers, number of worker nodes that this cluster should have. A cluster has one Spark Driver and num_workers Executors for a total of num_workers + 1 Spark nodes.

\n
\n
\n
\n
spark_version (String)
\n

The Spark version of the cluster. A list of available Spark versions can be retrieved by using the Runtime versions API call. This field is required.

\n
\n
spark_conf (permissive dict, optional)
\n

An object containing a set of optional, user-specified Spark configuration key-value pairs. You can also pass in a string of extra JVM options to the driver and the executors via spark.driver.extraJavaOptions and spark.executor.extraJavaOptions respectively. Example Spark confs: {\u201cspark.speculation\u201d: true, \u201cspark.streaming.ui.retainedBatches\u201d: 5} or {\u201cspark.driver.extraJavaOptions\u201d: \u201c-verbose:gc -XX:+PrintGCDetails\u201d}

\n
\n
nodes (selector)
\n

The nodes used in the cluster. Either the node types or an instance pool can be specified.

\n
\nConfig Schema:
\n
node_types (strict dict)
\n
\nConfig Schema:
\n
node_type_id (String)
\n

This field encodes, through a single value, the resources available to each of the Spark nodes in this cluster. For example, the Spark nodes can be provisioned and optimized for memory or compute intensive workloads. A list of available node types can be retrieved by using the List node types API call. This field is required.

\n
\n
driver_node_type_id (String, optional)
\n

The node type of the Spark driver. This field is optional; if unset, the driver node type is set as the same value as node_type_id defined above.

\n
\n
\n
\n
instance_pool_id (String, optional)
\n

The optional ID of the instance pool to which the cluster belongs. Refer to the Instance Pools API for details.

\n
\n
\n
\n
ssh_public_keys (List[String], optional)
\n

SSH public key contents that will be added to each Spark node in this cluster. The corresponding private keys can be used to login with the user name ubuntu on port 2200. Up to 10 keys can be specified.

\n
\n
custom_tags (List[strict dict], optional)
\n

Additional tags for cluster resources. Databricks tags all cluster resources (e.g., AWS instances and EBS volumes) with these tags in addition to default_tags. Note: - Tags are not supported on legacy node types such as compute-optimized and memory-optimized - Databricks allows at most 45 custom tagsMore restrictions may apply if using Azure Databricks; refer to the official docs for further details.

\n
\n
cluster_log_conf (selector, optional)
\n

Recommended! The configuration for delivering Spark logs to a long-term storage destination. Only one destination can be specified for one cluster. If the conf is given, the logs will be delivered to the destination every 5 mins. The destination of driver logs is <destination>/<cluster-id>/driver, while the destination of executor logs is <destination>/<cluster-id>/executor.

\n
\nConfig Schema:
\n
dbfs (strict dict)
\n

DBFS storage information

\n
\nConfig Schema:
\n
destination (String)
\n

DBFS destination, e.g. dbfs:/my/path

\n
\n
\n
\n
s3 (strict dict)
\n

S3 storage information

\n
\nConfig Schema:
\n
destination (String)
\n

S3 destination, e.g. s3://my-bucket/some-prefix. You must configure the cluster with an instance profile and the instance profile must have write access to the destination. You cannot use AWS keys.

\n
\n
region (String)
\n

S3 region, e.g. us-west-2. Either region or endpoint must be set. If both are set, endpoint is used.

\n
\n
endpoint (String)
\n

S3 endpoint, e.g. https://s3-us-west-2.amazonaws.com. Either region or endpoint must be set. If both are set, endpoint is used.

\n
\n
enable_encryption (Bool, optional)
\n

(Optional) Enable server side encryption, false by default.

\n
\n
encryption_type (String, optional)
\n

(Optional) The encryption type, it could be sse-s3 or sse-kms. It is used only when encryption is enabled and the default type is sse-s3.

\n
\n
kms_key (String, optional)
\n

(Optional) KMS key used if encryption is enabled and encryption type is set to sse-kms.

\n
\n
canned_acl (String, optional)
\n

(Optional) Set canned access control list, e.g. bucket-owner-full-control.If canned_acl is set, the cluster instance profile must have s3:PutObjectAcl permission on the destination bucket and prefix. The full list of possible canned ACLs can be found at https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl. By default only the object owner gets full control. If you are using cross account role for writing data, you may want to set bucket-owner-full-control to make bucket owner able to read the logs.

\n
\n
\n
\n
\n
\n
init_scripts (List[selector], optional)
\n

The configuration for storing init scripts. Any number of scripts can be specified. The scripts are executed sequentially in the order provided. If cluster_log_conf is specified, init script logs are sent to <destination>/<cluster-id>/init_scripts.

\n
\n
spark_env_vars (permissive dict, optional)
\n

An object containing a set of optional, user-specified environment variable key-value pairs. Key-value pair of the form (X,Y) are exported as is (i.e., export X=\u201dY\u201d) while launching the driver and workers. To specify an additional set of SPARK_DAEMON_JAVA_OPTS, we recommend appending them to $SPARK_DAEMON_JAVA_OPTS as shown in the example below. This ensures that all default Databricks managed environmental variables are included as well. Example Spark environment variables: {\u201cSPARK_WORKER_MEMORY\u201d: \u201c28000m\u201d, \u201cSPARK_LOCAL_DIRS\u201d: \u201c/local_disk0\u201d} or {\u201cSPARK_DAEMON_JAVA_OPTS\u201d: \u201c$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\u201d}

\n
\n
enable_elastic_disk (Bool, optional)
\n

Autoscaling Local Storage: when enabled, this cluster dynamically acquires attitional disk space when its Spark workers are running low on disk space. This feature requires specific AWS permissions to function correctly - refer to https://docs.databricks.com/clusters/configure.html#autoscaling-local-storage for details.

\n
\n
\n
\n
existing (String)
\n

The ID of an existing cluster that will be used for all runs of this job. When running jobs on an existing cluster, you may need to manually restart the cluster if it stops responding. Databricks suggests running jobs on new clusters for greater reliability.

\n
\n
\n
\n
run_name (String, optional)
\n

An optional name for the run. The default value is Untitled

\n
\n
libraries (List[selector], optional)
\n

An optional list of libraries to be installed on the cluster that will execute the job. By default dagster, dagster-databricks and dagster-pyspark libraries will be included.

\n
\n
timeout_seconds (Int, optional)
\n

An optional timeout applied to each run of this job. The default behavior is to have no timeout.

\n
\n
idempotency_token (String, optional)
\n

An optional token that can be used to guarantee the idempotency of job run requests.If an active run with the provided token already exists, the request will not create a new run, but will return the ID of the existing run instead. If you specify the idempotency token, upon failure you can retry until the request succeeds. Databricks guarantees that exactly one run will be launched with that idempotency token. This token should have at most 64 characters.

\n
\n
\n
\n
databricks_host (dagster.StringSource)
\n

Databricks host, e.g. uksouth.azuredatabricks.com

\n
\n
databricks_token (dagster.StringSource)
\n

Databricks access token

\n
\n
secrets_to_env_variables (List[strict dict], optional)
\n

Databricks secrets to be exported as environment variables. Since runs will execute in the Databricks runtime environment, environment variables (such as those required for a StringSource config variable) will not be accessible to Dagster. These variables must be stored as Databricks secrets and specified here, which will ensure they are re-exported as environment variables accessible to Dagster upon execution.

\n
\n
storage (selector, optional)
\n

Databricks storage configuration for either S3 or ADLS2. If access credentials for your Databricks storage are stored in Databricks secrets, this config indicates the secret scope and the secret keys used to access either S3 or ADLS2.

\n
\nConfig Schema:
\n
s3 (strict dict)
\n

S3 storage secret configuration

\n
\nConfig Schema:
\n
secret_scope (String)
\n

The Databricks secret scope containing the storage secrets.

\n
\n
access_key_key (String)
\n

The key of a Databricks secret containing the S3 access key ID.

\n
\n
secret_key_key (String)
\n

The key of a Databricks secret containing the S3 secret access key.

\n
\n
\n
\n
adls2 (strict dict)
\n

ADLS2 storage secret configuration

\n
\nConfig Schema:
\n
secret_scope (String)
\n

The Databricks secret scope containing the storage secrets.

\n
\n
storage_account_name (String)
\n

The name of the storage account used to access data.

\n
\n
storage_account_key_key (String)
\n

The key of a Databricks secret containing the storage account secret key.

\n
\n
\n
\n
\n
\n
local_pipeline_package_path (dagster.StringSource, optional)
\n

Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on Databricks. This is a path on the local fileystem of the process executing the pipeline. Before every step run, the launcher will zip up the code in this path, upload it to DBFS, and unzip it into the Python path of the remote Spark process. This gives the remote process access to up-to-date user code.

\n
\n
local_dagster_job_package_path (dagster.StringSource, optional)
\n

Absolute path to the package that contains the dagster job definition(s) whose steps will execute remotely on Databricks. This is a path on the local fileystem of the process executing the dagster job. Before every step run, the launcher will zip up the code in this path, upload it to DBFS, and unzip it into the Python path of the remote Spark process. This gives the remote process access to up-to-date user code.

\n
\n
staging_prefix (dagster.StringSource, optional)
\n

Directory in DBFS to use for uploaded job code. Must be absolute.

\n

Default Value: \u2018/dagster_staging\u2019

\n
\n
wait_for_logs (Bool, optional)
\n

If set, and if the specified cluster is configured to export logs, the system will wait after job completion for the logs to appear in the configured location. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime. NOTE: this integration will export stdout/stderrfrom the remote Databricks process automatically, so this option is not generally necessary.

\n

Default Value: False

\n
\n
max_completion_wait_time_seconds (dagster.IntSource, optional)
\n

If the Databricks job run takes more than this many seconds, then consider it failed and terminate the step.

\n

Default Value: 86400

\n
\n
poll_interval_sec (Float, optional)
\n

How frequently Dagster will poll Databricks to determine the state of the job.

\n

Default Value: 5.0

\n
\n
\n

Resource for running ops as a Databricks Job.

\n

When this resource is used, the op will be executed in Databricks using the \u2018Run Submit\u2019\nAPI. Pipeline code will be zipped up and copied to a directory in DBFS along with the op\u2019s\nexecution context.

\n

Use the \u2018run_config\u2019 configuration to specify the details of the Databricks cluster used, and\nthe \u2018storage\u2019 key to configure persistent storage on that cluster. Storage is accessed by\nsetting the credentials in the Spark context, as documented here for S3 and here for ADLS.

\n
\n\n
\n
\nclass dagster_databricks.DatabricksError[source]\u00b6
\n
\n\n
\n
\n

Legacy APIs\u00b6

\n
\n
\ndagster_databricks.create_databricks_job_solid(name='databricks_job', num_inputs=1, description=None, required_resource_keys=frozenset({'databricks_client'}))[source]\u00b6
\n

Creates a solid that launches a databricks job.

\n

As config, the solid accepts a blob of the form described in Databricks\u2019 job API:\nhttps://docs.databricks.com/dev-tools/api/latest/jobs.html.

\n
\n
Returns
\n

A solid definition.

\n
\n
Return type
\n

SolidDefinition

\n
\n
\n

Example

\n
from dagster import ModeDefinition, pipeline\nfrom dagster_databricks import create_databricks_job_solid, databricks_client\n\nsparkpi = create_databricks_job_solid().configured(\n    {\n        "job": {\n            "name": "SparkPi Python job",\n            "new_cluster": {\n                "spark_version": "7.3.x-scala2.12",\n                "node_type_id": "i3.xlarge",\n                "num_workers": 2,\n            },\n            "spark_python_task": {"python_file": "dbfs:/docs/pi.py", "parameters": ["10"]},\n        }\n    },\n    name="sparkspi",\n)\n\n\n@pipeline(\n    mode_defs=[\n        ModeDefinition(\n            resource_defs={\n                "databricks_client": databricks_client.configured(\n                    {"host": "my.workspace.url", "token": "my.access.token"}\n                )\n            }\n        )\n    ]\n)\ndef my_pipeline():\n    sparkpi()\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-databricks", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-datadog/", "title": "Datadog (dagster-datadog)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-dask/", "title": "Dask (dagster-dask)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-datadog", "Datadog (dagster-datadog)", "N", "next"], ["sections/api/apidocs/libraries/dagster-dask", "Dask (dagster-dask)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-databricks.rst.txt", "title": "Databricks (dagster-databricks)", "toc": "\n"}, "dagster-datadog": {"alabaster_version": "0.7.12", "body": "
\n

Datadog (dagster-datadog)\u00b6

\n

This library provides an integration with Datadog, to support publishing metrics to Datadog from\nwithin Dagster ops.

\n
\n

\n
\n

We use the Python datadogpy library. To use it, you\u2019ll\nfirst need to create a DataDog account and get both API and Application keys.

\n
\n

\n
\n

The integration uses DogStatsD, so you\u2019ll need\nto ensure the datadog agent is running on the host you\u2019re sending metrics from.

\n
\n
\ndagster_datadog.datadog_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource)
\n

Datadog API key

\n
\n
app_key (dagster.StringSource)
\n

Datadog application key

\n
\n
\n

This resource is a thin wrapper over the\ndogstatsd library.

\n

As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\nDataDog documentation for how to use this\nresource.

\n

Examples

\n
@op(required_resource_keys={'datadog'})\ndef datadog_op(context):\n    dd = context.resources.datadog\n\n    dd.event('Man down!', 'This server needs assistance.')\n    dd.gauge('users.online', 1001, tags=["protocol:http"])\n    dd.increment('page.views')\n    dd.decrement('page.views')\n    dd.histogram('album.photo.count', 26, tags=["gender:female"])\n    dd.distribution('album.photo.count', 26, tags=["color:blue"])\n    dd.set('visitors.uniques', 999, tags=["browser:ie"])\n    dd.service_check('svc.check_name', dd.WARNING)\n    dd.timing("query.response.time", 1234)\n\n    # Use timed decorator\n    @dd.timed('run_fn')\n    def run_fn():\n        pass\n\n    run_fn()\n\n@job(resource_defs={'datadog': datadog_resource})\ndef dd_job():\n    datadog_op()\n\nresult = dd_job.execute_in_process(\n    run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-datadog", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-dbt/", "title": "dbt (dagster-dbt)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-databricks/", "title": "Databricks (dagster-databricks)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-dbt", "dbt (dagster-dbt)", "N", "next"], ["sections/api/apidocs/libraries/dagster-databricks", "Databricks (dagster-databricks)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-datadog.rst.txt", "title": "Datadog (dagster-datadog)", "toc": "\n"}, "dagster-dbt": {"alabaster_version": "0.7.12", "body": "
\n

dbt (dagster-dbt)\u00b6

\n

This library provides a Dagster integration with dbt (data build tool), created by dbt Labs.

\n
\n

Ops\u00b6

\n
\n

dbt Core Ops\u00b6

\n

dagster_dbt provides a set of pre-built ops that work with either the CLI or RPC interfaces. For\nmore advanced use cases, we suggest building your own ops which directly interact with these resources.

\n
\n
\ndagster_dbt.dbt_run_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the op executes. Default: True

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018dbt\u2019]

\n
\n
\n

This op executes a dbt run command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_run_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_run_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_run_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_compile_op(context)[source]\u00b6
\n

This op executes a dbt compile command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_compile_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_compile_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_compile_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_ls_op(context)[source]\u00b6
\n

This op executes a dbt ls command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_ls_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_ls_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_ls_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_test_op(context)[source]\u00b6
\n

This op executes a dbt test command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_test_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_test_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_test_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_snapshot_op(context)[source]\u00b6
\n

This op executes a dbt snapshot command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_snapshot_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_snapshot_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_snapshot_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_seed_op(context)[source]\u00b6
\n

This op executes a dbt seed command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_seed_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_seed_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_seed_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_docs_generate_op(context)[source]\u00b6
\n

This op executes a dbt docs generate command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource) or\nover RPC (using the dbt_rpc_sync_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_docs_generate_op, dbt_cli_resource, dbt_rpc_sync_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_docs_generate_op()\n\n@job(resource_defs={"dbt":dbt_rpc_sync_resource})\ndef my_dbt_rpc_job():\n    dbt_docs_generate_op()\n
\n
\n
\n\n
\n
\n

dbt Cloud Ops\u00b6

\n
\n
\ndagster_dbt.dbt_cloud_run_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_id (Int)
\n

The integer ID of the relevant dbt Cloud job. You can find this value by going to the details page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
poll_interval (Float, optional)
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional)
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n

Default Value: None

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018dbt\u2019]

\n
\n
\n

Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\nfails or is otherwised stopped before succeeding, a dagster.Failure exception will be raised,\nand this op will fail.

\n

It requires the use of a \u2018dbt_cloud\u2019 resource, which is used to connect to the dbt Cloud API.

\n

Config Options:

\n
\n
job_id (int)

The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\npage of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\nhttps://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
poll_interval (float)

The time (in seconds) that will be waited between successive polls. Defaults to 10.

\n
\n
poll_timeout (float)

The maximum time (in seconds) that will waited before this operation is timed out. By\ndefault, this will never time out.

\n
\n
yield_materializations (bool)

If True, materializations corresponding to the results of the dbt operation will be\nyielded when the solid executes. Defaults to True.

\n
\n
rasset_key_prefix (float)

If provided and yield_materializations is True, these components will be used to \u201d\nprefix the generated asset keys. Defaults to [\u201cdbt\u201d].

\n
\n
\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_cloud_resource, dbt_cloud_run_op\n\nmy_dbt_cloud_resource = dbt_cloud_resource.configured(\n    {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n)\nrun_dbt_nightly_sync = dbt_cloud_run_op.configured(\n    {"job_id": 54321}, name="run_dbt_nightly_sync"\n)\n\n@job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\ndef dbt_cloud():\n    run_dbt_nightly_sync()\n
\n
\n
\n\n
\n
\n
\n

Resources\u00b6

\n
\n

CLI Resources\u00b6

\n
\n
\nclass dagster_dbt.DbtCliResource(executable, default_flags, warn_error, ignore_handled_error, target_path, logger=None, docs_url=None)[source]\u00b6
\n

A resource that allows you to execute dbt cli commands. For the most up-to-date documentation on\nthe specific parameters available to you for each command, check out the dbt docs:

\n

https://docs.getdbt.com/reference/commands/run

\n

To use this as a dagster resource, we recommend using\ndbt_cli_resource.

\n
\n
\nbuild(select=None, **kwargs)[source]\u00b6
\n

Run the build command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

select (List[str], optional) \u2013 the models/resources to include in the run.

\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\ncli(command, **kwargs)[source]\u00b6
\n
\n
Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the

default flags that were configured on resource initialization (if any) overriding the\ndefault values if necessary.

\n
\n
\n
\n
Parameters
\n

command (str) \u2013 The command you wish to run (e.g. \u2018run\u2019, \u2018test\u2019, \u2018docs generate\u2019, etc.)

\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\ncompile(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the compile command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in compilation.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from compilation.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nproperty default_flags\u00b6
\n

A set of params populated from resource config that are passed as flags to each dbt CLI command.

\n
\n\n
\n
\nfreshness(select=None, **kwargs)[source]\u00b6
\n

Run the source snapshot-freshness command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

select (List[str], optional) \u2013 the sources to include in the run.

\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\ngenerate_docs(compile_project=False, **kwargs)[source]\u00b6
\n

Run the docs generate command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

compile_project (bool, optional) \u2013 If true, compile the project before generating a catalog.

\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nget_manifest_json(**kwargs)[source]\u00b6
\n

Get a parsed version of the manifest.json file for the relevant dbt project.

\n
\n
Returns
\n

\n
dictionary containing the parsed contents of the manifest json file

for this dbt project.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_run_results_json(**kwargs)[source]\u00b6
\n

Get a parsed version of the run_results.json file for the relevant dbt project.

\n
\n
Returns
\n

\n
dictionary containing the parsed contents of the manifest json file

for this dbt project.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nls(select=None, models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the ls command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the resources to include in the output.

  • \n
  • models (List[str], optional) \u2013 the models to include in the output.

  • \n
  • exclude (List[str], optional) \u2013 the resources to exclude from the output.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nrun(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the run command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in compilation.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from compilation.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nrun_operation(macro, args=None, **kwargs)[source]\u00b6
\n

Run the run-operation command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • macro (str) \u2013 the dbt macro to invoke.

  • \n
  • args (Dict[str, Any], optional) \u2013 the keyword arguments to be supplied to the macro.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nseed(show=False, select=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the seed command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • show (bool, optional) \u2013 If True, then show a sample of the seeded data in the\nresponse. Defaults to False.

  • \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nsnapshot(select=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the snapshot command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n
\nproperty strict_flags\u00b6
\n

A set of flags that should not be auto-populated from the default flags unless they are\narguments to the associated function.

\n
\n\n
\n
\ntest(models=None, exclude=None, data=True, schema=True, **kwargs)[source]\u00b6
\n

Run the test command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in testing.

  • \n
  • exclude (List[str], optional) \u2013 the models to exclude from testing.

  • \n
  • data (bool, optional) \u2013 If True (default), then run data tests.

  • \n
  • schema (bool, optional) \u2013 If True (default), then run schema tests.

  • \n
\n
\n
Returns
\n

\n
An instance of DbtCliOutput containing

parsed log output as well as the contents of run_results.json (if applicable).

\n
\n
\n

\n
\n
Return type
\n

DbtCliOutput

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtCliOutput(command, return_code, raw_output, logs, result, docs_url=None)[source]\u00b6
\n

The results of executing a dbt command, along with additional metadata about the dbt CLI\nprocess that was run.

\n

Note that users should not construct instances of this class directly. This class is intended\nto be constructed from the JSON output of dbt commands.

\n
\n
\ncommand\u00b6
\n

The full shell command that was executed.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nreturn_code\u00b6
\n

The return code of the dbt CLI process.

\n
\n
Type
\n

int

\n
\n
\n
\n\n
\n
\nraw_output\u00b6
\n

The raw output (stdout) of the dbt CLI process.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nlogs\u00b6
\n

List of parsed JSON logs produced by the dbt command.

\n
\n
Type
\n

List[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nresult\u00b6
\n

Dictionary containing dbt-reported result information\ncontained in run_results.json. Some dbt commands do not produce results, and will\ntherefore have result = None.

\n
\n
Type
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\ndocs_url\u00b6
\n

Hostname where dbt docs are being served for this project.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\ndagster_dbt.dbt_cli_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project_dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles_dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass_cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn_error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target_path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (dagster.StringSource, optional)
\n

The url for where dbt docs are being served for this project.

\n
\n
\n

This resource defines a dbt CLI interface.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
custom_dbt_cli_resource = dbt_cli_resource.configured({"project-dir": "path/to/my/dbt_project"})\n\n@pipeline(mode_defs=[ModeDefinition(resource_defs={"dbt": custom_dbt_cli_resource})])\ndef dbt_cli_pipeline():\n    # Run solids with `required_resource_keys={"dbt", ...}`.\n
\n
\n

You may configure this resource as follows:

\n
resources:\n  dbt_cli_resource:\n    config:\n      project_dir: "."\n      # Optional[str]: Which directory to look in for the dbt_project.yml file. Default is\n      # the current working directory and its parents.\n      profiles_dir: $DBT_PROFILES_DIR or $HOME/.dbt\n      # Optional[str]: Which directory to look in for the profiles.yml file.\n      profile: ""\n      # Optional[str]: Which profile to load. Overrides setting in dbt_project.yml.\n      target: ""\n      # Optional[str]: Which target to load for the given profile.\n      vars: {}\n      # Optional[Permissive]: Supply variables to the project. This argument overrides\n      # variables defined in your dbt_project.yml file. This argument should be a\n      # dictionary, eg. "{'my_variable': 'my_value'}"\n      bypass_cache: False\n      # Optional[bool]: If set, bypass the adapter-level cache of database state.\n
\n
\n
\n\n
\n
\n

RPC Resources\u00b6

\n
\n
\nclass dagster_dbt.DbtRpcResource(host='0.0.0.0', port=8580, jsonrpc_version='2.0', logger=None, **_)[source]\u00b6
\n

A client for a dbt RPC server.

\n

To use this as a dagster resource, we recommend using\ndbt_rpc_resource.

\n
\n
\nbuild(select=None, **kwargs)[source]\u00b6
\n

Run the build command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

select (List[str], optional) \u2013 the models/resources to include in the run.

\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\ncli(command, **kwargs)[source]\u00b6
\n

Sends a request with CLI syntax to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for running CLI commands via RPC.

\n
\n
Parameters
\n

cli (str) \u2013 a dbt command in CLI syntax.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\ncompile(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method compile to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for compiling projects via RPC.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in compilation.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from compilation.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\ncompile_sql(sql, name)[source]\u00b6
\n

Sends a request with the method compile_sql to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for compiling SQL via RPC.

\n
\n
Parameters
\n
    \n
  • sql (str) \u2013 the SQL to compile in base-64 encoding.

  • \n
  • name (str) \u2013 a name for the compiled SQL.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\ngenerate_docs(compile_project=False, **kwargs)[source]\u00b6
\n

Sends a request with the method docs.generate to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the RPC method docs.generate.

\n
\n
Parameters
\n

compile_project (bool, optional) \u2013 If true, compile the project before generating a catalog.

\n
\n
\n
\n\n
\n
\nget_manifest_json(**kwargs)[source]\u00b6
\n

Get a parsed version of the manifest.json file for the relevant dbt project.

\n
\n
Returns
\n

\n
dictionary containing the parsed contents of the manifest json file

for this dbt project.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_run_results_json(**kwargs)[source]\u00b6
\n

Get a parsed version of the run_results.json file for the relevant dbt project.

\n
\n
Returns
\n

\n
dictionary containing the parsed contents of the run_results json file

for this dbt project.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nproperty host\u00b6
\n

The IP address of the host of the dbt RPC server.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty jsonrpc_version\u00b6
\n

The JSON-RPC version to send in RPC requests.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nkill(task_id)[source]\u00b6
\n

Sends a request with the method kill to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method kill.

\n
\n
Parameters
\n

task_id (str) \u2013 the ID of the task to terminate.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nproperty logger\u00b6
\n

A property for injecting a logger dependency.

\n
\n
Type
\n

logging.Logger

\n
\n
\n
\n\n
\n
\nls(select=None, models=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method list to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for list.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the resources to include in the output.

  • \n
  • models (List[str], optional) \u2013 the models to include in the output.

  • \n
  • exclude (List[str]), optional) \u2013 the resources to exclude from compilation.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\npoll(request_token, logs=False, logs_start=0)[source]\u00b6
\n

Sends a request with the method poll to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method poll.

\n
\n
Parameters
\n
    \n
  • request_token (str) \u2013 the token to poll responses for.

  • \n
  • logs (bool) \u2013 Whether logs should be returned in the response. Defaults to False.

  • \n
  • logs_start (int) \u2013 The zero-indexed log line to fetch logs from. Defaults to 0.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nproperty port\u00b6
\n

The port of the dbt RPC server.

\n
\n
Type
\n

int

\n
\n
\n
\n\n
\n
\nps(completed=False)[source]\u00b6
\n

Sends a request with the method ps to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method ps.

\n
\n
Parameters
\n

compelted (bool) \u2013 If True, then also return completed tasks. Defaults to False.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nrun(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method run to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method run.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in the run.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from the run.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nrun_operation(macro, args=None, **kwargs)[source]\u00b6
\n

Sends a request with the method run-operation to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the command run-operation.

\n
\n
Parameters
\n
    \n
  • macro (str) \u2013 the dbt macro to invoke.

  • \n
  • args (Dict[str, Any], optional) \u2013 the keyword arguments to be supplied to the macro.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nrun_sql(sql, name)[source]\u00b6
\n

Sends a request with the method run_sql to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for running SQL via RPC.

\n
\n
Parameters
\n
    \n
  • sql (str) \u2013 the SQL to run in base-64 encoding.

  • \n
  • name (str) \u2013 a name for the compiled SQL.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nseed(show=False, select=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method seed to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method seed.

\n
\n
Parameters
\n
    \n
  • show (bool, optional) \u2013 If True, then show a sample of the seeded data in the\nresponse. Defaults to False.

  • \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nsnapshot(select=None, exclude=None, **kwargs)[source]\u00b6
\n

Sends a request with the method snapshot to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the command snapshot.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nsnapshot_freshness(select=None, **kwargs)[source]\u00b6
\n

Sends a request with the method snapshot-freshness to the dbt RPC server, and returns\nthe response. For more details, see the dbt docs for the command source snapshot-freshness.

\n
\n
Parameters
\n

select (List[str], optional) \u2013 the models to include in calculating snapshot freshness.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nstatus()[source]\u00b6
\n

Sends a request with the method status to the dbt RPC server, and returns the\nresponse. For more details, see the dbt docs for the RPC method status.

\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\ntest(models=None, exclude=None, data=True, schema=True, **kwargs)[source]\u00b6
\n

Sends a request with the method test to the dbt RPC server, and returns the response.\nFor more details, see the dbt docs for the RPC method test.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in testing.

  • \n
  • exclude (List[str], optional) \u2013 the models to exclude from testing.

  • \n
  • data (bool, optional) \u2013 If True (default), then run data tests.

  • \n
  • schema (bool, optional) \u2013 If True (default), then run schema tests.

  • \n
\n
\n
Returns
\n

the HTTP response from the dbt RPC server.

\n
\n
Return type
\n

Response

\n
\n
\n
\n\n
\n
\nproperty url\u00b6
\n

The URL for sending dbt RPC requests.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtRpcSyncResource(host='0.0.0.0', port=8580, jsonrpc_version='2.0', logger=None, poll_interval=1, **_)[source]\u00b6
\n
\n\n
\n
\nclass dagster_dbt.DbtRpcOutput(response)[source]\u00b6
\n

The output from executing a dbt command via the dbt RPC server.

\n
\n
\nresult\u00b6
\n

The parsed contents of the \u201cresult\u201d field of the JSON response from\nthe rpc server (if any).

\n
\n
Type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nresponse_dict\u00b6
\n

The entire contents of the JSON response from the rpc server.

\n
\n
Type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nresponse\u00b6
\n

The original Response from which this output was generated.

\n
\n
Type
\n

requests.Response

\n
\n
\n
\n\n
\n\n
\n
\ndagster_dbt.local_dbt_rpc_resource ResourceDefinition\u00b6
\n

This resource defines a dbt RPC client for an RPC server running\non 0.0.0.0:8580.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 8580

\n
\n
\n

This resource defines a dbt RPC client.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n

Examples:

\n
from dagster_dbt import dbt_rpc_resource\n\ncustom_dbt_rpc_resource = dbt_rpc_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n@job(resource_defs={"dbt_rpc": custom_dbt_rpc_sync_resource})\ndef dbt_rpc_job():\n    # Run ops with `required_resource_keys={"dbt_rpc", ...}`.\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_rpc_sync_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 8580

\n
\n
poll_interval (dagster.IntSource, optional)
\n

Default Value: 1

\n
\n
\n

This resource defines a synchronous dbt RPC client, which sends requests to a dbt RPC server,\nand waits for the request to complete before returning.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster_dbt import dbt_rpc_sync_resource\n\ncustom_sync_dbt_rpc_resource = dbt_rpc_sync_resource.configured({"host": "80.80.80.80","port": 8080,})\n\n@job(resource_defs={"dbt_rpc": custom_dbt_rpc_sync_resource})\ndef dbt_rpc_sync_job():\n    # Run ops with `required_resource_keys={"dbt_rpc", ...}`.\n
\n
\n
\n\n
\n
\n

dbt Cloud Resources\u00b6

\n
\n
\nclass dagster_dbt.DbtCloudResourceV2(auth_token, account_id, disable_schedule_on_trigger=True, request_max_retries=3, request_retry_delay=0.25, dbt_cloud_host='https://cloud.getdbt.com/', log=<Logger dagster.builtin (DEBUG)>, log_requests=False)[source]\u00b6
\n

This class exposes methods on top of the dbt Cloud REST API v2.

\n

For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\nresponse JSON schemae, see the dbt Cloud API Docs.

\n
\n
\ncancel_run(run_id)[source]\u00b6
\n

Cancels a dbt Cloud run.

\n
\n
Parameters
\n

run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

\n
\n
Returns
\n

\n
A dictionary containing the parsed contents of the dbt Cloud run details.

See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_job(job_id)[source]\u00b6
\n

Gets details about a given dbt job from the dbt Cloud API.

\n
\n
Parameters
\n

job_id (int) \u2013 The ID of the relevant dbt Cloud job. You can find this value by going to\nthe details page of your job in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_manifest(run_id, step=None)[source]\u00b6
\n

The parsed contents of a manifest.json file created by a completed run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • step (int) \u2013 The index of the step in the run to query for artifacts. The first step in\nthe run has the index 1. If the step parameter is omitted, then this endpoint will\nreturn the artifacts compiled for the last step in the run.

  • \n
\n
\n
Returns
\n

Parsed contents of the manifest.json file

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_run(run_id, include_related=None)[source]\u00b6
\n

Gets details about a specific job run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • include_related (List[str]) \u2013 List of related fields to pull with the run. Valid values\nare \u201ctrigger\u201d, \u201cjob\u201d, and \u201cdebug_logs\u201d.

  • \n
\n
\n
Returns
\n

\n
A dictionary containing the parsed contents of the dbt Cloud run details.

See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_run_artifact(run_id, path, step=None)[source]\u00b6
\n

The string contents of a run artifact from a dbt Cloud run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • path (str) \u2013 The path to this run artifact (e.g. \u2018run/my_new_project/models/example/my_first_dbt_model.sql\u2019)

  • \n
  • step (int) \u2013 The index of the step in the run to query for artifacts. The first step in\nthe run has the index 1. If the step parameter is omitted, then this endpoint will\nreturn the artifacts compiled for the last step in the run.

  • \n
\n
\n
Returns
\n

List of the names of the available run artifacts

\n
\n
Return type
\n

List[str]

\n
\n
\n
\n\n
\n
\nget_run_results(run_id, step=None)[source]\u00b6
\n

The parsed contents of a run_results.json file created by a completed run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • step (int) \u2013 The index of the step in the run to query for artifacts. The first step in\nthe run has the index 1. If the step parameter is omitted, then this endpoint will\nreturn the artifacts compiled for the last step in the run.

  • \n
\n
\n
Returns
\n

Parsed contents of the run_results.json file

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_run_steps(run_id)[source]\u00b6
\n

Gets the steps of an initialized dbt Cloud run.

\n
\n
Parameters
\n

run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

\n
\n
Returns
\n

List of commands for each step of the run.

\n
\n
Return type
\n

List[str, Any]

\n
\n
\n
\n\n
\n
\nget_runs(include_related=None, job_id=None, order_by='-id', offset=0, limit=100)[source]\u00b6
\n

Returns a list of runs from dbt Cloud. This can be optionally filtered to a specific job\nusing the job_definition_id. It supports pagination using offset and limit as well and\ncan be configured to load a variety of related information about the runs.

\n
\n
Parameters
\n
    \n
  • include_related (Optional[List[str]]) \u2013 A list of resources to include in the response\nfrom dbt Cloud. This is technically a required field according to the API, but it\ncan be passed with an empty list where it will only load the default run\ninformation. Valid values are \u201ctrigger\u201d, \u201cjob\u201d, \u201crepository\u201d, and \u201cenvironment\u201d.

  • \n
  • job_definition_id (Optional[int]) \u2013 This method can be optionally filtered to only\nload runs for a specific job id if it is included here. If omitted it will pull\nruns for every job.

  • \n
  • order_by (Optional[str]) \u2013 An identifier designated by dbt Cloud in which to sort the\nresults before returning them. Useful when combined with offset and limit to load\nruns for a job. Defaults to \u201c-id\u201d where \u201c-\u201d designates reverse order and \u201cid\u201d is\nthe key to filter on.

  • \n
  • offset (int) \u2013 An offset to apply when listing runs. Can be used to paginate results\nwhen combined with order_by and limit. Defaults to 0.

  • \n
  • limit (int) \u2013 Limits the amount of rows returned by the API. Defaults to 100.

  • \n
\n
\n
Returns
\n

\n
A list of dictionaries containing the runs and any included

related information.

\n
\n
\n

\n
\n
Return type
\n

List[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nlist_run_artifacts(run_id, step=None)[source]\u00b6
\n

Lists the paths of the available run artifacts from a completed dbt Cloud run.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • step (int) \u2013 The index of the step in the run to query for artifacts. The first step in\nthe run has the index 1. If the step parameter is omitted, then this endpoint will\nreturn the artifacts compiled for the last step in the run

  • \n
\n
\n
Returns
\n

List of the paths of the available run artifacts

\n
\n
Return type
\n

List[str]

\n
\n
\n
\n\n
\n
\nmake_request(method, endpoint, data=None, return_text=False)[source]\u00b6
\n

Creates and sends a request to the desired dbt Cloud API endpoint.

\n
\n
Parameters
\n
    \n
  • method (str) \u2013 The http method to use for this request (e.g. \u201cPOST\u201d, \u201cGET\u201d, \u201cPATCH\u201d).

  • \n
  • endpoint (str) \u2013 The dbt Cloud API endpoint to send this request to.

  • \n
  • data (Optional[str]) \u2013 JSON-formatted data string to be included in the request.

  • \n
  • return_text (bool) \u2013 Override default behavior and return unparsed {\u201ctext\u201d: response.text}\nblob instead of json.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\npoll_run(run_id, poll_interval=10, poll_timeout=None, href=None)[source]\u00b6
\n

Polls a dbt Cloud job run until it completes. Will raise a dagster.Failure exception if the\nrun does not complete successfully.

\n
\n
Parameters
\n
    \n
  • run_id (int) \u2013 The ID of the relevant dbt Cloud run. You can find this value by going to\nthe details page of your run in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that should be waited between successive\npolls of the dbt Cloud API.

  • \n
  • poll_timeout (float) \u2013 The maximum time (in seconds) that should be waited for this run\nto complete. If this threshold is exceeded, the run will be cancelled and an\nexception will be thrown. By default, this will poll forver.

  • \n
  • href (str) \u2013 For internal use, generally should not be set manually.

  • \n
\n
\n
Returns
\n

\n
A dictionary containing the parsed contents of the dbt Cloud run details.

See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nrun_job(job_id, **kwargs)[source]\u00b6
\n

Initializes a run for a job. Overrides for specific properties can be set by passing in\nvalues to the kwargs. A full list of overridable properties can be found here:\nhttps://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun

\n
\n
Parameters
\n
    \n
  • job_id (int) \u2013 The ID of the relevant dbt Cloud job. You can find this value by going to\nthe details page of your job in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

  • \n
  • kwargs \u2013 Passed in as the properties to be overridden.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nrun_job_and_poll(job_id, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Runs a dbt Cloud job and polls until it completes. Will raise a dagster.Failure exception\nif the run does not complete successfully.

\n
\n
Parameters
\n
    \n
  • job_id (int) \u2013 The ID of the relevant dbt Cloud job. You can find this value by going to\nthe details page of your job in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that should be waited between successive\npolls of the dbt Cloud API.

  • \n
  • poll_timeout (float) \u2013 The maximum time (in seconds) that should be waited for this run\nto complete. If this threshold is exceeded, the run will be cancelled and an\nexception will be thrown. By default, this will poll forver.

  • \n
\n
\n
Returns
\n

\n
Class containing details about the specific job run and the

parsed run results.

\n
\n
\n

\n
\n
Return type
\n

DbtCloudOutput

\n
\n
\n
\n\n
\n
\nupdate_job(job_id, **kwargs)[source]\u00b6
\n

Updates specific properties of a dbt job. Documentation on the full set of potential\nparameters can be found here: https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById

\n
\n
Parameters
\n
    \n
  • job_id (int) \u2013 The ID of the relevant dbt Cloud job. You can find this value by going to\nthe details page of your job in the dbt Cloud UI. It will be the final number in the\nurl, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

  • \n
  • kwargs \u2013 Passed in as the properties to be changed.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n

Examples:

\n
# disable schedule for job with id=12345\nmy_dbt_cloud_resource.update_job(12345, triggers={"schedule": False})\n
\n
\n
\n\n
\n\n
\n
\ndagster_dbt.dbt_cloud_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
auth_token (dagster.StringSource)
\n

dbt Cloud API Token. User tokens can be found in the [dbt Cloud UI](https://cloud.getdbt.com/#/profile/api/), or see the [dbt Cloud Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) for instructions on creating a Service Account token.

\n
\n
account_id (Int)
\n

dbt Cloud Account ID. This value can be found in the url of a variety of views in the dbt Cloud UI, e.g. https://cloud.getdbt.com/#/accounts/{account_id}/settings/.

\n
\n
disable_schedule_on_trigger (Bool, optional)
\n

Specifies if you would like any job that is triggered using this resource to automatically disable its schedule.

\n

Default Value: True

\n
\n
request_max_retries (Int, optional)
\n

The maximum number of times requests to the dbt Cloud API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional)
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
dbt_cloud_host (dagster.StringSource, optional)
\n

The hostname where dbt cloud is being hosted (e.g. https://my_org.cloud.getdbt.com/).

\n

Default Value: \u2018https://cloud.getdbt.com/\u2019

\n
\n
\n

This resource allows users to programatically interface with the dbt Cloud Administrative REST\nAPI (v2) to launch jobs and monitor their progress. This currently implements only a subset of\nthe functionality exposed by the API.

\n

For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\nresponse JSON schemae, see the dbt Cloud API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_cloud_resource\n\nmy_dbt_cloud_resource = dbt_cloud_resource.configured(\n    {\n        "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n        "account_id": 30000,\n    }\n)\n\n@job(resource_defs={"dbt_cloud":my_dbt_cloud_resource})\ndef my_dbt_cloud_job():\n    ...\n
\n
\n
\n\n
\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_dbt.load_assets_from_dbt_project(project_dir, profiles_dir=None, target_dir=None, select=None, runtime_metadata_fn=None, io_manager_key=None, node_info_to_asset_key=<function _get_node_asset_key>, use_build_command=False)[source]\u00b6
\n

Loads a set of DBT models from a DBT project into Dagster assets.

\n

Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\ndbt run command.

\n
\n
Parameters
\n
    \n
  • project_dir (Optional[str]) \u2013 The directory containing the DBT project to load.

  • \n
  • profiles_dir (Optional[str]) \u2013 The profiles directory to use for loading the DBT project.\nDefaults to a directory called \u201cconfig\u201d inside the project_dir.

  • \n
  • target_dir (Optional[str]) \u2013 The target directory where DBT will place compiled artifacts.\nDefaults to \u201ctarget\u201d underneath the project_dir.

  • \n
  • select (str) \u2013 A DBT selection string for the models in a project that you want to include.\nDefaults to \u201c*\u201d.

  • \n
  • runtime_metadata_fn \u2013 (Optional[Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]):\nA function that will be run after any of the assets are materialized and returns\nmetadata entries for the asset, to be displayed in the asset catalog for that run.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key that will be set on each of the returned\nassets. When other ops are downstream of the loaded assets, the IOManager specified\nhere determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • node_info_to_asset_key \u2013 (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\nof dbt node info and returns the AssetKey that you want to represent that node. By\ndefault, the asset key will simply be the name of the dbt model.

  • \n
  • use_build_command \u2013 (bool): Flag indicating if you want to use dbt build as the core computation\nfor this asset, rather than dbt run.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_dbt.load_assets_from_dbt_manifest(manifest_json, runtime_metadata_fn=None, io_manager_key=None, selected_unique_ids=None, select=None, node_info_to_asset_key=<function _get_node_asset_key>, use_build_command=False)[source]\u00b6
\n

Loads a set of dbt models, described in a manifest.json, into Dagster assets.

\n

Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\ndbt run command.

\n
\n
Parameters
\n
    \n
  • manifest_json (Optional[Mapping[str, Any]]) \u2013 The contents of a DBT manifest.json, which contains\na set of models to load into assets.

  • \n
  • runtime_metadata_fn \u2013 (Optional[Callable[[SolidExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]):\nA function that will be run after any of the assets are materialized and returns\nmetadata entries for the asset, to be displayed in the asset catalog for that run.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key that will be set on each of the returned\nassets. When other ops are downstream of the loaded assets, the IOManager specified\nhere determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • selected_unique_ids (Optional[Set[str]]) \u2013 The set of dbt unique_ids that you want to load\nas assets.

  • \n
  • node_info_to_asset_key \u2013 (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\nof dbt node info and returns the AssetKey that you want to represent that node. By\ndefault, the asset key will simply be the name of the dbt model.

  • \n
  • use_build_command \u2013 (bool): Flag indicating if you want to use dbt build as the core computation\nfor this asset, rather than dbt run.

  • \n
\n
\n
\n
\n\n
\n
\n

Types\u00b6

\n
\n
\nclass dagster_dbt.DbtOutput(result)[source]\u00b6
\n

Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, result, which\nrepresents the dbt-formatted result of the command that was run (if any).

\n

Used internally, should not be instantiated directly by the user.

\n
\n\n
\n
\nclass dagster_dbt.DbtResource(logger=None)[source]\u00b6
\n

Base class for a resource allowing users to interface with dbt

\n
\n
\nabstract build(select=None, **kwargs)[source]\u00b6
\n

Run the build command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

select (List[str], optional) \u2013 the models/resources to include in the run.

\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract compile(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the compile command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in compilation.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from compilation.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract generate_docs(compile_project=False, **kwargs)[source]\u00b6
\n

Run the docs generate command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n

compile_project (bool, optional) \u2013 If true, compile the project before generating a catalog.

\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract get_manifest_json(**kwargs)[source]\u00b6
\n

Get a parsed version of the manifest.json file for the relevant dbt project.

\n
\n
Returns
\n

\n
dictionary containing the parsed contents of the manifest json file

for this dbt project.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nabstract get_run_results_json(**kwargs)[source]\u00b6
\n

Get a parsed version of the run_results.json file for the relevant dbt project.

\n
\n
Returns
\n

\n
dictionary containing the parsed contents of the run_results json file

for this dbt project.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nproperty logger\u00b6
\n

A property for injecting a logger dependency.

\n
\n
Type
\n

logging.Logger

\n
\n
\n
\n\n
\n
\nabstract ls(select=None, models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the ls command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the resources to include in the output.

  • \n
  • models (List[str], optional) \u2013 the models to include in the output.

  • \n
  • exclude (List[str], optional) \u2013 the resources to exclude from the output.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract run(models=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the run command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in the run.

  • \n
  • exclude (List[str]), optional) \u2013 the models to exclude from the run.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract run_operation(macro, args=None, **kwargs)[source]\u00b6
\n

Run the run-operation command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • macro (str) \u2013 the dbt macro to invoke.

  • \n
  • args (Dict[str, Any], optional) \u2013 the keyword arguments to be supplied to the macro.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract seed(show=False, select=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the seed command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • show (bool, optional) \u2013 If True, then show a sample of the seeded data in the\nresponse. Defaults to False.

  • \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract snapshot(select=None, exclude=None, **kwargs)[source]\u00b6
\n

Run the snapshot command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • select (List[str], optional) \u2013 the snapshots to include in the run.

  • \n
  • exclude (List[str], optional) \u2013 the snapshots to exclude from the run.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n
\nabstract test(models=None, exclude=None, data=True, schema=True, **kwargs)[source]\u00b6
\n

Run the test command on a dbt project. kwargs are passed in as additional parameters.

\n
\n
Parameters
\n
    \n
  • models (List[str], optional) \u2013 the models to include in testing.

  • \n
  • exclude (List[str], optional) \u2013 the models to exclude from testing.

  • \n
  • data (bool, optional) \u2013 If True (default), then run data tests.

  • \n
  • schema (bool, optional) \u2013 If True (default), then run schema tests.

  • \n
\n
\n
Returns
\n

object containing parsed output from dbt

\n
\n
Return type
\n

DbtOutput

\n
\n
\n
\n\n
\n\n
\n
\n

Errors\u00b6

\n
\n
\nexception dagster_dbt.DagsterDbtError(description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

The base exception of the dagster-dbt library.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliRuntimeError(description, logs, raw_output)[source]\u00b6
\n

Represents an error while executing a dbt CLI command.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliFatalRuntimeError(logs, raw_output)[source]\u00b6
\n

Represents a fatal error in the dbt CLI (return code 2).

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliHandledRuntimeError(logs, raw_output)[source]\u00b6
\n

Represents a model error reported by the dbt CLI at runtime (return code 1).

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliOutputsNotFoundError(path)[source]\u00b6
\n

Represents a problem in finding the target/run_results.json artifact when executing a dbt\nCLI command.

\n

For more details on target/run_results.json, see\nhttps://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliUnexpectedOutputError(invalid_line_nos)[source]\u00b6
\n

Represents an error when parsing the output of a dbt CLI command.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtRpcUnexpectedPollOutputError(description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Represents an unexpected response when polling the dbt RPC server.

\n
\n\n
\n
\n

Utils\u00b6

\n
\n
\ndagster_dbt.utils.generate_materializations(dbt_output, asset_key_prefix=None)[source]\u00b6
\n

This function yields dagster.AssetMaterialization events for each model updated by\na dbt command.

\n

Information parsed from a DbtOutput object.

\n

Note that this will not work with output from the dbt_rpc_resource, because this resource does\nnot wait for a response from the RPC server before returning. Instead, use the\ndbt_rpc_sync_resource, which will wait for execution to complete.

\n

Examples:

\n
from dagster import op, Output\nfrom dagster_dbt.utils import generate_materializations\nfrom dagster_dbt import dbt_cli_resource, dbt_rpc_sync_resource\n\n@op(required_resource_keys={"dbt"})\ndef my_custom_dbt_run(context):\n    dbt_output = context.resources.dbt.run()\n    for materialization in generate_materializations(dbt_output):\n        # you can modify the materialization object to add extra metadata, if desired\n        yield materialization\n    yield Output(my_dbt_output)\n\n@job(resource_defs={{"dbt":dbt_cli_resource}})\ndef my_dbt_cli_job():\n    my_custom_dbt_run()\n\n@job(resource_defs={{"dbt":dbt_rpc_sync_resource}})\ndef my_dbt_rpc_job():\n    my_custom_dbt_run()\n
\n
\n
\n\n
\n
\n

Solids [Legacy]\u00b6

\n

dagster_dbt provides a set of solids that may be used in legacy pipelines.

\n
\n

CLI Solids\u00b6

\n
\n
\ndagster_dbt.dbt_cli_compile = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (dagster.StringSource, optional)
\n

The url for where dbt docs are being served for this project.

\n
\n
parse-only (Bool, optional)
\n

Default Value: False

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
no-version-check (Bool, optional)
\n

Skip the check that dbt\u2019s version matches the one specified in the dbt_project.yml file (\u2018require-dbt-version\u2019)

\n

Default Value: False

\n
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
selector (Union[List[String], None], optional)
\n

The selector name to use, as defined in your selectors.yml

\n

Default Value: None

\n
\n
state (Union[List[String], None], optional)
\n

If set, use the given directory as the source for json files to compare with this project.

\n

Default Value: None

\n
\n
full-refresh (Bool, optional)
\n

If specified, DBT will drop incremental models and fully-recalculate the incremental table from the model definition. (\u2013full-refresh)

\n

Default Value: False

\n
\n
\n

This solid executes dbt compile via the dbt CLI.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_run = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (dagster.StringSource, optional)
\n

The url for where dbt docs are being served for this project.

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
full-refresh (Bool, optional)
\n

If specified, DBT will drop incremental models and fully-recalculate the incremental table from the model definition. (\u2013full-refresh)

\n

Default Value: False

\n
\n
fail-fast (Bool, optional)
\n

Stop execution upon a first failure. (\u2013fail-fast)

\n

Default Value: False

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: []

\n
\n
\n

This solid executes dbt run via the dbt CLI. See the solid definition for available\nparameters.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_run_operation = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (dagster.StringSource, optional)
\n

The url for where dbt docs are being served for this project.

\n
\n
macro (dagster.StringSource)
\n

Specify the macro to invoke. dbt will call this macro with the supplied arguments and then exit.

\n
\n
args (permissive dict, optional)
\n

Supply arguments to the macro. This dictionary will be mapped to the keyword arguments defined in the selected macro. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
\n

This solid executes dbt run-operation via the dbt CLI.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_snapshot = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (dagster.StringSource, optional)
\n

The url for where dbt docs are being served for this project.

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
select (Union[List[String], None], optional)
\n

The dbt models to include.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
\n

This solid executes dbt snapshot via the dbt CLI.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_snapshot_freshness = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (dagster.StringSource, optional)
\n

The url for where dbt docs are being served for this project.

\n
\n
select (Union[List[String], None], optional)
\n

Specify the sources to snapshot freshness.

\n

Default Value: None

\n
\n
output (dagster.StringSource, optional)
\n

Specify the output path for the json report. By default, outputs to target/sources.json

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
\n

This solid executes dbt source snapshot-freshness via the dbt CLI.

\n
\n\n
\n
\ndagster_dbt.dbt_cli_test = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project-dir (dagster.StringSource, optional)
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles-dir (dagster.StringSource, optional)
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (dagster.StringSource, optional)
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (dagster.StringSource, optional)
\n

Which target to load for the given profile.

\n
\n
vars (permissive dict, optional)
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass-cache (Bool, optional)
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn-error (Bool, optional)
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional)
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (Bool, optional)
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target-path (dagster.StringSource, optional)
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (dagster.StringSource, optional)
\n

The url for where dbt docs are being served for this project.

\n
\n
data (Bool, optional)
\n

Run data tests defined in \u201ctests\u201d directory.

\n

Default Value: False

\n
\n
schema (Bool, optional)
\n

Run constraint validations from schema.yml files.

\n

Default Value: False

\n
\n
fail-fast (Bool, optional)
\n

Stop execution upon a first test failure.

\n

Default Value: False

\n
\n
threads (Union[Int, None], optional)
\n

Specify number of threads to use while executing models. Overrides settings in profiles.yml.

\n

Default Value: None

\n
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
\n

This solid executes dbt test via the dbt CLI. See the solid definition for available\nparameters.

\n
\n\n
\n
\n

RPC Solids\u00b6

\n
\n
\ndagster_dbt.create_dbt_rpc_run_sql_solid(name, output_def=None, **kwargs)[source]\u00b6
\n

This function is a factory which constructs a solid that will copy the results of a SQL query\nrun within the context of a dbt project to a pandas DataFrame.

\n

Any kwargs passed to this function will be passed along to the underlying @solid decorator. However, note that overriding config_schema, input_defs, and\nrequired_resource_keys is not allowed and will throw a DagsterInvalidDefinitionError.

\n

If you would like to configure this solid with different config fields, you could consider using\n@composite_solid to wrap this solid.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this solid.

  • \n
  • output_def (OutputDefinition, optional) \u2013 The OutputDefinition for the solid. This value should always be a representation\nof a pandas DataFrame. If not specified, the solid will default to an\nOutputDefinition named \u201cdf\u201d with a DataFrame\ndagster type.

  • \n
\n
\n
Returns
\n

Returns the constructed solid definition.

\n
\n
Return type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_rpc_compile_sql = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
name (String)
\n

\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt compile command to a dbt RPC server and returns the request\ntoken.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_run = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
\n

This solid sends the dbt run command to a dbt RPC server and returns the request token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_run_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
models (Union[List[String], None], optional)
\n

The dbt models to run.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
full_refresh (Bool, optional)
\n

Whether or not to perform a \u2013full-refresh.

\n

Default Value: False

\n
\n
fail_fast (Bool, optional)
\n

Whether or not to \u2013fail-fast.

\n

Default Value: False

\n
\n
warn_error (Bool, optional)
\n

Whether or not to \u2013warn-error.

\n

Default Value: False

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
task_tags (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
max_retries (Int, optional)
\n

Default Value: 5

\n
\n
retry_interval (Int, optional)
\n

Default Value: 120

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt run command to a dbt RPC server and returns the result of the\nexecuted dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_run_operation = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
macro (String)
\n

The dbt macro to invoke as a run operation

\n
\n
args (Union[permissive dict, None], optional)
\n

Arguments to supply to the invoked macro.

\n

Default Value: None

\n
\n
\n

This solid sends the dbt run-operation command to a dbt RPC server and returns the\nrequest token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_run_operation_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
macro (String)
\n

The dbt macro to invoke as a run operation

\n
\n
args (Union[permissive dict, None], optional)
\n

Arguments to supply to the invoked macro.

\n

Default Value: None

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt run-operation command to a dbt RPC server and returns the\nresult of the executed dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_snapshot = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
select (Union[List[String], None], optional)
\n

The dbt snapshot files to snapshot.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt snapshot files to exclude from the snapshot.

\n

Default Value: None

\n
\n
\n

This solid sends the dbt snapshot command to a dbt RPC server and returns the\nrequest token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_snapshot_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
select (Union[List[String], None], optional)
\n

The dbt snapshot files to snapshot.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt snapshot files to exclude from the snapshot.

\n

Default Value: None

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
task_tags (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
max_retries (Int, optional)
\n

Default Value: 5

\n
\n
retry_interval (Int, optional)
\n

Default Value: 120

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt snapshot command to a dbt RPC server and returns the result of\nthe executed dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_snapshot_freshness = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
select (Union[List[String], None], optional)
\n

The dbt sources to snapshot-freshness for.

\n

Default Value: None

\n
\n
warn_error (Bool, optional)
\n

Whether or not to \u2013warn-error.

\n

Default Value: False

\n
\n
\n

This solid sends the dbt source snapshot-freshness command to a dbt RPC server and\nreturns the request token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_snapshot_freshness_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
select (Union[List[String], None], optional)
\n

The dbt sources to snapshot-freshness for.

\n

Default Value: None

\n
\n
warn_error (Bool, optional)
\n

Whether or not to \u2013warn-error.

\n

Default Value: False

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt source snapshot command to a dbt RPC server and returns the\nresult of the executed dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_test = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
models (Union[List[String], None], optional)
\n

The dbt models to test.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
data (Bool, optional)
\n

Whether or not to run custom data tests.

\n

Default Value: True

\n
\n
schema (Bool, optional)
\n

Whether or not to run schema tests.

\n

Default Value: True

\n
\n
\n

This solid sends the dbt test command to a dbt RPC server and returns the request token.

\n

This dbt RPC solid is asynchronous. The request token can be used in subsequent RPC requests to\npoll the progress of the running dbt process.

\n
\n\n
\n
\ndagster_dbt.dbt_rpc_test_and_wait = <dagster.core.definitions.solid_definition.SolidDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
models (Union[List[String], None], optional)
\n

The dbt models to test.

\n

Default Value: None

\n
\n
exclude (Union[List[String], None], optional)
\n

The dbt models to exclude.

\n

Default Value: None

\n
\n
data (Bool, optional)
\n

Whether or not to run custom data tests.

\n

Default Value: True

\n
\n
schema (Bool, optional)
\n

Whether or not to run schema tests.

\n

Default Value: True

\n
\n
interval (Int, optional)
\n

The interval (in seconds) at which to poll the dbt rpc process.

\n

Default Value: 10

\n
\n
logs (Bool, optional)
\n

Whether or not to return logs from the process.

\n

Default Value: True

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the solid executes. Default: True

\n

Default Value: True

\n
\n
\n

This solid sends the dbt test command to a dbt RPC server and returns the result of the\nexecuted dbt process.

\n

This dbt RPC solid is synchronous, and will periodically poll the dbt RPC server until the dbt\nprocess is completed.

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-dbt", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-fivetran/", "title": "Fivetran (dagster-fivetran)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-datadog/", "title": "Datadog (dagster-datadog)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-fivetran", "Fivetran (dagster-fivetran)", "N", "next"], ["sections/api/apidocs/libraries/dagster-datadog", "Datadog (dagster-datadog)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-dbt.rst.txt", "title": "dbt (dagster-dbt)", "toc": "\n"}, "dagster-docker": {"alabaster_version": "0.7.12", "body": "
\n

Orchestration on Docker\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_docker.DockerRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource, optional)
\n

The docker image to be used if the repository does not specify one.

\n
\n
network (dagster.StringSource, optional)
\n

Name of the network to which to connect the launched container at creation time

\n
\n
registry (strict dict, optional)
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource)
\n

\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
\n
\n
env_vars (List[String], optional)
\n

The list of environment variables names to include in the docker container. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the local environment)

\n
\n
container_kwargs (permissive dict, optional)
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
networks (List[dagster.StringSource], optional)
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
\n

Launches runs in a Docker container.

\n
\n\n
\n
\ndagster_docker.docker_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource, optional)
\n

The docker image to be used if the repository does not specify one.

\n
\n
network (dagster.StringSource, optional)
\n

Name of the network to which to connect the launched container at creation time

\n
\n
registry (strict dict, optional)
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource)
\n

\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
\n
\n
env_vars (List[String], optional)
\n

The list of environment variables names to include in the docker container. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the local environment)

\n
\n
container_kwargs (permissive dict, optional)
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
networks (List[dagster.StringSource], optional)
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Executor which launches steps as Docker containers.

\n

To use the docker_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_docker import docker_executor\n\n@job(executor_def=docker_executor)\ndef docker_job():\n    pass\n
\n
\n

Then you can configure the executor with run config as follows:

\n
execution:\n  config:\n    registry: ...\n    network: ...\n    networks: ...\n    container_kwargs: ...\n
\n
\n

If you\u2019re using the DockerRunLauncher, configuration set on the containers created by the run\nlauncher will also be set on the containers that are created for each step.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-docker", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-gcp/", "title": "GCP (dagster-gcp)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-fivetran/", "title": "Fivetran (dagster-fivetran)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-gcp", "GCP (dagster-gcp)", "N", "next"], ["sections/api/apidocs/libraries/dagster-fivetran", "Fivetran (dagster-fivetran)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-docker.rst.txt", "title": "Orchestration on Docker", "toc": "\n"}, "dagster-fivetran": {"alabaster_version": "0.7.12", "body": "
\n

Fivetran (dagster-fivetran)\u00b6

\n

This library provides a Dagster integration with Fivetran.

\n
\n

Ops\u00b6

\n
\n
\ndagster_fivetran.fivetran_sync_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connector_id (String)
\n

The Fivetran Connector ID that this op will sync. You can retrieve this value from the \u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
poll_interval (Float, optional)
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional)
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n

Default Value: None

\n
\n
yield_materializations (Bool, optional)
\n

If True, materializations corresponding to the results of the Fivetran sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional)
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018fivetran\u2019]

\n
\n
\n

Executes a Fivetran sync for a given connector_id, and polls until that sync\ncompletes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\nthe details of the Fivetran connector after the sync successfully completes, as well as details\nabout which tables the sync updates.

\n

It requires the use of the fivetran_resource, which allows it to\ncommunicate with the Fivetran API.

\n

Examples:

\n
from dagster import job\nfrom dagster_fivetran import fivetran_resource, fivetran_sync_op\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\nsync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"fivetran": my_fivetran_resource})\ndef my_simple_fivetran_job():\n    sync_foobar()\n\n@job(resource_defs={"fivetran": my_fivetran_resource})\ndef my_composed_fivetran_job():\n    final_foobar_state = sync_foobar(start_after=some_op())\n    other_op(final_foobar_state)\n
\n
\n
\n\n
\n
\n

Resources\u00b6

\n
\n
\ndagster_fivetran.fivetran_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource)
\n

Fivetran API Key. You can find this value on the Fivetran settings page: https://fivetran.com/account/settings

\n
\n
api_secret (dagster.StringSource)
\n

Fivetran API Secret. You can find this value on the Fivetran settings page: https://fivetran.com/account/settings

\n
\n
disable_schedule_on_trigger (Bool, optional)
\n

Specifies if you would like any connector that is sync\u2019d using this resource to be automatically taken off its Fivetran schedule.

\n

Default Value: True

\n
\n
request_max_retries (Int, optional)
\n

The maximum number of times requests to the Fivetran API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional)
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the Fivetran REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

For a complete set of documentation on the Fivetran REST API, including expected response JSON\nschemae, see the Fivetran API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_fivetran import fivetran_resource\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\n@job(resource_defs={"fivetran":my_fivetran_resource})\ndef my_fivetran_job():\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_fivetran.FivetranResource(api_key, api_secret, disable_schedule_on_trigger=True, request_max_retries=3, request_retry_delay=0.25, log=<Logger dagster.builtin (DEBUG)>)[source]\u00b6
\n

This class exposes methods on top of the Fivetran REST API.

\n
\n
\nget_connector_details(connector_id)[source]\u00b6
\n

Gets details about a given connector from the Fivetran Connector API.

\n
\n
Parameters
\n

connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nget_connector_sync_status(connector_id)[source]\u00b6
\n

Gets details about the status of the most recent Fivetran sync operation for a given\nconnector.

\n
\n
Parameters
\n

connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
Returns
\n

Tuple representing the timestamp of the last completeded sync, if it succeeded, and\nthe currently reported sync status.

\n
\n
Return type
\n

Tuple[datetime.datetime, bool, str]

\n
\n
\n
\n\n
\n
\nmake_request(method, endpoint, data=None)[source]\u00b6
\n

Creates and sends a request to the desired Fivetran Connector API endpoint.

\n
\n
Parameters
\n
    \n
  • method (str) \u2013 The http method to use for this request (e.g. \u201cPOST\u201d, \u201cGET\u201d, \u201cPATCH\u201d).

  • \n
  • endpoint (str) \u2013 The Fivetran API endpoint to send this request to.

  • \n
  • data (Optional[str]) \u2013 JSON-formatted data string to be included in the request.

  • \n
\n
\n
Returns
\n

Parsed json data from the response to this request

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\npoll_sync(connector_id, initial_last_sync_completion, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Given a Fivetran connector and the timestamp at which the previous sync completed, poll\nuntil the next sync completes.

\n

The previous sync completion time is necessary because the only way to tell when a sync\ncompletes is when this value changes.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • initial_last_sync_completion (datetime.datetime) \u2013 The timestamp of the last completed sync\n(successful or otherwise) for this connector, prior to running this method.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (float) \u2013 The maximum time that will waited before this operation is timed\nout. By default, this will never time out.

  • \n
\n
\n
Returns
\n

Parsed json data representing the API response.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nresync_and_poll(connector_id, resync_parameters, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Initializes a historical resync operation for the given connector, and polls until it completes.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • resync_parameters (Dict[str, List[str]]) \u2013 The payload to send to the Fivetran API.\nThis should be a dictionary with schema names as the keys and a list of tables\nto resync as the values.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (float) \u2013 The maximum time that will waited before this operation is timed\nout. By default, this will never time out.

  • \n
\n
\n
Returns
\n

Object containing details about the connector and the tables it updates

\n
\n
Return type
\n

FivetranOutput

\n
\n
\n
\n\n
\n
\nstart_resync(connector_id, resync_parameters)[source]\u00b6
\n

Initiates a historical sync of all data for multiple schema tables within a Fivetran connector.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • resync_parameters (Dict[str, List[str]]) \u2013 The resync parameters to send to the Fivetran API.\nAn example payload can be found here: https://fivetran.com/docs/rest-api/connectors#request_6

  • \n
\n
\n
Returns
\n

\n
Parsed json data representing the connector details API response after

the resync is started.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nstart_sync(connector_id)[source]\u00b6
\n

Initiates a sync of a Fivetran connector.

\n
\n
Parameters
\n

connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
Returns
\n

\n
Parsed json data representing the connector details API response after

the sync is started.

\n
\n
\n

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nsync_and_poll(connector_id, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Initializes a sync operation for the given connector, and polls until it completes.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (float) \u2013 The maximum time that will waited before this operation is timed\nout. By default, this will never time out.

  • \n
\n
\n
Returns
\n

Object containing details about the connector and the tables it updates

\n
\n
Return type
\n

FivetranOutput

\n
\n
\n
\n\n
\n
\nupdate_connector(connector_id, properties=None)[source]\u00b6
\n

Updates properties of a Fivetran Connector.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • properties (Dict[str, Any]) \u2013 The properties to be updated. For a comprehensive list of\nproperties, see the [Fivetran docs](https://fivetran.com/docs/rest-api/connectors#modifyaconnector).

  • \n
\n
\n
Returns
\n

Parsed json data representing the API response.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nupdate_schedule_type(connector_id, schedule_type=None)[source]\u00b6
\n

Updates the schedule type property of the connector to either \u201cauto\u201d or \u201cmanual\u201d.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID. You can retrieve this value from the\n\u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • schedule_type (Optional[str]) \u2013 Either \u201cauto\u201d (to turn the schedule on) or \u201cmanual\u201d (to\nturn it off).

  • \n
\n
\n
Returns
\n

Parsed json data representing the API response.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_fivetran.build_fivetran_assets(connector_id, destination_tables, poll_interval=10, poll_timeout=None, io_manager_key=None, asset_key_prefix=None)[source]\u00b6
\n

Build a set of assets for a given Fivetran connector.

\n

Returns an AssetsDefintion which connects the specified asset_keys to the computation that\nwill update them. Internally, executes a Fivetran sync for a given connector_id, and\npolls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\nfivetran_resource, which allows it to communicate with the\nFivetran API.

\n
\n
Parameters
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID that this op will sync. You can retrieve this\nvalue from the \u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • destination_tables (List[str]) \u2013 schema_name.table_name for each table that you want to be\nrepresented in the Dagster asset graph for this connection.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (Optional[float]) \u2013 The maximum time that will waited before this operation is\ntimed out. By default, this will never time out.

  • \n
  • io_manager_key (Optional[str]) \u2013 The io_manager to be used to handle each of these assets.

  • \n
  • asset_key_prefix (Optional[List[str]]) \u2013 A prefix for the asset keys inside this asset.\nIf left blank, assets will have a key of AssetKey([schema_name, table_name]).

  • \n
\n
\n
\n

Examples:

\n
from dagster import AssetKey, build_assets_job\n\nfrom dagster_fivetran import fivetran_resource\nfrom dagster_fivetran.assets import build_fivetran_assets\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\nfivetran_assets = build_fivetran_assets(\n    connector_id="foobar",\n    table_names=["schema1.table1", "schema2.table2"],\n])\n\nmy_fivetran_job = build_assets_job(\n    "my_fivetran_job",\n    assets=[fivetran_assets],\n    resource_defs={"fivetran": my_fivetran_resource}\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-fivetran", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-docker/", "title": "Orchestration on Docker"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-dbt/", "title": "dbt (dagster-dbt)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-docker", "Orchestration on Docker", "N", "next"], ["sections/api/apidocs/libraries/dagster-dbt", "dbt (dagster-dbt)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-fivetran.rst.txt", "title": "Fivetran (dagster-fivetran)", "toc": "\n"}, "dagster-gcp": {"alabaster_version": "0.7.12", "body": "
\n

GCP (dagster-gcp)\u00b6

\n
\n

BigQuery\u00b6

\n
\n
\nclass dagster_gcp.BigQueryError[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.bigquery_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource, optional)
\n

Project ID for the project which the client acts on behalf of. Will be passed\nwhen creating a dataset / job. If not passed, falls back to the default inferred from the\nenvironment.

\n
\n
location (dagster.StringSource, optional)
\n

(Optional) Default location for jobs / datasets / tables.

\n
\n
\n
\n\n
\n
\ndagster_gcp.bq_create_dataset(context)[source]\u00b6
\n

BigQuery Create Dataset.

\n

This op encapsulates creating a BigQuery dataset.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.bq_delete_dataset(context)[source]\u00b6
\n

BigQuery Delete Dataset.

\n

This op encapsulates deleting a BigQuery dataset.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.bq_op_for_queries(sql_queries)[source]\u00b6
\n

Executes BigQuery SQL queries.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.import_df_to_bq(context, df)[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.import_file_to_bq(context, path)[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.import_gcs_paths_to_bq(context, paths)[source]\u00b6
\n
\n\n
\n
\n

Dataproc\u00b6

\n
\n
\ndagster_gcp.dataproc_op = <dagster.core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_timeout_in_seconds (Int, optional)
\n

Optional. Maximum time in seconds to wait for the job being\ncompleted. Default is set to 1200 seconds (20 minutes).

\n

Default Value: 1200

\n
\n
job_config (strict dict)
\n
\nConfig Schema:
\n
job (strict dict, optional)
\n

A Cloud Dataproc job resource.

\n
\nConfig Schema:
\n
status (strict dict, optional)
\n

Cloud Dataproc job status.

\n
\n
placement (strict dict, optional)
\n

Cloud Dataproc job config.

\n
\nConfig Schema:
\n
clusterName (String, optional)
\n

Required. The name of the cluster where the job will\nbe submitted.

\n
\n
\n
\n
scheduling (strict dict, optional)
\n

Job scheduling options.

\n
\nConfig Schema:
\n
maxFailuresPerHour (Int, optional)
\n

Optional. Maximum number of times per hour a driver\nmay be restarted as a result of driver terminating with non-zero\ncode before job is reported failed.A job may be reported as\nthrashing if driver exits with non-zero code 4 times within 10\nminute window.Maximum value is 10.

\n
\n
\n
\n
pigJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Pig\n(https://pig.apache.org/) queries on YARN.

\n
\nConfig Schema:
\n
queryFileUri (String, optional)
\n

The HCFS URI of the script that contains the Pig\nqueries.

\n
\n
queryList (strict dict, optional)
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional)
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATH of the Pig Client and Hadoop MapReduce (MR) tasks. Can\ncontain Pig UDFs.

\n
\n
scriptVariables (permissive dict, optional)
\n

Optional. Mapping of query variable names to values\n(equivalent to the Pig command: name=[value]).

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure Pig. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/hadoop/conf/*-site.xml, /etc/pig/conf/pig.properties, and\nclasses in user code.

\n
\n
continueOnFailure (Bool, optional)
\n

Optional. Whether to continue executing queries if a\nquery fails. The default value is false. Setting to true can be\nuseful when executing independent parallel queries.

\n
\n
\n
\n
hiveJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Hive\n(https://hive.apache.org/) queries on YARN.

\n
\nConfig Schema:
\n
continueOnFailure (Bool, optional)
\n

Optional. Whether to continue executing queries if a\nquery fails. The default value is false. Setting to true can be\nuseful when executing independent parallel queries.

\n
\n
queryFileUri (String, optional)
\n

The HCFS URI of the script that contains Hive\nqueries.

\n
\n
queryList (strict dict, optional)
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional)
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATH of the Hive server and Hadoop MapReduce (MR) tasks. Can\ncontain Hive SerDes and UDFs.

\n
\n
scriptVariables (permissive dict, optional)
\n

Optional. Mapping of query variable names to values\n(equivalent to the Hive command: SET name=\u201dvalue\u201d;).

\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names and values,\nused to configure Hive. Properties that conflict with values set by\nthe Cloud Dataproc API may be overwritten. Can include properties\nset in /etc/hadoop/conf/*-site.xml, /etc/hive/conf/hive-site.xml,\nand classes in user code.

\n
\n
\n
\n
labels (permissive dict, optional)
\n

Optional. The labels to associate with this job. Label keys must\ncontain 1 to 63 characters, and must conform to RFC 1035\n(https://www.ietf.org/rfc/rfc1035.txt). Label values may be empty, but, if\npresent, must contain 1 to 63 characters, and must conform to RFC 1035\n(https://www.ietf.org/rfc/rfc1035.txt). No more than 32 labels can be associated\nwith a job.

\n
\n
sparkJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Spark\n(http://spark.apache.org/) applications on YARN.

\n
\nConfig Schema:
\n
archiveUris (List[String], optional)
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of Spark drivers and tasks. Supported file\ntypes: .jar, .tar, .tar.gz, .tgz, and .zip.

\n
\n
mainJarFileUri (String, optional)
\n

The HCFS URI of the jar file that contains the main\nclass.

\n
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATHs of the Spark driver and tasks.

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure Spark. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/spark/conf/spark-defaults.conf and classes in user code.

\n
\n
args (List[String], optional)
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as \u2013conf, that can be set as job\nproperties, since a collision may occur that causes an incorrect job\nsubmission.

\n
\n
fileUris (List[String], optional)
\n

Optional. HCFS URIs of files to be copied to the\nworking directory of Spark drivers and distributed tasks. Useful for\nnaively parallel tasks.

\n
\n
mainClass (String, optional)
\n

The name of the driver\u2019s main class. The jar file\nthat contains the class must be in the default CLASSPATH or\nspecified in jar_file_uris.

\n
\n
\n
\n
sparkSqlJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Spark SQL\n(http://spark.apache.org/sql/) queries.

\n
\nConfig Schema:
\n
queryList (strict dict, optional)
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional)
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
queryFileUri (String, optional)
\n

The HCFS URI of the script that contains SQL\nqueries.

\n
\n
scriptVariables (permissive dict, optional)
\n

Optional. Mapping of query variable names to values\n(equivalent to the Spark SQL command: SET name=\u201dvalue\u201d;).

\n
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to be added to the\nSpark CLASSPATH.

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure Spark SQL\u2019s SparkConf. Properties that conflict with\nvalues set by the Cloud Dataproc API may be overwritten.

\n
\n
\n
\n
pysparkJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache PySpark\n(https://spark.apache.org/docs/0.9.0/python-programming-guide.html) applications\non YARN.

\n
\nConfig Schema:
\n
jarFileUris (List[String], optional)
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATHs of the Python driver and tasks.

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure PySpark. Properties that conflict with values set by\nthe Cloud Dataproc API may be overwritten. Can include properties\nset in /etc/spark/conf/spark-defaults.conf and classes in user\ncode.

\n
\n
args (List[String], optional)
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as \u2013conf, that can be set as job\nproperties, since a collision may occur that causes an incorrect job\nsubmission.

\n
\n
fileUris (List[String], optional)
\n

Optional. HCFS URIs of files to be copied to the\nworking directory of Python drivers and distributed tasks. Useful\nfor naively parallel tasks.

\n
\n
pythonFileUris (List[String], optional)
\n

Optional. HCFS file URIs of Python files to pass to\nthe PySpark framework. Supported file types: .py, .egg, and\n.zip.

\n
\n
mainPythonFileUri (String, optional)
\n

Required. The HCFS URI of the main Python file to use\nas the driver. Must be a .py file.

\n
\n
archiveUris (List[String], optional)
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of .jar, .tar, .tar.gz, .tgz, and .zip.

\n
\n
\n
\n
reference (strict dict, optional)
\n

Encapsulates the full scoping used to reference a job.

\n
\nConfig Schema:
\n
projectId (String, optional)
\n

Required. The ID of the Google Cloud Platform project\nthat the job belongs to.

\n
\n
jobId (String, optional)
\n

Optional. The job ID, which must be unique within the\nproject.The ID must contain only letters (a-z, A-Z), numbers (0-9),\nunderscores (_), or hyphens (-). The maximum length is 100\ncharacters.If not specified by the caller, the job ID will be\nprovided by the server.

\n
\n
\n
\n
hadoopJob (strict dict, optional)
\n

A Cloud Dataproc job for running Apache Hadoop MapReduce\n(https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html)\njobs on Apache Hadoop YARN\n(https://hadoop.apache.org/docs/r2.7.1/hadoop-yarn/hadoop-yarn-site/YARN.html).

\n
\nConfig Schema:
\n
jarFileUris (List[String], optional)
\n

Optional. Jar file URIs to add to the CLASSPATHs of\nthe Hadoop driver and tasks.

\n
\n
loggingConfig (strict dict, optional)
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional)
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional)
\n

Optional. A mapping of property names to values, used\nto configure Hadoop. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/hadoop/conf/*-site and classes in user code.

\n
\n
args (List[String], optional)
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as -libjars or -Dfoo=bar, that can be set as\njob properties, since a collision may occur that causes an incorrect\njob submission.

\n
\n
fileUris (List[String], optional)
\n

Optional. HCFS (Hadoop Compatible Filesystem) URIs of\nfiles to be copied to the working directory of Hadoop drivers and\ndistributed tasks. Useful for naively parallel tasks.

\n
\n
mainClass (String, optional)
\n

The name of the driver\u2019s main class. The jar file\ncontaining the class must be in the default CLASSPATH or specified\nin jar_file_uris.

\n
\n
archiveUris (List[String], optional)
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of Hadoop drivers and tasks. Supported file\ntypes: .jar, .tar, .tar.gz, .tgz, or .zip.

\n
\n
mainJarFileUri (String, optional)
\n

The HCFS URI of the jar file containing the main\nclass. Examples:\n\u2018gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar\u2019\n\u2018hdfs:/tmp/test-samples/custom-wordcount.jar\u2019\n\u2018file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar\u2019

\n
\n
\n
\n
\n
\n
projectId (dagster.StringSource)
\n

Required. Project ID for the project which the client acts on behalf of. Will\nbe passed when creating a dataset / job. If not passed, falls back to the default inferred\nfrom the environment.

\n
\n
region (dagster.StringSource)
\n

\n
\n
\n
job_scoped_cluster (Bool, optional)
\n

whether to create a cluster or use an existing cluster

\n

Default Value: True

\n
\n
\n
\n\n
\n
\ndagster_gcp.dataproc_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
projectId (dagster.StringSource)
\n

Required. Project ID for the project which the client acts on behalf of. Will\nbe passed when creating a dataset / job. If not passed, falls back to the default inferred\nfrom the environment.

\n
\n
region (dagster.StringSource)
\n

\n
clusterName (dagster.StringSource)
\n

Required. The cluster name. Cluster names within a project must be unique.\nNames of deleted clusters can be reused.

\n
\n
cluster_config (strict dict, optional)
\n

The cluster config.

\n
\nConfig Schema:
\n
masterConfig (strict dict, optional)
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional)
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional)
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional)
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional)
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional)
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional)
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional)
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional)
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional)
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional)
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
secondaryWorkerConfig (strict dict, optional)
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional)
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional)
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional)
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional)
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional)
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional)
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional)
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional)
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional)
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional)
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
encryptionConfig (strict dict, optional)
\n

Encryption settings for the cluster.

\n
\nConfig Schema:
\n
gcePdKmsKeyName (String, optional)
\n

Optional. The Cloud KMS key name to use for PD disk\nencryption for all instances in the cluster.

\n
\n
\n
\n
securityConfig (strict dict, optional)
\n

Security related configuration, including Kerberos.

\n
\nConfig Schema:
\n
kerberosConfig (strict dict, optional)
\n

Specifies Kerberos related configuration.

\n
\nConfig Schema:
\n
truststorePasswordUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided truststore. For the self-signed certificate,\nthis password is generated by Dataproc.

\n
\n
enableKerberos (Bool, optional)
\n

Optional. Flag to indicate whether to\nKerberize the cluster.

\n
\n
truststoreUri (String, optional)
\n

Optional. The Cloud Storage URI of the\ntruststore file used for SSL encryption. If not\nprovided, Dataproc will provide a self-signed\ncertificate.

\n
\n
crossRealmTrustRealm (String, optional)
\n

Optional. The remote realm the Dataproc\non-cluster KDC will trust, should the user enable cross\nrealm trust.

\n
\n
rootPrincipalPasswordUri (String, optional)
\n

Required. The Cloud Storage URI of a KMS\nencrypted file containing the root principal\npassword.

\n
\n
kmsKeyUri (String, optional)
\n

Required. The uri of the KMS key used to\nencrypt various sensitive files.

\n
\n
crossRealmTrustKdc (String, optional)
\n

Optional. The KDC (IP or hostname) for\nthe remote trusted realm in a cross realm trust\nrelationship.

\n
\n
crossRealmTrustSharedPasswordUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the shared password between\nthe on-cluster Kerberos realm and the remote trusted\nrealm, in a cross realm trust relationship.

\n
\n
tgtLifetimeHours (Int, optional)
\n

Optional. The lifetime of the ticket\ngranting ticket, in hours. If not specified, or user\nspecifies 0, then default value 10 will be used.

\n
\n
keystoreUri (String, optional)
\n

Optional. The Cloud Storage URI of the\nkeystore file used for SSL encryption. If not provided,\nDataproc will provide a self-signed certificate.

\n
\n
keyPasswordUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided key. For the self-signed certificate, this\npassword is generated by Dataproc.

\n
\n
keystorePasswordUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided keystore. For the self-signed certificate, this\npassword is generated by Dataproc.

\n
\n
crossRealmTrustAdminServer (String, optional)
\n

Optional. The admin server (IP or\nhostname) for the remote trusted realm in a cross realm\ntrust relationship.

\n
\n
kdcDbKeyUri (String, optional)
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the master key of the KDC\ndatabase.

\n
\n
\n
\n
\n
\n
initializationActions (List[strict dict], optional)
\n

Optional. Commands to execute on each node after config is\ncompleted. By default, executables are run on master and all worker nodes. You\ncan test a node\u2019s role metadata to run an executable on a master or worker\nnode, as shown below using curl (you can also use wget): ROLE=$(curl -H\nMetadata-Flavor:Google\nhttp://metadata/computeMetadata/v1/instance/attributes/dataproc-role) if [[\n\u201c${ROLE}\u201d == \u2018Master\u2019 ]]; then \u2026 master specific actions \u2026 else \u2026\nworker specific actions \u2026 fi

\n
\n
configBucket (String, optional)
\n

Optional. A Google Cloud Storage bucket used to stage job\ndependencies, config files, and job driver console output. If you do not specify\na staging bucket, Cloud Dataproc will determine a Cloud Storage location (US,\nASIA, or EU) for your cluster\u2019s staging bucket according to the Google Compute\nEngine zone where your cluster is deployed, and then create and manage this\nproject-level, per-location bucket (see Cloud Dataproc staging bucket).

\n
\n
workerConfig (strict dict, optional)
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional)
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional)
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional)
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional)
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional)
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional)
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional)
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional)
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional)
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional)
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
gceClusterConfig (strict dict, optional)
\n

Common config settings for resources of Compute Engine cluster\ninstances, applicable to all instances in the cluster.

\n
\nConfig Schema:
\n
networkUri (String, optional)
\n

Optional. The Compute Engine network to be used for\nmachine communications. Cannot be specified with subnetwork_uri. If\nneither network_uri nor subnetwork_uri is specified, the \u201cdefault\u201d\nnetwork of the project is used, if it exists. Cannot be a \u201cCustom\nSubnet Network\u201d (see Using Subnetworks for more information).A full\nURL, partial URI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/regions/global/default\nprojects/[project_id]/regions/global/default default

\n
\n
zoneUri (String, optional)
\n

Optional. The zone where the Compute Engine cluster\nwill be located. On a create request, it is required in the \u201cglobal\u201d\nregion. If omitted in a non-global Cloud Dataproc region, the\nservice will pick a zone in the corresponding Compute Engine region.\nOn a get request, zone will always be present.A full URL, partial\nURI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/[zone]\nprojects/[project_id]/zones/[zone] us-central1-f

\n
\n
metadata (permissive dict, optional)
\n

The Compute Engine metadata entries to add to all\ninstances (see Project and instance metadata\n(https://cloud.google.com/compute/docs/storing-retrieving-metadata#project_and_instance_metadata)).

\n
\n
internalIpOnly (Bool, optional)
\n

Optional. If true, all instances in the cluster will\nonly have internal IP addresses. By default, clusters are not\nrestricted to internal IP addresses, and will have ephemeral\nexternal IP addresses assigned to each instance. This\ninternal_ip_only restriction can only be enabled for subnetwork\nenabled networks, and all off-cluster dependencies must be\nconfigured to be accessible without external IP addresses.

\n
\n
serviceAccountScopes (List[String], optional)
\n

Optional. The URIs of service account scopes to be\nincluded in Compute Engine instances. The following base set of\nscopes is always included:\nhttps://www.googleapis.com/auth/cloud.useraccounts.readonly\nhttps://www.googleapis.com/auth/devstorage.read_write\nhttps://www.googleapis.com/auth/logging.writeIf no scopes are\nspecified, the following defaults are also provided:\nhttps://www.googleapis.com/auth/bigquery\nhttps://www.googleapis.com/auth/bigtable.admin.table\nhttps://www.googleapis.com/auth/bigtable.data\nhttps://www.googleapis.com/auth/devstorage.full_control

\n
\n
tags (List[String], optional)
\n

The Compute Engine tags to add to all instances (see\nTagging instances).

\n
\n
serviceAccount (String, optional)
\n

Optional. The service account of the instances.\nDefaults to the default Compute Engine service account. Custom\nservice accounts need permissions equivalent to the following IAM\nroles: roles/logging.logWriter roles/storage.objectAdmin(see\nhttps://cloud.google.com/compute/docs/access/service-accounts#custom_service_accounts\nfor more information). Example:\n[account_id]@[project_id].iam.gserviceaccount.com

\n
\n
subnetworkUri (String, optional)
\n

Optional. The Compute Engine subnetwork to be used\nfor machine communications. Cannot be specified with network_uri.A\nfull URL, partial URI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/regions/us-east1/subnetworks/sub0\nprojects/[project_id]/regions/us-east1/subnetworks/sub0 sub0

\n
\n
\n
\n
softwareConfig (strict dict, optional)
\n

Specifies the selection and config of software inside the\ncluster.

\n
\nConfig Schema:
\n
properties (permissive dict, optional)
\n

Optional. The properties to set on daemon config\nfiles.Property keys are specified in prefix:property format, for\nexample core:hadoop.tmp.dir. The following are supported prefixes\nand their mappings: capacity-scheduler: capacity-scheduler.xml core:\ncore-site.xml distcp: distcp-default.xml hdfs: hdfs-site.xml hive:\nhive-site.xml mapred: mapred-site.xml pig: pig.properties spark:\nspark-defaults.conf yarn: yarn-site.xmlFor more information, see\nCluster properties.

\n
\n
optionalComponents (List[Component], optional)
\n

The set of optional components to activate on the\ncluster.

\n
\n
imageVersion (String, optional)
\n

Optional. The version of software inside the cluster.\nIt must be one of the supported Cloud Dataproc Versions, such as\n\u201c1.2\u201d (including a subminor version, such as \u201c1.2.29\u201d), or the\n\u201cpreview\u201d version. If unspecified, it defaults to the latest Debian\nversion.

\n
\n
\n
\n
\n
\n
\n
\n\n
\n
\n

GCS\u00b6

\n
\n
\ndagster_gcp.gcs_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional)
\n

Project name

\n
\n
\n
\n\n
\n
\nclass dagster_gcp.GCSFileHandle(gcs_bucket, gcs_key)[source]\u00b6
\n

A reference to a file on GCS.

\n
\n
\nproperty gcs_bucket\u00b6
\n

The name of the GCS bucket.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty gcs_key\u00b6
\n

The GCS key.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty gcs_path\u00b6
\n

The file\u2019s GCS URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty path_desc\u00b6
\n

The file\u2019s GCS URL.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\ndagster_gcp.gcs_file_manager ResourceDefinition[source]\u00b6
\n

FileManager that provides abstract access to GCS.

\n

Implements the FileManager API.

\n
\n\n
\n
\ndagster_gcp.gcs.gcs_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs_bucket (dagster.StringSource)
\n

\n
gcs_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using GCS for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.

\n

Attach this resource definition to your job to make it available to your ops.

\n
@job(resource_defs={'io_manager': gcs_pickle_io_manager, 'gcs': gcs_resource, ...})\ndef my_job():\n    my_op()\n
\n
\n

You may configure this storage as follows:

\n
resources:\n    io_manager:\n        config:\n            gcs_bucket: my-cool-bucket\n            gcs_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\ndagster_gcp.gcs.gcs_pickle_asset_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs_bucket (dagster.StringSource)
\n

\n
gcs_prefix (dagster.StringSource, optional)
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using GCS for storage, meant for use with software-defined assets.

\n

Each asset is assigned to a single filesystem path, so subsequent materializations of an asset\nwill overwrite previous materializations of that asset.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.

\n

Attach this resource definition to your job to make it available to your ops.

\n
asset_group = AssetGroup(\n    assets...,\n    resource_defs={'io_manager': gcs_pickle_asset_io_manager, "gcs": gcs_resource, ...}),\n)\n
\n
\n

You may configure this IO manager as follows:

\n
resources:\n    io_manager:\n        config:\n            gcs_bucket: my-cool-bucket\n            gcs_prefix: good/prefix-for-files-\n
\n
\n
\n\n
\n
\n

Legacy APIs\u00b6

\n
\n
\ndagster_gcp.bq_solid_for_queries(sql_queries)[source]\u00b6
\n

Executes BigQuery SQL queries.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.dataproc_solid(context)[source]\u00b6
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-gcp", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-ge/", "title": "Great Expectations (dagster-ge)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-docker/", "title": "Orchestration on Docker"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-ge", "Great Expectations (dagster-ge)", "N", "next"], ["sections/api/apidocs/libraries/dagster-docker", "Orchestration on Docker", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-gcp.rst.txt", "title": "GCP (dagster-gcp)", "toc": "\n"}, "dagster-ge": {"alabaster_version": "0.7.12", "body": "
\n

Great Expectations (dagster-ge)\u00b6

\n
\n
\ndagster_ge.ge_validation_solid_factory(name, datasource_name, suite_name, validation_operator_name=None, input_dagster_type=<dagster.core.types.dagster_type.DagsterType object>, batch_kwargs=None)[source]\u00b6
\n

Generates solids for interacting with GE.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 the name of the solid

  • \n
  • datasource_name (str) \u2013 the name of your DataSource, see your great_expectations.yml

  • \n
  • suite_name (str) \u2013 the name of your expectation suite, see your great_expectations.yml

  • \n
  • validation_operator_name (Optional[str]) \u2013 what validation operator to run \u2013 defaults to None,\nwhich generates an ephemeral validator.\nIf you want to save data docs, use \u2018action_list_operator\u2019.\nSee https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html

  • \n
  • input_dagster_type (DagsterType) \u2013 the Dagster type used to type check the input to the\nsolid. Defaults to dagster_pandas.DataFrame.

  • \n
  • batch_kwargs (Optional[dict]) \u2013 overrides the batch_kwargs parameter when calling the\nge_data_context\u2019s get_batch method. Defaults to {\u201cdataset\u201d: dataset},\nwhere dataset is the input to the generated solid.

  • \n
\n
\n
Returns
\n

A solid that takes in a set of data and yields both an expectation with relevant metadata\nand an output with all the metadata (for user processing)

\n
\n
\n
\n\n
\n
\ndagster_ge.ge_validation_op_factory(name, datasource_name, suite_name, validation_operator_name=None, input_dagster_type=<dagster.core.types.dagster_type.DagsterType object>, batch_kwargs=None)[source]\u00b6
\n

Generates ops for interacting with GE.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 the name of the op

  • \n
  • datasource_name (str) \u2013 the name of your DataSource, see your great_expectations.yml

  • \n
  • suite_name (str) \u2013 the name of your expectation suite, see your great_expectations.yml

  • \n
  • validation_operator_name (Optional[str]) \u2013 what validation operator to run \u2013 defaults to\nNone, which generates an ephemeral validator. If you want to save data docs, use\n\u2018action_list_operator\u2019.\nSee https://docs.greatexpectations.io/en/latest/reference/core_concepts/validation_operators_and_actions.html

  • \n
  • input_dagster_type (DagsterType) \u2013 the Dagster type used to type check the input to the op.\nDefaults to dagster_pandas.DataFrame.

  • \n
  • batch_kwargs (Optional[dict]) \u2013 overrides the batch_kwargs parameter when calling the\nge_data_context\u2019s get_batch method. Defaults to {\u201cdataset\u201d: dataset}, where\ndataset is the input to the generated op.

  • \n
\n
\n
Returns
\n

A solid that takes in a set of data and yields both an expectation with relevant metadata\nand an output with all the metadata (for user processing)

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-ge", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-github/", "title": "GitHub (dagster-github)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-gcp/", "title": "GCP (dagster-gcp)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-github", "GitHub (dagster-github)", "N", "next"], ["sections/api/apidocs/libraries/dagster-gcp", "GCP (dagster-gcp)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-ge.rst.txt", "title": "Great Expectations (dagster-ge)", "toc": "\n"}, "dagster-github": {"alabaster_version": "0.7.12", "body": "
\n

GitHub (dagster-github)\u00b6

\n

This library provides an integration with GitHub Apps, to support performing various automation\noperations within your github repositories and with the tighter permissions scopes that github apps\nallow for vs using a personal token.

\n

Presently, it provides a thin wrapper on the github v4 graphql API.

\n

To use this integration, you\u2019ll first need to create a GitHub App for it.

\n
    \n
  1. Create App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/, You will end up with a private key and App ID, which will be used when configuring the\ndagster-github resource. Note you will need to grant your app the relevent permissions\nfor the API requests you want to make, for example to post issues it will need read/write access\nfor the issues repository permission, more info on GitHub application permissions can be found\nhere

  2. \n
  3. Install App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/#step-7-install-the-app-on-your-account

  4. \n
  5. Find your installation_id: You can pull this from the GitHub app administration page,\nhttps://github.com/apps/<app-name>/installations/<installation_id>. Note if your app is\ninstalled more than once you can also programatically retrieve these IDs.

  6. \n
\n

Sharing your App ID and Installation ID is fine, but make sure that the Private Key for your app is\nstored securily.

\n
\n
\n

Posting Issues\u00b6

\n

Now, you can create issues in GitHub from Dagster with the GitHub resource:

\n
import os\n\nfrom dagster import job, op\nfrom dagster_github import github_resource\n\n\n@op(resource_defs={'github'})\ndef github_op(context):\n    context.resources.github.create_issue(\n        repo_name='dagster',\n        repo_owner='dagster-io',\n        title='Dagster\\'s first github issue',\n        body='this open source thing seems like a pretty good idea',\n    )\n\n@job(resource_defs={'github': github_resource})\ndef github_job():\n    github_op()\n\ngithub_job.execute_in_process(\n    run_config={'resources': {'github': {'config': {\n        "github_app_id": os.getenv('GITHUB_APP_ID'),\n        "github_app_private_rsa_key": os.getenv('GITHUB_PRIVATE_KEY'),\n        "github_installation_id": os.getenv('GITHUB_INSTALLATION_ID'),\n    }}}}\n)\n
\n
\n

Run the above code, and you\u2019ll see the issue appear in GitHub:\n

\n

GitHub enterprise users can provide their hostname in the run config. Provide github_hostname\nas part of your github config like below.

\n
github_job.execute_in_process(\n    run_config={'resources': {'github': {'config': {\n        "github_app_id": os.getenv('GITHUB_APP_ID'),\n        "github_app_private_rsa_key": os.getenv('GITHUB_PRIVATE_KEY'),\n        "github_installation_id": os.getenv('GITHUB_INSTALLATION_ID'),\n        "github_hostname": os.getenv('GITHUB_HOSTNAME'),\n    }}}}\n
\n
\n

By provisioning github_resource as a Dagster job resource, you can post to GitHub from\nwithin any op execution.

\n
\n
\n

Executing GraphQL queries\u00b6

\n
import os\n\nfrom dagster import job, op\nfrom dagster_github import github_resource\n\n\n@op(resource_defs={'github'})\ndef github_op(context):\n    context.resources.github.execute(\n        query="""\n        query get_repo_id($repo_name: String!, $repo_owner: String!) {\n            repository(name: $repo_name, owner: $repo_owner) {\n                id\n            }\n        }\n        """,\n        variables={"repo_name": repo_name, "repo_owner": repo_owner},\n    )\n\n@job(resource_defs={'github': github_resource})\ndef github_job():\n    github_op()\n\ngithub_job.execute_in_process(\n    run_config={'resources': {'github': {'config': {\n        "github_app_id": os.getenv('GITHUB_APP_ID'),\n        "github_app_private_rsa_key": os.getenv('GITHUB_PRIVATE_KEY'),\n        "github_installation_id": os.getenv('GITHUB_INSTALLATION_ID'),\n    }}}}\n)\n
\n
\n
\n
\ndagster_github.github_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
github_app_id (dagster.IntSource)
\n

Github Application ID, for more info see https://developer.github.com/apps/

\n
\n
github_app_private_rsa_key (dagster.StringSource)
\n

Github Application Private RSA key text, for more info see https://developer.github.com/apps/

\n
\n
github_installation_id (dagster.IntSource, optional)
\n

Github Application Installation ID, for more info see https://developer.github.com/apps/

\n
\n
github_hostname (dagster.StringSource, optional)
\n

Github hostname. Defaults to api.github.com, for more info see https://developer.github.com/apps/

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-github", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-k8s/", "title": "Kubernetes (dagster-k8s)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-ge/", "title": "Great Expectations (dagster-ge)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-k8s", "Kubernetes (dagster-k8s)", "N", "next"], ["sections/api/apidocs/libraries/dagster-ge", "Great Expectations (dagster-ge)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-github.rst.txt", "title": "GitHub (dagster-github)", "toc": "\n"}, "dagster-graphql": {"alabaster_version": "0.7.12", "body": "
\n

GraphQL (dagster-graphql)\u00b6

\n
\n

Python Client\u00b6

\n
\n
\nclass dagster_graphql.DagsterGraphQLClient(hostname, port_number=None, transport=None, use_https=False)[source]\u00b6
\n

Official Dagster Python Client for GraphQL

\n

Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server

\n

As of now, all operations on this client are synchronous.

\n

Intended usage:

\n
client = DagsterGraphQLClient("localhost", port_number=3000)\nstatus = client.get_run_status(**SOME_RUN_ID**)\n
\n
\n
\n
Parameters
\n
    \n
  • hostname (str) \u2013 Hostname for the Dagster GraphQL API, like localhost or\ndagit.dagster.YOUR_ORG_HERE.

  • \n
  • port_number (Optional[int], optional) \u2013 Optional port number to connect to on the host.\nDefaults to None.

  • \n
  • transport (Optional[Transport], optional) \u2013 A custom transport to use to connect to the\nGraphQL API with (e.g. for custom auth). Defaults to None.

  • \n
  • use_https (bool, optional) \u2013 Whether to use https in the URL connection string for the\nGraphQL API. Defaults to False.

  • \n
\n
\n
Raises
\n

ConnectionError \u2013 if the client cannot connect to the host.

\n
\n
\n
\n
\nget_run_status(run_id)[source]\u00b6
\n

Get the status of a given Pipeline Run

\n
\n
Parameters
\n

run_id (str) \u2013 run id of the requested pipeline run.

\n
\n
Raises
\n
\n
\n
Returns
\n

returns a status Enum describing the state of the requested pipeline run

\n
\n
Return type
\n

PipelineRunStatus

\n
\n
\n
\n\n
\n
\nreload_repository_location(repository_location_name)[source]\u00b6
\n

Reloads a Dagster Repository Location, which reloads all repositories in that repository location.

\n

This is useful in a variety of contexts, including refreshing Dagit without restarting\nthe server.

\n
\n
Parameters
\n

repository_location_name (str) \u2013 The name of the repository location

\n
\n
Returns
\n

Object with information about the result of the reload request

\n
\n
Return type
\n

ReloadRepositoryLocationInfo

\n
\n
\n
\n\n
\n
\nshutdown_repository_location(repository_location_name)[source]\u00b6
\n

Shuts down the server that is serving metadata for the provided repository location.

\n

This is primarily useful when you want the server to be restarted by the compute environment\nin which it is running (for example, in Kubernetes, the pod in which the server is running\nwill automatically restart when the server is shut down, and the repository metadata will\nbe reloaded)

\n
\n
Parameters
\n

repository_location_name (str) \u2013 The name of the repository location

\n
\n
Returns
\n

Object with information about the result of the reload request

\n
\n
Return type
\n

ShutdownRepositoryLocationInfo

\n
\n
\n
\n\n
\n
\nsubmit_job_execution(job_name, repository_location_name=None, repository_name=None, run_config=None, tags=None, op_selection=None)[source]\u00b6
\n

Submits a job with attached configuration for execution.

\n
\n
Parameters
\n
    \n
  • job_name (str) \u2013 The job\u2019s name

  • \n
  • repository_location_name (Optional[str]) \u2013 The name of the repository location where\nthe job is located. If omitted, the client will try to infer the repository location\nfrom the available options on the Dagster deployment. Defaults to None.

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository where the job is located.\nIf omitted, the client will try to infer the repository from the available options\non the Dagster deployment. Defaults to None.

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 This is the run config to execute the job with.\nNote that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\nan arbitrary object for run config. However, it must conform to the constraints of the config\nschema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\nJobConfigValidationInvalid. Defaults to None.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 A set of tags to add to the job execution.

  • \n
\n
\n
Raises
\n
    \n
  • DagsterGraphQLClientError("InvalidStepError", invalid_step_key) \u2013 the job has an invalid step

  • \n
  • DagsterGraphQLClientError("InvalidOutputError", body=error_object) \u2013 some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.

  • \n
  • DagsterGraphQLClientError("RunConflict", message) \u2013 a DagsterRunConflict occured during execution.\n This indicates that a conflicting job run already exists in run storage.

  • \n
  • DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key) \u2013 the run_config is not in the expected format\n for the job

  • \n
  • DagsterGraphQLClientError("JobNotFoundError", message) \u2013 the requested job does not exist

  • \n
  • DagsterGraphQLClientError("PythonError", message) \u2013 an internal framework error occurred

  • \n
\n
\n
Returns
\n

run id of the submitted pipeline run

\n
\n
Return type
\n

str

\n
\n
\n
\n\n
\n
\nsubmit_pipeline_execution(pipeline_name, repository_location_name=None, repository_name=None, run_config=None, mode=None, preset=None, tags=None, solid_selection=None)[source]\u00b6
\n

Submits a Pipeline with attached configuration for execution.

\n
\n
Parameters
\n
    \n
  • pipeline_name (str) \u2013 The pipeline\u2019s name

  • \n
  • repository_location_name (Optional[str], optional) \u2013 The name of the repository location where\nthe pipeline is located. If omitted, the client will try to infer the repository location\nfrom the available options on the Dagster deployment. Defaults to None.

  • \n
  • repository_name (Optional[str], optional) \u2013 The name of the repository where the pipeline is located.\nIf omitted, the client will try to infer the repository from the available options\non the Dagster deployment. Defaults to None.

  • \n
  • run_config (Optional[Any], optional) \u2013 This is the run config to execute the pipeline with.\nNote that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\nan arbitrary object for run config. However, it must conform to the constraints of the config\nschema for this pipeline. If it does not, the client will throw a DagsterGraphQLClientError with a message of\nRunConfigValidationInvalid. Defaults to None.

  • \n
  • mode (Optional[str], optional) \u2013 The mode to run the pipeline with. If you have not\ndefined any custom modes for your pipeline, the default mode is \u201cdefault\u201d. Defaults to None.

  • \n
  • preset (Optional[str], optional) \u2013 The name of a pre-defined preset to use instead of a\nrun config. Defaults to None.

  • \n
  • tags (Optional[Dict[str, Any]], optional) \u2013 A set of tags to add to the pipeline execution.

  • \n
\n
\n
Raises
\n
    \n
  • DagsterGraphQLClientError("InvalidStepError", invalid_step_key) \u2013 the pipeline has an invalid step

  • \n
  • DagsterGraphQLClientError("InvalidOutputError", body=error_object) \u2013 some solid has an invalid output within the pipeline.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.

  • \n
  • DagsterGraphQLClientError("ConflictingExecutionParamsError", invalid_step_key) \u2013 a preset and a run_config & mode are present\n that conflict with one another

  • \n
  • DagsterGraphQLClientError("PresetNotFoundError", message) \u2013 if the provided preset name is not found

  • \n
  • DagsterGraphQLClientError("RunConflict", message) \u2013 a DagsterRunConflict occured during execution.\n This indicates that a conflicting pipeline run already exists in run storage.

  • \n
  • DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key) \u2013 the run_config is not in the expected format\n for the pipeline

  • \n
  • DagsterGraphQLClientError("PipelineNotFoundError", message) \u2013 the requested pipeline does not exist

  • \n
  • DagsterGraphQLClientError("PythonError", message) \u2013 an internal framework error occurred

  • \n
\n
\n
Returns
\n

run id of the submitted pipeline run

\n
\n
Return type
\n

str

\n
\n
\n
\n\n
\n
\nterminate_run(run_id)[source]\u00b6
\n

Terminates a pipeline run. This method it is useful when you would like to stop a pipeline run\nbased on a external event.

\n
\n
Parameters
\n

run_id (str) \u2013 The run id of the pipeline run to terminate

\n
\n
\n
\n\n
\n\n
\n
\nexception dagster_graphql.DagsterGraphQLClientError(*args, body=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster_graphql.InvalidOutputErrorInfo(step_key, invalid_output_name)[source]\u00b6
\n

This class gives information about an InvalidOutputError from submitting a pipeline for execution\nfrom GraphQL.

\n
\n
Parameters
\n
    \n
  • step_key (str) \u2013 key of the step that failed

  • \n
  • invalid_output_name (str) \u2013 the name of the invalid output from the given step

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_graphql.ReloadRepositoryLocationInfo(status, failure_type=None, message=None)[source]\u00b6
\n

This class gives information about the result of reloading\na Dagster repository location with a GraphQL mutation.

\n
\n
Parameters
\n
    \n
  • status (ReloadRepositoryLocationStatus) \u2013 The status of the reload repository location mutation

  • \n
  • failure_type \u2013 (Optional[str], optional): the failure type if status == ReloadRepositoryLocationStatus.FAILURE.\nCan be one of ReloadNotSupported, RepositoryLocationNotFound, or RepositoryLocationLoadFailure. Defaults to None.

  • \n
  • message (Optional[str], optional) \u2013 the failure message/reason if\nstatus == ReloadRepositoryLocationStatus.FAILURE. Defaults to None.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_graphql.ReloadRepositoryLocationStatus(value)[source]\u00b6
\n

This enum describes the status of a GraphQL mutation to reload a Dagster repository location

\n
\n
Parameters
\n

Enum (str) \u2013 can be either ReloadRepositoryLocationStatus.SUCCESS\nor ReloadRepositoryLocationStatus.FAILURE.

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-graphql", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": null, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagstermill/", "title": "Dagstermill"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagstermill", "Dagstermill", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-graphql.rst.txt", "title": "GraphQL (dagster-graphql)", "toc": "\n"}, "dagster-k8s": {"alabaster_version": "0.7.12", "body": "
\n

Kubernetes (dagster-k8s)\u00b6

\n

See also the Kubernetes deployment guide.

\n

This library contains utilities for running Dagster with Kubernetes. This includes a Python API\nallowing Dagit to launch runs as Kubernetes Jobs, as well as a Helm chart you can use as the basis\nfor a Dagster deployment on a Kubernetes cluster.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_k8s.K8sRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional)
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional)
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional)
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional)
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional)
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional)
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional)
\n

A list of environment variables to inject into the Job. Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional)
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional)
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional)
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional)
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
instance_config_map (dagster.StringSource)
\n

The name of an existing Volume to mount into the pod in order to provide a ConfigMap for the Dagster instance. This Volume should contain a dagster.yaml with appropriate values for run storage, event log storage, etc.

\n
\n
postgres_password_secret (dagster.StringSource, optional)
\n

The name of the Kubernetes Secret where the postgres password can be retrieved. Will be mounted and supplied as an environment variable to the Job Pod.Secret must contain the key "postgresql-password" which will be exposed in the Job environment as the environment variable DAGSTER_PG_PASSWORD.

\n
\n
dagster_home (dagster.StringSource, optional)
\n

The location of DAGSTER_HOME in the Job container; this is where the dagster.yaml file will be mounted from the instance ConfigMap specified here. Defaults to /opt/dagster/dagster_home.

\n

Default Value: \u2018/opt/dagster/dagster_home\u2019

\n
\n
load_incluster_config (Bool, optional)
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional)
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
fail_pod_on_run_failure (Bool, optional)
\n

Whether the launched Kubernetes Jobs and Pods should fail if the Dagster run fails

\n
\n
job_namespace (dagster.StringSource, optional)
\n

Default Value: \u2018default\u2019

\n
\n
\n

RunLauncher that starts a Kubernetes Job for each Dagster job run.

\n

Encapsulates each run in a separate, isolated invocation of dagster-graphql.

\n

You can configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml like the following:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: K8sRunLauncher\n  config:\n    service_account_name: your_service_account\n    job_image: my_project/dagster_image:latest\n    instance_config_map: dagster-instance\n    postgres_password_secret: dagster-postgresql-secret\n
\n
\n
\n\n
\n
\ndagster_k8s.k8s_job_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional)
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional)
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional)
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional)
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional)
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional)
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional)
\n

A list of environment variables to inject into the Job. Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional)
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional)
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional)
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional)
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
job_namespace (dagster.StringSource, optional)
\n

\n
retries (selector, optional)
\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
max_concurrency (dagster.IntSource, optional)
\n

Limit on the number of pods that will run concurrently within the scope of a Dagster run. Note that this limit is per run, not global.

\n
\n
\n

Executor which launches steps as Kubernetes Jobs.

\n

To use the k8s_job_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_k8s import k8s_job_executor\n\n@job(executor_def=k8s_job_executor)\ndef k8s_job():\n    pass\n
\n
\n

Then you can configure the executor with run config as follows:

\n
execution:\n  config:\n    job_namespace: 'some-namespace'\n    image_pull_policy: ...\n    image_pull_secrets: ...\n    service_account_name: ...\n    env_config_maps: ...\n    env_secrets: ...\n    env_vars: ...\n    job_image: ... # leave out if using userDeployments\n    max_concurrent: ...\n
\n
\n

max_concurrent limits the number of pods that will execute concurrently for one run. By default\nthere is no limit- it will maximally parallel as allowed by the DAG. Note that this is not a\nglobal limit.

\n

Configuration set on the Kubernetes Jobs and Pods created by the K8sRunLauncher will also be\nset on Kubernetes Jobs and Pods created by the k8s_job_executor.

\n
\n\n
\n

Python API\u00b6

\n

The K8sRunLauncher allows Dagit instances to be configured to launch new runs by starting\nper-run Kubernetes Jobs. To configure the K8sRunLauncher, your dagster.yaml should\ninclude a section like:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: K8sRunLauncher\n  config:\n    image_pull_secrets:\n    service_account_name: dagster\n    job_image: "my-company.com/image:latest"\n    dagster_home: "/opt/dagster/dagster_home"\n    postgres_password_secret: "dagster-postgresql-secret"\n    image_pull_policy: "IfNotPresent"\n    job_namespace: "dagster"\n    instance_config_map: "dagster-instance"\n    env_config_maps:\n      - "dagster-k8s-job-runner-env"\n    env_secrets:\n      - "dagster-k8s-some-secret"\n
\n
\n
\n
\n

Helm chart\u00b6

\n

For local dev (e.g., on kind or minikube):

\n
helm install \\\n    --set dagit.image.repository="dagster.io/buildkite-test-image" \\\n    --set dagit.image.tag="py37-latest" \\\n    --set job_runner.image.repository="dagster.io/buildkite-test-image" \\\n    --set job_runner.image.tag="py37-latest" \\\n    --set imagePullPolicy="IfNotPresent" \\\n    dagster \\\n    helm/dagster/\n
\n
\n

Upon installation, the Helm chart will provide instructions for port forwarding Dagit and Flower (if\nconfigured).

\n
\n
\n

Running tests\u00b6

\n

To run the unit tests:

\n
pytest -m "not integration"\n
\n
\n

To run the integration tests, you must have Docker,\nkind,\nand helm installed.

\n

On macOS:

\n
brew install kind\nbrew install helm\n
\n
\n

Docker must be running.

\n

You may experience slow first test runs thanks to image pulls (run pytest -svv --fulltrace for\nvisibility). Building images and loading them to the kind cluster is slow, and there is\nno visibility into the progress of the load.

\n

NOTE: This process is quite slow, as it requires bootstrapping a local kind cluster with\nDocker images and the dagster-k8s Helm chart. For faster development, you can either:

\n
    \n
  1. Keep a warm kind cluster

  2. \n
  3. Use a remote K8s cluster, e.g. via AWS EKS or GCP GKE

  4. \n
\n

Instructions are below.

\n
\n

Faster local development (with kind)\u00b6

\n

You may find that the kind cluster creation, image loading, and kind cluster creation loop\nis too slow for effective local dev.

\n

You may bypass cluster creation and image loading in the following way. First add the --no-cleanup\nflag to your pytest invocation:

\n
pytest --no-cleanup -s -vvv -m "not integration"\n
\n
\n

The tests will run as before, but the kind cluster will be left running after the tests are completed.

\n

For subsequent test runs, you can run:

\n
pytest --kind-cluster="cluster-d9971c84d44d47f382a2928c8c161faa" --existing-helm-namespace="dagster-test-95590a" -s -vvv -m "not integration"\n
\n
\n

This will bypass cluster creation, image loading, and Helm chart installation, for much faster tests.

\n

The kind cluster name and Helm namespace for this command can be found in the logs, or retrieved\nvia the respective CLIs, using kind get clusters and kubectl get namespaces. Note that\nfor kubectl and helm to work correctly with a kind cluster, you should override your\nkubeconfig file location with:

\n
kind get kubeconfig --name kind-test > /tmp/kubeconfig\nexport KUBECONFIG=/tmp/kubeconfig\n
\n
\n
\n
\n

Manual kind cluster setup\u00b6

\n

The test fixtures provided by dagster-k8s automate the process described below, but sometimes\nit\u2019s useful to manually configure a kind cluster and load images onto it.

\n

First, ensure you have a Docker image appropriate for your Python version. Run, from the root of\nthe repo:

\n
./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6 \\\n    dagster.io.priv/buildkite-test-image:py37-latest\n
\n
\n

In the above invocation, the Python majmin version should be appropriate for your desired tests.

\n

Then run the following commands to create the cluster and load the image. Note that there is no\nfeedback from the loading process.

\n
kind create cluster --name kind-test\nkind load docker-image --name kind-test dagster.io/dagster-docker-buildkite:py37-latest\n
\n
\n

If you are deploying the Helm chart with an in-cluster Postgres (rather than an external database),\nand/or with dagster-celery workers (and a RabbitMQ), you\u2019ll also want to have images present for\nrabbitmq and postgresql:

\n
docker pull docker.io/bitnami/rabbitmq\ndocker pull docker.io/bitnami/postgresql\n\nkind load docker-image --name kind-test docker.io/bitnami/rabbitmq:latest\nkind load docker-image --name kind-test docker.io/bitnami/postgresql:latest\n
\n
\n

Then you can run pytest as follows:

\n
pytest --kind-cluster=kind-test\n
\n
\n
\n
\n
\n

Faster local development (with an existing K8s cluster)\u00b6

\n

If you already have a development K8s cluster available, you can run tests on that cluster vs.\nrunning locally in kind.

\n

For this to work, first build and deploy the test image to a registry available to your cluster.\nFor example, with a private ECR repository:

\n
./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6\ndocker tag dagster-docker-buildkite:latest $AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n\naws ecr get-login --no-include-email --region us-west-1 | sh\ndocker push $AWS_ACCOUNT_ID.dkr.ecr.us-west-1.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n
\n
\n

Then, you can run tests on EKS with:

\n
export DAGSTER_DOCKER_IMAGE_TAG="2020-04-21T21-04-06"\nexport DAGSTER_DOCKER_REPOSITORY="$AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com"\nexport DAGSTER_DOCKER_IMAGE="dagster-k8s-tests"\n\n# First run with --no-cleanup to leave Helm chart in place\npytest --cluster-provider="kubeconfig" --no-cleanup -s -vvv\n\n# Subsequent runs against existing Helm chart\npytest --cluster-provider="kubeconfig" --existing-helm-namespace="dagster-test-<some id>" -s -vvv\n
\n
\n
\n
\n

Validating Helm charts\u00b6

\n

To test / validate Helm charts, you can run:

\n
helm install dagster --dry-run --debug helm/dagster\nhelm lint\n
\n
\n
\n
\n

Enabling GCR access from Minikube\u00b6

\n

To enable GCR access from Minikube:

\n
kubectl create secret docker-registry element-dev-key \\\n    --docker-server=https://gcr.io \\\n    --docker-username=oauth2accesstoken \\\n    --docker-password="$(gcloud auth print-access-token)" \\\n    --docker-email=my@email.com\n
\n
\n
\n
\n

A note about PVCs\u00b6

\n

Both the Postgres and the RabbitMQ Helm charts will store credentials using Persistent Volume\nClaims, which will outlive test invocations and calls to helm uninstall. These must be deleted if\nyou want to change credentials. To view your pvcs, run:

\n
kubectl get pvc\n
\n
\n
\n
\n

Testing Redis\u00b6

\n

The Redis Helm chart installs w/ a randomly-generated password by default; turn this off:

\n
helm install dagredis stable/redis --set usePassword=false\n
\n
\n

Then, to connect to your database from outside the cluster execute the following commands:

\n
kubectl port-forward --namespace default svc/dagredis-master 6379:6379\nredis-cli -h 127.0.0.1 -p 6379\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-k8s", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-mlflow/", "title": "MLflow (dagster-mlflow)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-github/", "title": "GitHub (dagster-github)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-mlflow", "MLflow (dagster-mlflow)", "N", "next"], ["sections/api/apidocs/libraries/dagster-github", "GitHub (dagster-github)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-k8s.rst.txt", "title": "Kubernetes (dagster-k8s)", "toc": "\n"}, "dagster-mlflow": {"alabaster_version": "0.7.12", "body": "
\n

MLflow (dagster-mlflow)\u00b6

\n
\n
\ndagster_mlflow.mlflow_tracking ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
experiment_name (String)
\n

MlFlow experiment name.

\n
\n
mlflow_tracking_uri (Union[String, None], optional)
\n

MlFlow tracking server uri.

\n

Default Value: None

\n
\n
parent_run_id (Union[String, None], optional)
\n

Mlflow run ID of parent run if this is a nested run.

\n

Default Value: None

\n
\n
env (permissive dict, optional)
\n

Environment variables for mlflow setup.

\n
\nDefault Value:
{}\n
\n
\n
\n
env_to_tag (Union[List[Any], None], optional)
\n

List of environment variables to log as tags in mlflow.

\n

Default Value: None

\n
\n
extra_tags (permissive dict, optional)
\n

Any extra key-value tags to log to mlflow.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n

This resource initializes an MLflow run that\u2019s used for all steps within a Dagster run.

\n

This resource provides access to all of mlflow\u2019s methods as well as the mlflow tracking client\u2019s\nmethods.

\n

Usage:

\n
    \n
  1. Add the mlflow resource to any solids in which you want to invoke mlflow tracking APIs.

  2. \n
  3. Add the end_mlflow_on_run_finished hook to your pipeline to end the MLflow run\nwhen the Dagster run is finished.

  4. \n
\n

Examples

\n
from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n@op(required_resource_keys={"mlflow"})\ndef mlflow_solid(context):\n    mlflow.log_params(some_params)\n    mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n@end_mlflow_on_run_finished\n@job(resource_defs={"mlflow": mlflow_tracking})\ndef mlf_example():\n    mlflow_op()\n\n# example using an mlflow instance with s3 storage\nmlf_example.execute_in_process(run_config={\n    "resources": {\n        "mlflow": {\n            "config": {\n                "experiment_name": my_experiment,\n                "mlflow_tracking_uri": "http://localhost:5000",\n\n                # if want to run a nested run, provide parent_run_id\n                "parent_run_id": an_existing_mlflow_run_id,\n\n                # env variables to pass to mlflow\n                "env": {\n                    "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n                    "AWS_ACCESS_KEY_ID": my_aws_key_id,\n                    "AWS_SECRET_ACCESS_KEY": my_secret,\n                },\n\n                # env variables you want to log as mlflow tags\n                "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n                # key-value tags to add to your experiment\n                "extra_tags": {"super": "experiment"},\n            }\n        }\n    }\n})\n
\n
\n
\n\n
\n
\ndagster_mlflow.end_mlflow_on_run_finished HookDefinition\u00b6
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_mlflow.end_mlflow_run_on_pipeline_finished HookDefinition\u00b6
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-mlflow", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-msteams/", "title": "Microsoft Teams (dagster-msteams)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-k8s/", "title": "Kubernetes (dagster-k8s)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-msteams", "Microsoft Teams (dagster-msteams)", "N", "next"], ["sections/api/apidocs/libraries/dagster-k8s", "Kubernetes (dagster-k8s)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-mlflow.rst.txt", "title": "MLflow (dagster-mlflow)", "toc": "\n"}, "dagster-msteams": {"alabaster_version": "0.7.12", "body": "
\n

Microsoft Teams (dagster-msteams)\u00b6

\n
\n
\ndagster_msteams.msteams_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
hook_url (dagster.StringSource)
\n

To send messages to MS Teams channel, an incoming webhook has to\nbe created. The incoming webhook url must be given as a part of the\nresource config to the msteams_resource in dagster.

\n
\n
http_proxy (dagster.StringSource, optional)
\n

\n
https_proxy (dagster.StringSource, optional)
\n

\n
timeout (Float, optional)
\n

Default Value: 60

\n
\n
Verify (Bool, optional)
\n

\n
\n

This resource is for connecting to Microsoft Teams.

\n

The resource object is a dagster_msteams.TeamsClient.

\n

By configuring this resource, you can post messages to MS Teams from any Dagster solid:

\n

Examples:

\n
import os\n\nfrom dagster import ModeDefinition, execute_pipeline, pipeline, solid\nfrom dagster_msteams import Card, msteams_resource\n\n\n@solid(required_resource_keys={"msteams"})\ndef teams_solid(context):\n    card = Card()\n    card.add_attachment(text_message="Hello There !!")\n    context.resources.msteams.post_message(payload=card.payload)\n\n\n@pipeline(\n    mode_defs=[ModeDefinition(resource_defs={"msteams": msteams_resource})],\n)\ndef teams_pipeline():\n    teams_solid()\n\n\nexecute_pipeline(\n    teams_pipeline,\n    {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}},\n)\n
\n
\n
\n\n
\n
\ndagster_msteams.teams_on_failure HookDefinition[source]\u00b6
\n

Create a hook on step failure events that will message the given MS Teams webhook URL.

\n
\n
Parameters
\n
    \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the\nHookContext outputs the message you want to send.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this\nto allow messages to include deeplinks to the specific pipeline run that triggered\nthe hook.

  • \n
\n
\n
\n

Examples

\n
@teams_on_failure(dagit_base_url="http://localhost:3000")\n@pipeline(...)\ndef my_pipeline():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return "Solid {solid_name} failed!".format(\n        solid_name=context.solid\n    )\n\n@solid\ndef a_solid(context):\n    pass\n\n@pipeline(...)\ndef my_pipeline():\n    a_solid.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_msteams.teams_on_success HookDefinition[source]\u00b6
\n

Create a hook on step success events that will message the given MS Teams webhook URL.

\n
\n
Parameters
\n
    \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the\nHookContext outputs the message you want to send.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this\nto allow messages to include deeplinks to the specific pipeline run that triggered\nthe hook.

  • \n
\n
\n
\n

Examples

\n
@teams_on_success(dagit_base_url="http://localhost:3000")\n@pipeline(...)\ndef my_pipeline():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return "Solid {solid_name} failed!".format(\n        solid_name=context.solid\n    )\n\n@solid\ndef a_solid(context):\n    pass\n\n@pipeline(...)\ndef my_pipeline():\n    a_solid.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_msteams.make_teams_on_pipeline_failure_sensor(hook_url, message_fn=<function _default_failure_message>, http_proxy=None, https_proxy=None, timeout=60, verify=None, name=None, dagit_base_url=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Create a sensor on pipeline failures that will message the given MS Teams webhook URL.

\n
\n
Parameters
\n
    \n
  • hook_url (str) \u2013 MS Teams incoming webhook URL.

  • \n
  • message_fn (Optional(Callable[[PipelineFailureSensorContext], str])) \u2013 Function which\ntakes in the PipelineFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, pipeline name, and run ID.

  • \n
  • http_proxy \u2013 (Optional[str]): Proxy for requests using http protocol.

  • \n
  • https_proxy \u2013 (Optional[str]): Proxy for requests using https protocol.

  • \n
  • timeout \u2013 (Optional[float]): Connection timeout in seconds. Defaults to 60.

  • \n
  • verify \u2013 (Optional[bool]): Whether to verify the servers TLS certificate.

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cteams_on_pipeline_failure\u201d.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the failed pipeline run.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n

Examples

\n
teams_on_pipeline_failure = make_teams_on_pipeline_failure_sensor(\n    hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n)\n\n@repository\ndef my_repo():\n    return [my_pipeline + teams_on_pipeline_failure]\n
\n
\n
def my_message_fn(context: PipelineFailureSensorContext) -> str:\n    return "Pipeline {pipeline_name} failed! Error: {error}".format(\n        pipeline_name=context.pipeline_run.pipeline_name,\n        error=context.failure_event.message,\n    )\n\nteams_on_pipeline_failure = make_teams_on_pipeline_failure_sensor(\n    hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n    message_fn=my_message_fn,\n    dagit_base_url="http://localhost:3000",\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-msteams", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-mysql/", "title": "MySQL (dagster-mysql)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-mlflow/", "title": "MLflow (dagster-mlflow)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-mysql", "MySQL (dagster-mysql)", "N", "next"], ["sections/api/apidocs/libraries/dagster-mlflow", "MLflow (dagster-mlflow)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-msteams.rst.txt", "title": "Microsoft Teams (dagster-msteams)", "toc": "\n"}, "dagster-mysql": {"alabaster_version": "0.7.12", "body": "
\n

MySQL (dagster-mysql)\u00b6

\n
\n
\nclass dagster_mysql.MySQLEventLogStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
event_log_storage:\n module: dagster_mysql.event_log\n class: MySQLEventLogStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { db_name }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\nclass dagster_mysql.MySQLRunStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n module: dagster_mysql.run_storage\n class: MySQLRunStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { database }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\nclass dagster_mysql.MySQLScheduleStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
schedule_storage:\n module: dagster_mysql.schedule_storage\n class: MySQLScheduleStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { db_name }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-mysql", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-pagerduty/", "title": "PagerDuty (dagster-pagerduty)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-msteams/", "title": "Microsoft Teams (dagster-msteams)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pagerduty", "PagerDuty (dagster-pagerduty)", "N", "next"], ["sections/api/apidocs/libraries/dagster-msteams", "Microsoft Teams (dagster-msteams)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-mysql.rst.txt", "title": "MySQL (dagster-mysql)", "toc": "\n"}, "dagster-pagerduty": {"alabaster_version": "0.7.12", "body": "
\n

PagerDuty (dagster-pagerduty)\u00b6

\n

This library provides an integration with PagerDuty, to support creating alerts from your Dagster\ncode.

\n

Presently, it provides a thin wrapper on the Events V2 API.

\n
\n
\n

Getting Started\u00b6

\n

You can install this library with:

\n
pip install dagster_pagerduty\n
\n
\n

To use this integration, you\u2019ll first need to create a PagerDuty integration. There are instructions\nhere for\ncreating a new PagerDuty service & integration.

\n

As noted in the PagerDuty documentation, you\u2019ll find an integration key (also referred to as a\n\u201crouting key\u201d) on the Integrations tab for your new service. This key is used to authorize events\ncreated from the PagerDuty events API.

\n

Once your service/integration is created, you can provision a PagerDuty resource and issue PagerDuty\nalerts from within your ops.

\n
\n
\ndagster_pagerduty.pagerduty_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
routing_key (String)
\n

The routing key provisions access to your PagerDuty service. You\nwill need to include the integration key for your new integration, as a\nrouting_key in the event payload.

\n
\n
\n

A resource for posting events (alerts) to PagerDuty.

\n

Example:

\n
@op(required_resource_keys={'pagerduty'})\ndef pagerduty_op(context):\n    context.resources.pagerduty.EventV2_create(\n        summary='alert from dagster'\n        source='localhost',\n        severity='error',\n        event_action='trigger',\n    )\n\n@job(resource_defs={ 'pagerduty': pagerduty_resource })\ndef pagerduty_test():\n    pagerduty_op()\n\npagerduty_test.execute_in_process(\n    run_config={\n        "resources": {\n            'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n        }\n    }\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pagerduty", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-pandas/", "title": "Pandas (dagster-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-mysql/", "title": "MySQL (dagster-mysql)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pandas", "Pandas (dagster-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-mysql", "MySQL (dagster-mysql)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pagerduty.rst.txt", "title": "PagerDuty (dagster-pagerduty)", "toc": "\n"}, "dagster-pandas": {"alabaster_version": "0.7.12", "body": "
\n

Pandas (dagster-pandas)\u00b6

\n

The dagster_pandas library provides utilities for using pandas with Dagster and for implementing\nvalidation on pandas DataFrames. A good place to start with dagster_pandas is the validation\nguide.

\n
\n
\ndagster_pandas.create_dagster_pandas_dataframe_type(name, description=None, columns=None, event_metadata_fn=None, dataframe_constraints=None, loader=None, materializer=None)[source]\u00b6
\n

Constructs a custom pandas dataframe dagster type.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the dagster pandas type.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • columns (Optional[List[PandasColumn]]) \u2013 A list of PandasColumn objects\nwhich express dataframe column schemas and constraints.

  • \n
  • event_metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]], List[MetadataEntry]]]]) \u2013 A callable which takes your dataframe and returns a dict with string label keys and\nMetadataValue values. Can optionally return a List[MetadataEntry].

  • \n
  • dataframe_constraints (Optional[List[DataFrameConstraint]]) \u2013 A list of objects that inherit from\nDataFrameConstraint. This allows you to express dataframe-level constraints.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader. If None, we will default\nto using dataframe_loader.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer. If None, we will\ndefault to using dataframe_materializer.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.RowCountConstraint(num_allowed_rows, error_tolerance=0)[source]\u00b6
\n

A dataframe constraint that validates the expected count of rows.

\n
\n
Parameters
\n
    \n
  • num_allowed_rows (int) \u2013 The number of allowed rows in your dataframe.

  • \n
  • error_tolerance (Optional[int]) \u2013 The acceptable threshold if you are not completely certain. Defaults to 0.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.StrictColumnsConstraint(strict_column_list, enforce_ordering=False)[source]\u00b6
\n

A dataframe constraint that validates column existence and ordering.

\n
\n
Parameters
\n
    \n
  • strict_column_list (List[str]) \u2013 The exact list of columns that your dataframe must have.

  • \n
  • enforce_ordering (Optional[bool]) \u2013 If true, will enforce that the ordering of column names must match.\nDefault is False.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.PandasColumn(name, constraints=None, is_required=None)[source]\u00b6
\n

The main API for expressing column level schemas and constraints for your custom dataframe\ntypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf th column exists, the validate function will validate the column. Defaults to True.

  • \n
  • constraints (Optional[List[Constraint]]) \u2013 List of constraint objects that indicate the\nvalidation rules for the pandas column.

  • \n
\n
\n
\n
\n
\nstatic boolean_column(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic categorical_column(name, categories, of_types=frozenset({'category', 'object'}), non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • categories (List[Any]) \u2013 The valid set of buckets that all values in the column must match.

  • \n
  • of_types (Optional[Union[str, Set[str]]]) \u2013 The expected dtype[s] that your categories and values must\nabide by.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in\nthe column ought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the\nconstraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic datetime_column(name, min_datetime=Timestamp('1677-09-21 00:12:43.145224193'), max_datetime=Timestamp('2262-04-11 23:47:16.854775807'), non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, tz=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses datetime constraints on \u2018datetime64[ns]\u2019 dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • min_datetime (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column.\nDefaults to pandas.Timestamp.min.

  • \n
  • max_datetime (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column.\nDefaults to pandas.Timestamp.max.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
  • tz (Optional[str]) \u2013 Required timezone for values eg: tz=\u2019UTC\u2019, tz=\u2019Europe/Dublin\u2019, tz=\u2019US/Eastern\u2019.\nDefaults to None, meaning naive datetime values.

  • \n
\n
\n
\n
\n\n
\n
\nstatic exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses existence constraints.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic float_column(name, min_value=- inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)

  • \n
  • max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic integer_column(name, min_value=- inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)

  • \n
  • max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic numeric_column(name, min_value=- inf, max_value=inf, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • min_value (Optional[Union[int,float]]) \u2013 The lower bound for values you expect in this column. Defaults to -float(\u2018inf\u2019)

  • \n
  • max_value (Optional[Union[int,float]]) \u2013 The upper bound for values you expect in this column. Defaults to float(\u2018inf\u2019)

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n
\nstatic string_column(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None)[source]\u00b6
\n

Simple constructor for PandasColumns that expresses constraints on string dtypes.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • non_nullable (Optional[bool]) \u2013 If true, this column will enforce a constraint that all values in the column\nought to be non null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, this column will enforce a uniqueness constraint on the column values.

  • \n
  • ignore_missing_vals (Optional[bool]) \u2013 A flag that is passed into most constraints. If true, the constraint will\nonly evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf the column exists the validate function will validate the column. Default to True.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\ndagster_pandas.DataFrame = <dagster.core.types.dagster_type.DagsterType object>\u00b6
\n

Define a type in dagster. These can be used in the inputs and outputs of ops.

\n
\n
Parameters
\n
    \n
  • type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the op. If it passes, return either\nTrue or a TypeCheck with success set to True. If it fails,\nreturn either False or a TypeCheck with success set to False.\nThe first argument must be named context (or, if unused, _, _context, or context_).\nUse required_resource_keys for access to resources.

  • \n
  • key (Optional[str]) \u2013

    The unique key to identify types programmatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name. If\nneither key nor name is provided, a CheckError is thrown.

    \n

    In the case of a generic type such as List or Optional, this is\ngenerated programmatically based on the type parameters.

    \n

    For most use cases, name should be set and the key argument should\nnot be specified.

    \n

  • \n
  • name (Optional[str]) \u2013 A unique name given by a user. If key is None, key\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer\ndecorator to construct these arguments.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn.

  • \n
  • is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String, Int) to visually distinguish\nthem from user-defined types. Meant for internal use.

  • \n
  • kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.

  • \n
  • typing_type \u2013 Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\nvalue contained within the DagsterType. Meant for internal use.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pandas", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-papertrail/", "title": "Papertrail (dagster-papertrail)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pagerduty/", "title": "PagerDuty (dagster-pagerduty)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-papertrail", "Papertrail (dagster-papertrail)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pagerduty", "PagerDuty (dagster-pagerduty)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pandas.rst.txt", "title": "Pandas (dagster-pandas)", "toc": "\n"}, "dagster-papertrail": {"alabaster_version": "0.7.12", "body": "
\n

Papertrail (dagster-papertrail)\u00b6

\n

This library provides an integration with Papertrail for logging.

\n

You can easily set up your Dagster job to log to Papertrail. You\u2019ll need an active Papertrail\naccount, and have your papertrail URL and port handy.

\n
\n
\ndagster_papertrail.papertrail_logger LoggerDefinition\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-papertrail", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-postgres/", "title": "PostgreSQL (dagster-postgres)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pandas/", "title": "Pandas (dagster-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-postgres", "PostgreSQL (dagster-postgres)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pandas", "Pandas (dagster-pandas)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-papertrail.rst.txt", "title": "Papertrail (dagster-papertrail)", "toc": "\n"}, "dagster-postgres": {"alabaster_version": "0.7.12", "body": "
\n

PostgreSQL (dagster-postgres)\u00b6

\n
\n
\ndagster_postgres.PostgresEventLogStorage = <class 'dagster_postgres.event_log.event_log.PostgresEventLogStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional)
\n

\n
postgres_db (strict dict, optional)
\n
\nConfig Schema:
\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
hostname (dagster.StringSource)
\n

\n
db_name (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 5432

\n
\n
params (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
should_autocreate_tables (Bool, optional)
\n

Default Value: True

\n
\n
\n

Postgres-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for event log storage, you can add a block such as the following to your\ndagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
event_log_storage:\n  module: dagster_postgres.event_log\n  class: PostgresEventLogStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\ndagster_postgres.PostgresRunStorage = <class 'dagster_postgres.run_storage.run_storage.PostgresRunStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional)
\n

\n
postgres_db (strict dict, optional)
\n
\nConfig Schema:
\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
hostname (dagster.StringSource)
\n

\n
db_name (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 5432

\n
\n
params (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
should_autocreate_tables (Bool, optional)
\n

Default Value: True

\n
\n
\n

Postgres-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for run storage, you can add a block such as the following to your\ndagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n  module: dagster_postgres.run_storage\n  class: PostgresRunStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { database }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\ndagster_postgres.PostgresScheduleStorage = <class 'dagster_postgres.schedule_storage.schedule_storage.PostgresScheduleStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional)
\n

\n
postgres_db (strict dict, optional)
\n
\nConfig Schema:
\n
username (dagster.StringSource)
\n

\n
password (dagster.StringSource)
\n

\n
hostname (dagster.StringSource)
\n

\n
db_name (dagster.StringSource)
\n

\n
port (dagster.IntSource, optional)
\n

Default Value: 5432

\n
\n
params (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
should_autocreate_tables (Bool, optional)
\n

Default Value: True

\n
\n
\n

Postgres-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagit and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for schedule storage, you can add a block such as the following to your\ndagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
schedule_storage:\n  module: dagster_postgres.schedule_storage\n  class: PostgresScheduleStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-postgres", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-prometheus/", "title": "Prometheus (dagster-prometheus)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-papertrail/", "title": "Papertrail (dagster-papertrail)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-prometheus", "Prometheus (dagster-prometheus)", "N", "next"], ["sections/api/apidocs/libraries/dagster-papertrail", "Papertrail (dagster-papertrail)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-postgres.rst.txt", "title": "PostgreSQL (dagster-postgres)", "toc": "\n"}, "dagster-prometheus": {"alabaster_version": "0.7.12", "body": "
\n

Prometheus (dagster-prometheus)\u00b6

\n
\n
\nclass dagster_prometheus.resources.PrometheusResource(gateway, timeout)[source]\u00b6
\n

Integrates with Prometheus via the prometheus_client library.

\n
\n\n
\n
\ndagster_prometheus.prometheus_resource ResourceDefinition[source]\u00b6
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-prometheus", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-pyspark/", "title": "Pyspark (dagster-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-postgres/", "title": "PostgreSQL (dagster-postgres)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pyspark", "Pyspark (dagster-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-postgres", "PostgreSQL (dagster-postgres)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-prometheus.rst.txt", "title": "Prometheus (dagster-prometheus)", "toc": "\n"}, "dagster-pyspark": {"alabaster_version": "0.7.12", "body": "
\n

Pyspark (dagster-pyspark)\u00b6

\n
\n
\ndagster_pyspark.pyspark_resource = <dagster.core.definitions.resource_definition.ResourceDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_conf (permissive dict, optional)
\n
\nDefault Value:
{\n    "spark": {\n        "app": {},\n        "driver": {\n            "blockManager": {}\n        },\n        "executor": {\n            "pyspark": {},\n            "logs": {\n                "rolling": {\n                    "time": {}\n                }\n            }\n        },\n        "local": {},\n        "submit": {},\n        "log": {},\n        "redaction": {},\n        "python": {\n            "profile": {},\n            "worker": {}\n        },\n        "files": {},\n        "jars": {},\n        "pyspark": {\n            "driver": {}\n        },\n        "reducer": {},\n        "shuffle": {\n            "file": {},\n            "io": {},\n            "service": {\n                "index": {\n                    "cache": {}\n                }\n            },\n            "sort": {},\n            "spill": {},\n            "registration": {}\n        },\n        "eventLog": {\n            "logBlockUpdates": {},\n            "longForm": {},\n            "buffer": {}\n        },\n        "ui": {\n            "dagGraph": {},\n            "liveUpdate": {}\n        },\n        "worker": {\n            "ui": {}\n        },\n        "sql": {\n            "ui": {}\n        },\n        "streaming": {\n            "ui": {},\n            "backpressure": {},\n            "receiver": {\n                "writeAheadLog": {}\n            },\n            "kafka": {},\n            "driver": {\n                "writeAheadLog": {}\n            }\n        },\n        "broadcast": {},\n        "io": {\n            "compression": {\n                "lz4": {},\n                "snappy": {},\n                "zstd": {}\n            }\n        },\n        "kryo": {},\n        "kryoserializer": {\n            "buffer": {}\n        },\n        "rdd": {},\n        "serializer": {},\n        "memory": {\n            "offHeap": {}\n        },\n        "storage": {\n            "replication": {}\n        },\n        "cleaner": {\n            "periodicGC": {},\n            "referenceTracking": {\n                "blocking": {}\n            }\n        },\n        "default": {},\n        "hadoop": {\n            "mapreduce": {\n                "fileoutputcommitter": {\n                    "algorithm": {}\n                }\n            }\n        },\n        "rpc": {\n            "message": {},\n            "retry": {}\n        },\n        "blockManager": {},\n        "network": {},\n        "port": {},\n        "core": {\n            "connection": {\n                "ack": {\n                    "wait": {}\n                }\n            }\n        },\n        "cores": {},\n        "locality": {\n            "wait": {}\n        },\n        "scheduler": {\n            "revive": {},\n            "listenerbus": {\n                "eventqueue": {}\n            }\n        },\n        "blacklist": {\n            "task": {},\n            "stage": {},\n            "application": {\n                "fetchFailure": {}\n            }\n        },\n        "speculation": {},\n        "task": {\n            "reaper": {}\n        },\n        "stage": {},\n        "dynamicAllocation": {},\n        "r": {\n            "driver": {},\n            "shell": {}\n        },\n        "graphx": {\n            "pregel": {}\n        },\n        "deploy": {\n            "zookeeper": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
spark (permissive dict, optional)
\n
\nDefault Value:
{\n    "app": {},\n    "driver": {\n        "blockManager": {}\n    },\n    "executor": {\n        "pyspark": {},\n        "logs": {\n            "rolling": {\n                "time": {}\n            }\n        }\n    },\n    "local": {},\n    "submit": {},\n    "log": {},\n    "redaction": {},\n    "python": {\n        "profile": {},\n        "worker": {}\n    },\n    "files": {},\n    "jars": {},\n    "pyspark": {\n        "driver": {}\n    },\n    "reducer": {},\n    "shuffle": {\n        "file": {},\n        "io": {},\n        "service": {\n            "index": {\n                "cache": {}\n            }\n        },\n        "sort": {},\n        "spill": {},\n        "registration": {}\n    },\n    "eventLog": {\n        "logBlockUpdates": {},\n        "longForm": {},\n        "buffer": {}\n    },\n    "ui": {\n        "dagGraph": {},\n        "liveUpdate": {}\n    },\n    "worker": {\n        "ui": {}\n    },\n    "sql": {\n        "ui": {}\n    },\n    "streaming": {\n        "ui": {},\n        "backpressure": {},\n        "receiver": {\n            "writeAheadLog": {}\n        },\n        "kafka": {},\n        "driver": {\n            "writeAheadLog": {}\n        }\n    },\n    "broadcast": {},\n    "io": {\n        "compression": {\n            "lz4": {},\n            "snappy": {},\n            "zstd": {}\n        }\n    },\n    "kryo": {},\n    "kryoserializer": {\n        "buffer": {}\n    },\n    "rdd": {},\n    "serializer": {},\n    "memory": {\n        "offHeap": {}\n    },\n    "storage": {\n        "replication": {}\n    },\n    "cleaner": {\n        "periodicGC": {},\n        "referenceTracking": {\n            "blocking": {}\n        }\n    },\n    "default": {},\n    "hadoop": {\n        "mapreduce": {\n            "fileoutputcommitter": {\n                "algorithm": {}\n            }\n        }\n    },\n    "rpc": {\n        "message": {},\n        "retry": {}\n    },\n    "blockManager": {},\n    "network": {},\n    "port": {},\n    "core": {\n        "connection": {\n            "ack": {\n                "wait": {}\n            }\n        }\n    },\n    "cores": {},\n    "locality": {\n        "wait": {}\n    },\n    "scheduler": {\n        "revive": {},\n        "listenerbus": {\n            "eventqueue": {}\n        }\n    },\n    "blacklist": {\n        "task": {},\n        "stage": {},\n        "application": {\n            "fetchFailure": {}\n        }\n    },\n    "speculation": {},\n    "task": {\n        "reaper": {}\n    },\n    "stage": {},\n    "dynamicAllocation": {},\n    "r": {\n        "driver": {},\n        "shell": {}\n    },\n    "graphx": {\n        "pregel": {}\n    },\n    "deploy": {\n        "zookeeper": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
app (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
name (dagster.StringSource, optional)
\n

Application Properties: The name of your application. This will appear in the UI and in log data.

\n
\n
\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{\n    "blockManager": {}\n}\n
\n
\n
\nConfig Schema:
\n
cores (dagster.IntSource, optional)
\n

Application Properties: Number of cores to use for the driver process, only in cluster mode.

\n
\n
maxResultSize (dagster.StringSource, optional)
\n

Application Properties: Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size is above this limit. Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory and memory overhead of objects in JVM). Setting a proper limit can protect the driver from out-of-memory errors.

\n
\n
memory (dagster.StringSource, optional)
\n

Application Properties: Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-memory command line option or in your default properties file.

\n
\n
memoryOverhead (dagster.StringSource, optional)
\n

Application Properties: The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
supervise (Bool, optional)
\n

Application Properties: If true, restarts the driver automatically if it fails with a non-zero exit status. Only has effect in Spark standalone mode or Mesos cluster deploy mode.

\n
\n
extraClassPath (dagster.StringSource, optional)
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of the driver. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-class-path command line option or in your default properties file.

\n
\n
extraJavaOptions (dagster.StringSource, optional)
\n

Runtime Environment: A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set with spark.driver.memory in the cluster mode and through the \u2013driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-java-options command line option or in your default properties file.

\n
\n
extraLibraryPath (dagster.StringSource, optional)
\n

Runtime Environment: Set a special library path to use when launching the driver JVM. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-library-path command line option or in your default properties file.

\n
\n
userClassPathFirst (Bool, optional)
\n

Runtime Environment: (Experimental) Whether to give user-added jars precedence over Spark\u2019s own jars when loading classes in the driver. This feature can be used to mitigate conflicts between Spark\u2019s dependencies and user dependencies. It is currently an experimental feature. This is used in cluster mode only.

\n
\n
blockManager (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional)
\n

Networking: Driver-specific port for the block manager to listen on, for cases where it cannot use the same configuration as executors.

\n
\n
\n
\n
bindAddress (dagster.StringSource, optional)
\n

Networking: Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP environment variable (see below). It also allows a different address from the local one to be advertised to executors or external systems. This is useful, for example, when running containers with bridged networking. For this to properly work, the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the container\u2019s host.

\n
\n
host (dagster.StringSource, optional)
\n

Networking: Hostname or IP address for the driver. This is used for communicating with the executors and the standalone Master.

\n
\n
port (dagster.StringSource, optional)
\n

Networking: Port for the driver to listen on. This is used for communicating with the executors and the standalone Master.

\n
\n
\n
\n
executor (permissive dict, optional)
\n
\nDefault Value:
{\n    "pyspark": {},\n    "logs": {\n        "rolling": {\n            "time": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Application Properties: Amount of memory to use per executor process, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g).

\n
\n
pyspark (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Application Properties: The amount of memory to be allocated to PySpark in each executor, in MiB unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python\u2019s memory use and it is up to the application to avoid exceeding the overhead memory space shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests.

\n
\n
\n
\n
memoryOverhead (dagster.StringSource, optional)
\n

Application Properties: The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
extraClassPath (dagster.StringSource, optional)
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of executors. This exists primarily for backwards-compatibility with older versions of Spark. Users typically should not need to set this option.

\n
\n
extraJavaOptions (dagster.StringSource, optional)
\n

Runtime Environment: A string of extra JVM options to pass to executors. For instance, GC settings or other logging. Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. The following symbols, if present will be interpolated: {{APP_ID}} will be replaced by application ID and {{EXECUTOR_ID}} will be replaced by executor ID. For example, to enable verbose gc logging to a file named for the executor ID of the app in /tmp, pass a \u2018value\u2019 of: -verbose:gc -Xloggc:/tmp/{{APP_ID}}-{{EXECUTOR_ID}}.gc

\n
\n
extraLibraryPath (dagster.StringSource, optional)
\n

Runtime Environment: Set a special library path to use when launching executor JVM\u2019s.

\n
\n
logs (permissive dict, optional)
\n
\nDefault Value:
{\n    "rolling": {\n        "time": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
rolling (permissive dict, optional)
\n
\nDefault Value:
{\n    "time": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRetainedFiles (dagster.IntSource, optional)
\n

Runtime Environment: Sets the number of latest rolling log files that are going to be retained by the system. Older log files will be deleted. Disabled by default.

\n
\n
enableCompression (Bool, optional)
\n

Runtime Environment: Enable executor log compression. If it is enabled, the rolled executor logs will be compressed. Disabled by default.

\n
\n
maxSize (dagster.IntSource, optional)
\n

Runtime Environment: Set the max size of the file in bytes by which the executor logs will be rolled over. Rolling is disabled by default. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
strategy (dagster.StringSource, optional)
\n

Runtime Environment: Set the strategy of rolling of executor logs. By default it is disabled. It can be set to \u201ctime\u201d (time-based rolling) or \u201csize\u201d (size-based rolling). For \u201ctime\u201d, use spark.executor.logs.rolling.time.interval to set the rolling interval. For \u201csize\u201d, use spark.executor.logs.rolling.maxSize to set the maximum file size for rolling.

\n
\n
time (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Runtime Environment: Set the time interval by which the executor logs will be rolled over. Rolling is disabled by default. Valid values are daily, hourly, minutely or any interval in seconds. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
\n
\n
\n
\n
\n
\n
userClassPathFirst (Bool, optional)
\n

Runtime Environment: (Experimental) Same functionality as spark.driver.userClassPathFirst, but applied to executor instances.

\n
\n
cores (dagster.IntSource, optional)
\n

Execution Behavior: The number of cores to use on each executor. In standalone and Mesos coarse-grained modes, for more detail, see this description.

\n
\n
heartbeatInterval (dagster.StringSource, optional)
\n

Execution Behavior: Interval between each executor\u2019s heartbeats to the driver. Heartbeats let the driver know that the executor is still alive and update it with metrics for in-progress tasks. spark.executor.heartbeatInterval should be significantly less than spark.network.timeout

\n
\n
\n
\n
extraListeners (dagster.StringSource, optional)
\n

Application Properties: A comma-separated list of classes that implement SparkListener; when initializing SparkContext, instances of these classes will be created and registered with Spark\u2019s listener bus. If a class has a single-argument constructor that accepts a SparkConf, that constructor will be called; otherwise, a zero-argument constructor will be called. If no valid constructor can be found, the SparkContext creation will fail with an exception.

\n
\n
local (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
dir (dagster.StringSource, optional)
\n

Application Properties: Directory to use for \u201cscratch\u201d space in Spark, including map output files and RDDs that get stored on disk. This should be on a fast, local disk in your system. It can also be a comma-separated list of multiple directories on different disks. NOTE: In Spark 1.0 and later this will be overridden by SPARK_LOCAL_DIRS (Standalone), MESOS_SANDBOX (Mesos) or LOCAL_DIRS (YARN) environment variables set by the cluster manager.

\n
\n
\n
\n
logConf (Bool, optional)
\n

Application Properties: Logs the effective SparkConf as INFO when a SparkContext is started.

\n
\n
master (dagster.StringSource, optional)
\n

Application Properties: The cluster manager to connect to. See the list of allowed master URL\u2019s.

\n
\n
submit (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
deployMode (dagster.StringSource, optional)
\n

Application Properties: The deploy mode of Spark driver program, either \u201cclient\u201d or \u201ccluster\u201d, Which means to launch driver program locally (\u201cclient\u201d) or remotely (\u201ccluster\u201d) on one of the nodes inside the cluster.

\n
\n
pyFiles (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.

\n
\n
\n
\n
log (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
callerContext (dagster.StringSource, optional)
\n

Application Properties: Application information that will be written into Yarn RM log/HDFS audit log when running on Yarn/HDFS. Its length depends on the Hadoop configuration hadoop.caller.context.max.size. It should be concise, and typically can have up to 50 characters.

\n
\n
\n
\n
redaction (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
regex (dagster.StringSource, optional)
\n

Runtime Environment: Regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the environment UI and various logs like YARN and event logs.

\n
\n
\n
\n
python (permissive dict, optional)
\n
\nDefault Value:
{\n    "profile": {},\n    "worker": {}\n}\n
\n
\n
\nConfig Schema:
\n
profile (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Runtime Environment: Enable profiling in Python worker, the profile result will show up by sc.show_profiles(), or it will be displayed before the driver exits. It also can be dumped into disk by sc.dump_profiles(path). If some of the profile results had been displayed manually, they will not be displayed automatically before driver exiting. By default the pyspark.profiler.BasicProfiler will be used, but this can be overridden by passing a profiler class in as a parameter to the SparkContext constructor.

\n
\n
dump (dagster.StringSource, optional)
\n

Runtime Environment: The directory which is used to dump the profile result before driver exiting. The results will be dumped as separated file for each RDD. They can be loaded by ptats.Stats(). If this is specified, the profile result will not be displayed automatically.

\n
\n
\n
\n
worker (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional)
\n

Runtime Environment: Amount of memory to use per python worker process during aggregation, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). If the memory used during aggregation goes above this amount, it will spill the data into disks.

\n
\n
reuse (Bool, optional)
\n

Runtime Environment: Reuse Python worker or not. If yes, it will use a fixed number of Python workers, does not need to fork() a Python process for every task. It will be very useful if there is large broadcast, then the broadcast will not be needed to transferred from JVM to Python worker for every task.

\n
\n
\n
\n
\n
\n
files (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.

\n
\n
fetchTimeout (dagster.StringSource, optional)
\n

Execution Behavior: Communication timeout to use when fetching files added through SparkContext.addFile() from the driver.

\n
\n
useFetchCache (Bool, optional)
\n

Execution Behavior: If set to true (default), file fetching will use a local cache that is shared by executors that belong to the same application, which can improve task launching performance when running many executors on the same host. If set to false, these caching optimizations will be disabled and all executors will fetch their own copies of files. This optimization may be disabled in order to use Spark local directories that reside on NFS filesystems (see SPARK-6313 for more details).

\n
\n
overwrite (Bool, optional)
\n

Execution Behavior: Whether to overwrite files added through SparkContext.addFile() when the target file exists and its contents do not match those of the source.

\n
\n
maxPartitionBytes (dagster.IntSource, optional)
\n

Execution Behavior: The maximum number of bytes to pack into a single partition when reading files.

\n
\n
openCostInBytes (dagster.IntSource, optional)
\n

Execution Behavior: The estimated cost to open a file, measured by the number of bytes could be scanned at the same time. This is used when putting multiple files into a partition. It is better to overestimate, then the partitions with small files will be faster than partitions with bigger files.

\n
\n
\n
\n
jars (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.

\n
\n
packages (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option \u2013repositories. For more details, see Advanced Dependency Management.

\n
\n
excludes (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in spark.jars.packages to avoid dependency conflicts.

\n
\n
ivy (dagster.StringSource, optional)
\n

Runtime Environment: Path to specify the Ivy user directory, used for the local Ivy cache and package files from spark.jars.packages. This will override the Ivy property ivy.default.ivy.user.dir which defaults to ~/.ivy2.

\n
\n
ivySettings (dagster.StringSource, optional)
\n

Runtime Environment: Path to an Ivy settings file to customize resolution of jars specified using spark.jars.packages instead of the built-in defaults, such as maven central. Additional repositories given by the command-line option \u2013repositories or spark.jars.repositories will also be included. Useful for allowing Spark to resolve artifacts from behind a firewall e.g. via an in-house artifact server like Artifactory. Details on the settings file format can be found at http://ant.apache.org/ivy/history/latest-milestone/settings.html

\n
\n
repositories (dagster.StringSource, optional)
\n

Runtime Environment: Comma-separated list of additional remote repositories to search for the maven coordinates given with \u2013packages or spark.jars.packages.

\n
\n
\n
\n
pyspark (permissive dict, optional)
\n
\nDefault Value:
{\n    "driver": {}\n}\n
\n
\n
\nConfig Schema:
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
python (dagster.StringSource, optional)
\n

Runtime Environment: Python binary executable to use for PySpark in driver. (default is spark.pyspark.python)

\n
\n
\n
\n
python (dagster.StringSource, optional)
\n

Runtime Environment: Python binary executable to use for PySpark in both driver and executors.

\n
\n
\n
\n
reducer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSizeInFlight (dagster.StringSource, optional)
\n

Shuffle Behavior: Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless otherwise specified. Since each output requires us to create a buffer to receive it, this represents a fixed memory overhead per reduce task, so keep it small unless you have a large amount of memory.

\n
\n
maxReqsInFlight (dagster.IntSource, optional)
\n

Shuffle Behavior: This configuration limits the number of remote requests to fetch blocks at any given point. When the number of hosts in the cluster increase, it might lead to very large number of inbound connections to one or more nodes, causing the workers to fail under load. By allowing it to limit the number of fetch requests, this scenario can be mitigated.

\n
\n
maxBlocksInFlightPerAddress (dagster.IntSource, optional)
\n

Shuffle Behavior: This configuration limits the number of remote blocks being fetched per reduce task from a given host port. When a large number of blocks are being requested from a given address in a single fetch or simultaneously, this could crash the serving executor or Node Manager. This is especially useful to reduce the load on the Node Manager when external shuffle is enabled. You can mitigate this issue by setting it to a lower value.

\n
\n
\n
\n
maxRemoteBlockSizeFetchToMem (dagster.IntSource, optional)
\n

Shuffle Behavior: The remote block will be fetched to disk when size of the block is above this threshold in bytes. This is to avoid a giant request that takes too much memory. By default, this is only enabled for blocks > 2GB, as those cannot be fetched directly into memory, no matter what resources are available. But it can be turned down to a much lower value (eg. 200m) to avoid using too much memory on smaller blocks as well. Note this configuration will affect both shuffle fetch and block manager remote block fetch. For users who enabled external shuffle service, this feature can only be used when external shuffle service is newer than Spark 2.2.

\n
\n
shuffle (permissive dict, optional)
\n
\nDefault Value:
{\n    "file": {},\n    "io": {},\n    "service": {\n        "index": {\n            "cache": {}\n        }\n    },\n    "sort": {},\n    "spill": {},\n    "registration": {}\n}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional)
\n

Shuffle Behavior: Whether to compress map output files. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
file (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
buffer (dagster.StringSource, optional)
\n

Shuffle Behavior: Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise specified. These buffers reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

\n
\n
\n
\n
io (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.IntSource, optional)
\n

Shuffle Behavior: (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.

\n
\n
numConnectionsPerPeer (dagster.IntSource, optional)
\n

Shuffle Behavior: (Netty only) Connections between hosts are reused in order to reduce connection buildup for large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value.

\n
\n
preferDirectBufs (Bool, optional)
\n

Shuffle Behavior: (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache block transfer. For environments where off-heap memory is tightly limited, users may wish to turn this off to force all allocations from Netty to be on-heap.

\n
\n
retryWait (dagster.StringSource, optional)
\n

Shuffle Behavior: (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds by default, calculated as maxRetries * retryWait.

\n
\n
\n
\n
service (permissive dict, optional)
\n
\nDefault Value:
{\n    "index": {\n        "cache": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional)
\n

Shuffle Behavior: Enables the external shuffle service. This service preserves the shuffle files written by executors so the executors can be safely removed. This must be enabled if spark.dynamicAllocation.enabled is \u201ctrue\u201d. The external shuffle service must be set up in order to enable it. See dynamic allocation configuration and setup documentation for more information.

\n
\n
port (dagster.IntSource, optional)
\n

Shuffle Behavior: Port on which the external shuffle service will run.

\n
\n
index (permissive dict, optional)
\n
\nDefault Value:
{\n    "cache": {}\n}\n
\n
\n
\nConfig Schema:
\n
cache (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
size (dagster.StringSource, optional)
\n

Shuffle Behavior: Cache entries limited to the specified memory footprint in bytes.

\n
\n
\n
\n
\n
\n
\n
\n
maxChunksBeingTransferred (dagster.IntSource, optional)
\n

Shuffle Behavior: The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see spark.shuffle.io.maxRetries and spark.shuffle.io.retryWait), if those limits are reached the task will fail with fetch failure.

\n
\n
sort (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
bypassMergeThreshold (dagster.IntSource, optional)
\n

Shuffle Behavior: (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no map-side aggregation and there are at most this many reduce partitions.

\n
\n
\n
\n
spill (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional)
\n

Shuffle Behavior: Whether to compress data spilled during shuffles. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
accurateBlockThreshold (dagster.IntSource, optional)
\n

Shuffle Behavior: Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle block size when fetch shuffle blocks.

\n
\n
registration (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.IntSource, optional)
\n

Shuffle Behavior: Timeout in milliseconds for registration to the external shuffle service.

\n
\n
maxAttempts (dagster.IntSource, optional)
\n

Shuffle Behavior: When we fail to register to the external shuffle service, we will retry for maxAttempts times.

\n
\n
\n
\n
memoryFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for aggregation and cogroups during shuffles. At any given time, the collective size of all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will begin to spill to disk. If spills are often, consider increasing this value at the expense of spark.storage.memoryFraction.

\n
\n
\n
\n
eventLog (permissive dict, optional)
\n
\nDefault Value:
{\n    "logBlockUpdates": {},\n    "longForm": {},\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
logBlockUpdates (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to log events for every block update, if spark.eventLog.enabled is true. *Warning*: This will increase the size of the event log considerably.

\n
\n
\n
\n
longForm (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: If true, use the long form of call sites in the event log. Otherwise use the short form.

\n
\n
\n
\n
compress (dagster.StringSource, optional)
\n

Spark UI: Whether to compress logged events, if spark.eventLog.enabled is true. Compression will use spark.io.compression.codec.

\n
\n
dir (dagster.StringSource, optional)
\n

Spark UI: Base directory in which Spark events are logged, if spark.eventLog.enabled is true. Within this base directory, Spark creates a sub-directory for each application, and logs the events specific to the application in this directory. Users may want to set this to a unified location like an HDFS directory so history files can be read by the history server.

\n
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to log Spark events, useful for reconstructing the Web UI after the application has finished.

\n
\n
overwrite (dagster.StringSource, optional)
\n

Spark UI: Whether to overwrite any existing files.

\n
\n
buffer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
kb (dagster.StringSource, optional)
\n

Spark UI: Buffer size to use when writing to output streams, in KiB unless otherwise specified.

\n
\n
\n
\n
\n
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{\n    "dagGraph": {},\n    "liveUpdate": {}\n}\n
\n
\n
\nConfig Schema:
\n
dagGraph (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedRootRDDs (dagster.StringSource, optional)
\n

Spark UI: How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
enabled (dagster.StringSource, optional)
\n

Spark UI: Whether to run the web UI for the Spark application.

\n
\n
killEnabled (dagster.StringSource, optional)
\n

Spark UI: Allows jobs and stages to be killed from the web UI.

\n
\n
liveUpdate (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
period (dagster.StringSource, optional)
\n

Spark UI: How often to update live entities. -1 means \u201cnever update\u201d when replaying applications, meaning only the last write will happen. For live applications, this avoids a few operations that we can live without when rapidly processing incoming task events.

\n
\n
\n
\n
port (dagster.StringSource, optional)
\n

Spark UI: Port for your application\u2019s dashboard, which shows memory and workload data.

\n
\n
retainedJobs (dagster.StringSource, optional)
\n

Spark UI: How many jobs the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedStages (dagster.StringSource, optional)
\n

Spark UI: How many stages the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedTasks (dagster.StringSource, optional)
\n

Spark UI: How many tasks the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
reverseProxy (dagster.StringSource, optional)
\n

Spark UI: Enable running Spark Master as reverse proxy for worker and application UIs. In this mode, Spark master will reverse proxy the worker and application UIs to enable access without requiring direct access to their hosts. Use it with caution, as worker and application UI will not be accessible directly, you will only be able to access them through spark master/proxy public URL. This setting affects all the workers and application UIs running in the cluster and must be set on all the workers, drivers and masters.

\n
\n
reverseProxyUrl (dagster.StringSource, optional)
\n

Spark UI: This is the URL where your proxy is running. This URL is for proxy which is running in front of Spark Master. This is useful when running proxy for authentication e.g. OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to reach your proxy.

\n
\n
showConsoleProgress (dagster.StringSource, optional)
\n

Spark UI: Show the progress bar in the console. The progress bar shows the progress of stages that run for longer than 500ms. If multiple stages run at the same time, multiple progress bars will be displayed on the same line.

\n
\n
retainedDeadExecutors (dagster.StringSource, optional)
\n

Spark UI: How many dead executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
filters (dagster.StringSource, optional)
\n

Spark UI: Comma separated list of filter class names to apply to the Spark Web UI. The filter should be a standard javax servlet Filter. Filter parameters can also be specified in the configuration, by setting config entries of the form spark.<class name of filter>.param.<param name>=<value> For example: spark.ui.filters=com.test.filter1 spark.com.test.filter1.param.name1=foo spark.com.test.filter1.param.name2=bar

\n
\n
\n
\n
worker (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutors (dagster.StringSource, optional)
\n

Spark UI: How many finished executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
retainedDrivers (dagster.StringSource, optional)
\n

Spark UI: How many finished drivers the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
sql (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutions (dagster.StringSource, optional)
\n

Spark UI: How many finished executions the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
streaming (permissive dict, optional)
\n
\nDefault Value:
{\n    "ui": {},\n    "backpressure": {},\n    "receiver": {\n        "writeAheadLog": {}\n    },\n    "kafka": {},\n    "driver": {\n        "writeAheadLog": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedBatches (dagster.StringSource, optional)
\n

Spark Streaming: How many batches the Spark Streaming UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
backpressure (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Spark Streaming: Enables or disables Spark Streaming\u2019s internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values spark.streaming.receiver.maxRate and spark.streaming.kafka.maxRatePerPartition if they are set (see below).

\n
\n
initialRate (dagster.StringSource, optional)
\n

Spark Streaming: This is the initial maximum receiving rate at which each receiver will receive data for the first batch when the backpressure mechanism is enabled.

\n
\n
\n
\n
blockInterval (dagster.StringSource, optional)
\n

Spark Streaming: Interval at which data received by Spark Streaming receivers is chunked into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the performance tuning section in the Spark Streaming programing guide for more details.

\n
\n
receiver (permissive dict, optional)
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRate (dagster.StringSource, optional)
\n

Spark Streaming: Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details.

\n
\n
writeAheadLog (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enable (dagster.StringSource, optional)
\n

Spark Streaming: Enable write-ahead logs for receivers. All the input data received through receivers will be saved to write-ahead logs that will allow it to be recovered after driver failures. See the deployment guide in the Spark Streaming programing guide for more details.

\n
\n
closeFileAfterWrite (dagster.StringSource, optional)
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the receivers. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the data WAL on the receivers.

\n
\n
\n
\n
\n
\n
unpersist (dagster.StringSource, optional)
\n

Spark Streaming: Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from Spark\u2019s memory. The raw input data received by Spark Streaming is also automatically cleared. Setting this to false will allow the raw data and persisted RDDs to be accessible outside the streaming application as they will not be cleared automatically. But it comes at the cost of higher memory usage in Spark.

\n
\n
stopGracefullyOnShutdown (dagster.StringSource, optional)
\n

Spark Streaming: If true, Spark shuts down the StreamingContext gracefully on JVM shutdown rather than immediately.

\n
\n
kafka (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRatePerPartition (dagster.StringSource, optional)
\n

Spark Streaming: Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API. See the Kafka Integration guide for more details.

\n
\n
minRatePerPartition (dagster.StringSource, optional)
\n

Spark Streaming: Minimum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API.

\n
\n
maxRetries (dagster.StringSource, optional)
\n

Spark Streaming: Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the new Kafka direct stream API.

\n
\n
\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
writeAheadLog (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
closeFileAfterWrite (dagster.StringSource, optional)
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the driver. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the metadata WAL on the driver.

\n
\n
\n
\n
\n
\n
\n
\n
broadcast (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to compress broadcast variables before sending them. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
blockSize (dagster.StringSource, optional)
\n

Execution Behavior: Size of each piece of a block for TorrentBroadcastFactory, in KiB unless otherwise specified. Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit.

\n
\n
checksum (dagster.StringSource, optional)
\n

Execution Behavior: Whether to enable checksum for broadcast. If enabled, broadcasts will include a checksum, which can help detect corrupted blocks, at the cost of computing and sending a little more data. It\u2019s possible to disable it if the network has other mechanisms to guarantee data won\u2019t be corrupted during broadcast.

\n
\n
\n
\n
io (permissive dict, optional)
\n
\nDefault Value:
{\n    "compression": {\n        "lz4": {},\n        "snappy": {},\n        "zstd": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
compression (permissive dict, optional)
\n
\nDefault Value:
{\n    "lz4": {},\n    "snappy": {},\n    "zstd": {}\n}\n
\n
\n
\nConfig Schema:
\n
codec (dagster.StringSource, optional)
\n

Compression and Serialization: The codec used to compress internal data such as RDD partitions, event log, broadcast variables and shuffle outputs. By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, and org.apache.spark.io.ZStdCompressionCodec.

\n
\n
lz4 (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional)
\n

Compression and Serialization: Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.

\n
\n
\n
\n
snappy (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional)
\n

Compression and Serialization: Block size in bytes used in Snappy compression, in the case when Snappy compression codec is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.

\n
\n
\n
\n
zstd (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
level (dagster.StringSource, optional)
\n

Compression and Serialization: Compression level for Zstd compression codec. Increasing the compression level will result in better compression at the expense of more CPU and memory.

\n
\n
bufferSize (dagster.StringSource, optional)
\n

Compression and Serialization: Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it might increase the compression cost because of excessive JNI call overhead.

\n
\n
\n
\n
\n
\n
\n
\n
kryo (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
classesToRegister (dagster.StringSource, optional)
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of custom class names to register with Kryo. See the tuning guide for more details.

\n
\n
referenceTracking (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple copies of the same object. Can be disabled to improve performance if you know this is not the case.

\n
\n
registrationRequired (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to require registration with Kryo. If set to \u2018true\u2019, Kryo will throw an exception if an unregistered class is serialized. If set to false (the default), Kryo will write unregistered class names along with each object. Writing class names can cause significant performance overhead, so enabling this option can enforce strictly that a user has not omitted classes from registration.

\n
\n
registrator (dagster.StringSource, optional)
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of classes that register your custom classes with Kryo. This property is useful if you need to register your classes in a custom way, e.g. to specify a custom field serializer. Otherwise spark.kryo.classesToRegister is simpler. It should be set to classes that extend KryoRegistrator. See the tuning guide for more details.

\n
\n
unsafe (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to use unsafe based Kryo serializer. Can be substantially faster by using Unsafe Based IO.

\n
\n
\n
\n
kryoserializer (permissive dict, optional)
\n
\nDefault Value:
{\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
buffer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Compression and Serialization: Initial size of Kryo\u2019s serialization buffer, in KiB unless otherwise specified. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.

\n
\n
max (dagster.StringSource, optional)
\n

Compression and Serialization: Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger than any object you attempt to serialize and must be less than 2048m. Increase this if you get a \u201cbuffer limit exceeded\u201d exception inside Kryo.

\n
\n
\n
\n
\n
\n
rdd (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional)
\n

Compression and Serialization: Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
serializer (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Compression and Serialization: Class to use for serializing objects that will be sent over the network or need to be cached in serialized form. The default of Java serialization works with any Serializable Java object but is quite slow, so we recommend using org.apache.spark.serializer.KryoSerializer and configuring Kryo serialization when speed is necessary. Can be any subclass of org.apache.spark.Serializer.

\n
\n
objectStreamReset (dagster.StringSource, optional)
\n

Compression and Serialization: When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches objects to prevent writing redundant data, however that stops garbage collection of those objects. By calling \u2018reset\u2019 you flush that info from the serializer, and allow old objects to be collected. To turn off this periodic reset set it to -1. By default it will reset the serializer every 100 objects.

\n
\n
\n
\n
memory (permissive dict, optional)
\n
\nDefault Value:
{\n    "offHeap": {}\n}\n
\n
\n
\nConfig Schema:
\n
fraction (Float, optional)
\n

Memory Management: Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at the default value is recommended. For more detail, including important information about correctly tuning JVM garbage collection when increasing this value, see this description.

\n
\n
storageFraction (Float, optional)
\n

Memory Management: Amount of storage memory immune to eviction, expressed as a fraction of the size of the region set aside by spark.memory.fraction. The higher this is, the less working memory may be available to execution and tasks may spill to disk more often. Leaving this at the default value is recommended. For more detail, see this description.

\n
\n
offHeap (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional)
\n

Memory Management: If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.

\n
\n
size (dagster.IntSource, optional)
\n

Memory Management: The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors\u2019 total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.

\n
\n
\n
\n
useLegacyMode (Bool, optional)
\n

Memory Management: Whether to enable the legacy memory management mode used in Spark 1.5 and before. The legacy mode rigidly partitions the heap space into fixed-size regions, potentially leading to excessive spilling if the application was not tuned. The following deprecated memory fraction configurations are not read unless this is enabled: spark.shuffle.memoryFraction spark.storage.memoryFraction spark.storage.unrollFraction

\n
\n
\n
\n
storage (permissive dict, optional)
\n
\nDefault Value:
{\n    "replication": {}\n}\n
\n
\n
\nConfig Schema:
\n
memoryFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for Spark\u2019s memory cache. This should not be larger than the \u201cold\u201d generation of objects in the JVM, which by default is given 0.6 of the heap, but you can increase it if you configure your own old generation size.

\n
\n
unrollFraction (Float, optional)
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of spark.storage.memoryFraction to use for unrolling blocks in memory. This is dynamically allocated by dropping existing blocks when there is not enough free storage space to unroll the new block in its entirety.

\n
\n
replication (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
proactive (Bool, optional)
\n

Memory Management: Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries to get the replication level of the block to the initial number.

\n
\n
\n
\n
memoryMapThreshold (dagster.StringSource, optional)
\n

Execution Behavior: Size in bytes of a block above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory mapping has high overhead for blocks close to or below the page size of the operating system.

\n
\n
\n
\n
cleaner (permissive dict, optional)
\n
\nDefault Value:
{\n    "periodicGC": {},\n    "referenceTracking": {\n        "blocking": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
periodicGC (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Memory Management: Controls how often to trigger a garbage collection. This context cleaner triggers cleanups only when weak references are garbage collected. In long-running applications with large driver JVMs, where there is little memory pressure on the driver, this may happen very occasionally or not at all. Not cleaning at all may lead to executors running out of disk space after a while.

\n
\n
\n
\n
referenceTracking (permissive dict, optional)
\n
\nDefault Value:
{\n    "blocking": {}\n}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Memory Management: Enables or disables context cleaning.

\n
\n
blocking (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional)
\n

Memory Management: Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by spark.cleaner.referenceTracking.blocking.shuffle Spark property).

\n
\n
shuffle (Bool, optional)
\n

Memory Management: Controls whether the cleaning thread should block on shuffle cleanup tasks.

\n
\n
\n
\n
cleanCheckpoints (Bool, optional)
\n

Memory Management: Controls whether to clean checkpoint files if the reference is out of scope.

\n
\n
\n
\n
\n
\n
default (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
parallelism (dagster.IntSource, optional)
\n

Execution Behavior: Default number of partitions in RDDs returned by transformations like join, reduceByKey, and parallelize when not set by user.

\n
\n
\n
\n
hadoop (permissive dict, optional)
\n
\nDefault Value:
{\n    "mapreduce": {\n        "fileoutputcommitter": {\n            "algorithm": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
cloneConf (Bool, optional)
\n

Execution Behavior: If set to true, clones a new Hadoop Configuration object for each task. This option should be enabled to work around Configuration thread-safety issues (see SPARK-2546 for more details). This is disabled by default in order to avoid unexpected performance regressions for jobs that are not affected by these issues.

\n
\n
validateOutputSpecs (Bool, optional)
\n

Execution Behavior: If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with previous versions of Spark. Simply use Hadoop\u2019s FileSystem API to delete output directories by hand. This setting is ignored for jobs generated through Spark Streaming\u2019s StreamingContext, since data may need to be rewritten to pre-existing output directories during checkpoint recovery.

\n
\n
mapreduce (permissive dict, optional)
\n
\nDefault Value:
{\n    "fileoutputcommitter": {\n        "algorithm": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
fileoutputcommitter (permissive dict, optional)
\n
\nDefault Value:
{\n    "algorithm": {}\n}\n
\n
\n
\nConfig Schema:
\n
algorithm (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
version (dagster.IntSource, optional)
\n

Execution Behavior: The file output committer algorithm version, valid algorithm version number: 1 or 2. Version 2 may have better performance, but version 1 may handle failures better in certain situations, as per MAPREDUCE-4815.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
rpc (permissive dict, optional)
\n
\nDefault Value:
{\n    "message": {},\n    "retry": {}\n}\n
\n
\n
\nConfig Schema:
\n
message (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSize (dagster.StringSource, optional)
\n

Networking: Maximum message size (in MB) to allow in \u201ccontrol plane\u201d communication; generally only applies to map output size information sent between executors and the driver. Increase this if you are running jobs with many thousands of map and reduce tasks and see messages about the RPC message size.

\n
\n
\n
\n
numRetries (dagster.StringSource, optional)
\n

Networking: Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number.

\n
\n
retry (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
wait (dagster.StringSource, optional)
\n

Networking: Duration for an RPC ask operation to wait before retrying.

\n
\n
\n
\n
askTimeout (dagster.StringSource, optional)
\n

Networking: Duration for an RPC ask operation to wait before timing out.

\n
\n
lookupTimeout (dagster.StringSource, optional)
\n

Networking: Duration for an RPC remote endpoint lookup operation to wait before timing out.

\n
\n
\n
\n
blockManager (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional)
\n

Networking: Port for all block managers to listen on. These exist on both the driver and the executors.

\n
\n
\n
\n
network (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional)
\n

Networking: Default timeout for all network interactions. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.

\n
\n
\n
\n
port (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.StringSource, optional)
\n

Networking: Maximum number of retries when binding to a port before giving up. When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. This essentially allows it to try a range of ports from the start port specified to port + maxRetries.

\n
\n
\n
\n
core (permissive dict, optional)
\n
\nDefault Value:
{\n    "connection": {\n        "ack": {\n            "wait": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
connection (permissive dict, optional)
\n
\nDefault Value:
{\n    "ack": {\n        "wait": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ack (permissive dict, optional)
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional)
\n

Networking: How long for the connection to wait for ack to occur before timing out and giving up. To avoid unwilling timeout caused by long pause like GC, you can set larger value.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cores (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
max (dagster.StringSource, optional)
\n

Scheduling: When running on a standalone deploy cluster or a Mesos cluster in \u201ccoarse-grained\u201d sharing mode, the maximum amount of CPU cores to request for the application from across the cluster (not from each machine). If not set, the default will be spark.deploy.defaultCores on Spark\u2019s standalone cluster manager, or infinite (all available cores) on Mesos.

\n
\n
\n
\n
locality (permissive dict, optional)
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Scheduling: How long to wait to launch a data-local task before giving up and launching it on a less-local node. The same wait will be used to step through multiple locality levels (process-local, node-local, rack-local and then any). It is also possible to customize the waiting time for each level by setting spark.locality.wait.node, etc. You should increase this setting if your tasks are long and see poor locality, but the default usually works well.

\n
\n
node (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for node locality. For example, you can set this to 0 to skip node locality and search immediately for rack locality (if your cluster has rack information).

\n
\n
process (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for process locality. This affects tasks that attempt to access cached data in a particular executor process.

\n
\n
rack (dagster.StringSource, optional)
\n

Scheduling: Customize the locality wait for rack locality.

\n
\n
\n
\n
\n
\n
scheduler (permissive dict, optional)
\n
\nDefault Value:
{\n    "revive": {},\n    "listenerbus": {\n        "eventqueue": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
maxRegisteredResourcesWaitingTime (dagster.StringSource, optional)
\n

Scheduling: Maximum amount of time to wait for resources to register before scheduling begins.

\n
\n
minRegisteredResourcesRatio (dagster.StringSource, optional)
\n

Scheduling: The minimum ratio of registered resources (registered resources / total expected resources) (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarse-grained mode [\u2018spark.cores.max\u2019 value is total expected resources for Mesos coarse-grained mode] ) to wait for before scheduling begins. Specified as a double between 0.0 and 1.0. Regardless of whether the minimum ratio of resources has been reached, the maximum amount of time it will wait before scheduling begins is controlled by config spark.scheduler.maxRegisteredResourcesWaitingTime.

\n
\n
mode (dagster.StringSource, optional)
\n

Scheduling: The scheduling mode between jobs submitted to the same SparkContext. Can be set to FAIR to use fair sharing instead of queueing jobs one after another. Useful for multi-user services.

\n
\n
revive (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional)
\n

Scheduling: The interval length for the scheduler to revive the worker resource offers to run tasks.

\n
\n
\n
\n
listenerbus (permissive dict, optional)
\n
\nDefault Value:
{\n    "eventqueue": {}\n}\n
\n
\n
\nConfig Schema:
\n
eventqueue (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
capacity (dagster.StringSource, optional)
\n

Scheduling: Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing value (e.g. 20000) if listener events are dropped. Increasing this value may result in the driver using more memory.

\n
\n
\n
\n
\n
\n
\n
\n
blacklist (permissive dict, optional)
\n
\nDefault Value:
{\n    "task": {},\n    "stage": {},\n    "application": {\n        "fetchFailure": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: If set to \u201ctrue\u201d, prevent Spark from scheduling tasks on executors that have been blacklisted due to too many task failures. The blacklisting algorithm can be further controlled by the other \u201cspark.blacklist\u201d configuration options.

\n
\n
timeout (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How long a node or executor is blacklisted for the entire application, before it is unconditionally removed from the blacklist to attempt running new tasks.

\n
\n
task (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxTaskAttemptsPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one executor before the executor is blacklisted for that task.

\n
\n
maxTaskAttemptsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one node, before the entire node is blacklisted for that task.

\n
\n
\n
\n
stage (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, within one stage, before the executor is blacklisted for that stage.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different executors are marked as blacklisted for a given stage, before the entire node is marked as failed for the stage.

\n
\n
\n
\n
application (permissive dict, optional)
\n
\nDefault Value:
{\n    "fetchFailure": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, in successful task sets, before the executor is blacklisted for the entire application. Blacklisted executors will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors may get marked as idle and be reclaimed by the cluster manager.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional)
\n

Scheduling: (Experimental) How many different executors must be blacklisted for the entire application, before the node is blacklisted for the entire application. Blacklisted nodes will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors on the node may get marked as idle and be reclaimed by the cluster manager.

\n
\n
fetchFailure (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, Spark will blacklist the executor immediately when a fetch failure happens. If external shuffle service is enabled, then the whole node will be blacklisted.

\n
\n
\n
\n
\n
\n
killBlacklistedExecutors (dagster.StringSource, optional)
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, allow Spark to automatically kill the executors when they are blacklisted on fetch failure or blacklisted for the entire application, as controlled by spark.blacklist.application.*. Note that, when an entire node is added to the blacklist, all of the executors on that node will be killed.

\n
\n
\n
\n
speculation (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional)
\n

Scheduling: If set to \u201ctrue\u201d, performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.

\n
\n
interval (dagster.StringSource, optional)
\n

Scheduling: How often Spark will check for tasks to speculate.

\n
\n
multiplier (dagster.StringSource, optional)
\n

Scheduling: How many times slower a task is than the median to be considered for speculation.

\n
\n
quantile (dagster.StringSource, optional)
\n

Scheduling: Fraction of tasks which must be complete before speculation is enabled for a particular stage.

\n
\n
\n
\n
task (permissive dict, optional)
\n
\nDefault Value:
{\n    "reaper": {}\n}\n
\n
\n
\nConfig Schema:
\n
cpus (dagster.StringSource, optional)
\n

Scheduling: Number of cores to allocate for each task.

\n
\n
maxFailures (dagster.StringSource, optional)
\n

Scheduling: Number of failures of any particular task before giving up on the job. The total number of failures spread across different tasks will not cause the job to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1.

\n
\n
reaper (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Scheduling: Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed will be monitored by the executor until that task actually finishes executing. See the other spark.task.reaper.* configurations for details on how to control the exact behavior of this monitoring. When set to false (the default), task killing will use an older code path which lacks such monitoring.

\n
\n
pollingInterval (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls the frequency at which executors will poll the status of killed tasks. If a killed task is still running when polled then a warning will be logged and, by default, a thread-dump of the task will be logged (this thread dump can be disabled via the spark.task.reaper.threadDump setting, which is documented below).

\n
\n
threadDump (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls whether task thread dumps are logged during periodic polling of killed tasks. Set this to false to disable collection of thread dumps.

\n
\n
killTimeout (dagster.StringSource, optional)
\n

Scheduling: When spark.task.reaper.enabled = true, this setting specifies a timeout after which the executor JVM will kill itself if a killed task has not stopped running. The default value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose of this setting is to act as a safety-net to prevent runaway noncancellable tasks from rendering an executor unusable.

\n
\n
\n
\n
\n
\n
stage (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxConsecutiveAttempts (dagster.StringSource, optional)
\n

Scheduling: Number of consecutive stage attempts allowed before a stage is aborted.

\n
\n
\n
\n
dynamicAllocation (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional)
\n

Dynamic Allocation: Whether to use dynamic resource allocation, which scales the number of executors registered with this application up and down based on the workload. For more detail, see the description here. This requires spark.shuffle.service.enabled to be set. The following configurations are also relevant: spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors, and spark.dynamicAllocation.initialExecutors spark.dynamicAllocation.executorAllocationRatio

\n
\n
executorIdleTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor has been idle for more than this duration, the executor will be removed. For more detail, see this description.

\n
\n
cachedExecutorIdleTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration, the executor will be removed. For more details, see this description.

\n
\n
initialExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Initial number of executors to run if dynamic allocation is enabled. If \u2013num-executors (or spark.executor.instances) is set and larger than this value, it will be used as the initial number of executors.

\n
\n
maxExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Upper bound for the number of executors if dynamic allocation is enabled.

\n
\n
minExecutors (dagster.StringSource, optional)
\n

Dynamic Allocation: Lower bound for the number of executors if dynamic allocation is enabled.

\n
\n
executorAllocationRatio (dagster.StringSource, optional)
\n

Dynamic Allocation: By default, the dynamic allocation will request enough executors to maximize the parallelism according to the number of tasks to process. While this minimizes the latency of the job, with small tasks this setting can waste a lot of resources due to executor allocation overhead, as some executor might not even do any work. This setting allows to set a ratio that will be used to reduce the number of executors w.r.t. full parallelism. Defaults to 1.0 to give maximum parallelism. 0.5 will divide the target number of executors by 2 The target number of executors computed by the dynamicAllocation can still be overridden by the spark.dynamicAllocation.minExecutors and spark.dynamicAllocation.maxExecutors settings

\n
\n
schedulerBacklogTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: If dynamic allocation is enabled and there have been pending tasks backlogged for more than this duration, new executors will be requested. For more detail, see this description.

\n
\n
sustainedSchedulerBacklogTimeout (dagster.StringSource, optional)
\n

Dynamic Allocation: Same as spark.dynamicAllocation.schedulerBacklogTimeout, but used only for subsequent executor requests. For more detail, see this description.

\n
\n
\n
\n
r (permissive dict, optional)
\n
\nDefault Value:
{\n    "driver": {},\n    "shell": {}\n}\n
\n
\n
\nConfig Schema:
\n
numRBackendThreads (dagster.StringSource, optional)
\n

SparkR: Number of threads used by RBackend to handle RPC calls from SparkR package.

\n
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing R scripts in cluster modes for both driver and workers.

\n
\n
driver (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing R scripts in client modes for driver. Ignored in cluster modes.

\n
\n
\n
\n
shell (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional)
\n

SparkR: Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script.

\n
\n
\n
\n
backendConnectionTimeout (dagster.StringSource, optional)
\n

SparkR: Connection timeout set by R process on its connection to RBackend in seconds.

\n
\n
heartBeatInterval (dagster.StringSource, optional)
\n

SparkR: Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.

\n
\n
\n
\n
graphx (permissive dict, optional)
\n
\nDefault Value:
{\n    "pregel": {}\n}\n
\n
\n
\nConfig Schema:
\n
pregel (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
checkpointInterval (dagster.StringSource, optional)
\n

GraphX: Checkpoint interval for graph and message in Pregel. It used to avoid stackOverflowError due to long lineage chains after lots of iterations. The checkpoint is disabled by default.

\n
\n
\n
\n
\n
\n
deploy (permissive dict, optional)
\n
\nDefault Value:
{\n    "zookeeper": {}\n}\n
\n
\n
\nConfig Schema:
\n
recoveryMode (dagster.StringSource, optional)
\n

Deploy: The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches. This is only applicable for cluster mode when running with Standalone or Mesos.

\n
\n
zookeeper (permissive dict, optional)
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
url (dagster.StringSource, optional)
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.

\n
\n
dir (dagster.StringSource, optional)
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n

This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.

\n

Example:

\n
@op(required_resource_keys={"pyspark"})\ndef my_op(context):\n    spark_session = context.resources.pyspark.spark_session\n    dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\nmy_pyspark_resource = pyspark_resource.configured(\n    {"spark_conf": {"spark.executor.memory": "2g"}}\n)\n\n@job(resource_defs={"pyspark": my_pyspark_resource})\ndef my_spark_job():\n    my_op()\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pyspark", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-shell/", "title": "Shell (dagster-shell)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-prometheus/", "title": "Prometheus (dagster-prometheus)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-shell", "Shell (dagster-shell)", "N", "next"], ["sections/api/apidocs/libraries/dagster-prometheus", "Prometheus (dagster-prometheus)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pyspark.rst.txt", "title": "Pyspark (dagster-pyspark)", "toc": "\n"}, "dagster-shell": {"alabaster_version": "0.7.12", "body": "
\n

Shell (dagster-shell)\u00b6

\n

The Dagster shell library provides op factories for executing inline shell scripts or script files.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_shell.create_shell_command_op(shell_command, name, description=None, required_resource_keys=None, tags=None)[source]\u00b6
\n

This function is a factory that constructs ops to execute a shell command.

\n

Note that you can only use shell_command_op if you know the command you\u2019d like to execute\nat pipeline construction time. If you\u2019d like to construct shell commands dynamically during\npipeline execution and pass them between ops, you should use shell_op instead.

\n

Examples:

\n
# pylint: disable=no-value-for-parameter\nfrom dagster_shell import create_shell_command_op\n\nfrom dagster import graph\n\n\n@graph\ndef my_graph():\n    a = create_shell_command_op('echo "hello, world!"', name="a")\n    a()\n
\n
\n
\n
Parameters
\n
    \n
  • shell_command (str) \u2013 The shell command that the constructed op will execute.

  • \n
  • name (str) \u2013 The name of the constructed op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this op.\nSetting this ensures that resource spin up for the required resources will occur before\nthe shell command is executed.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
\n
\n
Raises
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns
\n

Returns the constructed op definition.

\n
\n
Return type
\n

OpDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.create_shell_script_op(shell_script_path, name='create_shell_script_op', input_defs=None, **kwargs)[source]\u00b6
\n

This function is a factory which constructs an op that will execute a shell command read\nfrom a script file.

\n

Any kwargs passed to this function will be passed along to the underlying @op decorator. However, note that overriding config or output_defs is not\nsupported.

\n

You might consider using @graph to wrap this op\nin the cases where you\u2019d like to configure the shell op with different config fields.

\n

Examples:

\n
# pylint: disable=no-value-for-parameter\nfrom dagster_shell import create_shell_script_op\n\nfrom dagster import file_relative_path, graph\n\n\n@graph\ndef my_graph():\n    a = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="a")\n    a()\n
\n
\n
\n
Parameters
\n
    \n
  • shell_script_path (str) \u2013 The script file to execute.

  • \n
  • name (str, optional) \u2013 The name of this op. Defaults to \u201ccreate_shell_script_op\u201d.

  • \n
  • input_defs (List[InputDefinition], optional) \u2013 input definitions for the op. Defaults to\na single Nothing input.

  • \n
\n
\n
Raises
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns
\n

Returns the constructed op definition.

\n
\n
Return type
\n

OpDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.shell_op(context, shell_command)\u00b6
\n
\n\n
\n
\n

Legacy APIs\u00b6

\n
\n
\ndagster_shell.create_shell_command_solid(shell_command, name, description=None, required_resource_keys=None, tags=None)[source]\u00b6
\n

This function is a factory that constructs solids to execute a shell command.

\n

Note that you can only use shell_command_solid if you know the command you\u2019d like to execute\nat pipeline construction time. If you\u2019d like to construct shell commands dynamically during\npipeline execution and pass them between solids, you should use shell_solid instead.

\n

Examples:

\n
# pylint: disable=no-value-for-parameter\nfrom dagster_shell import create_shell_command_solid\n\nfrom dagster import pipeline\n\n\n@pipeline\ndef pipe():\n    a = create_shell_command_solid('echo "hello, world!"', name="a")\n    a()\n
\n
\n
\n
Parameters
\n
    \n
  • shell_command (str) \u2013 The shell command that the constructed solid will execute.

  • \n
  • name (str) \u2013 The name of the constructed solid.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this solid.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this solid.\nSetting this ensures that resource spin up for the required resources will occur before\nthe shell command is executed.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
\n
\n
Raises
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns
\n

Returns the constructed solid definition.

\n
\n
Return type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.create_shell_script_solid(shell_script_path, name='create_shell_script_solid', input_defs=None, **kwargs)[source]\u00b6
\n

This function is a factory which constructs a solid that will execute a shell command read\nfrom a script file.

\n

Any kwargs passed to this function will be passed along to the underlying @solid decorator. However, note that overriding config or output_defs is not\nsupported.

\n

You might consider using @composite_solid to wrap this solid\nin the cases where you\u2019d like to configure the shell solid with different config fields.

\n

Examples:

\n
# pylint: disable=no-value-for-parameter\nfrom dagster_shell import create_shell_script_solid\n\nfrom dagster import file_relative_path, pipeline\n\n\n@pipeline\ndef pipe():\n    a = create_shell_script_solid(file_relative_path(__file__, "hello_world.sh"), name="a")\n    a()\n
\n
\n
\n
Parameters
\n
    \n
  • shell_script_path (str) \u2013 The script file to execute.

  • \n
  • name (str, optional) \u2013 The name of this solid. Defaults to \u201ccreate_shell_script_solid\u201d.

  • \n
  • input_defs (List[InputDefinition], optional) \u2013 input definitions for the solid. Defaults to\na single Nothing input.

  • \n
\n
\n
Raises
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns
\n

Returns the constructed solid definition.

\n
\n
Return type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.shell_solid(context, shell_command)\u00b6
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-shell", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dagster-slack/", "title": "Slack (dagster-slack)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pyspark/", "title": "Pyspark (dagster-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-slack", "Slack (dagster-slack)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pyspark", "Pyspark (dagster-pyspark)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-shell.rst.txt", "title": "Shell (dagster-shell)", "toc": "\n"}, "dagster-slack": {"alabaster_version": "0.7.12", "body": "
\n

Slack (dagster-slack)\u00b6

\n

\n
\n

\n
\n

This library provides an integration with Slack, to support posting messages in your company\u2019s Slack workspace.

\n
\n

\n
\n

Presently, it provides a thin wrapper on the Slack client API chat.postMessage.

\n
\n

\n
\n

To use this integration, you\u2019ll first need to create a Slack App for it.

\n
    \n
  1. Create App: Go to https://api.slack.com/apps and click \u201cCreate New App\u201d:

    \n

    \n
  2. \n
  3. Install App: After creating an app, on the left-hand side of the app configuration, click \u201cBot Users\u201d, and then create a bot user. Then, click \u201cInstall App\u201d on the left hand side, and finally \u201cInstall App to Workspace\u201d.

  4. \n
  5. Bot Token: Once finished, this will create a new bot token for your bot/workspace:

    \n

    \n
  6. \n
\n

Copy this bot token and put it somewhere safe; see Safely Storing Credentials for more on this topic.

\n
\n
\ndagster_slack.slack_resource ResourceDefinition[source]\u00b6
\n

This resource is for connecting to Slack.

\n

The resource object is a slack_sdk.WebClient.

\n

By configuring this Slack resource, you can post messages to Slack from any Dagster op:

\n

Examples:

\n
import os\n\nfrom dagster import job, op\nfrom dagster_slack import slack_resource\n\n\n@op(required_resource_keys={'slack'})\ndef slack_op(context):\n    context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n@job(resource_defs={'slack': slack_resource})\ndef slack_job():\n    slack_op()\n\nslack_job.execute_in_process(\n    run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n)\n
\n
\n
\n\n
\n
\ndagster_slack.slack_on_failure HookDefinition[source]\u00b6
\n

Create a hook on step failure events that will message the given Slack channel.

\n
\n
Parameters
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the HookContext\noutputs the message you want to send.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the specific pipeline run that triggered the hook.

  • \n
\n
\n
\n

Examples

\n
@slack_on_failure("#foo", dagit_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op} failed!"\n\n@op\ndef an_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_slack.slack_on_success HookDefinition[source]\u00b6
\n

Create a hook on step success events that will message the given Slack channel.

\n
\n
Parameters
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the HookContext\noutputs the message you want to send.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the specific pipeline run that triggered the hook.

  • \n
\n
\n
\n

Examples

\n
@slack_on_success("#foo", dagit_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op} worked!"\n\n@op\ndef an_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_slack.make_slack_on_run_failure_sensor(channel, slack_token, text_fn=<function _default_failure_message_text_fn>, blocks_fn=None, name=None, dagit_base_url=None, job_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Create a sensor on job failures that will message the given Slack channel.

\n
\n
Parameters
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • slack_token (str) \u2013 The slack token.\nTokens are typically either user tokens or bot tokens. More in the Slack API\ndocumentation here: https://api.slack.com/docs/token-types

  • \n
  • text_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, job name, and run ID.\nThe usage of the text_fn changes depending on whether you\u2019re using blocks_fn. If you\nare using blocks_fn, this is used as a fallback string to display in notifications. If\nyou aren\u2019t, this is the main body text of the message. It can be formatted as plain text,\nor with mrkdwn.\nSee more details in https://api.slack.com/methods/chat.postMessage#text_usage

  • \n
  • blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]) \u2013 Function which takes in\nthe RunFailureSensorContext and outputs the message blocks you want to send.\nSee information about Blocks in https://api.slack.com/reference/block-kit/blocks

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cslack_on_run_failure\u201d.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the failed job run.

  • \n
  • job_selection (Optional[List[Union[PipelineDefinition, GraphDefinition]]]) \u2013 The jobs that\nwill be monitored by this failure sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n

Examples

\n
slack_on_run_failure = make_slack_on_run_failure_sensor(\n    "#my_channel",\n    os.getenv("MY_SLACK_TOKEN")\n)\n\n@repository\ndef my_repo():\n    return [my_job + slack_on_run_failure]\n
\n
\n
def my_message_fn(context: RunFailureSensorContext) -> str:\n    return (\n        f"Job {context.pipeline_run.pipeline_name} failed!"\n        f"Error: {context.failure_event.message}"\n    )\n\nslack_on_run_failure = make_slack_on_run_failure_sensor(\n    channel="#my_channel",\n    slack_token=os.getenv("MY_SLACK_TOKEN"),\n    message_fn=my_message_fn,\n    dagit_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n
\ndagster_slack.make_slack_on_pipeline_failure_sensor(channel, slack_token, text_fn=<function _default_failure_message_text_fn>, blocks_fn=None, pipeline_selection=None, name=None, dagit_base_url=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Create a sensor on pipeline failures that will message the given Slack channel.

\n
\n
Parameters
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • slack_token (str) \u2013 The slack token.\nTokens are typically either user tokens or bot tokens. More in the Slack API\ndocumentation here: https://api.slack.com/docs/token-types

  • \n
  • text_fn (Optional(Callable[[PipelineFailureSensorContext], str])) \u2013 Function which\ntakes in the PipelineFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, pipeline name, and run ID.\nThe usage of the text_fn changes depending on whether you\u2019re using blocks_fn. If you\nare using blocks_fn, this is used as a fallback string to display in notifications. If\nyou aren\u2019t, this is the main body text of the message. It can be formatted as plain text,\nor with mrkdwn.\nSee more details in https://api.slack.com/methods/chat.postMessage#text_usage

  • \n
  • blocks_fn (Callable[[PipelineFailureSensorContext], List[Dict]]) \u2013 Function which takes in\nthe PipelineFailureSensorContext and outputs the message blocks you want to send.\nSee information about Blocks in https://api.slack.com/reference/block-kit/blocks

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 Names of the pipelines that will be monitored by\nthis failure sensor. Defaults to None, which means the alert will be sent when any\npipeline in the repository fails.

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cslack_on_pipeline_failure\u201d.

  • \n
  • dagit_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the failed pipeline run.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n

Examples

\n
slack_on_pipeline_failure = make_slack_on_pipeline_failure_sensor(\n    "#my_channel",\n    os.getenv("MY_SLACK_TOKEN")\n)\n\n@repository\ndef my_repo():\n    return [my_pipeline + slack_on_pipeline_failure]\n
\n
\n
def my_message_fn(context: PipelineFailureSensorContext) -> str:\n    return "Pipeline {pipeline_name} failed! Error: {error}".format(\n        pipeline_name=context.pipeline_run.pipeline_name,\n        error=context.failure_event.message,\n    )\n\nslack_on_pipeline_failure = make_slack_on_pipeline_failure_sensor(\n    channel="#my_channel",\n    slack_token=os.getenv("MY_SLACK_TOKEN"),\n    message_fn=my_message_fn,\n    dagit_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-slack", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-snowflake/", "title": "Snowflake (dagster-snowflake)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-shell/", "title": "Shell (dagster-shell)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake", "Snowflake (dagster-snowflake)", "N", "next"], ["sections/api/apidocs/libraries/dagster-shell", "Shell (dagster-shell)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-slack.rst.txt", "title": "Slack (dagster-slack)", "toc": "\n"}, "dagster-snowflake": {"alabaster_version": "0.7.12", "body": "
\n

Snowflake (dagster-snowflake)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n
\n
\ndagster_snowflake.snowflake_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account (dagster.StringSource, optional)
\n

Your Snowflake account name. For more details, see https://bit.ly/2FBL320.

\n
\n
user (dagster.StringSource)
\n

User login name.

\n
\n
password (dagster.StringSource)
\n

User password.

\n
\n
database (dagster.StringSource, optional)
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (dagster.StringSource, optional)
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
role (dagster.StringSource, optional)
\n

Name of the default role to use. After login, you can use USE ROLE to change the role.

\n
\n
warehouse (dagster.StringSource, optional)
\n

Name of the default warehouse to use. After login, you can use USE WAREHOUSE to change the role.

\n
\n
autocommit (Bool, optional)
\n

None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
client_prefetch_threads (dagster.IntSource, optional)
\n

Number of threads used to download the results sets (4 by default). Increasing the value improves fetch performance but requires more memory.

\n
\n
client_session_keep_alive (dagster.StringSource, optional)
\n

False by default. Set this to True to keep the session active indefinitely, even if there is no activity from the user. Make certain to call the close method to terminate the thread properly or the process may hang.

\n
\n
login_timeout (dagster.IntSource, optional)
\n

Timeout in seconds for login. By default, 60 seconds. The login request gives up after the timeout length if the HTTP response is \u201csuccess\u201d.

\n
\n
network_timeout (dagster.IntSource, optional)
\n

Timeout in seconds for all other operations. By default, none/infinite. A general request gives up after the timeout length if the HTTP response is not \u2018success\u2019.

\n
\n
ocsp_response_cache_filename (dagster.StringSource, optional)
\n

URI for the OCSP response cache file. By default, the OCSP response cache file is created in the cache directory.

\n
\n
validate_default_parameters (Bool, optional)
\n

False by default. Raise an exception if either one of specified database, schema or warehouse doesn\u2019t exists if True.

\n
\n
paramstyle (dagster.StringSource, optional)
\n

pyformat by default for client side binding. Specify qmark or numeric to change bind variable formats for server side binding.

\n
\n
timezone (dagster.StringSource, optional)
\n

None by default, which honors the Snowflake parameter TIMEZONE. Set to a valid time zone (e.g. America/Los_Angeles) to set the session time zone.

\n
\n
connector (dagster.StringSource, optional)
\n

Indicate alternative database connection engine. Permissible option is \u2018sqlalchemy\u2019 otherwise defaults to use the Snowflake Connector for Python.

\n
\n
cache_column_metadata (dagster.StringSource, optional)
\n

Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a flag cache_column_metadata=True such that all of column metadata for all tables are \u201ccached\u201d

\n
\n
numpy (dagster.StringSource, optional)
\n

Optional parameter when connector is set to sqlalchemy. To enable fetching NumPy data types, add numpy=True to the connection parameters.

\n
\n
authenticator (dagster.StringSource, optional)
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

A resource for connecting to the Snowflake data warehouse.

\n

A simple example of loading data into Snowflake and subsequently querying that data is shown below:

\n

Examples:

\n
from dagster import job, op\nfrom dagster_snowflake import snowflake_resource\n\n@op(required_resource_keys={'snowflake'})\ndef get_one(context):\n    context.resources.snowflake.execute_query('SELECT 1')\n\n@job(resource_defs={'snowflake': snowflake_resource})\ndef my_snowflake_job():\n    get_one()\n\nmy_snowflake_job.execute_in_process(\n    run_config={\n        'resources': {\n            'snowflake': {\n                'config': {\n                    'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n                    'user': {'env': 'SNOWFLAKE_USER'},\n                    'password': {'env': 'SNOWFLAKE_PASSWORD'},\n                    'database': {'env': 'SNOWFLAKE_DATABASE'},\n                    'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n                    'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n                }\n            }\n        }\n    }\n)\n
\n
\n
\n\n
\n
\ndagster_snowflake.build_snowflake_io_manager(type_handlers)[source]\u00b6
\n

Builds an IO manager definition that reads inputs from and writes outputs to Snowflake.

\n
\n
Parameters
\n

type_handlers (Sequence[DbTypeHandler]) \u2013 Each handler defines how to translate between\nslices of Snowflake tables and an in-memory type - e.g. a Pandas DataFrame.

\n
\n
Returns
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake import build_snowflake_io_manager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\n\nsnowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler()])\n\n@job(resource_defs={'io_manager': snowflake_io_manager})\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\ndagster_snowflake.snowflake_op_for_query(sql, parameters=None)[source]\u00b6
\n

This function is an op factory that constructs an op to execute a snowflake query.

\n

Note that you can only use snowflake_op_for_query if you know the query you\u2019d like to\nexecute at graph construction time. If you\u2019d like to execute queries dynamically during\njob execution, you should manually execute those queries in your custom op using the\nsnowflake resource.

\n
\n
Parameters
\n
    \n
  • sql (str) \u2013 The sql query that will execute against the provided snowflake resource.

  • \n
  • parameters (dict) \u2013 The parameters for the sql query.

  • \n
\n
\n
Returns
\n

Returns the constructed op definition.

\n
\n
Return type
\n

OpDefinition

\n
\n
\n
\n\n
\n
\nclass dagster_snowflake.SnowflakeConnection(config, log)[source]\u00b6
\n
\n
\nexecute_queries(sql_queries, parameters=None, fetch_results=False)[source]\u00b6
\n
\n\n
\n
\nexecute_query(sql, parameters=None, fetch_results=False)[source]\u00b6
\n
\n\n
\n
\nget_connection(raw_conn=True)[source]\u00b6
\n
\n\n
\n
\nload_table_from_local_parquet(src, table)[source]\u00b6
\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-snowflake-pandas/", "title": "Snowflake with Pandas (dagster-snowflake-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-slack/", "title": "Slack (dagster-slack)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake-pandas", "Snowflake with Pandas (dagster-snowflake-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-slack", "Slack (dagster-slack)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake.rst.txt", "title": "Snowflake (dagster-snowflake)", "toc": "\n"}, "dagster-snowflake-pandas": {"alabaster_version": "0.7.12", "body": "
\n

Snowflake with Pandas (dagster-snowflake-pandas)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse and Pandas data processing library.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n
\n
\nclass dagster_snowflake_pandas.SnowflakePandasTypeHandler(*args, **kwds)[source]\u00b6
\n

Defines how to translate between slices of Snowflake tables and Pandas DataFrames.

\n

Examples:

\n
from dagster_snowflake import build_snowflake_io_manager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\n\nsnowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler()])\n\n@job(resource_defs={'io_manager': snowflake_io_manager})\ndef my_job():\n    ...\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake-pandas", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-spark/", "title": "Spark (dagster-spark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake/", "title": "Snowflake (dagster-snowflake)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-spark", "Spark (dagster-spark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake", "Snowflake (dagster-snowflake)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake-pandas.rst.txt", "title": "Snowflake with Pandas (dagster-snowflake-pandas)", "toc": "\n"}, "dagster-spark": {"alabaster_version": "0.7.12", "body": "
\n

Spark (dagster-spark)\u00b6

\n
\n
\nclass dagster_spark.SparkOpError[source]\u00b6
\n
\n\n
\n
\ndagster_spark.define_spark_config()[source]\u00b6
\n

Spark configuration.

\n
\n
See the Spark documentation for reference:

https://spark.apache.org/docs/latest/submitting-applications.html

\n
\n
\n
\n\n
\n
\ndagster_spark.create_spark_op(name, main_class, description=None, required_resource_keys=frozenset({'spark'}))[source]\u00b6
\n
\n\n
\n
\ndagster_spark.construct_spark_shell_command(application_jar, main_class, master_url=None, spark_conf=None, deploy_mode=None, application_arguments=None, spark_home=None)[source]\u00b6
\n

Constructs the spark-submit command for a Spark job.

\n
\n\n
\n
\ndagster_spark.spark_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-spark", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-ssh/", "title": "SSH / SFTP (dagster-ssh)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake-pandas/", "title": "Snowflake with Pandas (dagster-snowflake-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-ssh", "SSH / SFTP (dagster-ssh)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake-pandas", "Snowflake with Pandas (dagster-snowflake-pandas)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-spark.rst.txt", "title": "Spark (dagster-spark)", "toc": "\n"}, "dagster-ssh": {"alabaster_version": "0.7.12", "body": "
\n

SSH / SFTP (dagster-ssh)\u00b6

\n

This library provides an integration with SSH and SFTP.

\n
\n
\nclass dagster_ssh.SSHResource(remote_host, remote_port, username=None, password=None, key_file=None, key_string=None, timeout=10, keepalive_interval=30, compress=True, no_host_key_check=True, allow_host_key_change=False, logger=None)[source]\u00b6
\n

Resource for ssh remote execution using Paramiko.\nref: https://github.com/paramiko/paramiko

\n
\n\n
\n
\ndagster_ssh.ssh_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
remote_host (dagster.StringSource)
\n

remote host to connect to

\n
\n
remote_port (Int, optional)
\n

port of remote host to connect (Default is paramiko SSH_PORT)

\n

Default Value: 22

\n
\n
username (dagster.StringSource, optional)
\n

username to connect to the remote_host

\n
\n
password (dagster.StringSource, optional)
\n

password of the username to connect to the remote_host

\n
\n
key_file (dagster.StringSource, optional)
\n

key file to use to connect to the remote_host.

\n
\n
key_string (dagster.StringSource, optional)
\n

key string to use to connect to remote_host

\n
\n
timeout (Int, optional)
\n

timeout for the attempt to connect to the remote_host.

\n

Default Value: 10

\n
\n
keepalive_interval (Int, optional)
\n

send a keepalive packet to remote host every keepalive_interval seconds

\n

Default Value: 30

\n
\n
compress (Bool, optional)
\n

Default Value: True

\n
\n
no_host_key_check (Bool, optional)
\n

Default Value: True

\n
\n
allow_host_key_change (Bool, optional)
\n

Default Value: False

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-ssh", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-twilio/", "title": "Twilio (dagster-twilio)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-spark/", "title": "Spark (dagster-spark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-twilio", "Twilio (dagster-twilio)", "N", "next"], ["sections/api/apidocs/libraries/dagster-spark", "Spark (dagster-spark)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-ssh.rst.txt", "title": "SSH / SFTP (dagster-ssh)", "toc": "\n"}, "dagster-twilio": {"alabaster_version": "0.7.12", "body": "
\n

Twilio (dagster-twilio)\u00b6

\n

This library provides an integration with Twilio.

\n
\n
\ndagster_twilio.twilio_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account_sid (dagster.StringSource)
\n

Twilio Account SID

\n
\n
auth_token (dagster.StringSource)
\n

Twilio Auth Token

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-twilio", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagstermill/", "title": "Dagstermill"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-ssh/", "title": "SSH / SFTP (dagster-ssh)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagstermill", "Dagstermill", "N", "next"], ["sections/api/apidocs/libraries/dagster-ssh", "SSH / SFTP (dagster-ssh)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagster-twilio.rst.txt", "title": "Twilio (dagster-twilio)", "toc": "\n"}, "dagstermill": {"alabaster_version": "0.7.12", "body": "
\n

Dagstermill\u00b6

\n
\n
\ndagstermill.define_dagstermill_solid(name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook=None, output_notebook_name=None, asset_key_prefix=None, description=None, tags=None)[source]\u00b6
\n

Wrap a Jupyter notebook in a solid.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the solid.

  • \n
  • notebook_path (str) \u2013 Path to the backing notebook.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013 The solid\u2019s inputs.

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013 The solid\u2019s outputs. Your notebook should\ncall yield_result() to yield each of these outputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The string names of any required resources.

  • \n
  • output_notebook (Optional[str]) \u2013 If set, will be used as the name of an injected output of\ntype FileHandle that will point to the executed notebook (in\naddition to the AssetMaterialization that is always created). This\nrespects the FileManager configured on\nthe pipeline resources via the \u201cfile_manager\u201d resource key, so, e.g.,\nif s3_file_manager is configured, the output will be a :\npy:class:~dagster_aws.s3.S3FileHandle.

  • \n
  • output_notebook_name \u2013 (Optional[str]): If set, will be used as the name of an injected output\nof type of BufferedIOBase that is the file object of the executed\nnotebook (in addition to the AssetMaterialization that is always\ncreated). It allows the downstream solids to access the executed notebook via a file\nobject.

  • \n
  • asset_key_prefix (Optional[Union[List[str], str]]) \u2013 If set, will be used to prefix the\nasset keys for materialized notebooks.

  • \n
  • description (Optional[str]) \u2013 If set, description used for solid.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 If set, additional tags used to annotate solid.\nDagster uses the tag keys notebook_path and kind, which cannot be\noverwritten by the user.

  • \n
\n
\n
Returns
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagstermill.define_dagstermill_op(name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook_name=None, asset_key_prefix=None, description=None, tags=None)[source]\u00b6
\n

Wrap a Jupyter notebook in a solid.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the solid.

  • \n
  • notebook_path (str) \u2013 Path to the backing notebook.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013 The solid\u2019s inputs.

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013 The solid\u2019s outputs. Your notebook should\ncall yield_result() to yield each of these outputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The string names of any required resources.

  • \n
  • output_notebook_name \u2013 (Optional[str]): If set, will be used as the name of an injected output\nof type of BufferedIOBase that is the file object of the executed\nnotebook (in addition to the AssetMaterialization that is always\ncreated). It allows the downstream solids to access the executed notebook via a file\nobject.

  • \n
  • asset_key_prefix (Optional[Union[List[str], str]]) \u2013 If set, will be used to prefix the\nasset keys for materialized notebooks.

  • \n
  • description (Optional[str]) \u2013 If set, description used for solid.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 If set, additional tags used to annotate solid.\nDagster uses the tag keys notebook_path and kind, which cannot be\noverwritten by the user.

  • \n
\n
\n
Returns
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\ndagstermill.local_output_notebook_io_manager(init_context)[source]\u00b6
\n

Built-in IO Manager that handles output notebooks.

\n
\n\n
\n
\ndagstermill.get_context(solid_config=None, mode_def=None, run_config=None)\u00b6
\n

Get a dagstermill execution context for interactive exploration and development.

\n
\n
Parameters
\n
    \n
  • solid_config (Optional[Any]) \u2013 If specified, this value will be made available on the\ncontext as its solid_config property.

  • \n
  • mode_def (Optional[dagster.ModeDefinition]) \u2013 If specified, defines the mode to\nuse to construct the context. Specify this if you would like a context constructed\nwith specific resource_defs or logger_defs. By default, an ephemeral mode\nwith a console logger will be constructed.

  • \n
  • run_config (Optional[dict]) \u2013 The config dict with which to construct\nthe context.

  • \n
\n
\n
Returns
\n

DagstermillExecutionContext

\n
\n
\n
\n\n
\n
\ndagstermill.yield_event(dagster_event)\u00b6
\n

Yield a dagster event directly from notebook code.

\n

When called interactively or in development, returns its input.

\n
\n
Parameters
\n

dagster_event (Union[dagster.AssetMaterialization, dagster.ExpectationResult, dagster.TypeCheck, dagster.Failure, dagster.RetryRequested]) \u2013 An event to yield back to Dagster.

\n
\n
\n
\n\n
\n
\ndagstermill.yield_result(value, output_name='result')\u00b6
\n

Yield a result directly from notebook code.

\n

When called interactively or in development, returns its input.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value to yield.

  • \n
  • output_name (Optional[str]) \u2013 The name of the result to yield (default: 'result').

  • \n
\n
\n
\n
\n\n
\n
\nclass dagstermill.DagstermillExecutionContext(pipeline_context, pipeline_def, resource_keys_to_init, solid_name, solid_handle, solid_config=None)[source]\u00b6
\n

Dagstermill-specific execution context.

\n

Do not initialize directly: use dagstermill.get_context().

\n
\n
\nget_tag(key)[source]\u00b6
\n

Get a logging tag defined on the context.

\n
\n
Parameters
\n

key (str) \u2013 The key to get.

\n
\n
Returns
\n

str

\n
\n
\n
\n\n
\n
\nhas_tag(key)[source]\u00b6
\n

Check if a logging tag is defined on the context.

\n
\n
Parameters
\n

key (str) \u2013 The key to check.

\n
\n
Returns
\n

bool

\n
\n
\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager for the context.

\n

Call, e.g., log.info() to log messages through the Dagster machinery.

\n
\n
Type
\n

dagster.DagsterLogManager

\n
\n
\n
\n\n
\n
\nproperty logging_tags\u00b6
\n

The logging tags for the context.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\nproperty pipeline_def\u00b6
\n

The pipeline definition for the context.

\n

This will be a dagstermill-specific shim.

\n
\n
Type
\n

dagster.PipelineDefinition

\n
\n
\n
\n\n
\n
\nproperty pipeline_run\u00b6
\n

The pipeline run for the context.

\n
\n
Type
\n

dagster.PipelineRun

\n
\n
\n
\n\n
\n
\nproperty resolved_run_config\u00b6
\n

The resolved_run_config for the context

\n
\n
Type
\n

dagster.ResolvedRunConfig

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

A dynamically-created type whose properties allow access to\nresources.

\n
\n
Type
\n

collections.namedtuple

\n
\n
\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run_config for the context.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The run_id for the context.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty solid\u00b6
\n

The solid for the context.

\n

In interactive contexts, this may be a dagstermill-specific shim, depending whether a\nsolid definition was passed to dagstermill.get_context.

\n
\n
Type
\n

dagster.Node

\n
\n
\n
\n\n
\n
\nproperty solid_config\u00b6
\n

A dynamically-created type whose properties allow access to\nsolid-specific config.

\n
\n
Type
\n

collections.namedtuple

\n
\n
\n
\n\n
\n
\nproperty solid_def\u00b6
\n

The solid definition for the context.

\n

In interactive contexts, this may be a dagstermill-specific shim, depending whether a\nsolid definition was passed to dagstermill.get_context.

\n
\n
Type
\n

dagster.SolidDefinition

\n
\n
\n
\n\n
\n\n
\n
\nclass dagstermill.DagstermillError[source]\u00b6
\n

Base class for errors raised by dagstermill.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagstermill", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../dagster-graphql/", "title": "GraphQL (dagster-graphql)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-twilio/", "title": "Twilio (dagster-twilio)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-graphql", "GraphQL (dagster-graphql)", "N", "next"], ["sections/api/apidocs/libraries/dagster-twilio", "Twilio (dagster-twilio)", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/libraries/dagstermill.rst.txt", "title": "Dagstermill", "toc": "\n"}}, "loggers": {"alabaster_version": "0.7.12", "body": "
\n

Loggers\u00b6

\n
\n

Built-in loggers\u00b6

\n
\n
\ndagster.loggers.colored_console_logger(*args, **kwargs)\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.loggers.json_console_logger(*args, **kwargs)\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\n

Logging from an @op\u00b6

\n
\n
\nclass dagster.DagsterLogManager(dagster_handler, level=0, managed_loggers=None)[source]\u00b6
\n

Centralized dispatch for logging from user code.

\n

Handles the construction of uniform structured log messages and passes them through to the\nunderlying loggers/handlers.

\n

An instance of the log manager is made available to ops as context.log. Users should not\ninitialize instances of the log manager directly. To configure custom loggers, set the\nlogger_defs argument in an @job decorator or when calling the to_job() method on a\nGraphDefinition.

\n

The log manager inherits standard convenience methods like those exposed by the Python standard\nlibrary logging module (i.e., within the body of an op,\ncontext.log.{debug, info, warning, warn, error, critical, fatal}).

\n

The underlying integer API can also be called directly using, e.g.\ncontext.log.log(5, msg), and the log manager will delegate to the log method\ndefined on each of the loggers it manages.

\n

User-defined custom log levels are not supported, and calls to, e.g.,\ncontext.log.trace or context.log.notice will result in hard exceptions at runtime.

\n
\n\n
\n
\n

Defining custom loggers\u00b6

\n
\n
\n@dagster.logger(config_schema=None, description=None)[source]\u00b6
\n

Define a logger.

\n

The decorated function should accept an InitLoggerContext and return an instance of\nlogging.Logger. This function will become the logger_fn of an underlying\nLoggerDefinition.

\n
\n
Parameters
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the logger.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.LoggerDefinition(logger_fn, config_schema=None, description=None)[source]\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job/pipeline compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n
\nconfigured(config_or_config_fn, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\nclass dagster.InitLoggerContext(logger_config, logger_def=None, pipeline_def=None, run_id=None)[source]\u00b6
\n

Logger-specific initialization context.

\n

An instance of this class is made available as the first argument to the logger_fn decorated\nby @logger or set on a LoggerDefinition.

\n

Users should not instantiate this class.

\n
\n
\nlogger_config\u00b6
\n

The configuration data provided by the run config. The\nschema for this data is defined by config_schema on the LoggerDefinition

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\npipeline_def\u00b6
\n

The pipeline/job definition currently being executed.

\n
\n
Type
\n

Optional[PipelineDefinition]

\n
\n
\n
\n\n
\n
\nlogger_def\u00b6
\n

The logger definition for the logger being constructed.

\n
\n
Type
\n

Optional[LoggerDefinition]

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The ID for this run of the pipeline.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_init_logger_context(logger_config=None, pipeline_def=None, job_def=None)[source]\u00b6
\n

Builds logger initialization context from provided parameters.

\n

This function can be used to provide the context argument to the invocation of a logger\ndefinition.

\n

Note that you may only specify one of pipeline_def and job_def.

\n
\n
Parameters
\n
    \n
  • logger_config (Any) \u2013 The config to provide during initialization of logger.

  • \n
  • pipeline_def (Optional[PipelineDefinition]) \u2013 The pipeline definition that the logger will be\nused with.

  • \n
  • job_def (Optional[JobDefinition]) \u2013 The job definition that the logger will be used with.

  • \n
\n
\n
\n

Examples

\n
context = build_init_logger_context()\nlogger_to_init(context)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/loggers", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../modes/", "title": "[Legacy] Modes"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../jobs/", "title": "Jobs"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/modes", "[Legacy] Modes", "N", "next"], ["sections/api/apidocs/jobs", "Jobs", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/loggers.rst.txt", "title": "Loggers", "toc": "\n"}, "memoization": {"alabaster_version": "0.7.12", "body": "
\n

Versioning and Memoization\u00b6

\n

Dagster allows for code versioning and memoization of previous outputs based upon that versioning.\nListed here are APIs related to versioning and memoization.

\n
\n

Versioning\u00b6

\n
\n
\nclass dagster.VersionStrategy[source]\u00b6
\n

Abstract class for defining a strategy to version solids and resources.

\n

When subclassing, get_solid_version must be implemented, and get_resource_version can be\noptionally implemented.

\n

get_solid_version should ingest a SolidVersionContext, and get_resource_version should ingest a\nResourceVersionContext. From that, each synthesize a unique string called a version, which will\nbe tagged to outputs of that solid in the pipeline. Providing a VersionStrategy instance to a\njob will enable memoization on that job, such that only steps whose outputs do not have an\nup-to-date version will run.

\n
\n\n
\n
\nclass dagster.SourceHashVersionStrategy[source]\u00b6
\n
\n\n
\n
\n

Memoization\u00b6

\n
\n
\nclass dagster.MemoizableIOManager[source]\u00b6
\n

Base class for IO manager enabled to work with memoized execution. Users should implement\nthe load_input and handle_output methods described in the IOManager API, and the\nhas_output method, which returns a boolean representing whether a data object can be found.

\n
\n
\nabstract has_output(context)[source]\u00b6
\n

The user-defined method that returns whether data exists given the metadata.

\n
\n
Parameters
\n

context (OutputContext) \u2013 The context of the step performing this check.

\n
\n
Returns
\n

True if there is data present that matches the provided context. False otherwise.

\n
\n
Return type
\n

bool

\n
\n
\n
\n\n
\n\n

See also: dagster.IOManager.

\n
\n
\ndagster.MEMOIZED_RUN_TAG\u00b6
\n

Provide this tag to a run to toggle memoization on or off. {MEMOIZED_RUN_TAG: "true"} toggles memoization on, while {MEMOIZED_RUN_TAG: "false"} toggles memoization off.

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/memoization", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../libraries/dagster-airbyte/", "title": "Airbyte (dagster-airbyte)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../utilities/", "title": "Utilities"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-airbyte", "Airbyte (dagster-airbyte)", "N", "next"], ["sections/api/apidocs/utilities", "Utilities", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/memoization.rst.txt", "title": "Versioning and Memoization", "toc": "\n"}, "modes": {"alabaster_version": "0.7.12", "body": "
\n

[Legacy] Modes\u00b6

\n

Modes are only used in the creation of PipelineDefinition objects, which are now\ndeprecated in favor of JobDefinition.

\n
\n
\nclass dagster.ModeDefinition(name=None, resource_defs=None, logger_defs=None, executor_defs=None, description=None, _config_mapping=None, _partitioned_config=None)[source]\u00b6
\n

Define a mode in which a pipeline can operate.

\n

A mode provides pipelines with a set of resource implementations, loggers, system storages,\nand executors.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the mode. Must be unique within the\nPipelineDefinition to which the mode is attached. (default: \u201cdefault\u201d).

  • \n
  • resource_defs (Optional[Dict[str, ResourceDefinition]]) \u2013 A dictionary of string resource\nkeys to their implementations. Individual solids may require resources to be present by\nthese keys.

  • \n
  • logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger\nidentifiers to their implementations.

  • \n
  • executor_defs (Optional[List[ExecutorDefinition]]) \u2013 The set of executors available when\nexecuting in this mode. By default, this will be the \u2018in_process\u2019 and \u2018multiprocess\u2019\nexecutors (default_executors).

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the mode.

  • \n
  • _config_mapping (Optional[ConfigMapping]) \u2013 Only for internal use.

  • \n
  • _partitions (Optional[PartitionedConfig]) \u2013 Only for internal use.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/modes", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../ops/", "title": "Ops"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../loggers/", "title": "Loggers"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/ops", "Ops", "N", "next"], ["sections/api/apidocs/loggers", "Loggers", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/modes.rst.txt", "title": "[Legacy] Modes", "toc": "\n"}, "ops": {"alabaster_version": "0.7.12", "body": "
\n

Ops\u00b6

\n

The foundational unit of computation in Dagster.

\n
\n
\n

Defining ops\u00b6

\n
\n
\n@dagster.op(name=None, description=None, ins=None, out=None, config_schema=None, required_resource_keys=None, tags=None, version=None, retry_policy=None, input_defs=None, output_defs=None)[source]\u00b6
\n

Create an op with the specified parameters from the decorated function.

\n

Ins and outs will be inferred from the type signature of the decorated function\nif not explicitly provided.

\n

The decorated function will be used as the op\u2019s compute function. The signature of the\ndecorated function is more flexible than that of the compute_fn in the core API; it may:

\n
    \n
  1. Return a value. This value will be wrapped in an Output and yielded by the compute function.

  2. \n
  3. Return an Output. This output will be yielded by the compute function.

  4. \n
  5. Yield Output or other event objects. Same as default compute behavior.

  6. \n
\n

Note that options 1) and 2) are incompatible with yielding other events \u2013 if you would like\nto decorate a function that yields events, it must also wrap its eventual output in an\nOutput and yield it.

\n

@op supports async def functions as well, including async generators when yielding multiple\nevents or outputs. Note that async ops will generally be run on their own unless using a custom\nExecutor implementation that supports running them together.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 Name of op. Must be unique within any GraphDefinition\nusing the op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this op. If not provided, and\nthe decorated function has docstring, that docstring will be used as the description.

  • \n
  • ins (Optional[Dict[str, In]]) \u2013 Information about the inputs to the op. Information provided here will be combined\nwith what can be inferred from the function signature.

  • \n
  • out (Optional[Union[Out, Dict[str, Out]]]) \u2013 Information about the op outputs. Information provided here will be combined with\nwhat can be inferred from the return type signature if the function does not use yield.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the op matches this schema and fail if it does not. If not\nset, Dagster will accept any config provided for the op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this op.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Values that are not strings\nwill be json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the op\u2019s compute_fn. Two ops should have\nthe same version if and only if they deterministically produce the same outputs when\nprovided the same inputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this op.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013 (legacy) Preserved to ease migration from solid. Can be used in place of ins argument.

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013 (legacy) Preserved to ease migration from solid. Can be used in place of out argument.

  • \n
\n
\n
\n

Examples

\n
@op\ndef hello_world():\n    print('hello')\n\n@op\ndef echo(msg: str) -> str:\n    return msg\n\n@op(\n    ins={'msg': In(str)},\n    out=Out(str)\n)\ndef echo_2(msg): # same as above\n    return msg\n\n@op(\n    out={'word': Out(), 'num': Out()}\n)\ndef multi_out() -> Tuple[str, int]:\n    return 'cool', 4\n
\n
\n
\n\n
\n
\nclass dagster.OpDefinition(name, input_defs, compute_fn, output_defs, config_schema=None, description=None, tags=None, required_resource_keys=None, version=None, retry_policy=None)[source]\u00b6
\n

Defines an op, the functional unit of user-defined computation.

\n

For more details on what a op is, refer to the\nOps Overview .

\n

End users should prefer the @op decorator. OpDefinition is generally intended to be\nused by framework authors or for programatically generated ops.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the op. Must be unique within any GraphDefinition or\nJobDefinition that contains the op.

  • \n
  • input_defs (List[InputDefinition]) \u2013 Inputs of the op.

  • \n
  • compute_fn (Callable) \u2013

    The core of the op, the function that performs the actual\ncomputation. The signature of this function is determined by input_defs, and\noptionally, an injected first argument, context, a collection of information\nprovided by the system.

    \n

    This function will be coerced into a generator or an async generator, which must yield\none Output for each of the op\u2019s output_defs, and additionally may\nyield other types of Dagster events, including AssetMaterialization and\nExpectationResult.

    \n

  • \n
  • output_defs (List[OutputDefinition]) \u2013 Outputs of the op.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat the config provided for the op matches this schema and will fail if it does not. If\nnot set, Dagster will accept any config provided for the op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the op.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resources handles required by this op.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the op\u2019s compute_fn. Two ops should\nhave the same version if and only if they deterministically produce the same outputs\nwhen provided the same inputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this op.

  • \n
\n
\n
\n

Examples

\n
def _add_one(_context, inputs):\n    yield Output(inputs["num"] + 1)\n\nOpDefinition(\n    name="add_one",\n    input_defs=[InputDefinition("num", Int)],\n    output_defs=[OutputDefinition(Int)], # default name ("result")\n    compute_fn=_add_one,\n)\n
\n
\n
\n
\nconfigured(config_or_config_fn, name, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (str) \u2013 Name of the new definition. This is a required argument, as this definition\ntype has a name uniqueness constraint.

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\n
\n

Ins & outs\u00b6

\n
\n
\nclass dagster.In(dagster_type=<class 'dagster.core.definitions.utils.NoValueSentinel'>, description=None, default_value=<class 'dagster.core.definitions.utils.NoValueSentinel'>, root_manager_key=None, metadata=None, asset_key=None, asset_partitions=None)[source]\u00b6
\n

Defines an argument to an op\u2019s compute function.

\n

Inputs may flow from previous op\u2019s outputs, or be stubbed using config. They may optionally\nbe typed using the Dagster type system.

\n
\n
Parameters
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this input. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the input.

  • \n
  • default_value (Optional[Any]) \u2013 The default value to use if no input is provided.

  • \n
  • root_manager_key (Optional[str]) \u2013 (Experimental) The resource key for the\nRootInputManager used for loading this input when it is not connected to an\nupstream output.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata for the input.

  • \n
  • asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]) \u2013 (Experimental) An AssetKey\n(or function that produces an AssetKey from the InputContext) which should be associated\nwith this In. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the InputContext) which should be associated with this In.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Out(dagster_type=<class 'dagster.core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, asset_key=None, asset_partitions=None, asset_partitions_def=None)[source]\u00b6
\n

Defines an output from an op\u2019s compute function.

\n

Ops can have multiple outputs, in which case outputs cannot be anonymous.

\n

Many ops have only one output, in which case the user can provide a single output definition\nthat will be given the default name, \u201cresult\u201d.

\n

Outs may be typed using the Dagster type system.

\n
\n
Parameters
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this output. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the output.

  • \n
  • is_required (bool) \u2013 Whether the presence of this field is required. (default: True)

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the output manager used for this output.\n(default: \u201cio_manager\u201d).

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.

  • \n
  • asset_key (Optional[AssetKey]) \u2013 (Experimental) An AssetKey which should be associated\nwith this Out. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], OutputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the OutputContext) which should be associated with this Out.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Execution\u00b6

\n
\n
\nclass dagster.RetryPolicy(max_retries=1, delay=None, backoff=None, jitter=None)[source]\u00b6
\n

A declarative policy for when to request retries when an exception occurs during op execution.

\n
\n
Parameters
\n
    \n
  • max_retries (int) \u2013 The maximum number of retries to attempt. Defaults to 1.

  • \n
  • delay (Optional[Union[int,float]]) \u2013 The time in seconds to wait between the retry being requested and the next attempt\nbeing started. This unit of time can be modulated as a function of attempt number\nwith backoff and randomly with jitter.

  • \n
  • backoff (Optional[Backoff]) \u2013 A modifier for delay as a function of retry attempt number.

  • \n
  • jitter (Optional[Jitter]) \u2013 A randomizing modifier for delay, applied after backoff calculation.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Backoff(value)[source]\u00b6
\n

A modifier for delay as a function of attempt number.

\n

LINEAR: attempt_num * delay\nEXPONENTIAL: ((2 ^ attempt_num) - 1) * delay

\n
\n\n
\n
\nclass dagster.Jitter(value)[source]\u00b6
\n

A randomizing modifier for delay, applied after backoff calculation.

\n

FULL: between 0 and the calculated delay based on backoff: random() * backoff_delay\nPLUS_MINUS: +/- the delay: backoff_delay + ((2 * (random() * delay)) - delay)

\n
\n\n
\n
\n
\n

Events\u00b6

\n

The objects that can be yielded by the body of ops\u2019 compute functions to communicate with the\nDagster framework.

\n

(Note that Failure and RetryRequested are intended to be raised from ops rather than yielded.)

\n
\n

Event types\u00b6

\n
\n
\nclass dagster.Output(value, output_name='result', metadata_entries=None, metadata=None)[source]\u00b6
\n

Event corresponding to one of a op\u2019s outputs.

\n

Op compute functions must explicitly yield events of this type when they have more than\none output, or when they also yield events of other types, or when defining a op using the\nOpDefinition API directly.

\n

Outputs are values produced by ops that will be consumed by downstream ops in a job.\nThey are type-checked at op boundaries when their corresponding Out\nor the downstream In is typed.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding out. (default:\n\u201cresult\u201d)

  • \n
  • metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]) \u2013 (Experimental) A set of metadata entries to attach to events related to this Output.

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetMaterialization(asset_key, description=None, metadata_entries=None, partition=None, tags=None, metadata=None)[source]\u00b6
\n

Event indicating that an op has materialized an asset.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that they have produced a materialized value as a\nside effect of computation. Unlike outputs, asset materializations can not be passed to other\nops, and their persistence is controlled by op logic, rather than by the Dagster\nframework.

\n

Op authors should use these events to organize metadata about the side effects of their\ncomputations, enabling tooling like the Assets dashboard in Dagit.

\n
\n
Parameters
\n
    \n
  • asset_key (Union[str, List[str], AssetKey]) \u2013 A key to identify the materialized asset across job\nruns

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the materialized value.

  • \n
  • metadata_entries (Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]]) \u2013 Arbitrary metadata about the\nmaterialized value.

  • \n
  • partition (Optional[str]) \u2013 The name of the partition that was materialized.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 (Experimental) Tag metadata for a given asset\nmaterialization. Used for search and organization of the asset entry in the asset\ncatalog in Dagit.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the asset. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n
\nstatic file(path, description=None, asset_key=None)[source]\u00b6
\n

Static constructor for standard materializations corresponding to files on disk.

\n
\n
Parameters
\n
    \n
  • path (str) \u2013 The path to the file.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the materialization.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ExpectationResult(success, label=None, description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Event corresponding to a data quality test.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that a data quality test has produced a (positive or\nnegative) result.

\n
\n
Parameters
\n
    \n
  • success (bool) \u2013 Whether the expectation passed or not.

  • \n
  • label (Optional[str]) \u2013 Short display name for expectation. Defaults to \u201cresult\u201d.

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the expectation.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\nexpectation.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheck(success, description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Event corresponding to a successful typecheck.

\n

Events of this type should be returned by user-defined type checks when they need to encapsulate\nadditional metadata about a type check\u2019s success or failure. (i.e., when using\nas_dagster_type(), @usable_as_dagster_type, or the underlying\nPythonObjectDagsterType() API.)

\n

Solid compute functions should generally avoid yielding events of this type to avoid confusion.

\n
\n
Parameters
\n
    \n
  • success (bool) \u2013 True if the type check succeeded, False otherwise.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the type check.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\ntype check.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Failure(description=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Event indicating op failure.

\n

Raise events of this type from within op compute functions or custom type checks in order to\nindicate an unrecoverable failure in user code to the Dagster machinery and return\nstructured metadata about the failure.

\n
\n
Parameters
\n
    \n
  • description (Optional[str]) \u2013 A human-readable description of the failure.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\nfailure.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RetryRequested(max_retries=1, seconds_to_wait=None)[source]\u00b6
\n

An exception to raise from an op to indicate that it should be retried.

\n
\n
Parameters
\n
    \n
  • max_retries (Optional[int]) \u2013 The max number of retries this step should attempt before failing

  • \n
  • seconds_to_wait (Optional[Union[float,int]]) \u2013 Seconds to wait before restarting the step after putting the step in\nto the up_for_retry state

  • \n
\n
\n
\n

Example

\n
@op\ndef flakes():\n    try:\n        flakey_operation()\n    except Exception as e:\n        raise RetryRequested(max_retries=3) from e\n
\n
\n
\n\n
\n
\n
\n

Event metadata\u00b6

\n

Dagster uses metadata to communicate arbitrary user-specified metadata about structured\nevents.

\n
\n
\nclass dagster.MetadataValue[source]\u00b6
\n

Utility class to wrap metadata values passed into Dagster events so that they can be\ndisplayed in Dagit and other tooling.

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": "hello",\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n            "num_rows": 0,\n        },\n    )\n
\n
\n
\n
\nstatic asset(asset_key)[source]\u00b6
\n

Static constructor for a metadata value referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata={\n            "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n        },\n    )\n
\n
\n
\n
Parameters
\n

asset_key (AssetKey) \u2013 The asset key referencing the asset.

\n
\n
\n
\n\n
\n
\nstatic bool(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping a bool as\nBoolMetadataValuye. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n        },\n    )\n
\n
\n
\n
Parameters
\n

value (bool) \u2013 The bool value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic dagster_run(run_id)[source]\u00b6
\n

Static constructor for a metadata value wrapping a reference to a Dagster run.

\n
\n
Parameters
\n

run_id (str) \u2013 The ID of the run.

\n
\n
\n
\n\n
\n
\nstatic float(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping a float as\nFloatMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n        }\n    )\n
\n
\n
\n
Parameters
\n

value (float) \u2013 The float value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic int(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping an int as\nIntMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "number of rows": MetadataValue.int(len(df)),\n        },\n    )\n
\n
\n
\n
Parameters
\n

value (int) \u2013 The int value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic json(data)[source]\u00b6
\n

Static constructor for a metadata value wrapping a path as\nJsonMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata={\n            "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n        },\n    )\n
\n
\n
\n
Parameters
\n

data (Dict[str, Any]) \u2013 The JSON data for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic md(data)[source]\u00b6
\n

Static constructor for a metadata value wrapping markdown data as\nMarkdownMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata={\n            'Details': MetadataValue.md(md_str)\n        },\n    )\n
\n
\n
\n
Parameters
\n

md_str (str) \u2013 The markdown for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic path(path)[source]\u00b6
\n

Static constructor for a metadata value wrapping a path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "filepath": MetadataValue.path("path/to/file"),\n        }\n    )\n
\n
\n
\n
Parameters
\n

path (str) \u2013 The path for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic python_artifact(python_artifact)[source]\u00b6
\n

Static constructor for a metadata value wrapping a python artifact as\nPythonArtifactMetadataValue. Can be used as the value type for the\nmetadata parameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "class": MetadataValue.python_artifact(MyClass),\n            "function": MetadataValue.python_artifact(my_function),\n        }\n    )\n
\n
\n
\n
Parameters
\n

value (Callable) \u2013 The python class or function for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic table(records, schema=None)[source]\u00b6
\n

Static constructor for a metadata value wrapping arbitrary tabular data as\nTableMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata={\n            "errors": MetadataValue.table(\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"}]\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        },\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • records (List[TableRecord]) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table_schema(schema)[source]\u00b6
\n

Static constructor for a metadata value wrapping a table schema as\nTableSchemaMetadataValue. Can be used as the value type\nfor the metadata parameter for supported events. For example:

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata={\n        'my_table_schema': MetadataValue.table_schema(schema),\n    }\n)\n
\n
\n
\n
Parameters
\n

schema (TableSchema) \u2013 The table schema for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic text(text)[source]\u00b6
\n

Static constructor for a metadata value wrapping text as\nTextMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": MetadataValue.text("hello")\n        },\n    )\n
\n
\n
\n
Parameters
\n

text (str) \u2013 The text string for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic url(url)[source]\u00b6
\n

Static constructor for a metadata value wrapping a URL as\nUrlMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata={\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n        }\n    )\n
\n
\n
\n
Parameters
\n

url (str) \u2013 The URL for a metadata entry.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MetadataEntry(label, description=None, entry_data=None, value=None)[source]\u00b6
\n

The standard structure for describing metadata for Dagster events.

\n

Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\nin Dagit and other tooling.

\n

Should be yielded from within an IO manager to append metadata for a given input/output event.\nFor other event types, passing a dict with MetadataValue values to the metadata argument\nis preferred.

\n
\n
Parameters
\n
    \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • value (MetadataValue) \u2013 Typed metadata entry data. The different types allow\nfor customized display in tools like dagit.

  • \n
\n
\n
\n
\n
\nstatic asset(asset_key, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata_entries=[\n             MetadataEntry.asset(AssetKey('my_other_table'), "Related asset"),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • asset_key (AssetKey) \u2013 The asset key referencing the asset.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic float(value, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing float as\nFloatMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.float(calculate_bytes(df), "size (bytes)")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • value (Optional[float]) \u2013 The float value contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic fspath(path, label=None, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing a filesystem path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.fspath("path/to/file")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • path (Optional[str]) \u2013 The path contained by this metadata entry.

  • \n
  • label (Optional[str]) \u2013 Short display label for this metadata entry. Defaults to the\nbase name of the path.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic int(value, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing int as\nIntMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.int(len(df), "number of rows")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • value (Optional[int]) \u2013 The int value contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic json(data, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing JSON data as\nJsonMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata_entries=[\n            MetadataEntry.json(\n                label="metadata", data={"missing_columns": missing_things},\n            )\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • data (Optional[Dict[str, Any]]) \u2013 The JSON data contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic md(md_str, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing markdown data as\nMarkdownMetadataValue. For example:

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata_entries=[MetadataEntry.md(md_str=md_str)],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • md_str (Optional[str]) \u2013 The markdown contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic path(path, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing a path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.path("path/to/file", label="filepath")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • path (Optional[str]) \u2013 The path contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table(records, label, description=None, schema=None)[source]\u00b6
\n

Static constructor for a metadata entry containing tabluar data as\nTableMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata_entries=[\n            MetadataEntry.table(\n                label="errors",\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"}]\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • records (List[TableRecord]) \u2013 The data as a list of records (i.e. rows).

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table. If none is provided, one will be\nautomatically generated by examining the first record. The schema will include as columns all\nfield names present in the first record, with a type of \u201cstring\u201d, \u201cint\u201d,\n\u201cbool\u201d or \u201cfloat\u201d inferred from the first record\u2019s values. If a value does\nnot directly match one of the above types, it will be treated as a string.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table_schema(schema, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing a table schema as\nTableSchemaMetadataValue. For example:

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata_entries=[\n        MetadataEntry.table_schema(\n            schema,\n            label='schema',\n        )\n    ]\n)\n
\n
\n
\n
Parameters
\n
    \n
  • schema (TableSchema) \u2013 The table schema for a metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic text(text, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing text as\nTextMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[\n            MetadataEntry.text("Text-based metadata for this event", "text_metadata")\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • text (Optional[str]) \u2013 The text of this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic url(url, label, description=None)[source]\u00b6
\n

Static constructor for a metadata entry containing a URL as\nUrlMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata_entries=[\n            MetadataEntry.url(\n                "http://mycoolsite.com/my_dashboard", label="dashboard_url"\n            ),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • url (Optional[str]) \u2013 The URL contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nproperty value\u00b6
\n

Alias of entry_data.

\n
\n\n
\n\n
\n
\n

Metadata types\u00b6

\n

All metadata types inherit from MetadataValue. The following types are defined:

\n
\n
\nclass dagster.DagsterAssetMetadataValue(asset_key)[source]\u00b6
\n

Representation of a dagster asset.

\n
\n
Parameters
\n

asset_key (AssetKey) \u2013 The dagster asset key

\n
\n
\n
\n\n
\n
\nclass dagster.DagsterPipelineRunMetadataValue(run_id)[source]\u00b6
\n

Representation of a dagster pipeline run.

\n
\n
Parameters
\n

run_id (str) \u2013 The pipeline run id

\n
\n
\n
\n\n
\n
\nclass dagster.FloatMetadataValue(value)[source]\u00b6
\n

Container class for float metadata entry data.

\n
\n
Parameters
\n

value (Optional[float]) \u2013 The float value.

\n
\n
\n
\n\n
\n
\nclass dagster.IntMetadataValue(value)[source]\u00b6
\n

Container class for int metadata entry data.

\n
\n
Parameters
\n

value (Optional[int]) \u2013 The int value.

\n
\n
\n
\n\n
\n
\nclass dagster.JsonMetadataValue(data)[source]\u00b6
\n

Container class for JSON metadata entry data.

\n
\n
Parameters
\n

data (Dict[str, Any]) \u2013 The JSON data.

\n
\n
\n
\n\n
\n
\nclass dagster.MarkdownMetadataValue(md_str)[source]\u00b6
\n

Container class for markdown metadata entry data.

\n
\n
Parameters
\n

md_str (Optional[str]) \u2013 The markdown as a string.

\n
\n
\n
\n\n
\n
\nclass dagster.PathMetadataValue(path)[source]\u00b6
\n

Container class for path metadata entry data.

\n
\n
Parameters
\n

path (Optional[str]) \u2013 The path as a string or conforming to os.PathLike.

\n
\n
\n
\n\n
\n
\nclass dagster.PythonArtifactMetadataValue(module, name)[source]\u00b6
\n

Container class for python artifact metadata entry data.

\n
\n
Parameters
\n
    \n
  • module (str) \u2013 The module where the python artifact can be found

  • \n
  • name (str) \u2013 The name of the python artifact

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableMetadataValue(records, schema)[source]\u00b6
\n

Container class for table metadata entry data.

\n
\n
Parameters
\n
    \n
  • records (TableRecord) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableSchemaMetadataValue(schema)[source]\u00b6
\n

Representation of a schema for arbitrary tabular data.

\n
\n
Parameters
\n

schema (TableSchema) \u2013 The dictionary containing the schema representation.

\n
\n
\n
\n\n
\n
\nclass dagster.TextMetadataValue(text)[source]\u00b6
\n

Container class for text metadata entry data.

\n
\n
Parameters
\n

text (Optional[str]) \u2013 The text data.

\n
\n
\n
\n\n
\n
\nclass dagster.UrlMetadataValue(url)[source]\u00b6
\n

Container class for URL metadata entry data.

\n
\n
Parameters
\n

url (Optional[str]) \u2013 The URL as a string.

\n
\n
\n
\n\n
\n
\n

Tables\u00b6

\n

These APIs provide the ability to express table schemas (TableSchema) and table rows/records (TableRecord) in Dagster. Currently the only use case for TableSchemas and TableRecords is to wrap them in their corresponding metadata classes TableMetadataValue and TableSchemaMetadataValue for attachment to events or Dagster types.

\n
\n
\nclass dagster.TableRecord(**data)[source]\u00b6
\n

Represents one record in a table. All passed keyword arguments are treated as field key/value\npairs in the record. Field keys are arbitrary strings\u2013 field values must be strings, integers,\nfloats, or bools.

\n
\n\n
\n
\nclass dagster.TableSchema(columns, constraints=None)[source]\u00b6
\n

Representation of a schema for tabular data. Schema is composed of two parts:

\n
    \n
  • A required list of columns (TableColumn). Each column specifies a\nname, type, set of constraints, and (optional) description. type\ndefaults to string if unspecified. Column constraints\n(TableColumnConstraints) consist of boolean properties unique and\nnullable, as well as a list of strings other containing string\ndescriptions of all additional constraints (e.g. \u201c<= 5\u201d).

  • \n
  • An optional list of table-level constraints (TableConstraints). A\ntable-level constraint cannot be expressed in terms of a single column,\ne.g. col a > col b. Presently, all table-level constraints must be\nexpressed as strings under the other attribute of a TableConstraints\nobject.

  • \n
\n
# example schema\nTableSchema(\n    constraints = TableConstraints(\n        other = [\n            "foo > bar",\n        ],\n    ),\n    columns = [\n        TableColumn(\n            name = "foo",\n            type = "string",\n            description = "Foo description",\n            constraints = TableColumnConstraints(\n                required = True,\n                other = [\n                    "starts with the letter 'a'",\n                ],\n            ),\n        ),\n        TableColumn(\n            name = "bar",\n            type = "string",\n        ),\n        TableColumn(\n            name = "baz",\n            type = "custom_type",\n            constraints = TableColumnConstraints(\n                unique = True,\n            )\n        ),\n    ],\n)\n
\n
\n
\n
Parameters
\n
    \n
  • columns (List[TableColumn]) \u2013 The columns of the table.

  • \n
  • constraints (Optional[TableConstraints]) \u2013 The constraints of the table.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableConstraints(other)[source]\u00b6
\n

Descriptor for \u201ctable-level\u201d constraints. Presently only one property,\nother is supported. This contains strings describing arbitrary\ntable-level constraints. A table-level constraint is a constraint defined\nin terms of multiple columns (e.g. col_A > col_B) or in terms of rows.

\n
\n
Parameters
\n

other (List[str]) \u2013 Descriptions of arbitrary table-level constraints.

\n
\n
\n
\n\n
\n
\nclass dagster.TableColumn(name, type='string', description=None, constraints=None)[source]\u00b6
\n

Descriptor for a table column. The only property that must be specified\nby the user is name. If no type is specified, string is assumed. If\nno constraints are specified, the column is assumed to be nullable\n(i.e. required = False) and have no other constraints beyond the data type.

\n
\n
Parameters
\n
    \n
  • name (List[str]) \u2013 Descriptions of arbitrary table-level constraints.

  • \n
  • type (Optional[str]) \u2013 The type of the column. Can be an arbitrary\nstring. Defaults to \u201cstring\u201d.

  • \n
  • description (Optional[str]) \u2013 Description of this column. Defaults to None.

  • \n
  • constraints (Optional[TableColumnConstraints]) \u2013 Column-level constraints.\nIf unspecified, column is nullable with no constraints.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableColumnConstraints(nullable=True, unique=False, other=None)[source]\u00b6
\n

Descriptor for a table column\u2019s constraints. Nullability and uniqueness are specified with\nboolean properties. All other constraints are described using arbitrary strings under the\nother property.

\n
\n
Parameters
\n
    \n
  • nullable (Optional[bool]) \u2013 If true, this column can hold null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, all values in this column must be unique.

  • \n
  • other (List[str]) \u2013 Descriptions of arbitrary column-level constraints\nnot expressible by the predefined properties.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Asset key\u00b6

\n

Dagster uses AssetKey to build an index on Materialization events.\nAssets materialized with an AssetKey are highlighted in dagit on the Assets\ndashboard.

\n
\n
\nclass dagster.AssetKey(path)[source]\u00b6
\n

Object representing the structure of an asset key. Takes in a sanitized string, list of\nstrings, or tuple of strings.

\n

Example usage:

\n
from dagster import op\n\n@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey('flat_asset_key'),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(['parent', 'child', 'grandchild']),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key_2(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(('parent', 'child', 'grandchild')),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n
\n
\n
\n
Parameters
\n

path (Sequence[str]) \u2013 String, list of strings, or tuple of strings. A list of strings\nrepresent the hierarchical structure of the asset_key.

\n
\n
\n
\n
\nto_string(legacy=False)[source]\u00b6
\n

E.g. \u2018[\u201cfirst_component\u201d, \u201csecond_component\u201d]\u2019

\n
\n\n
\n
\nto_user_string()[source]\u00b6
\n

E.g. \u201cfirst_component>second_component\u201d

\n
\n\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/ops", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../io-managers/", "title": "IO Managers"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../modes/", "title": "[Legacy] Modes"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/io-managers", "IO Managers", "N", "next"], ["sections/api/apidocs/modes", "[Legacy] Modes", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/ops.rst.txt", "title": "Ops", "toc": "\n"}, "partitions": {"alabaster_version": "0.7.12", "body": "
\n

Partitioned Config\u00b6

\n
\n
\nclass dagster.PartitionedConfig(partitions_def, run_config_for_partition_fn, decorated_fn=None, tags_for_partition_fn=None)[source]\u00b6
\n

Defines a way of configuring a job where the job can be run on one of a discrete set of\npartitions, and each partition corresponds to run configuration for the job.

\n

Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\nand view the run history across partitions.

\n
\n
\nget_run_config_for_partition_key(partition_key)[source]\u00b6
\n

Generates the run config corresponding to a partition key.

\n
\n
Parameters
\n

partition_key (str) \u2013 the key for a partition that should be used to generate a run config.

\n
\n
\n
\n\n
\n\n
\n
\ndagster.static_partitioned_config(partition_keys, tags_for_partition_fn=None)[source]\u00b6
\n

Creates a static partitioned config for a job.

\n

The provided partition_keys returns a static list of strings identifying the set of partitions,\ngiven an optional datetime argument (representing the current time). The list of partitions\nis static, so while the run config returned by the decorated function may change over time, the\nlist of valid partition keys does not.

\n

This has performance advantages over dynamic_partitioned_config in terms of loading different\npartition views in Dagit.

\n

The decorated function takes in a partition key and returns a valid run config for a particular\ntarget job.

\n
\n
Parameters
\n

partition_keys (List[str]) \u2013 A list of valid partition keys, which serve as the range of\nvalues that can be provided to the decorated run config function.

\n
\n
Returns
\n

PartitionedConfig

\n
\n
\n
\n\n
\n
\ndagster.dynamic_partitioned_config(partition_fn, tags_for_partition_fn=None)[source]\u00b6
\n

Creates a dynamic partitioned config for a job.

\n

The provided partition_fn returns a list of strings identifying the set of partitions, given\nan optional datetime argument (representing the current time). The list of partitions returned\nmay change over time.

\n

The decorated function takes in a partition key and returns a valid run config for a particular\ntarget job.

\n
\n
Parameters
\n

partition_fn (Callable[[datetime.datetime], Sequence[str]]) \u2013 A function that generates a\nlist of valid partition keys, which serve as the range of values that can be provided\nto the decorated run config function.

\n
\n
Returns
\n

PartitionedConfig

\n
\n
\n
\n\n
\n
\ndagster.hourly_partitioned_config(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of hourly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset is provided, the start and end times of each partition will be\nminute_offset past the hour.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n@hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\ndagster.daily_partitioned_config(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of daily partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the bounds\nof the date partition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset and/or hour_offset are used, the start and end times of each partition\nwill be hour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@daily_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n@daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\ndagster.weekly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of weekly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@weekly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n@weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\ndagster.monthly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of monthly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at midnight on the soonest first of the month after\nstart_date. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and end\ndate of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\nstart and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@monthly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n@monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n
\n

Partitions Definitions\u00b6

\n
\n
\nclass dagster.PartitionsDefinition(*args, **kwds)[source]\u00b6
\n
\n\n
\n
\nclass dagster.HourlyPartitionsDefinition(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n
\n\n
\n
\nclass dagster.DailyPartitionsDefinition(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n
\n\n
\n
\nclass dagster.WeeklyPartitionsDefinition(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n
\n\n
\n
\nclass dagster.MonthlyPartitionsDefinition(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n
\n\n
\n
\nclass dagster.TimeWindowPartitionsDefinition(schedule_type, start, timezone, fmt, end_offset, minute_offset=0, hour_offset=0, day_offset=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster.StaticPartitionsDefinition(partition_keys)[source]\u00b6
\n
\n\n
\n
\n

Partitioned Schedules\u00b6

\n
\n
\ndagster.build_schedule_from_partitioned_job(job, description=None, name=None, minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]
\n

Creates a schedule from a time window-partitioned job.

\n

The schedule executes at the cadence specified by the partitioning of the given job.

\n
\n\n
\n
\n

Legacy Functions\u00b6

\n

The following functions are useful for working with partitions on legacy pipelines.

\n
\n
\nclass dagster.Partition(value, name=None)[source]\u00b6
\n

A Partition represents a single slice of the entire set of a job\u2019s possible work. It consists\nof a value, which is an object that represents that partition, and an optional name, which is\nused to label the partition in a human-readable way.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The object for this partition

  • \n
  • name (str) \u2013 Name for this partition

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.PartitionSetDefinition(name, pipeline_name=None, partition_fn=None, solid_selection=None, mode=None, run_config_fn_for_partition=<function PartitionSetDefinition.<lambda>>, tags_fn_for_partition=<function PartitionSetDefinition.<lambda>>, partitions_def=None, job_name=None)[source]\u00b6
\n

Defines a partition set, representing the set of slices making up an axis of a pipeline

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name for this partition set

  • \n
  • pipeline_name (str) \u2013 The name of the pipeline definition

  • \n
  • partition_fn (Optional[Callable[void, List[Partition]]]) \u2013 User-provided function to define\nthe set of valid partition objects.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with this partition. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this partition. (default: \u2018default\u2019)

  • \n
  • run_config_fn_for_partition (Callable[[Partition], Any]) \u2013 A\nfunction that takes a Partition and returns the run\nconfiguration that parameterizes the execution for this partition.

  • \n
  • tags_fn_for_partition (Callable[[Partition], Optional[dict[str, str]]]) \u2013 A function that\ntakes a Partition and returns a list of key value pairs that will\nbe added to the generated run for this partition.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 A set of parameters used to construct the set\nof valid partition objects.

  • \n
\n
\n
\n
\n
\ncreate_schedule_definition(schedule_name, cron_schedule, partition_selector, should_execute=None, environment_vars=None, execution_timezone=None, description=None, decorated_fn=None, job=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Create a ScheduleDefinition from a PartitionSetDefinition.

\n
\n
Parameters
\n
    \n
  • schedule_name (str) \u2013 The name of the schedule.

  • \n
  • cron_schedule (str) \u2013 A valid cron string for the schedule

  • \n
  • partition_selector (Callable[ScheduleEvaluationContext, PartitionSetDefinition], Union[Partition, List[Partition]]) \u2013 Function that determines the partition to use at a given execution time. Can return\neither a single Partition or a list of Partitions. For time-based partition sets,\nwill likely be either identity_partition_selector or a selector returned by\ncreate_offset_partition_selector.

  • \n
  • should_execute (Optional[function]) \u2013 Function that runs at schedule execution time that\ndetermines whether a schedule should execute. Defaults to a function that always returns\nTrue.

  • \n
  • environment_vars (Optional[dict]) \u2013 The environment variables to set for the schedule.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
Returns
\n

\n
The generated PartitionScheduleDefinition for the partition

selector

\n
\n
\n

\n
\n
Return type
\n

PartitionScheduleDefinition

\n
\n
\n
\n\n
\n
\nget_partitions(current_time=None)[source]\u00b6
\n

Return the set of known partitions.

\n
\n
Parameters
\n

current_time (Optional[datetime]) \u2013 The evaluation time for the partition function, which\nis passed through to the partition_fn (if it accepts a parameter). Defaults to\nthe current time in UTC.

\n
\n
\n
\n\n
\n\n
\n
\ndagster.date_partition_range(start, end=None, delta_range='days', fmt=None, inclusive=False, timezone=None)[source]\u00b6
\n

Utility function that returns a partition generating function to be used in creating a\nPartitionSet definition.

\n
\n
Parameters
\n
    \n
  • start (datetime) \u2013 Datetime capturing the start of the time range.

  • \n
  • end (Optional(datetime)) \u2013 Datetime capturing the end of the partition. By default, the\ncurrent time is used. The range is not inclusive of the end\nvalue.

  • \n
  • delta_range (Optional(str)) \u2013 string representing the time duration of each partition.\nMust be a valid argument to pendulum.period.range (\u201cdays\u201d, \u201chours\u201d, \u201cmonths\u201d, etc.).

  • \n
  • fmt (Optional(str)) \u2013 Format string to represent each partition by its start time

  • \n
  • inclusive (Optional(bool)) \u2013 By default, the partition set only contains date interval\npartitions for which the end time of the interval is less than current time. In other\nwords, the partition set contains date interval partitions that are completely in the\npast. If inclusive is set to True, then the partition set will include all date\ninterval partitions for which the start time of the interval is less than the\ncurrent time.

  • \n
  • timezone (Optional(str)) \u2013 Timezone in which the partition values should be expressed.

  • \n
\n
\n
Returns
\n

Callable[[], List[Partition]]

\n
\n
\n
\n\n
\n
\ndagster.identity_partition_selector(context, partition_set_def)[source]\u00b6
\n

Utility function for supplying a partition selector when creating a schedule from a\npartition set made of datetime objects that assumes the schedule always executes at the\npartition time.

\n

It\u2019s important that the cron string passed into create_schedule_definition match\nthe partition set times. For example, a schedule created from a partition set with partitions for each day at\nmidnight would create its partition selector as follows:

\n
partition_set = PartitionSetDefinition(\n    name='hello_world_partition_set',\n    pipeline_name='hello_world_pipeline',\n    partition_fn= date_partition_range(\n        start=datetime.datetime(2021, 1, 1),\n        delta_range="days",\n        timezone="US/Central",\n    )\n    run_config_fn_for_partition=my_run_config_fn,\n)\n\nschedule_definition = partition_set.create_schedule_definition(\n    "hello_world_daily_schedule",\n    "0 0 * * *",\n    partition_selector=identity_partition_selector,\n    execution_timezone="US/Central",\n)\n
\n
\n
\n\n
\n
\ndagster.create_offset_partition_selector(execution_time_to_partition_fn)[source]\u00b6
\n

Utility function for supplying a partition selector when creating a schedule from a\npartition set made of datetime objects that assumes a fixed time offset between the\npartition time and the time at which the schedule executes.

\n

It\u2019s important to keep the cron string that\u2019s supplied to\nPartitionSetDefinition.create_schedule_definition in sync with the offset that\u2019s\nsupplied to this function. For example, a schedule created from a partition set with\npartitions for each day at midnight that fills in the partition for day N at day N+1 at\n10:00AM would create the partition selector as follows:

\n
partition_set = PartitionSetDefinition(\n    name='hello_world_partition_set',\n    pipeline_name='hello_world_pipeline',\n    partition_fn= date_partition_range(\n        start=datetime.datetime(2021, 1, 1),\n        delta_range="days",\n        timezone="US/Central",\n    )\n    run_config_fn_for_partition=my_run_config_fn,\n)\n\nschedule_definition = partition_set.create_schedule_definition(\n    "daily_10am_schedule",\n    "0 10 * * *",\n    partition_selector=create_offset_partition_selector(lambda d: d.subtract(hours=10, days=1))\n    execution_timezone="US/Central",\n)\n
\n
\n
\n
Parameters
\n

execution_time_to_partition_fn (Callable[[datetime.datetime], datetime.datetime]) \u2013 A\nfunction that maps the execution time of the schedule to the partition time.

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/partitions", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../pipeline/", "title": "[Legacy] Pipelines"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../io-managers/", "title": "IO Managers"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/pipeline", "[Legacy] Pipelines", "N", "next"], ["sections/api/apidocs/io-managers", "IO Managers", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/partitions.rst.txt", "title": "Partitioned Config", "toc": "\n"}, "pipeline": {"alabaster_version": "0.7.12", "body": "
\n

[Legacy] Pipelines\u00b6

\n

As of Dagster 0.13.0, we recommend using Jobs as an alternative to Pipelines.

\n
\n

Pipeline definitions\u00b6

\n
\n
\n@dagster.pipeline(name=None, description=None, mode_defs=None, preset_defs=None, tags=None, hook_defs=None, input_defs=None, output_defs=None, config_schema=None, config_fn=None, solid_retry_policy=None, version_strategy=None)[source]\u00b6
\n

Create a pipeline with the specified parameters from the decorated composition function.

\n

Using this decorator allows you to build up the dependency graph of the pipeline by writing a\nfunction that invokes solids and passes the output to other solids.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the pipeline. Must be unique within any\nRepositoryDefinition containing the pipeline.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the pipeline.

  • \n
  • mode_defs (Optional[List[ModeDefinition]]) \u2013 The set of modes in which this pipeline can\noperate. Modes are used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline. Modes can be used, e.g., to vary\navailable resource and logging implementations between local test and production runs.

  • \n
  • preset_defs (Optional[List[PresetDefinition]]) \u2013 A set of preset collections of configuration\noptions that may be used to execute a pipeline. A preset consists of an environment\ndict, an optional subset of solids to execute, and a mode selection. Presets can be used\nto ship common combinations of options to pipeline end users in Python code, and can\nbe selected by tools like Dagit.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution run of the pipeline.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • hook_defs (Optional[Set[HookDefinition]]) \u2013 A set of hook definitions applied to the\npipeline. When a hook is applied to a pipeline, it will be attached to all solid\ninstances within the pipeline.

  • \n
  • solid_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all solids in\nthis pipeline. Only used if retry policy is not defined on the solid definition or\nsolid invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 The version strategy to use with this\npipeline. Providing a VersionStrategy will enable memoization on the pipeline.

  • \n
\n
\n
\n

Example

\n
@solid(output_defs=[OutputDefinition(int, "two"), OutputDefinition(int, "four")])\ndef emit_two_four(_) -> int:\n    yield Output(2, "two")\n    yield Output(4, "four")\n\n\n@lambda_solid\ndef add_one(num: int) -> int:\n    return num + 1\n\n\n@lambda_solid\ndef mult_two(num: int) -> int:\n    return num * 2\n\n\n@pipeline\ndef math_pipeline():\n    two, four = emit_two_four()\n    add_one(two)\n    mult_two(four)\n
\n
\n
\n\n
\n
\nclass dagster.PipelineDefinition(solid_defs=None, name=None, description=None, dependencies=None, mode_defs=None, preset_defs=None, tags=None, hook_defs=None, solid_retry_policy=None, graph_def=None, _parent_pipeline_def=None, version_strategy=None, asset_layer=None)[source]\u00b6
\n

Defines a Dagster pipeline.

\n

A pipeline is made up of

\n
    \n
  • Solids, each of which is a single functional unit of data computation.

  • \n
  • Dependencies, which determine how the values produced by solids as their outputs flow from\none solid to another. This tells Dagster how to arrange solids, and potentially multiple\naliased instances of solids, into a directed, acyclic graph (DAG) of compute.

  • \n
  • Modes, which can be used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline, and to switch between them.

  • \n
  • Presets, which can be used to ship common combinations of pipeline config options in Python\ncode, and to switch between them.

  • \n
\n
\n
Parameters
\n
    \n
  • solid_defs (List[SolidDefinition]) \u2013 The set of solids used in this pipeline.

  • \n
  • name (str) \u2013 The name of the pipeline. Must be unique within any\nRepositoryDefinition containing the pipeline.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the pipeline.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each solid\u2019s inputs on the outputs of\nother solids in the pipeline. Keys of the top level dict are either the string names of\nsolids in the pipeline or, in the case of aliased solids,\nNodeInvocations. Values of the top level dict are\nthemselves dicts, which map input names belonging to the solid or aliased solid to\nDependencyDefinitions.

  • \n
  • mode_defs (Optional[List[ModeDefinition]]) \u2013 The set of modes in which this pipeline can\noperate. Modes are used to attach resources, custom loggers, custom system storage\noptions, and custom executors to a pipeline. Modes can be used, e.g., to vary available\nresource and logging implementations between local test and production runs.

  • \n
  • preset_defs (Optional[List[PresetDefinition]]) \u2013 A set of preset collections of configuration\noptions that may be used to execute a pipeline. A preset consists of an environment\ndict, an optional subset of solids to execute, and a mode selection. Presets can be used\nto ship common combinations of options to pipeline end users in Python code, and can\nbe selected by tools like Dagit.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution run of the pipeline.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • hook_defs (Optional[AbstractSet[HookDefinition]]) \u2013 A set of hook definitions applied to the\npipeline. When a hook is applied to a pipeline, it will be attached to all solid\ninstances within the pipeline.

  • \n
  • solid_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all solids in\nthis pipeline. Only used if retry policy is not defined on the solid definition or\nsolid invocation.

  • \n
  • asset_layer (Optional[AssetLayer]) \u2013 Structured object containing all definition-time asset\ninformation for this pipeline.

  • \n
  • _parent_pipeline_def (INTERNAL ONLY) \u2013 Used for tracking pipelines created using solid subsets.

  • \n
\n
\n
\n

Examples

\n
@solid\ndef return_one(_):\n    return 1\n\n\n@solid(input_defs=[InputDefinition('num')], required_resource_keys={'op'})\ndef apply_op(context, num):\n    return context.resources.op(num)\n\n@resource(config_schema=Int)\ndef adder_resource(init_context):\n    return lambda x: x + init_context.resource_config\n\n\nadd_mode = ModeDefinition(\n    name='add_mode',\n    resource_defs={'op': adder_resource},\n    description='Mode that adds things',\n)\n\n\nadd_three_preset = PresetDefinition(\n    name='add_three_preset',\n    run_config={'resources': {'op': {'config': 3}}},\n    mode='add_mode',\n)\n\n\npipeline_def = PipelineDefinition(\n    name='basic',\n    solid_defs=[return_one, apply_op],\n    dependencies={'apply_op': {'num': DependencyDefinition('return_one')}},\n    mode_defs=[add_mode],\n    preset_defs=[add_three_preset],\n)\n
\n
\n
\n\n
\n
\n

Executing pipelines\u00b6

\n
\n
\ndagster.execute_pipeline(pipeline, run_config=None, mode=None, preset=None, tags=None, solid_selection=None, instance=None, raise_on_error=True)[source]\u00b6
\n

Execute a pipeline synchronously.

\n

Users will typically call this API when testing pipeline execution, or running standalone\nscripts.

\n
\n
Parameters
\n
    \n
  • pipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run,\nas a dict.

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True, since this is the most useful behavior in test.

  • \n
  • solid_selection (Optional[List[str]]) \u2013

    A list of solid selection queries (including single\nsolid names) to execute. For example:

    \n
      \n
    • ['some_solid']: selects some_solid itself.

    • \n
    • ['*some_solid']: select some_solid and all its ancestors (upstream dependencies).

    • \n
    • ['*some_solid+++']: select some_solid, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_solid', 'other_solid_a', 'other_solid_b+']: select some_solid and all its\nancestors, other_solid_a itself, and other_solid_b and its direct child solids.

    • \n
    \n

  • \n
\n
\n
Returns
\n

The result of pipeline execution.

\n
\n
Return type
\n

PipelineExecutionResult

\n
\n
\n

For the asynchronous version, see execute_pipeline_iterator().

\n
\n\n
\n
\ndagster.execute_pipeline_iterator(pipeline, run_config=None, mode=None, preset=None, tags=None, solid_selection=None, instance=None)[source]\u00b6
\n

Execute a pipeline iteratively.

\n

Rather than package up the result of running a pipeline into a single object, like\nexecute_pipeline(), this function yields the stream of events resulting from pipeline\nexecution.

\n

This is intended to allow the caller to handle these events on a streaming basis in whatever\nway is appropriate.

\n
\n
Parameters
\n
    \n
  • pipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run,\nas a dict.

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • solid_selection (Optional[List[str]]) \u2013

    A list of solid selection queries (including single\nsolid names) to execute. For example:

    \n
      \n
    • ['some_solid']: selects some_solid itself.

    • \n
    • ['*some_solid']: select some_solid and all its ancestors (upstream dependencies).

    • \n
    • ['*some_solid+++']: select some_solid, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_solid', 'other_solid_a', 'other_solid_b+']: select some_solid and all its\nancestors, other_solid_a itself, and other_solid_b and its direct child solids.

    • \n
    \n

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
\n
\n
Returns
\n

The stream of events resulting from pipeline execution.

\n
\n
Return type
\n

Iterator[DagsterEvent]

\n
\n
\n
\n\n
\n
\nclass dagster.PipelineExecutionResult(pipeline_def, run_id, event_list, reconstruct_context, output_capture=None)[source]\u00b6
\n

The result of executing a pipeline.

\n

Returned by execute_pipeline(). Users should not instantiate this class directly.

\n
\n
\noutput_for_solid(handle_str, output_name='result')\u00b6
\n

Get the output of a solid by its solid handle string and output name.

\n
\n
Parameters
\n
    \n
  • handle_str (str) \u2013 The string handle for the solid.

  • \n
  • output_name (str) \u2013 Optional. The name of the output, default to DEFAULT_OUTPUT.

  • \n
\n
\n
Returns
\n

The output value for the handle and output_name.

\n
\n
\n
\n\n
\n
\nresult_for_handle(handle)\u00b6
\n

Get the result of a solid by its solid handle.

\n

This allows indexing into top-level solids to retrieve the results of children of\ncomposite solids.

\n
\n
Parameters
\n

handle (Union[str,NodeHandle]) \u2013 The handle for the solid.

\n
\n
Returns
\n

The result of the given\nsolid.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\nresult_for_solid(name)\u00b6
\n

Get the result of a top level solid.

\n
\n
Parameters
\n

name (str) \u2013 The name of the top-level solid or aliased solid for which to retrieve the\nresult.

\n
\n
Returns
\n

The result of the solid\nexecution within the pipeline.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\nproperty solid_result_list\u00b6
\n

The results for each\ntop level solid.

\n
\n
Type
\n

List[Union[CompositeSolidExecutionResult, SolidExecutionResult]]

\n
\n
\n
\n\n
\n
\nproperty step_event_list\u00b6
\n

List[DagsterEvent] The full list of events generated by steps in the execution.

\n

Excludes events generated by the pipeline lifecycle, e.g., PIPELINE_START.

\n
\n\n
\n
\nproperty success\u00b6
\n

Whether all steps in the execution were successful.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\ndagster.default_executors List[ExecutorDefinition]\u00b6
\n

Built-in mutable sequence.

\n

If no argument is given, the constructor creates a new empty list.\nThe argument must be an iterable if specified.

\n

The default executors available on any ModeDefinition that does not provide custom\nexecutors. These are currently [in_process_executor,\nmultiprocess_executor].

\n
\n\n
\n
\n

Re-executing pipelines\u00b6

\n
\n
\ndagster.reexecute_pipeline(pipeline, parent_run_id, run_config=None, step_selection=None, mode=None, preset=None, tags=None, instance=None, raise_on_error=True)[source]\u00b6
\n

Reexecute an existing pipeline run.

\n

Users will typically call this API when testing pipeline reexecution, or running standalone\nscripts.

\n
\n
Parameters
\n
    \n
  • pipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.

  • \n
  • parent_run_id (str) \u2013 The id of the previous run to reexecute. The run must exist in the\ninstance.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run,\nas a dict.

  • \n
  • solid_selection (Optional[List[str]]) \u2013

    A list of solid selection queries (including single\nsolid names) to execute. For example:

    \n
      \n
    • ['some_solid']: selects some_solid itself.

    • \n
    • ['*some_solid']: select some_solid and all its ancestors (upstream dependencies).

    • \n
    • ['*some_solid+++']: select some_solid, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_solid', 'other_solid_a', 'other_solid_b+']: select some_solid and all its\nancestors, other_solid_a itself, and other_solid_b and its direct child solids.

    • \n
    \n

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True, since this is the most useful behavior in test.

  • \n
\n
\n
Returns
\n

The result of pipeline execution.

\n
\n
Return type
\n

PipelineExecutionResult

\n
\n
\n

For the asynchronous version, see reexecute_pipeline_iterator().

\n
\n\n
\n
\ndagster.reexecute_pipeline_iterator(pipeline, parent_run_id, run_config=None, step_selection=None, mode=None, preset=None, tags=None, instance=None)[source]\u00b6
\n

Reexecute a pipeline iteratively.

\n

Rather than package up the result of running a pipeline into a single object, like\nreexecute_pipeline(), this function yields the stream of events resulting from pipeline\nreexecution.

\n

This is intended to allow the caller to handle these events on a streaming basis in whatever\nway is appropriate.

\n
\n
Parameters
\n
    \n
  • pipeline (Union[IPipeline, PipelineDefinition]) \u2013 The pipeline to execute.

  • \n
  • parent_run_id (str) \u2013 The id of the previous run to reexecute. The run must exist in the\ninstance.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run,\nas a dict.

  • \n
  • solid_selection (Optional[List[str]]) \u2013

    A list of solid selection queries (including single\nsolid names) to execute. For example:

    \n
      \n
    • ['some_solid']: selects some_solid itself.

    • \n
    • ['*some_solid']: select some_solid and all its ancestors (upstream dependencies).

    • \n
    • ['*some_solid+++']: select some_solid, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_solid', 'other_solid_a', 'other_solid_b+']: select some_solid and all its\nancestors, other_solid_a itself, and other_solid_b and its direct child solids.

    • \n
    \n

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
\n
\n
Returns
\n

The stream of events resulting from pipeline reexecution.

\n
\n
Return type
\n

Iterator[DagsterEvent]

\n
\n
\n
\n\n
\n
\n

Reconstructable pipelines\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]
\n

Create a ReconstructablePipeline from a\nfunction that returns a PipelineDefinition/JobDefinition,\nor a function decorated with @pipeline/@job.

\n

When your pipeline/job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the pipeline/job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of pipelines\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples:

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\nclass dagster.core.definitions.reconstruct.ReconstructablePipeline(repository, pipeline_name, solid_selection_str=None, solids_to_execute=None, asset_selection=None)[source]\u00b6
\n

Defines a reconstructable pipeline. When your pipeline/job must cross process boundaries,\nDagster must know how to reconstruct the pipeline/job on the other side of the process boundary.

\n
\n
Parameters
\n
    \n
  • repository (ReconstructableRepository) \u2013 The reconstructable representation of the repository\nthe pipeline/job belongs to.

  • \n
  • pipeline_name (str) \u2013 The name of the pipeline/job.

  • \n
  • solid_selection_str (Optional[str]) \u2013 The string value of a comma separated list of user-input\nsolid/op selection. None if no selection is specified, i.e. the entire pipeline/job will\nbe run.

  • \n
  • solids_to_execute (Optional[FrozenSet[str]]) \u2013 A set of solid/op names to execute. None if no selection\nis specified, i.e. the entire pipeline/job will be run.

  • \n
  • asset_selection (Optional[FrozenSet[AssetKey]]) \u2013 is specified, i.e. the entire job will be run.

  • \n
\n
\n
\n
\n
\nget_module()[source]\u00b6
\n

Return the module the pipeline is found in, the origin is a module code pointer

\n
\n\n
\n\n
\n
\n

Pipeline configuration\u00b6

\n
\n

Run Config Schema\u00b6

\n
\n

The run_config used by execute_pipeline() and\nexecute_pipeline_iterator() has the following schema:

\n
{\n  # configuration for execution, required if executors require config\n  execution: {\n    # the name of one, and only one available executor, typically 'in_process' or 'multiprocess'\n    __executor_name__: {\n      # executor-specific config, if required or permitted\n      config: {\n        ...\n      }\n    }\n  },\n\n  # configuration for loggers, required if loggers require config\n  loggers: {\n    # the name of an available logger\n    __logger_name__: {\n      # logger-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for resources, required if resources require config\n  resources: {\n    # the name of a resource\n    __resource_name__: {\n      # resource-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for solids, required if solids require config\n  solids: {\n\n    # these keys align with the names of the solids, or their alias in this pipeline\n    __solid_name__: {\n\n      # pass any data that was defined via config_field\n      config: ...,\n\n      # configurably specify input values, keyed by input name\n      inputs: {\n        __input_name__: {\n          # if an dagster_type_loader is specified, that schema must be satisfied here;\n          # scalar, built-in types will generally allow their values to be specified directly:\n          value: ...\n        }\n      },\n\n    }\n  },\n\n}\n
\n
\n
\n
\n
\n
\n

Aliases\u00b6

\n
\n
\ndagster.SolidInvocation\u00b6
\n

alias of dagster.core.definitions.dependency.NodeInvocation

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/pipeline", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../presets/", "title": "[Legacy] Presets"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../partitions/", "title": "Partitioned Config"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/presets", "[Legacy] Presets", "N", "next"], ["sections/api/apidocs/partitions", "Partitioned Config", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/pipeline.rst.txt", "title": "[Legacy] Pipelines", "toc": "\n"}, "presets": {"alabaster_version": "0.7.12", "body": "
\n

[Legacy] Presets\u00b6

\n
\n
\nclass dagster.PresetDefinition(name, run_config=None, solid_selection=None, mode=None, tags=None)[source]\u00b6
\n

Defines a preset configuration in which a pipeline can execute.

\n

Presets can be used in Dagit to load predefined configurations into the tool.

\n

Presets may also be used from the Python API (in a script, or in test) as follows:

\n
execute_pipeline(pipeline_def, preset='example_preset')\n
\n
\n

Presets may also be used with the command line tools:

\n
$ dagster pipeline execute example_pipeline --preset example_preset\n
\n
\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.

  • \n
  • run_config (Optional[dict]) \u2013 A dict representing the config to set with the preset.\nThis is equivalent to the run_config argument to execute_pipeline().

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default: \u2018default\u2019)

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.

  • \n
\n
\n
\n
\n
\nstatic from_files(name, config_files=None, solid_selection=None, mode=None, tags=None)[source]\u00b6
\n

Static constructor for presets from YAML files.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.

  • \n
  • config_files (Optional[List[str]]) \u2013 List of paths or glob patterns for yaml files\nto load and parse as the run config for this preset.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.

  • \n
\n
\n
Returns
\n

A PresetDefinition constructed from the provided YAML files.

\n
\n
Return type
\n

PresetDefinition

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML files is invalid and has a parse\n error.

\n
\n
\n
\n\n
\n
\nstatic from_pkg_resources(name, pkg_resource_defs=None, solid_selection=None, mode=None, tags=None)[source]\u00b6
\n

Load a preset from a package resource, using pkg_resources.resource_string().

\n

Example:

\n
PresetDefinition.from_pkg_resources(\n    name='local',\n    mode='local',\n    pkg_resource_defs=[\n        ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n        ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n    ],\n)\n
\n
\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.

  • \n
  • pkg_resource_defs (Optional[List[(str, str)]]) \u2013 List of pkg_resource modules/files to\nload as the run config for this preset.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with this partition. e.g.\n['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.

  • \n
\n
\n
Returns
\n

A PresetDefinition constructed from the provided YAML strings

\n
\n
Return type
\n

PresetDefinition

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\nstatic from_yaml_strings(name, yaml_strings=None, solid_selection=None, mode=None, tags=None)[source]\u00b6
\n

Static constructor for presets from YAML strings.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this preset. Must be unique in the presets defined on a given\npipeline.

  • \n
  • yaml_strings (Optional[List[str]]) \u2013 List of yaml strings to parse as the environment\nconfig for this preset.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute with the preset. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The mode to apply when executing this preset. (default:\n\u2018default\u2019)

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 The tags to apply when executing this preset.

  • \n
\n
\n
Returns
\n

A PresetDefinition constructed from the provided YAML strings

\n
\n
Return type
\n

PresetDefinition

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\nget_environment_yaml()[source]\u00b6
\n

Get the environment dict set on a preset as YAML.

\n
\n
Returns
\n

The environment dict as YAML.

\n
\n
Return type
\n

str

\n
\n
\n
\n\n
\n
\nwith_additional_config(run_config)[source]\u00b6
\n

Return a new PresetDefinition with additional config merged in to the existing config.

\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/presets", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../repositories/", "title": "Repositories"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../pipeline/", "title": "[Legacy] Pipelines"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/repositories", "Repositories", "N", "next"], ["sections/api/apidocs/pipeline", "[Legacy] Pipelines", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/presets.rst.txt", "title": "[Legacy] Presets", "toc": "\n"}, "repositories": {"alabaster_version": "0.7.12", "body": "
\n

Repositories\u00b6

\n
\n
\ndagster.repository RepositoryDefinition[source]\u00b6
\n

Create a repository from the decorated function.

\n

The decorated function should take no arguments and its return value should one of:

\n

1. List[Union[JobDefinition, PipelineDefinition, PartitionSetDefinition, ScheduleDefinition, SensorDefinition]].\nUse this form when you have no need to lazy load pipelines or other definitions. This is the\ntypical use case.

\n
    \n
  1. A dict of the form:

  2. \n
\n
{\n    'jobs': Dict[str, Callable[[], JobDefinition]],\n    'pipelines': Dict[str, Callable[[], PipelineDefinition]],\n    'partition_sets': Dict[str, Callable[[], PartitionSetDefinition]],\n    'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n    'sensors': Dict[str, Callable[[], SensorDefinition]]\n}\n
\n
\n

This form is intended to allow definitions to be created lazily when accessed by name,\nwhich can be helpful for performance when there are many definitions in a repository, or\nwhen constructing the definitions is costly.

\n

3. A RepositoryData. Return this object if you need fine-grained\ncontrol over the construction and indexing of definitions within the repository, e.g., to\ncreate definitions dynamically from .yaml files in a directory.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the repository. Defaults to the name of the decorated\nfunction.

  • \n
  • description (Optional[str]) \u2013 A string description of the repository.

  • \n
\n
\n
\n

Example:

\n
######################################################################\n# A simple repository using the first form of the decorated function\n######################################################################\n\n@op(config_schema={n: Field(Int)})\ndef return_n(context):\n    return context.op_config['n']\n\n@job\ndef simple_job():\n    return_n()\n\n@job\ndef some_job():\n    ...\n\n@sensor(job=some_job)\ndef some_sensor():\n    if foo():\n        yield RunRequest(\n            run_key= ...,\n            run_config={\n                'ops': {'return_n': {'config': {'n': bar()}}}\n            }\n        )\n\n@job\ndef my_job():\n    ...\n\nmy_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n@repository\ndef simple_repository():\n    return [simple_job, some_sensor, my_schedule]\n\n\n######################################################################\n# A lazy-loaded repository\n######################################################################\n\ndef make_expensive_job():\n    @job\n    def expensive_job():\n        for i in range(10000):\n            return_n.alias(f'return_n_{i}')()\n\n    return expensive_job\n\ndef make_expensive_schedule():\n    @job\n    def other_expensive_job():\n        for i in range(11000):\n            return_n.alias(f'my_return_n_{i}')()\n\n    return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n@repository\ndef lazy_loaded_repository():\n    return {\n        'jobs': {'expensive_job': make_expensive_job},\n        'schedules': {'expensive_schedule': make_expensive_schedule}\n    }\n\n\n######################################################################\n# A complex repository that lazily constructs jobs from a directory\n# of files in a bespoke YAML format\n######################################################################\n\nclass ComplexRepositoryData(RepositoryData):\n    def __init__(self, yaml_directory):\n        self._yaml_directory = yaml_directory\n\n    def get_all_pipelines(self):\n        return [\n            self._construct_job_def_from_yaml_file(\n              self._yaml_file_for_job_name(file_name)\n            )\n            for file_name in os.listdir(self._yaml_directory)\n        ]\n\n    ...\n\n@repository\ndef complex_repository():\n    return ComplexRepositoryData('some_directory')\n
\n
\n
\n\n
\n
\nclass dagster.RepositoryDefinition(name, repository_data, description=None)[source]\u00b6
\n

Define a repository that contains a group of definitions.

\n

Users should typically not create objects of this class directly. Instead, use the\n@repository() decorator.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the repository.

  • \n
  • repository_data (RepositoryData) \u2013 Contains the definitions making up the repository.

  • \n
  • description (Optional[str]) \u2013 A string description of the repository.

  • \n
\n
\n
\n
\n
\nget_all_jobs()[source]\u00b6
\n

Return all jobs in the repository as a list.

\n

Note that this will construct any job in the lazily evaluated dictionary that has\nnot yet been constructed.

\n
\n
Returns
\n

All jobs in the repository.

\n
\n
Return type
\n

List[JobDefinition]

\n
\n
\n
\n\n
\n
\nget_job(name)[source]\u00b6
\n

Get a job by name.

\n

If this job is present in the lazily evaluated dictionary passed to the\nconstructor, but has not yet been constructed, only this job is constructed, and\nwill be cached for future calls.

\n
\n
Parameters
\n

name (str) \u2013 Name of the job to retrieve.

\n
\n
Returns
\n

The job definition corresponding to\nthe given name.

\n
\n
Return type
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nhas_job(name)[source]\u00b6
\n

Check if a job with a given name is present in the repository.

\n
\n
Parameters
\n

name (str) \u2013 The name of the job.

\n
\n
Returns
\n

bool

\n
\n
\n
\n\n
\n
\nproperty job_names\u00b6
\n

Names of all jobs in the repository

\n
\n
Type
\n

List[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.RepositoryData[source]\u00b6
\n

Users should usually rely on the @repository decorator to create new\nrepositories, which will in turn call the static constructors on this class. However, users may\nsubclass RepositoryData for fine-grained control over access to and lazy creation\nof repository members.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/repositories", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../resources/", "title": "Resources"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../presets/", "title": "[Legacy] Presets"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/resources", "Resources", "N", "next"], ["sections/api/apidocs/presets", "[Legacy] Presets", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/repositories.rst.txt", "title": "Repositories", "toc": "\n"}, "resources": {"alabaster_version": "0.7.12", "body": "
\n

Resources\u00b6

\n
\n
\n@dagster.resource(config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define a resource.

\n

The decorated function should accept an InitResourceContext and return an instance of\nthe resource. This function will become the resource_fn of an underlying\nResourceDefinition.

\n

If the decorated function yields once rather than returning (in the manner of functions\ndecorable with @contextlib.contextmanager) then\nthe body of the function after the yield will be run after execution resolves, allowing users\nto write their own teardown/cleanup logic.

\n
\n
Parameters
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.resource_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by this resource.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ResourceDefinition(resource_fn, config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Core class for defining resources.

\n

Resources are scoped ways to make external resources (like database connections) available to\nduring job execution and to clean up after execution resolves.

\n

If resource_fn yields once rather than returning (in the manner of functions decorable with\n@contextlib.contextmanager) then the body of the\nfunction after the yield will be run after execution resolves, allowing users to write their\nown teardown/cleanup logic.

\n

Depending on your executor, resources may be instantiated and cleaned up more than once in a\njob execution.

\n
\n
Parameters
\n
    \n
  • resource_fn (Callable[[InitResourceContext], Any]) \u2013 User-provided function to instantiate\nthe resource, which will be made available to executions keyed on the\ncontext.resources object.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the resource matches this schema and fail if it does not. If\nnot set, Dagster will accept any config provided for the resource.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • required_resource_keys \u2013 (Optional[Set[str]]) Keys for the resources required by this\nresource. A DagsterInvariantViolationError will be raised during initialization if\ndependencies are cyclic.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the resource\u2019s definition fn. Two\nwrapped resource functions should only have the same version if they produce the same\nresource definition when provided with the same inputs.

  • \n
\n
\n
\n
\n
\nconfigured(config_or_config_fn, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n
\nstatic hardcoded_resource(value, description=None)[source]\u00b6
\n

A helper function that creates a ResourceDefinition with a hardcoded object.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value that will be accessible via context.resources.resource_name.

  • \n
  • description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

  • \n
\n
\n
Returns
\n

A hardcoded resource.

\n
\n
Return type
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nstatic mock_resource(description=None)[source]\u00b6
\n

A helper function that creates a ResourceDefinition which wraps a mock.MagicMock.

\n
\n
Parameters
\n

description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

\n
\n
Returns
\n

\n
A resource that creates the magic methods automatically and helps

you mock existing resources.

\n
\n
\n

\n
\n
Return type
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nstatic none_resource(description=None)[source]\u00b6
\n

A helper function that returns a none resource.

\n
\n
Parameters
\n

description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

\n
\n
Returns
\n

A resource that does nothing.

\n
\n
Return type
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.InitResourceContext(resource_config, resources, resource_def=None, instance=None, dagster_run=None, pipeline_run=None, log_manager=None, pipeline_def_for_backwards_compat=None)[source]\u00b6
\n

Resource-specific initialization context.

\n
\n
\nresource_config\u00b6
\n

The configuration data provided by the run config. The schema\nfor this data is defined by the config_field argument to\nResourceDefinition.

\n
\n
Type
\n

Any

\n
\n
\n
\n\n
\n
\nresource_def\u00b6
\n

The definition of the resource currently being\nconstructed.

\n
\n
Type
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\nlog_manager\u00b6
\n

The log manager for this run of the job or pipeline

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nresources\u00b6
\n

The resources that are available to the resource that we are\ninitalizing.

\n
\n
Type
\n

ScopedResources

\n
\n
\n
\n\n
\n
\ndagster_run\u00b6
\n

The dagster run to use. When initializing resources\noutside of execution context, this will be None.

\n
\n
Type
\n

Optional[PipelineRun]

\n
\n
\n
\n\n
\n
\nrun_id\u00b6
\n

The id for this run of the job or pipeline. When initializing resources\noutside of execution context, this will be None.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npipeline_run\u00b6
\n

(legacy) The dagster run to use. When initializing resources\noutside of execution context, this will be None.

\n
\n
Type
\n

Optional[PipelineRun]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.make_values_resource(**kwargs)[source]\u00b6
\n

A helper function that creates a ResourceDefinition to take in user-defined values.

\n
\n

This is useful for sharing values between ops.

\n
\n
\n
Parameters
\n

**kwargs \u2013 Arbitrary keyword arguments that will be passed to the config schema of the\nreturned resource definition. If not set, Dagster will accept any config provided for\nthe resource.

\n
\n
\n

For example:

\n
@op(required_resource_keys={"globals"})\ndef my_op(context):\n    print(context.resources.globals["my_str_var"])\n\n@job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\ndef my_job():\n    my_op()\n
\n
\n
\n
Returns
\n

A resource that passes in user-defined values.

\n
\n
Return type
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\ndagster.build_init_resource_context(config=None, resources=None, instance=None)[source]\u00b6
\n

Builds resource initialization context from provided parameters.

\n

build_init_resource_context can be used as either a function or context manager. If there is a\nprovided resource to build_init_resource_context that is a context manager, then it must be\nused as a context manager. This function can be used to provide the context argument to the\ninvocation of a resource.

\n
\n
Parameters
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • config (Optional[Any]) \u2013 The resource config to provide to the context.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
\n
\n
\n

Examples

\n
context = build_init_resource_context()\nresource_to_init(context)\n\nwith build_init_resource_context(\n    resources={"foo": context_manager_resource}\n) as context:\n    resource_to_init(context)\n
\n
\n
\n\n
\n
\ndagster.build_resources(resources, instance=None, resource_config=None, pipeline_run=None, log_manager=None)[source]\u00b6
\n

Context manager that yields resources using provided resource definitions and run config.

\n

This API allows for using resources in an independent context. Resources will be initialized\nwith the provided run config, and optionally, pipeline_run. The resulting resources will be\nyielded on a dictionary keyed identically to that provided for resource_defs. Upon exiting the\ncontext, resources will also be torn down safely.

\n
\n
Parameters
\n
    \n
  • resources (Dict[str, Any]) \u2013 Resource instances or definitions to build. All\nrequired resource dependencies to a given resource must be contained within this\ndictionary, or the resource build will fail.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to instantiate\nresources on.

  • \n
  • resource_config (Optional[Dict[str, Any]]) \u2013 A dict representing the config to be\nprovided to each resource during initialization and teardown.

  • \n
  • pipeline_run (Optional[PipelineRun]) \u2013 The pipeline run to provide during resource\ninitialization and teardown. If the provided resources require either the pipeline_run\nor run_id attributes of the provided context during resource initialization and/or\nteardown, this must be provided, or initialization will fail.

  • \n
  • log_manager (Optional[DagsterLogManager]) \u2013 Log Manager to use during resource\ninitialization. Defaults to system log manager.

  • \n
\n
\n
\n

Examples:

\n
from dagster import resource, build_resources\n\n@resource\ndef the_resource():\n    return "foo"\n\nwith build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n    assert resources.from_def == "foo"\n    assert resources.from_val == "bar"\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/resources", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../schedules-sensors/", "title": "Run Requests"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../repositories/", "title": "Repositories"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/schedules-sensors", "Run Requests", "N", "next"], ["sections/api/apidocs/repositories", "Repositories", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/resources.rst.txt", "title": "Resources", "toc": "\n"}, "schedules-sensors": {"alabaster_version": "0.7.12", "body": "
\n

Run Requests\u00b6

\n
\n
\nclass dagster.RunRequest(run_key, run_config=None, tags=None, job_name=None)[source]\u00b6
\n

Represents all the information required to launch a single run. Must be returned by a\nSensorDefinition or ScheduleDefinition\u2019s evaluation function for a run to be launched.

\n
\n
\nrun_key\u00b6
\n

A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.

\n
\n
Type
\n

str | None

\n
\n
\n
\n\n
\n
\nrun_config\u00b6
\n

The config that parameterizes the run execution to\nbe launched, as a dict.

\n
\n
Type
\n

Optional[Dict]

\n
\n
\n
\n\n
\n
\ntags\u00b6
\n

A dictionary of tags (string key-value pairs) to attach\nto the launched run.

\n
\n
Type
\n

Optional[Dict[str, str]]

\n
\n
\n
\n\n
\n
\njob_name\u00b6
\n

(Experimental) The name of the job this run request will launch.\nRequired for sensors that target multiple jobs.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SkipReason(skip_message=None)[source]\u00b6
\n

Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\nwhy no runs were requested.

\n
\n
\nskip_message\u00b6
\n

A message displayed in dagit for why this evaluation resulted\nin no requested runs.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\n

Schedules\u00b6

\n
\n
\n@dagster.schedule(cron_schedule, pipeline_name=None, name=None, tags=None, tags_fn=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, execution_timezone=None, description=None, job=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a schedule following the provided cron schedule and requests runs for the provided job.

\n

The decorated function takes in a ScheduleEvaluationContext as its only\nargument, and does one of the following:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Return a run config dictionary.

  10. \n
  11. Yield a SkipReason or yield one ore more RunRequest objects.

  12. \n
\n

Returns a ScheduleDefinition.

\n
\n
Parameters
\n
    \n
  • cron_schedule (str) \u2013 A valid cron string specifying when the schedule will run, e.g.,\n'45 23 * * 6' for a schedule that runs at 11:45 PM every Saturday.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The name of the pipeline to execute when the\nschedule runs.

  • \n
  • name (Optional[str]) \u2013 The name of the schedule to create.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]) \u2013 A function\nthat generates tags to attach to the schedules runs. Takes a\nScheduleEvaluationContext and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags and tags_fn.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 The pipeline mode in which to execute this schedule.\n(Default: \u2018default\u2019)

  • \n
  • should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]) \u2013 A function that runs at\nschedule execution time to determine whether a schedule should execute or skip. Takes a\nScheduleEvaluationContext and returns a boolean (True if the\nschedule should execute). Defaults to a function that always returns True.

  • \n
  • environment_vars (Optional[Dict[str, str]]) \u2013 Any environment variables to set when executing\nthe schedule.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job that should execute when this\nschedule runs.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScheduleDefinition(name=None, cron_schedule=None, pipeline_name=None, run_config=None, run_config_fn=None, tags=None, tags_fn=None, solid_selection=None, mode='default', should_execute=None, environment_vars=None, execution_timezone=None, execution_fn=None, description=None, job=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Define a schedule that targets a job

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the schedule to create. Defaults to the job name plus\n\u201c_schedule\u201d.

  • \n
  • cron_schedule (str) \u2013 A valid cron string specifying when the schedule will run, e.g.,\n\u201845 23 * * 6\u2019 for a schedule that runs at 11:45 PM every Saturday.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The name of the pipeline to execute when the schedule runs.

  • \n
  • execution_fn (Callable[ScheduleEvaluationContext]) \u2013

    The core evaluation function for the\nschedule, which is run at an interval to determine whether a run should be launched or\nnot. Takes a ScheduleEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • run_config (Optional[Dict]) \u2013 The config that parameterizes this execution,\nas a dict.

  • \n
  • run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Dict]]]) \u2013 A function that\ntakes a ScheduleEvaluationContext object and returns the run configuration that\nparameterizes this execution, as a dict. You may set only one of run_config,\nrun_config_fn, and execution_fn.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes a\nScheduleEvaluationContext and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags, tags_fn, and execution_fn.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 A list of solid subselection (including single\nsolid names) to execute when the schedule runs. e.g. ['*some_solid+', 'other_solid']

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing this schedule. (default: \u2018default\u2019)

  • \n
  • should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]) \u2013 A function that runs\nat schedule execution time to determine whether a schedule should execute or skip. Takes\na ScheduleEvaluationContext and returns a boolean (True if the\nschedule should execute). Defaults to a function that always returns True.

  • \n
  • environment_vars (Optional[dict[str, str]]) \u2013 The environment variables to set for the\nschedule

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job that should execute when this\nschedule runs.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScheduleEvaluationContext(instance_ref, scheduled_execution_time)[source]\u00b6
\n

Schedule-specific execution context.

\n

An instance of this class is made available as the first argument to various ScheduleDefinition\nfunctions. It is passed as the first argument to run_config_fn, tags_fn,\nand should_execute.

\n
\n
\ninstance_ref\u00b6
\n

The serialized instance configured to run the schedule

\n
\n
Type
\n

Optional[InstanceRef]

\n
\n
\n
\n\n
\n
\nscheduled_execution_time\u00b6
\n

The time in which the execution was scheduled to happen. May differ slightly\nfrom both the actual execution time and the time at which the run config is computed.\nNot available in all schedulers - currently only set in deployments using\nDagsterDaemonScheduler.

\n
\n
Type
\n

datetime

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_schedule_context(instance=None, scheduled_execution_time=None)[source]\u00b6
\n

Builds schedule execution context using the provided parameters.

\n

The instance provided to build_schedule_context must be persistent;\nDagsterInstance.ephemeral() will result in an error.

\n
\n
Parameters
\n
    \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the schedule.

  • \n
  • scheduled_execution_time (datetime) \u2013 The time in which the execution was scheduled to\nhappen. May differ slightly from both the actual execution time and the time at which\nthe run config is computed.

  • \n
\n
\n
\n

Examples

\n
context = build_schedule_context(instance)\ndaily_schedule.evaluate_tick(context)\n
\n
\n
\n\n
\n
\ndagster.core.scheduler.DagsterDaemonScheduler Scheduler[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
max_catchup_runs (dagster.IntSource, optional)
\n

For partitioned schedules, controls the maximum number of past\npartitions for each schedule that will be considered when looking for missing\nruns . Generally this parameter will only come into play if the scheduler\nfalls behind or launches after experiencing downtime. This parameter will not be checked for\nschedules without partition sets (for example, schedules created using the @schedule\ndecorator) - only the most recent execution time will be considered for those schedules.

\n

Note that no matter what this value is, the scheduler will never launch a run from a time\nbefore the schedule was turned on (even if the start_date on the schedule is earlier) - if\nyou want to launch runs for earlier partitions, launch a backfill.

\n

Default Value: 5

\n
\n
max_tick_retries (dagster.IntSource, optional)
\n

For each schedule tick that raises an error, how many times to retry that tick

\n

Default Value: 0

\n
\n
\n

Default scheduler implementation that submits runs from the dagster-daemon\nlong-lived process. Periodically checks each running schedule for execution times that don\u2019t\nhave runs yet and launches them.

\n
\n\n
\n
\n

Partitioned Schedules\u00b6

\n
\n
\ndagster.build_schedule_from_partitioned_job(job, description=None, name=None, minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a schedule from a time window-partitioned job.

\n

The schedule executes at the cadence specified by the partitioning of the given job.

\n
\n\n
\n
\nclass dagster.PartitionScheduleDefinition(name, cron_schedule, pipeline_name, tags_fn, solid_selection, mode, should_execute, environment_vars, partition_set, run_config_fn=None, execution_timezone=None, execution_fn=None, description=None, decorated_fn=None, job=None, default_status=<DefaultScheduleStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n
\n\n
\n
\n@dagster.hourly_partitioned_config(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of hourly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset is provided, the start and end times of each partition will be\nminute_offset past the hour.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n@hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\n@dagster.daily_partitioned_config(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of daily partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the bounds\nof the date partition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset and/or hour_offset are used, the start and end times of each partition\nwill be hour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@daily_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n@daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\n@dagster.weekly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of weekly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@weekly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n@weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\n@dagster.monthly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of monthly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at midnight on the soonest first of the month after\nstart_date. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and end\ndate of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\nstart and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
@monthly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n@monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n
\n

Sensors\u00b6

\n
\n
\n@dagster.sensor(pipeline_name=None, name=None, solid_selection=None, mode=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a sensor where the decorated function is used as the sensor\u2019s evaluation function. The\ndecorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Yield a SkipReason or yield one ore more RunRequest objects.

  10. \n
\n

Takes a SensorEvaluationContext.

\n
\n
Parameters
\n
    \n
  • pipeline_name (Optional[str]) \u2013 (legacy) Name of the target pipeline. Cannot be used in\nconjunction with job or jobs parameters.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 (legacy) A list of solid subselection (including single\nsolid names) to execute for runs for this sensor e.g.\n['*some_solid+', 'other_solid'].\nCannot be used in conjunction with job or jobs parameters.

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing runs for this sensor. Cannot be used\nin conjunction with job or jobs parameters.\n(default: \u2018default\u2019)

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.SensorDefinition(name=None, evaluation_fn=None, pipeline_name=None, solid_selection=None, mode=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Define a sensor that initiates a set of runs based on some external state

\n
\n
Parameters
\n
    \n
  • evaluation_fn (Callable[[SensorEvaluationContext]]) \u2013

    The core evaluation function for the\nsensor, which is run at an interval to determine whether a run should be launched or\nnot. Takes a SensorEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • name (Optional[str]) \u2013 The name of the sensor to create. Defaults to name of evaluation_fn

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The name of the pipeline to execute when the sensor\nfires. Cannot be used in conjunction with job or jobs parameters.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 (legacy) A list of solid subselection (including single\nsolid names) to execute when the sensor runs. e.g. ['*some_solid+', 'other_solid'].\nCannot be used in conjunction with job or jobs parameters.

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing runs triggered by this\nsensor. Cannot be used in conjunction with job or jobs parameters. (default:\n\u2018default\u2019)

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[GraphDefinition, JobDefinition]) \u2013 The job to execute when this sensor fires.

  • \n
  • jobs (Optional[Sequence[GraphDefinition, JobDefinition]]) \u2013 (experimental) A list of jobs to execute when this sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.SensorEvaluationContext(instance_ref, last_completion_time, last_run_key, cursor, repository_name, instance=None)[source]\u00b6
\n

Sensor execution context.

\n

An instance of this class is made available as the first argument to the evaluation function\non SensorDefinition.

\n
\n
\ninstance_ref\u00b6
\n

The serialized instance configured to run the schedule

\n
\n
Type
\n

Optional[InstanceRef]

\n
\n
\n
\n\n
\n
\ncursor\u00b6
\n

The cursor, passed back from the last sensor evaluation via\nthe cursor attribute of SkipReason and RunRequest

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nlast_completion_time\u00b6
\n

DEPRECATED The last time that the sensor was evaluated (UTC).

\n
\n
Type
\n

float

\n
\n
\n
\n\n
\n
\nlast_run_key\u00b6
\n

DEPRECATED The run key of the RunRequest most recently created by this\nsensor. Use the preferred cursor attribute instead.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nrepository_name\u00b6
\n

The name of the repository that the sensor belongs to.

\n
\n
Type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

The deserialized instance can also be passed in\ndirectly (primarily useful in testing contexts).

\n
\n
Type
\n

Optional[DagsterInstance]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_sensor_context(instance=None, cursor=None, repository_name=None)[source]\u00b6
\n

Builds sensor execution context using the provided parameters.

\n

This function can be used to provide a context to the invocation of a sensor definition.If\nprovided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\nerror.

\n
\n
Parameters
\n
    \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the sensor.

  • \n
  • cursor (Optional[str]) \u2013 A cursor value to provide to the evaluation of the sensor.

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository that the sensor belongs to.

  • \n
\n
\n
\n

Examples

\n
context = build_sensor_context()\nmy_sensor(context)\n
\n
\n
\n\n
\n
\nclass dagster.AssetSensorDefinition(name, asset_key, pipeline_name, asset_materialization_fn, solid_selection=None, mode=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Define an asset sensor that initiates a set of runs based on the materialization of a given\nasset.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the sensor to create.

  • \n
  • asset_key (AssetKey) \u2013 The asset_key this sensor monitors.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) The name of the pipeline to execute when the sensor\nfires. Cannot be used in conjunction with job or jobs parameters.

  • \n
  • asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]) \u2013

    The core\nevaluation function for the sensor, which is run at an interval to determine whether a\nrun should be launched or not. Takes a SensorEvaluationContext and\nan EventLogEntry corresponding to an AssetMaterialization event.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • solid_selection (Optional[List[str]]) \u2013 (legacy) A list of solid subselection (including single\nsolid names) to execute when the sensor runs. e.g. ['*some_solid+', 'other_solid'].\nCannot be used in conjunction with job or jobs parameters.

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing runs triggered by this sensor.\n(default: \u2018default\u2019).\nCannot be used in conjunction with job or jobs parameters.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job object to target with this sensor.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.asset_sensor(asset_key, pipeline_name=None, name=None, solid_selection=None, mode=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates an asset sensor where the decorated function is used as the asset sensor\u2019s evaluation\nfunction. The decorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Yield a SkipReason or yield one ore more RunRequest objects.

  10. \n
\n

Takes a SensorEvaluationContext and an EventLogEntry corresponding to an\nAssetMaterialization event.

\n
\n
Parameters
\n
    \n
  • asset_key (AssetKey) \u2013 The asset_key this sensor monitors.

  • \n
  • pipeline_name (Optional[str]) \u2013 (legacy) Name of the target pipeline. Cannot be used in conjunction with job or jobs parameters.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • solid_selection (Optional[List[str]]) \u2013 (legacy) A list of solid subselection (including single\nsolid names) to execute for runs for this sensor e.g.\n['*some_solid+', 'other_solid']. Cannot be used in conjunction with job or jobs\nparameters.

  • \n
  • mode (Optional[str]) \u2013 (legacy) The mode to apply when executing runs for this sensor. Cannot be used\nin conjunction with job or jobs parameters.\n(default: \u2018default\u2019)

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RunStatusSensorDefinition(name, pipeline_run_status, run_status_sensor_fn, pipeline_selection=None, minimum_interval_seconds=None, description=None, job_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Define a sensor that reacts to a given status of pipeline execution, where the decorated\nfunction will be evaluated when a run is at the given status.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • pipeline_run_status (PipelineRunStatus) \u2013 The status of a run which will be\nmonitored by the sensor.

  • \n
  • run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]]) \u2013 The core\nevaluation function for the sensor. Takes a RunStatusSensorContext.

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 (legacy) Names of the pipelines that will be monitored by\nthis sensor. Defaults to None, which means the alert will be sent when any pipeline in\nthe repository fails.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition]]]) \u2013 The jobs that\nwill be monitored by this sensor. Defaults to None, which means the alert will be sent\nwhen any job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RunStatusSensorContext(sensor_name, dagster_run, dagster_event, instance)[source]\u00b6
\n

The context object available to a decorated function of run_status_sensor.

\n
\n
\nsensor_name\u00b6
\n

the name of the sensor.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\ndagster_run\u00b6
\n

the run of the job or pipeline.

\n
\n
Type
\n

DagsterRun

\n
\n
\n
\n\n
\n
\ndagster_event\u00b6
\n

the event associated with the job or pipeline run status.

\n
\n
Type
\n

DagsterEvent

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

the current instance.

\n
\n
Type
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nfor_run_failure()[source]\u00b6
\n

Converts RunStatusSensorContext to RunFailureSensorContext.

\n
\n\n
\n\n
\n
\nclass dagster.RunFailureSensorContext(sensor_name, dagster_run, dagster_event, instance)[source]\u00b6
\n

The context object available to a decorated function of run_failure_sensor.

\n
\n
\nsensor_name\u00b6
\n

the name of the sensor.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\npipeline_run\u00b6
\n

the failed pipeline run.

\n
\n
Type
\n

PipelineRun

\n
\n
\n
\n\n
\n
\nfailure_event\u00b6
\n

the pipeline failure event.

\n
\n
Type
\n

DagsterEvent

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_run_status_sensor_context(sensor_name, dagster_event, dagster_instance, dagster_run)[source]\u00b6
\n

Builds run status sensor context from provided parameters.

\n

This function can be used to provide the context argument when directly invoking a function\ndecorated with @run_status_sensor or @run_failure_sensor, such as when writing unit tests.

\n
\n
Parameters
\n
    \n
  • sensor_name (str) \u2013 The name of the sensor the context is being constructed for.

  • \n
  • dagster_event (DagsterEvent) \u2013 A DagsterEvent with the same event type as the one that\ntriggers the run_status_sensor

  • \n
  • dagster_instance (DagsterInstance) \u2013 The dagster instance configured for the context.

  • \n
  • dagster_run (DagsterRun) \u2013 DagsterRun object from running a job

  • \n
\n
\n
\n

Examples

\n
instance = DagsterInstance.ephemeral()\nresult = my_job.execute_in_process(instance=instance)\n\ndagster_run = result.dagster_run\ndagster_event = result.get_job_success_event() # or get_job_failure_event()\n\ncontext = build_run_status_sensor_context(\n    sensor_name="run_status_sensor_to_invoke",\n    dagster_instance=instance,\n    dagster_run=dagster_run,\n    dagster_event=dagster_event,\n)\nrun_status_sensor_to_invoke(context)\n
\n
\n
\n\n
\n
\n@dagster.run_status_sensor(pipeline_run_status, pipeline_selection=None, name=None, minimum_interval_seconds=None, description=None, job_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a sensor that reacts to a given status of pipeline execution, where the decorated\nfunction will be run when a pipeline is at the given status.

\n

Takes a RunStatusSensorContext.

\n
\n
Parameters
\n
    \n
  • pipeline_run_status (PipelineRunStatus) \u2013 The status of pipeline execution which will be\nmonitored by the sensor.

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 Names of the pipelines that will be monitored by\nthis sensor. Defaults to None, which means the alert will be sent when any pipeline in\nthe repository fails.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job_selection (Optional[List[Union[PipelineDefinition, GraphDefinition]]]) \u2013 Jobs that will\nbe monitored by this sensor. Defaults to None, which means the alert will be sent when\nany job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.run_failure_sensor(name=None, minimum_interval_seconds=None, description=None, job_selection=None, pipeline_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a sensor that reacts to job failure events, where the decorated function will be\nrun when a run fails.

\n

Takes a RunFailureSensorContext.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the job failure sensor. Defaults to the name of the\ndecorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition]]]) \u2013 The jobs that\nwill be monitored by this failure sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository fails.

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 (legacy) Names of the pipelines that will be monitored by\nthis sensor. Defaults to None, which means the alert will be sent when any pipeline in\nthe repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n

Legacy APIs\u00b6

\n
\n
\n@dagster.pipeline_failure_sensor(name=None, minimum_interval_seconds=None, description=None, pipeline_selection=None, default_status=<DefaultSensorStatus.STOPPED: 'STOPPED'>)[source]\u00b6
\n

Creates a sensor that reacts to pipeline failure events, where the decorated function will be\nrun when a pipeline run fails.

\n

Takes a PipelineFailureSensorContext.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 The name of the pipeline failure sensor. Defaults to the name of the\ndecorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • pipeline_selection (Optional[List[str]]) \u2013 Names of the pipelines that will be monitored by\nthis failure sensor. Defaults to None, which means the alert will be sent when any\npipeline in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.PipelineFailureSensorContext(sensor_name, dagster_run, dagster_event, instance)[source]\u00b6
\n

The context object available to a decorated function of pipeline_failure_sensor.

\n
\n
\nsensor_name\u00b6
\n

the name of the sensor.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\npipeline_run\u00b6
\n

the failed pipeline run.

\n
\n
Type
\n

PipelineRun

\n
\n
\n
\n\n
\n
\nfailure_event\u00b6
\n

the pipeline failure event.

\n
\n
Type
\n

DagsterEvent

\n
\n
\n
\n\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/schedules-sensors", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../solids/", "title": "[Legacy] Solids"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../resources/", "title": "Resources"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/solids", "[Legacy] Solids", "N", "next"], ["sections/api/apidocs/resources", "Resources", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/schedules-sensors.rst.txt", "title": "Run Requests", "toc": "\n"}, "solids": {"alabaster_version": "0.7.12", "body": "
\n

[Legacy] Solids\u00b6

\n

As of Dagster 0.13.0, we recommend Ops as an alternative to Solids. They can generally be used\ninterchangeably.

\n
\n
\n

Defining solids\u00b6

\n
\n
\n@dagster.solid(name=None, description=None, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, tags=None, version=None, retry_policy=None)[source]\u00b6
\n

Create a solid with the specified parameters from the decorated function.

\n

This shortcut simplifies the core SolidDefinition API by exploding arguments into\nkwargs of the decorated compute function and omitting additional parameters when they are not\nneeded.

\n

Input and output definitions will be inferred from the type signature of the decorated function\nif not explicitly provided.

\n

The decorated function will be used as the solid\u2019s compute function. The signature of the\ndecorated function is more flexible than that of the compute_fn in the core API; it may:

\n
    \n
  1. Return a value. This value will be wrapped in an Output and yielded by the compute function.

  2. \n
  3. Return an Output. This output will be yielded by the compute function.

  4. \n
  5. Yield Output or other event objects. Same as default compute behavior.

  6. \n
\n

Note that options 1) and 2) are incompatible with yielding other events \u2013 if you would like\nto decorate a function that yields events, it must also wrap its eventual output in an\nOutput and yield it.

\n

@solid supports async def functions as well, including async generators when yielding multiple\nevents or outputs. Note that async solids will generally be run on their own unless using a custom\nExecutor implementation that supports running them together.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 Name of solid. Must be unique within any PipelineDefinition\nusing the solid.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this solid. If not provided, and\nthe decorated function has docstring, that docstring will be used as the description.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013 Information about the inputs to the solid. Information provided here will be combined\nwith what can be inferred from the function signature, with these explicit InputDefinitions\ntaking precedence.

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013 Information about the solids outputs. Information provided here will be combined with\nwhat can be inferred from the return type signature if there is only one OutputDefinition\nand the function does not use yield.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the solid matches this schema and fail if it does not. If not\nset, Dagster will accept any config provided for the solid.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this solid.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Values that are not strings\nwill be json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the solid\u2019s compute_fn. Two solids should have\nthe same version if and only if they deterministically produce the same outputs when\nprovided the same inputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this solid.

  • \n
\n
\n
\n

Examples

\n
@solid\ndef hello_world():\n    print('hello')\n\n@solid\ndef hello_world():\n    return {'foo': 'bar'}\n\n@solid\ndef hello_world():\n    return Output(value={'foo': 'bar'})\n\n@solid\ndef hello_world():\n    yield Output(value={'foo': 'bar'})\n\n@solid\ndef hello_world(foo):\n    return foo\n\n@solid(\n    input_defs=[InputDefinition(name="foo", str)],\n    output_defs=[OutputDefinition(str)]\n)\ndef hello_world(foo):\n    # explicitly type and name inputs and outputs\n    return foo\n\n@solid\ndef hello_world(foo: str) -> str:\n    # same as above inferred from signature\n    return foo\n\n@solid\ndef hello_world(context, foo):\n    context.log.info('log something')\n    return foo\n\n@solid(\n    config_schema={'str_value' : Field(str)}\n)\ndef hello_world(context, foo):\n    # context.solid_config is a dictionary with 'str_value' key\n    return foo + context.solid_config['str_value']\n
\n
\n
\n\n
\n
\nclass dagster.SolidDefinition(name, input_defs, compute_fn, output_defs, config_schema=None, description=None, tags=None, required_resource_keys=None, version=None, retry_policy=None)[source]\u00b6
\n

The definition of a Solid that performs a user-defined computation.

\n

For more details on what a solid is, refer to the\nSolid Overview .

\n

End users should prefer the @solid and @lambda_solid\ndecorators. SolidDefinition is generally intended to be used by framework authors.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the solid. Must be unique within any PipelineDefinition\nusing the solid.

  • \n
  • input_defs (List[InputDefinition]) \u2013 Inputs of the solid.

  • \n
  • compute_fn (Callable) \u2013

    The core of the solid, the function that does the actual\ncomputation. The signature of this function is determined by input_defs, and\noptionally, an injected first argument, context, a collection of information provided\nby the system.

    \n

    This function will be coerced into a generator or an async generator, which must yield\none Output for each of the solid\u2019s output_defs, and additionally may\nyield other types of Dagster events, including Materialization and\nExpectationResult.

    \n

  • \n
  • output_defs (List[OutputDefinition]) \u2013 Outputs of the solid.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the solid matches this schema and fail if it does not. If not\nset, Dagster will accept any config provided for the solid.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the solid.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resources handles required by this\nsolid.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the solid\u2019s compute_fn. Two solids should have\nthe same version if and only if they deterministically produce the same outputs when\nprovided the same inputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this solid.

  • \n
\n
\n
\n

Examples

\n
def _add_one(_context, inputs):\n    yield Output(inputs["num"] + 1)\n\nSolidDefinition(\n    name="add_one",\n    input_defs=[InputDefinition("num", Int)],\n    output_defs=[OutputDefinition(Int)], # default name ("result")\n    compute_fn=_add_one,\n)\n
\n
\n
\n
\nconfigured(config_or_config_fn, name, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (str) \u2013 Name of the new definition. This is a required argument, as this definition\ntype has a name uniqueness constraint.

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\n
\n

Inputs & outputs\u00b6

\n
\n
\nclass dagster.InputDefinition(name=None, dagster_type=None, description=None, default_value=<class 'dagster.core.definitions.utils.NoValueSentinel'>, root_manager_key=None, metadata=None, asset_key=None, asset_partitions=None)[source]\u00b6
\n

Defines an argument to a solid\u2019s compute function.

\n

Inputs may flow from previous solids\u2019 outputs, or be stubbed using config. They may optionally\nbe typed using the Dagster type system.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 Name of the input.

  • \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this input.\nUsers should provide the Python type of the objects that they expect to be passed for\nthis input, or a DagsterType that defines a runtime check that they want\nto be run on this input. Defaults to Any.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the input.

  • \n
  • default_value (Optional[Any]) \u2013 The default value to use if no input is provided.

  • \n
  • root_manager_key (Optional[str]) \u2013 (Experimental) The resource key for the\nRootInputManager used for loading this input when it is not connected to an\nupstream output.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata for the input.

  • \n
  • asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]) \u2013 (Experimental) An AssetKey\n(or function that produces an AssetKey from the InputContext) which should be associated\nwith this InputDefinition. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the InputContext) which should be associated with this InputDefinition.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.OutputDefinition(dagster_type=None, name=None, description=None, is_required=True, io_manager_key=None, metadata=None, asset_key=None, asset_partitions=None, asset_partitions_def=None)[source]\u00b6
\n

Defines an output from a solid\u2019s compute function.

\n

Solids can have multiple outputs, in which case outputs cannot be anonymous.

\n

Many solids have only one output, in which case the user can provide a single output definition\nthat will be given the default name, \u201cresult\u201d.

\n

Output definitions may be typed using the Dagster type system.

\n
\n
Parameters
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this output.\nUsers should provide the Python type of the objects that they expect the solid to yield\nfor this output, or a DagsterType that defines a runtime check that they\nwant to be run on this output. Defaults to Any.

  • \n
  • name (Optional[str]) \u2013 Name of the output. (default: \u201cresult\u201d)

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the output.

  • \n
  • is_required (Optional[bool]) \u2013 Whether the presence of this field is required. (default: True)

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the IOManager used for storing this\noutput and loading it in downstream steps (default: \u201cio_manager\u201d).

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.

  • \n
  • asset_key (Optional[AssetKey]]) \u2013 (Experimental) An AssetKey which should be associated\nwith this OutputDefinition. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], OutputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the OutputContext) which should be associated with this OutputDefinition.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Retries\u00b6

\n
\n
\nclass dagster.RetryPolicy(max_retries=1, delay=None, backoff=None, jitter=None)[source]
\n

A declarative policy for when to request retries when an exception occurs during op execution.

\n
\n
Parameters
\n
    \n
  • max_retries (int) \u2013 The maximum number of retries to attempt. Defaults to 1.

  • \n
  • delay (Optional[Union[int,float]]) \u2013 The time in seconds to wait between the retry being requested and the next attempt\nbeing started. This unit of time can be modulated as a function of attempt number\nwith backoff and randomly with jitter.

  • \n
  • backoff (Optional[Backoff]) \u2013 A modifier for delay as a function of retry attempt number.

  • \n
  • jitter (Optional[Jitter]) \u2013 A randomizing modifier for delay, applied after backoff calculation.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Backoff(value)[source]
\n

A modifier for delay as a function of attempt number.

\n

LINEAR: attempt_num * delay\nEXPONENTIAL: ((2 ^ attempt_num) - 1) * delay

\n
\n\n
\n
\nclass dagster.Jitter(value)[source]
\n

A randomizing modifier for delay, applied after backoff calculation.

\n

FULL: between 0 and the calculated delay based on backoff: random() * backoff_delay\nPLUS_MINUS: +/- the delay: backoff_delay + ((2 * (random() * delay)) - delay)

\n
\n\n
\n
\n

Execution\u00b6

\n
\n
\ndagster.execute_solid(solid_def, mode_def=None, input_values=None, tags=None, run_config=None, raise_on_error=True)[source]\u00b6
\n

Execute a single solid in an ephemeral pipeline.

\n

Intended to support unit tests. Input values may be passed directly, and no pipeline need be\nspecified \u2013 an ephemeral pipeline will be constructed.

\n
\n
Parameters
\n
    \n
  • solid_def (SolidDefinition) \u2013 The solid to execute.

  • \n
  • mode_def (Optional[ModeDefinition]) \u2013 The mode within which to execute the solid. Use this\nif, e.g., custom resources, loggers, or executors are desired.

  • \n
  • input_values (Optional[Dict[str, Any]]) \u2013 A dict of input names to input values, used to\npass inputs to the solid directly. You may also use the run_config to\nconfigure any inputs that are configurable.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parameterized this\nexecution, as a dict.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True, since this is the most useful behavior in test.

  • \n
\n
\n
Returns
\n

The result of executing the\nsolid.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\ndagster.execute_solid_within_pipeline(pipeline_def, solid_name, inputs=None, run_config=None, mode=None, preset=None, tags=None, instance=None)[source]\u00b6
\n

Execute a single solid within an existing pipeline.

\n

Intended to support tests. Input values may be passed directly.

\n
\n
Parameters
\n
    \n
  • pipeline_def (PipelineDefinition) \u2013 The pipeline within which to execute the solid.

  • \n
  • solid_name (str) \u2013 The name of the solid, or the aliased solid, to execute.

  • \n
  • inputs (Optional[Dict[str, Any]]) \u2013 A dict of input names to input values, used to\npass input values to the solid directly. You may also use the run_config to\nconfigure any inputs that are configurable.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parameterized this\nexecution, as a dict.

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
\n
\n
Returns
\n

The result of executing the\nsolid.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\ndagster.execute_solids_within_pipeline(pipeline_def, solid_names, inputs=None, run_config=None, mode=None, preset=None, tags=None, instance=None)[source]\u00b6
\n

Execute a set of solids within an existing pipeline.

\n

Intended to support tests. Input values may be passed directly.

\n
\n
Parameters
\n
    \n
  • pipeline_def (PipelineDefinition) \u2013 The pipeline within which to execute the solid.

  • \n
  • solid_names (FrozenSet[str]) \u2013 A set of the solid names, or the aliased solids, to execute.

  • \n
  • inputs (Optional[Dict[str, Dict[str, Any]]]) \u2013 A dict keyed on solid names, whose values are\ndicts of input names to input values, used to pass input values to the solids directly.\nYou may also use the run_config to configure any inputs that are configurable.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parameterized this\nexecution, as a dict.

  • \n
  • mode (Optional[str]) \u2013 The name of the pipeline mode to use. You may not set both mode\nand preset.

  • \n
  • preset (Optional[str]) \u2013 The name of the pipeline preset to use. You may not set both\nmode and preset.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to pipeline\nlogs.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against. If this is None,\nan ephemeral instance will be used, and no artifacts will be persisted from the run.

  • \n
\n
\n
Returns
\n

The results of\nexecuting the solids, keyed by solid name.

\n
\n
Return type
\n

Dict[str, Union[CompositeSolidExecutionResult, SolidExecutionResult]]

\n
\n
\n
\n\n
\n
\nclass dagster.SolidExecutionResult(solid, step_events_by_kind, reconstruct_context, pipeline_def, output_capture=None)[source]\u00b6
\n

Execution result for a leaf solid in a pipeline.

\n

Users should not instantiate this class.

\n
\n
\nproperty compute_input_event_dict\u00b6
\n

All events of type STEP_INPUT, keyed by input name.

\n
\n
Type
\n

Dict[str, DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty compute_output_events_dict\u00b6
\n

All events of type STEP_OUTPUT, keyed by output name

\n
\n
Type
\n

Dict[str, List[DagsterEvent]]

\n
\n
\n
\n\n
\n
\nproperty compute_step_events\u00b6
\n

All events generated by execution of the solid compute function.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty compute_step_failure_event\u00b6
\n

The STEP_FAILURE event, throws if it did not fail.

\n
\n
Type
\n

DagsterEvent

\n
\n
\n
\n\n
\n
\nproperty expectation_events_during_compute\u00b6
\n

All events of type STEP_EXPECTATION_RESULT.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty expectation_results_during_compute\u00b6
\n

All expectation results yielded by the solid

\n
\n
Type
\n

List[ExpectationResult]

\n
\n
\n
\n\n
\n
\nproperty failure_data\u00b6
\n

Any data corresponding to this step\u2019s failure, if it\nfailed.

\n
\n
Type
\n

Union[None, StepFailureData]

\n
\n
\n
\n\n
\n
\nget_output_event_for_compute(output_name='result')[source]\u00b6
\n

The STEP_OUTPUT event for the given output name.

\n

Throws if not present.

\n
\n
Parameters
\n

output_name (Optional[str]) \u2013 The name of the output. (default: \u2018result\u2019)

\n
\n
Returns
\n

The corresponding event.

\n
\n
Return type
\n

DagsterEvent

\n
\n
\n
\n\n
\n
\nget_output_events_for_compute(output_name='result')[source]\u00b6
\n

The STEP_OUTPUT event for the given output name.

\n

Throws if not present.

\n
\n
Parameters
\n

output_name (Optional[str]) \u2013 The name of the output. (default: \u2018result\u2019)

\n
\n
Returns
\n

The corresponding events.

\n
\n
Return type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nget_step_success_event()[source]\u00b6
\n

DagsterEvent: The STEP_SUCCESS event, throws if not present.

\n
\n\n
\n
\nproperty input_events_during_compute\u00b6
\n

All events of type STEP_INPUT.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty materialization_events_during_compute\u00b6
\n

All events of type ASSET_MATERIALIZATION.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty materializations_during_compute\u00b6
\n

All materializations yielded by the solid.

\n
\n
Type
\n

List[Materialization]

\n
\n
\n
\n\n
\n
\nproperty output_events_during_compute\u00b6
\n

All events of type STEP_OUTPUT.

\n
\n
Type
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\noutput_value(output_name='result')[source]\u00b6
\n

Get a computed output value.

\n

Note that calling this method will reconstruct the pipeline context (including, e.g.,\nresources) to retrieve materialized output values.

\n
\n
Parameters
\n

output_name (str) \u2013 The output name for which to retrieve the value. (default: \u2018result\u2019)

\n
\n
Returns
\n

\n
None if execution did not succeed, the output value

in the normal case, and a dict of mapping keys to values in the mapped case.

\n
\n
\n

\n
\n
Return type
\n

Union[None, Any, Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nproperty output_values\u00b6
\n

The computed output values.

\n

Returns None if execution did not succeed.

\n
\n
Returns a dictionary where keys are output names and the values are:
    \n
  • the output values in the normal case

  • \n
  • a dictionary from mapping key to corresponding value in the mapped case

  • \n
\n
\n
\n

Note that accessing this property will reconstruct the pipeline context (including, e.g.,\nresources) to retrieve materialized output values.

\n
\n
Type
\n

Union[None, Dict[str, Union[Any, Dict[str, Any]]]

\n
\n
\n
\n\n
\n
\nproperty retry_attempts\u00b6
\n

Number of times this step retried

\n
\n\n
\n
\nproperty skipped\u00b6
\n

Whether solid execution was skipped.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n
\nproperty success\u00b6
\n

Whether solid execution was successful.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.CompositeSolidExecutionResult(solid, event_list, step_events_by_kind, reconstruct_context, pipeline_def, handle=None, output_capture=None)[source]\u00b6
\n

Execution result for a composite solid in a pipeline.

\n

Users should not instantiate this class directly.

\n
\n
\noutput_for_solid(handle_str, output_name='result')\u00b6
\n

Get the output of a solid by its solid handle string and output name.

\n
\n
Parameters
\n
    \n
  • handle_str (str) \u2013 The string handle for the solid.

  • \n
  • output_name (str) \u2013 Optional. The name of the output, default to DEFAULT_OUTPUT.

  • \n
\n
\n
Returns
\n

The output value for the handle and output_name.

\n
\n
\n
\n\n
\n
\nresult_for_handle(handle)\u00b6
\n

Get the result of a solid by its solid handle.

\n

This allows indexing into top-level solids to retrieve the results of children of\ncomposite solids.

\n
\n
Parameters
\n

handle (Union[str,NodeHandle]) \u2013 The handle for the solid.

\n
\n
Returns
\n

The result of the given\nsolid.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\nresult_for_solid(name)\u00b6
\n

Get the result of a top level solid.

\n
\n
Parameters
\n

name (str) \u2013 The name of the top-level solid or aliased solid for which to retrieve the\nresult.

\n
\n
Returns
\n

The result of the solid\nexecution within the pipeline.

\n
\n
Return type
\n

Union[CompositeSolidExecutionResult, SolidExecutionResult]

\n
\n
\n
\n\n
\n
\nproperty solid_result_list\u00b6
\n

The results for each\ntop level solid.

\n
\n
Type
\n

List[Union[CompositeSolidExecutionResult, SolidExecutionResult]]

\n
\n
\n
\n\n
\n
\nproperty step_event_list\u00b6
\n

List[DagsterEvent] The full list of events generated by steps in the execution.

\n

Excludes events generated by the pipeline lifecycle, e.g., PIPELINE_START.

\n
\n\n
\n
\nproperty success\u00b6
\n

Whether all steps in the execution were successful.

\n
\n
Type
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\n

Execution context\u00b6

\n
\n
\nclass dagster.SolidExecutionContext(step_execution_context)[source]\u00b6
\n

The context object that can be made available as the first argument to a solid\u2019s compute\nfunction.

\n

The context object provides system information such as resources, config, and logging to a\nsolid\u2019s compute function. Users should not instantiate this object directly.

\n

Example:

\n
@solid\ndef hello_world(context: SolidExecutionContext):\n    context.log.info("Hello, world!")\n
\n
\n
\n
\nadd_output_metadata(metadata, output_name=None, mapping_key=None)[source]\u00b6
\n

Add metadata to one of the outputs of an op.

\n

This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.

\n
\n
Parameters
\n
    \n
  • metadata (Mapping[str, Any]) \u2013 The metadata to attach to the output

  • \n
  • output_name (Optional[str]) \u2013 The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.

  • \n
\n
\n
\n

Examples:

\n
from dagster import Out, op\nfrom typing import Tuple\n\n@op\ndef add_metadata(context):\n    context.add_output_metadata({"foo", "bar"})\n    return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n@op(out={"a": Out(), "b": Out()})\ndef add_metadata_two_outputs(context) -> Tuple[str, int]:\n    context.add_output_metadata({"foo": "bar"}, output_name="b")\n    context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n    return ("dog", 5)\n
\n
\n
\n\n
\n
\nconsume_events()[source]\u00b6
\n

Pops and yields all user-generated events that have been recorded from this context.

\n

If consume_events has not yet been called, this will yield all logged events since the beginning of the op\u2019s computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.

\n
\n\n
\n
\nget_mapping_key()[source]\u00b6
\n

Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None.

\n
\n\n
\n
\nget_tag(key)[source]\u00b6
\n

Get a logging tag.

\n
\n
Parameters
\n

key (tag) \u2013 The tag to get.

\n
\n
Returns
\n

The value of the tag, if present.

\n
\n
Return type
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run

\n
\n\n
\n
\nhas_tag(key)[source]\u00b6
\n

Check if a logging tag is set.

\n
\n
Parameters
\n

key (str) \u2013 The tag to check.

\n
\n
Returns
\n

Whether the tag is set.

\n
\n
Return type
\n

bool

\n
\n
\n
\n\n
\n
\nproperty instance\u00b6
\n

The current Dagster instance

\n
\n
Type
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty job_def\u00b6
\n

The currently executing job.

\n
\n
Type
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the currently executing job.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager available in the execution context.

\n
\n
Type
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nlog_event(event)[source]\u00b6
\n

Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.

\n

Events logged with this method will appear in the list of DagsterEvents, as well as the event log.

\n
\n
Parameters
\n

event (Union[AssetMaterialization, Materialization, AssetObservation, ExpectationResult]) \u2013 The event to log.

\n
\n
\n

Examples:

\n
from dagster import op, AssetMaterialization\n\n@op\ndef log_materialization(context):\n    context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty mode_def\u00b6
\n

The mode of the current execution.

\n
\n
Type
\n

ModeDefinition

\n
\n
\n
\n\n
\n
\nproperty op_def\u00b6
\n

The current op definition.

\n
\n
Type
\n

OpDefinition

\n
\n
\n
\n\n
\n
\noutput_asset_partition_key(output_name='result')[source]\u00b6
\n

Returns the asset partition key for the given output. Defaults to \u201cresult\u201d, which is the\nname of the default output.

\n
\n\n
\n
\noutput_asset_partitions_time_window(output_name='result')[source]\u00b6
\n

The time window for the partitions of the output asset.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nproperty partition_time_window\u00b6
\n

The partition time window for the current run.

\n

Raises an error if the current run is not a partitioned run, or if the job\u2019s partition\ndefinition is not a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nproperty pdb\u00b6
\n

Gives access to pdb debugging from within the op.

\n

Example:

\n
@op\ndef debug(context):\n    context.pdb.set_trace()\n
\n
\n
\n
Type
\n

dagster.utils.forked_pdb.ForkedPdb

\n
\n
\n
\n\n
\n
\nproperty pipeline_def\u00b6
\n

The currently executing pipeline.

\n
\n
Type
\n

PipelineDefinition

\n
\n
\n
\n\n
\n
\nproperty pipeline_name\u00b6
\n

The name of the currently executing pipeline.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty pipeline_run\u00b6
\n

The current pipeline run

\n
\n
Type
\n

PipelineRun

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

The currently available resources.

\n
\n
Type
\n

Resources

\n
\n
\n
\n\n
\n
\nproperty retry_number\u00b6
\n

Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc.

\n
\n\n
\n
\nproperty run\u00b6
\n

The current run

\n
\n
Type
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run config for the current execution.

\n
\n
Type
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the current execution\u2019s run.

\n
\n
Type
\n

str

\n
\n
\n
\n\n
\n
\nproperty solid_config\u00b6
\n

The parsed config specific to this solid.

\n
\n\n
\n
\nproperty solid_def\u00b6
\n

The current solid definition.

\n
\n
Type
\n

SolidDefinition

\n
\n
\n
\n\n
\n
\nproperty step_launcher\u00b6
\n

The current step launcher, if any.

\n
\n
Type
\n

Optional[StepLauncher]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_solid_context(resources=None, solid_config=None, resources_config=None, instance=None, config=None, partition_key=None, mapping_key=None)[source]\u00b6
\n

Builds solid execution context from provided parameters.

\n

build_solid_context can be used as either a function or context manager. If there is a\nprovided resource that is a context manager, then build_solid_context must be used as a\ncontext manager. This function can be used to provide the context argument when directly\ninvoking a solid.

\n
\n
Parameters
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • solid_config (Optional[Any]) \u2013 The solid config to provide to the context. The value provided\nhere will be available as context.solid_config.

  • \n
  • resources_config (Optional[Dict[str, Any]]) \u2013 Configuration for any resource definitions\nprovided to the resources arg. The configuration under a specific key should match the\nresource under a specific key in the resources dictionary.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
\n
\n
\n

Examples

\n
context = build_solid_context()\nsolid_to_invoke(context)\n\nwith build_solid_context(resources={"foo": context_manager_resource}) as context:\n    solid_to_invoke(context)\n
\n
\n
\n\n
\n
\n
\n

Composing solids\u00b6

\n
\n
\n@dagster.composite_solid(name=None, input_defs=None, output_defs=None, description=None, config_schema=None, config_fn=None)[source]\u00b6
\n

Create a composite solid with the specified parameters from the decorated composition\nfunction.

\n

Using this decorator allows you to build up the dependency graph of the composite by writing a\nfunction that invokes solids and passes the output to other solids. This is similar to the use\nof the @pipeline decorator, with the additional ability to remap inputs,\noutputs, and config across the composite boundary.

\n
\n
Parameters
\n
    \n
  • name (Optional[str]) \u2013 Name for the new composite solid. Must be unique within any\nPipelineDefinition using the solid.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the new composite solid.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013

    Information about the inputs that this composite solid maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit InputDefinitions taking precedence.

    \n

    Uses of inputs in the body of the decorated composition function will determine\nthe InputMappings passed to the underlying\nCompositeSolidDefinition.

    \n

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013

    Information about the outputs this composite solid maps. Information provided here\nwill be combined with what can be inferred from the return type signature if there\nis only one OutputDefinition.

    \n

    Uses of these outputs in the body of the decorated composition function, as well as the\nreturn value of the decorated function, will be used to infer the appropriate set of\nOutputMappings for the underlying\nCompositeSolidDefinition.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 If the config_fn argument is provided, this\nargument can be provided to set the schema for outer config that is passed to the\nconfig_fn. If config_fn is provided, but this argument is not provided, any config\nwill be accepted.

  • \n
  • config_fn (Callable[[dict], dict]) \u2013

    By specifying a config mapping\nfunction, you can override the configuration for the child solids contained within this\ncomposite solid. config_fn, maps the config provided to the\ncomposite solid to the config that will be provided to the child solids.

    \n

    If this argument is provided, the config_schema argument can also be provided to limit\nwhat config values can be passed to the composite solid.

    \n

  • \n
\n
\n
\n

Examples

\n
@lambda_solid\ndef add_one(num: int) -> int:\n    return num + 1\n\n@composite_solid\ndef add_two(num: int) -> int:\n    adder_1 = add_one.alias('adder_1')\n    adder_2 = add_one.alias('adder_2')\n\n    return adder_2(adder_1(num))\n
\n
\n
\n\n
\n
\nclass dagster.CompositeSolidDefinition(name, solid_defs, input_mappings=None, output_mappings=None, config_mapping=None, dependencies=None, description=None, tags=None, positional_inputs=None)[source]\u00b6
\n

The core unit of composition and abstraction, composite solids allow you to\ndefine a solid from a graph of solids.

\n

In the same way you would refactor a block of code in to a function to deduplicate, organize,\nor manage complexity - you can refactor solids in a pipeline in to a composite solid.

\n
\n
Parameters
\n
    \n
  • name (str) \u2013 The name of this composite solid. Must be unique within any\nPipelineDefinition using the solid.

  • \n
  • solid_defs (List[Union[SolidDefinition, CompositeSolidDefinition]]) \u2013 The set of solid\ndefinitions used in this composite solid. Composites may be arbitrarily nested.

  • \n
  • input_mappings (Optional[List[InputMapping]]) \u2013 Define the inputs to the composite solid,\nand how they map to the inputs of its constituent solids.

  • \n
  • output_mappings (Optional[List[OutputMapping]]) \u2013 Define the outputs of the composite solid,\nand how they map from the outputs of its constituent solids.

  • \n
  • config_mapping (Optional[ConfigMapping]) \u2013 By specifying a config mapping, you can override\nthe configuration for the child solids contained within this composite solid. Config\nmappings require both a configuration field to be specified, which is exposed as the\nconfiguration for the composite solid, and a configuration mapping function, which\nis called to map the configuration of the composite solid into the configuration that\nis applied to any child solids.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares where each solid gets its inputs. The keys at the top\nlevel dict are either string names of solids or NodeInvocations. The values\nare dicts that map input names to DependencyDefinitions.

  • \n
  • description (Optional[str]) \u2013 Human readable description of this composite solid.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the solid. Frameworks may\nexpect and require certain metadata to be attached to a solid. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.\nmay expect and require certain metadata to be attached to a solid.

  • \n
  • positional_inputs (Optional[List[str]]) \u2013 The positional order of the inputs if it\ndiffers from the order of the input mappings

  • \n
\n
\n
\n

Examples

\n
@lambda_solid\ndef add_one(num: int) -> int:\n    return num + 1\n\nadd_two = CompositeSolidDefinition(\n    'add_two',\n    solid_defs=[add_one],\n    dependencies={\n        NodeInvocation('add_one', 'adder_1'): {},\n        NodeInvocation('add_one', 'adder_2'): {'num': DependencyDefinition('adder_1')},\n    },\n    input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')],\n    output_mappings=[OutputDefinition(Int).mapping_from('adder_2')],\n)\n
\n
\n
\n
\nconfigured(config_or_config_fn, name, config_schema=None, description=None)\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n
\n
Parameters
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (str) \u2013 Name of the new definition. This is a required argument, as this definition\ntype has a name uniqueness constraint.

  • \n
  • config_schema (ConfigSchema) \u2013 If config_or_config_fn is a function, the config schema\nthat its input must satisfy.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n\n
\n
\nclass dagster.InputMapping(definition, maps_to)[source]\u00b6
\n

Defines an input mapping for a composite solid.

\n
\n
Parameters
\n
    \n
  • definition (InputDefinition) \u2013 Defines the input to the composite solid.

  • \n
  • solid_name (str) \u2013 The name of the child solid onto which to map the input.

  • \n
  • input_name (str) \u2013 The name of the input to the child solid onto which to map the input.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.OutputMapping(definition, maps_from)[source]\u00b6
\n

Defines an output mapping for a composite solid.

\n
\n
Parameters
\n
    \n
  • definition (OutputDefinition) \u2013 Defines the output of the composite solid.

  • \n
  • solid_name (str) \u2013 The name of the child solid from which to map the output.

  • \n
  • output_name (str) \u2013 The name of the child solid\u2019s output from which to map the output.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ConfigMapping(config_fn, config_schema=None, receive_processed_config_values=None)[source]
\n

Defines a config mapping for a graph (or job).

\n

By specifying a config mapping function, you can override the configuration for the child\nops and graphs contained within a graph.

\n

Config mappings require the configuration schema to be specified as config_schema, which will\nbe exposed as the configuration schema for the graph, as well as a configuration mapping\nfunction, config_fn, which maps the config provided to the composite solid to the config\nthat will be provided to the child nodes.

\n
\n
Parameters
\n
    \n
  • config_fn (Callable[[dict], dict]) \u2013 The function that will be called\nto map the graph config to a config appropriate for the child nodes.

  • \n
  • config_schema (ConfigSchema) \u2013 The schema of the graph config.

  • \n
  • receive_processed_config_values (Optional[bool]) \u2013 If true, config values provided to the config_fn\nwill be converted to their dagster types before being passed in. For example, if this\nvalue is true, enum config passed to config_fn will be actual enums, while if false,\nthen enum config passed to config_fn will be strings.

  • \n
\n
\n
\n
\n\n
\n
\n

Events\u00b6

\n

The objects that can be yielded by the body of solids\u2019 compute functions to communicate with the\nDagster framework.

\n

(Note that Failure and RetryRequested are intended to be raised from solids rather than yielded.)

\n
\n

Event types\u00b6

\n
\n
\nclass dagster.Output(value, output_name='result', metadata_entries=None, metadata=None)[source]
\n

Event corresponding to one of a op\u2019s outputs.

\n

Op compute functions must explicitly yield events of this type when they have more than\none output, or when they also yield events of other types, or when defining a op using the\nOpDefinition API directly.

\n

Outputs are values produced by ops that will be consumed by downstream ops in a job.\nThey are type-checked at op boundaries when their corresponding Out\nor the downstream In is typed.

\n
\n
Parameters
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding out. (default:\n\u201cresult\u201d)

  • \n
  • metadata_entries (Optional[Union[MetadataEntry, PartitionMetadataEntry]]) \u2013 (Experimental) A set of metadata entries to attach to events related to this Output.

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, Dict, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetMaterialization(asset_key, description=None, metadata_entries=None, partition=None, tags=None, metadata=None)[source]
\n

Event indicating that an op has materialized an asset.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that they have produced a materialized value as a\nside effect of computation. Unlike outputs, asset materializations can not be passed to other\nops, and their persistence is controlled by op logic, rather than by the Dagster\nframework.

\n

Op authors should use these events to organize metadata about the side effects of their\ncomputations, enabling tooling like the Assets dashboard in Dagit.

\n
\n
Parameters
\n
    \n
  • asset_key (Union[str, List[str], AssetKey]) \u2013 A key to identify the materialized asset across job\nruns

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the materialized value.

  • \n
  • metadata_entries (Optional[List[Union[MetadataEntry, PartitionMetadataEntry]]]) \u2013 Arbitrary metadata about the\nmaterialized value.

  • \n
  • partition (Optional[str]) \u2013 The name of the partition that was materialized.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 (Experimental) Tag metadata for a given asset\nmaterialization. Used for search and organization of the asset entry in the asset\ncatalog in Dagit.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the asset. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n
\nstatic file(path, description=None, asset_key=None)[source]
\n

Static constructor for standard materializations corresponding to files on disk.

\n
\n
Parameters
\n
    \n
  • path (str) \u2013 The path to the file.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the materialization.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ExpectationResult(success, label=None, description=None, metadata_entries=None, metadata=None)[source]
\n

Event corresponding to a data quality test.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that a data quality test has produced a (positive or\nnegative) result.

\n
\n
Parameters
\n
    \n
  • success (bool) \u2013 Whether the expectation passed or not.

  • \n
  • label (Optional[str]) \u2013 Short display name for expectation. Defaults to \u201cresult\u201d.

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the expectation.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\nexpectation.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheck(success, description=None, metadata_entries=None, metadata=None)[source]
\n

Event corresponding to a successful typecheck.

\n

Events of this type should be returned by user-defined type checks when they need to encapsulate\nadditional metadata about a type check\u2019s success or failure. (i.e., when using\nas_dagster_type(), @usable_as_dagster_type, or the underlying\nPythonObjectDagsterType() API.)

\n

Solid compute functions should generally avoid yielding events of this type to avoid confusion.

\n
\n
Parameters
\n
    \n
  • success (bool) \u2013 True if the type check succeeded, False otherwise.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the type check.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\ntype check.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Failure(description=None, metadata_entries=None, metadata=None)[source]
\n

Event indicating op failure.

\n

Raise events of this type from within op compute functions or custom type checks in order to\nindicate an unrecoverable failure in user code to the Dagster machinery and return\nstructured metadata about the failure.

\n
\n
Parameters
\n
    \n
  • description (Optional[str]) \u2013 A human-readable description of the failure.

  • \n
  • metadata_entries (Optional[List[MetadataEntry]]) \u2013 Arbitrary metadata about the\nfailure.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RetryRequested(max_retries=1, seconds_to_wait=None)[source]
\n

An exception to raise from an op to indicate that it should be retried.

\n
\n
Parameters
\n
    \n
  • max_retries (Optional[int]) \u2013 The max number of retries this step should attempt before failing

  • \n
  • seconds_to_wait (Optional[Union[float,int]]) \u2013 Seconds to wait before restarting the step after putting the step in\nto the up_for_retry state

  • \n
\n
\n
\n

Example

\n
@op\ndef flakes():\n    try:\n        flakey_operation()\n    except Exception as e:\n        raise RetryRequested(max_retries=3) from e\n
\n
\n
\n\n
\n
\n
\n

Metadata\u00b6

\n

Dagster uses metadata to communicate arbitrary user-specified metadata about structured\nevents.

\n
\n
\nclass dagster.MetadataValue[source]
\n

Utility class to wrap metadata values passed into Dagster events so that they can be\ndisplayed in Dagit and other tooling.

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": "hello",\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n            "num_rows": 0,\n        },\n    )\n
\n
\n
\n
\nstatic asset(asset_key)[source]
\n

Static constructor for a metadata value referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata={\n            "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n        },\n    )\n
\n
\n
\n
Parameters
\n

asset_key (AssetKey) \u2013 The asset key referencing the asset.

\n
\n
\n
\n\n
\n
\nstatic bool(value)[source]
\n

Static constructor for a metadata value wrapping a bool as\nBoolMetadataValuye. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n        },\n    )\n
\n
\n
\n
Parameters
\n

value (bool) \u2013 The bool value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic dagster_run(run_id)[source]
\n

Static constructor for a metadata value wrapping a reference to a Dagster run.

\n
\n
Parameters
\n

run_id (str) \u2013 The ID of the run.

\n
\n
\n
\n\n
\n
\nstatic float(value)[source]
\n

Static constructor for a metadata value wrapping a float as\nFloatMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n        }\n    )\n
\n
\n
\n
Parameters
\n

value (float) \u2013 The float value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic int(value)[source]
\n

Static constructor for a metadata value wrapping an int as\nIntMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "number of rows": MetadataValue.int(len(df)),\n        },\n    )\n
\n
\n
\n
Parameters
\n

value (int) \u2013 The int value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic json(data)[source]
\n

Static constructor for a metadata value wrapping a path as\nJsonMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata={\n            "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n        },\n    )\n
\n
\n
\n
Parameters
\n

data (Dict[str, Any]) \u2013 The JSON data for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic md(data)[source]
\n

Static constructor for a metadata value wrapping markdown data as\nMarkdownMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata={\n            'Details': MetadataValue.md(md_str)\n        },\n    )\n
\n
\n
\n
Parameters
\n

md_str (str) \u2013 The markdown for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic path(path)[source]
\n

Static constructor for a metadata value wrapping a path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "filepath": MetadataValue.path("path/to/file"),\n        }\n    )\n
\n
\n
\n
Parameters
\n

path (str) \u2013 The path for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic python_artifact(python_artifact)[source]
\n

Static constructor for a metadata value wrapping a python artifact as\nPythonArtifactMetadataValue. Can be used as the value type for the\nmetadata parameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "class": MetadataValue.python_artifact(MyClass),\n            "function": MetadataValue.python_artifact(my_function),\n        }\n    )\n
\n
\n
\n
Parameters
\n

value (Callable) \u2013 The python class or function for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic table(records, schema=None)[source]
\n

Static constructor for a metadata value wrapping arbitrary tabular data as\nTableMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata={\n            "errors": MetadataValue.table(\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"}]\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        },\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • records (List[TableRecord]) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table_schema(schema)[source]
\n

Static constructor for a metadata value wrapping a table schema as\nTableSchemaMetadataValue. Can be used as the value type\nfor the metadata parameter for supported events. For example:

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata={\n        'my_table_schema': MetadataValue.table_schema(schema),\n    }\n)\n
\n
\n
\n
Parameters
\n

schema (TableSchema) \u2013 The table schema for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic text(text)[source]
\n

Static constructor for a metadata value wrapping text as\nTextMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": MetadataValue.text("hello")\n        },\n    )\n
\n
\n
\n
Parameters
\n

text (str) \u2013 The text string for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic url(url)[source]
\n

Static constructor for a metadata value wrapping a URL as\nUrlMetadataValue. Can be used as the value type for the metadata\nparameter for supported events. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata={\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n        }\n    )\n
\n
\n
\n
Parameters
\n

url (str) \u2013 The URL for a metadata entry.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MetadataEntry(label, description=None, entry_data=None, value=None)[source]
\n

The standard structure for describing metadata for Dagster events.

\n

Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\nin Dagit and other tooling.

\n

Should be yielded from within an IO manager to append metadata for a given input/output event.\nFor other event types, passing a dict with MetadataValue values to the metadata argument\nis preferred.

\n
\n
Parameters
\n
    \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • value (MetadataValue) \u2013 Typed metadata entry data. The different types allow\nfor customized display in tools like dagit.

  • \n
\n
\n
\n
\n
\nstatic asset(asset_key, label, description=None)[source]
\n

Static constructor for a metadata entry referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata_entries=[\n             MetadataEntry.asset(AssetKey('my_other_table'), "Related asset"),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • asset_key (AssetKey) \u2013 The asset key referencing the asset.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic float(value, label, description=None)[source]
\n

Static constructor for a metadata entry containing float as\nFloatMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.float(calculate_bytes(df), "size (bytes)")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • value (Optional[float]) \u2013 The float value contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic fspath(path, label=None, description=None)[source]
\n

Static constructor for a metadata entry containing a filesystem path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.fspath("path/to/file")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • path (Optional[str]) \u2013 The path contained by this metadata entry.

  • \n
  • label (Optional[str]) \u2013 Short display label for this metadata entry. Defaults to the\nbase name of the path.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic int(value, label, description=None)[source]
\n

Static constructor for a metadata entry containing int as\nIntMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.int(len(df), "number of rows")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • value (Optional[int]) \u2013 The int value contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic json(data, label, description=None)[source]
\n

Static constructor for a metadata entry containing JSON data as\nJsonMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata_entries=[\n            MetadataEntry.json(\n                label="metadata", data={"missing_columns": missing_things},\n            )\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • data (Optional[Dict[str, Any]]) \u2013 The JSON data contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic md(md_str, label, description=None)[source]
\n

Static constructor for a metadata entry containing markdown data as\nMarkdownMetadataValue. For example:

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata_entries=[MetadataEntry.md(md_str=md_str)],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • md_str (Optional[str]) \u2013 The markdown contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic path(path, label, description=None)[source]
\n

Static constructor for a metadata entry containing a path as\nPathMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[MetadataEntry.path("path/to/file", label="filepath")],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • path (Optional[str]) \u2013 The path contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table(records, label, description=None, schema=None)[source]
\n

Static constructor for a metadata entry containing tabluar data as\nTableMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata_entries=[\n            MetadataEntry.table(\n                label="errors",\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"}]\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • records (List[TableRecord]) \u2013 The data as a list of records (i.e. rows).

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table. If none is provided, one will be\nautomatically generated by examining the first record. The schema will include as columns all\nfield names present in the first record, with a type of \u201cstring\u201d, \u201cint\u201d,\n\u201cbool\u201d or \u201cfloat\u201d inferred from the first record\u2019s values. If a value does\nnot directly match one of the above types, it will be treated as a string.

  • \n
\n
\n
\n
\n\n
\n
\nstatic table_schema(schema, label, description=None)[source]
\n

Static constructor for a metadata entry containing a table schema as\nTableSchemaMetadataValue. For example:

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata_entries=[\n        MetadataEntry.table_schema(\n            schema,\n            label='schema',\n        )\n    ]\n)\n
\n
\n
\n
Parameters
\n
    \n
  • schema (TableSchema) \u2013 The table schema for a metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic text(text, label, description=None)[source]
\n

Static constructor for a metadata entry containing text as\nTextMetadataValue. For example:

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata_entries=[\n            MetadataEntry.text("Text-based metadata for this event", "text_metadata")\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • text (Optional[str]) \u2013 The text of this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nstatic url(url, label, description=None)[source]
\n

Static constructor for a metadata entry containing a URL as\nUrlMetadataValue. For example:

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata_entries=[\n            MetadataEntry.url(\n                "http://mycoolsite.com/my_dashboard", label="dashboard_url"\n            ),\n        ],\n    )\n
\n
\n
\n
Parameters
\n
    \n
  • url (Optional[str]) \u2013 The URL contained by this metadata entry.

  • \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
\n
\n
\n
\n\n
\n
\nproperty value
\n

Alias of entry_data.

\n
\n\n
\n\n
\n
\n

Metadata types\u00b6

\n

All metadata types inherit from MetadataValue. The following types are defined:

\n
\n
\nclass dagster.DagsterAssetMetadataValue(asset_key)[source]
\n

Representation of a dagster asset.

\n
\n
Parameters
\n

asset_key (AssetKey) \u2013 The dagster asset key

\n
\n
\n
\n\n
\n
\nclass dagster.DagsterPipelineRunMetadataValue(run_id)[source]
\n

Representation of a dagster pipeline run.

\n
\n
Parameters
\n

run_id (str) \u2013 The pipeline run id

\n
\n
\n
\n\n
\n
\nclass dagster.FloatMetadataValue(value)[source]
\n

Container class for float metadata entry data.

\n
\n
Parameters
\n

value (Optional[float]) \u2013 The float value.

\n
\n
\n
\n\n
\n
\nclass dagster.IntMetadataValue(value)[source]
\n

Container class for int metadata entry data.

\n
\n
Parameters
\n

value (Optional[int]) \u2013 The int value.

\n
\n
\n
\n\n
\n
\nclass dagster.JsonMetadataValue(data)[source]
\n

Container class for JSON metadata entry data.

\n
\n
Parameters
\n

data (Dict[str, Any]) \u2013 The JSON data.

\n
\n
\n
\n\n
\n
\nclass dagster.MarkdownMetadataValue(md_str)[source]
\n

Container class for markdown metadata entry data.

\n
\n
Parameters
\n

md_str (Optional[str]) \u2013 The markdown as a string.

\n
\n
\n
\n\n
\n
\nclass dagster.PathMetadataValue(path)[source]
\n

Container class for path metadata entry data.

\n
\n
Parameters
\n

path (Optional[str]) \u2013 The path as a string or conforming to os.PathLike.

\n
\n
\n
\n\n
\n
\nclass dagster.PythonArtifactMetadataValue(module, name)[source]
\n

Container class for python artifact metadata entry data.

\n
\n
Parameters
\n
    \n
  • module (str) \u2013 The module where the python artifact can be found

  • \n
  • name (str) \u2013 The name of the python artifact

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableMetadataValue(records, schema)[source]
\n

Container class for table metadata entry data.

\n
\n
Parameters
\n
    \n
  • records (TableRecord) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableSchemaMetadataValue(schema)[source]
\n

Representation of a schema for arbitrary tabular data.

\n
\n
Parameters
\n

schema (TableSchema) \u2013 The dictionary containing the schema representation.

\n
\n
\n
\n\n
\n
\nclass dagster.TextMetadataValue(text)[source]
\n

Container class for text metadata entry data.

\n
\n
Parameters
\n

text (Optional[str]) \u2013 The text data.

\n
\n
\n
\n\n
\n
\nclass dagster.UrlMetadataValue(url)[source]
\n

Container class for URL metadata entry data.

\n
\n
Parameters
\n

url (Optional[str]) \u2013 The URL as a string.

\n
\n
\n
\n\n
\n
\n
\n

Asset key\u00b6

\n

Dagster uses AssetKey to build an index on Materialization events.\nAssets materialized with an AssetKey are highlighted in dagit on the Assets\ndashboard.

\n
\n
\nclass dagster.AssetKey(path)[source]
\n

Object representing the structure of an asset key. Takes in a sanitized string, list of\nstrings, or tuple of strings.

\n

Example usage:

\n
from dagster import op\n\n@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey('flat_asset_key'),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(['parent', 'child', 'grandchild']),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key_2(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(('parent', 'child', 'grandchild')),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n
\n
\n
\n
Parameters
\n

path (Sequence[str]) \u2013 String, list of strings, or tuple of strings. A list of strings\nrepresent the hierarchical structure of the asset_key.

\n
\n
\n
\n
\nto_string(legacy=False)[source]
\n

E.g. \u2018[\u201cfirst_component\u201d, \u201csecond_component\u201d]\u2019

\n
\n\n
\n
\nto_user_string()[source]
\n

E.g. \u201cfirst_component>second_component\u201d

\n
\n\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/solids", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../dynamic/", "title": "Dynamic Mapping & Collect"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../schedules-sensors/", "title": "Run Requests"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/dynamic", "Dynamic Mapping & Collect", "N", "next"], ["sections/api/apidocs/schedules-sensors", "Run Requests", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/solids.rst.txt", "title": "[Legacy] Solids", "toc": "\n"}, "types": {"alabaster_version": "0.7.12", "body": "
\n

Types\u00b6

\n

Dagster includes facilities for typing the input and output values of ops (\u201cruntime\u201d types).

\n
\n

Built-in types\u00b6

\n
\n
\ndagster.Nothing\u00b6
\n

Use this type only for inputs and outputs, in order to establish an execution dependency without\ncommunicating a value. Inputs of this type will not be passed to the op compute function, so\nit is necessary to use the explicit In API to define them rather than\nthe Python 3 type hint syntax.

\n

All values are considered to be instances of Nothing.

\n

Examples:

\n
@op\ndef wait(_) -> Nothing:\n    time.sleep(1)\n    return\n\n@op(\n    ins={"ready": In(dagster_type=Nothing)},\n)\ndef done(_) -> str:\n    return 'done'\n\n@job\ndef nothing_job():\n    done(wait())\n\n# Any value will pass the type check for Nothing\n@op\ndef wait_int(_) -> Int:\n    time.sleep(1)\n    return 1\n\n@job\ndef nothing_int_job():\n    done(wait_int())\n
\n
\n
\n\n
\n
\nclass dagster.FileHandle[source]\u00b6
\n

A reference to a file as manipulated by a FileManager

\n

Subclasses may handle files that are resident on the local file system, in an object store, or\nin any arbitrary place where a file can be stored.

\n

This exists to handle the very common case where you wish to write a computation that reads,\ntransforms, and writes files, but where you also want the same code to work in local development\nas well as on a cluster where the files will be stored in a globally available object store\nsuch as S3.

\n
\n
\nabstract property path_desc\u00b6
\n

A representation of the file path for display purposes only.

\n
\n\n
\n\n
\n
\nclass dagster.LocalFileHandle(path)[source]\u00b6
\n

A reference to a file on a local filesystem.

\n
\n\n
\n
\n

Making New Types\u00b6

\n
\n
\nclass dagster.DagsterType(type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, materializer=None, required_resource_keys=None, kind=<DagsterTypeKind.REGULAR: 'REGULAR'>, typing_type=None, metadata_entries=None, metadata=None)[source]\u00b6
\n

Define a type in dagster. These can be used in the inputs and outputs of ops.

\n
\n
Parameters
\n
    \n
  • type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the op. If it passes, return either\nTrue or a TypeCheck with success set to True. If it fails,\nreturn either False or a TypeCheck with success set to False.\nThe first argument must be named context (or, if unused, _, _context, or context_).\nUse required_resource_keys for access to resources.

  • \n
  • key (Optional[str]) \u2013

    The unique key to identify types programmatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name. If\nneither key nor name is provided, a CheckError is thrown.

    \n

    In the case of a generic type such as List or Optional, this is\ngenerated programmatically based on the type parameters.

    \n

    For most use cases, name should be set and the key argument should\nnot be specified.

    \n

  • \n
  • name (Optional[str]) \u2013 A unique name given by a user. If key is None, key\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer\ndecorator to construct these arguments.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn.

  • \n
  • is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String, Int) to visually distinguish\nthem from user-defined types. Meant for internal use.

  • \n
  • kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.

  • \n
  • typing_type \u2013 Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\nvalue contained within the DagsterType. Meant for internal use.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.PythonObjectDagsterType(python_type, key=None, name=None, **kwargs)[source]\u00b6
\n

Define a type in dagster whose typecheck is an isinstance check.

\n

Specifically, the type can either be a single python type (e.g. int),\nor a tuple of types (e.g. (int, float)) which is treated as a union.

\n

Examples

\n
ntype = PythonObjectDagsterType(python_type=int)\nassert ntype.name == 'int'\nassert_success(ntype, 1)\nassert_failure(ntype, 'a')\n
\n
\n
ntype = PythonObjectDagsterType(python_type=(int, float))\nassert ntype.name == 'Union[int, float]'\nassert_success(ntype, 1)\nassert_success(ntype, 1.5)\nassert_failure(ntype, 'a')\n
\n
\n
\n
Parameters
\n
    \n
  • python_type (Union[Type, Tuple[Type, ..]) \u2013 The dagster typecheck function calls instanceof on\nthis type.

  • \n
  • name (Optional[str]) \u2013 Name the type. Defaults to the name of python_type.

  • \n
  • key (Optional[str]) \u2013 Key of the type. Defaults to name.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_mate\ndecorator to construct these arguments.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.dagster_type_loader(config_schema, required_resource_keys=None, loader_version=None, external_version_fn=None)[source]\u00b6
\n

Create an dagster type loader that maps config data to a runtime value.

\n

The decorated function should take the execution context and parsed config value and return the\nappropriate runtime value.

\n
\n
Parameters
\n
    \n
  • config_schema (ConfigSchema) \u2013 The schema for the config that\u2019s passed to the decorated\nfunction.

  • \n
  • loader_version (str) \u2013 (Experimental) The version of the decorated compute function. Two\nloading functions should have the same version if and only if they deterministically\nproduce the same outputs when provided the same inputs.

  • \n
  • external_version_fn (Callable) \u2013 (Experimental) A function that takes in the same parameters as the loader\nfunction (config_value) and returns a representation of the version of the external\nasset (str). Two external assets with identical versions are treated as identical to one\nanother.

  • \n
\n
\n
\n

Examples:

\n
@dagster_type_loader(Permissive())\ndef load_dict(_context, value):\n    return value\n
\n
\n
\n\n
\n
\nclass dagster.DagsterTypeLoader[source]\u00b6
\n

Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\nto.

\n

The recommended way to define a type loader is with the\n@dagster_type_loader decorator.

\n
\n\n
\n
\ndagster.dagster_type_materializer(config_schema, required_resource_keys=None)[source]\u00b6
\n

Create an output materialization hydration config that configurably materializes a runtime\nvalue.

\n

The decorated function should take the execution context, the parsed config value, and the\nruntime value. It should materialize the runtime value, and should\nreturn an appropriate AssetMaterialization.

\n
\n
Parameters
\n

config_schema (object) \u2013 The type of the config data expected by the decorated function.

\n
\n
\n

Examples:

\n
# Takes a list of dicts such as might be read in using csv.DictReader, as well as a config\nvalue, and writes\n@dagster_type_materializer(str)\ndef materialize_df(_context, path, value):\n    with open(path, 'w') as fd:\n        writer = csv.DictWriter(fd, fieldnames=value[0].keys())\n        writer.writeheader()\n        writer.writerows(rowdicts=value)\n\n    return AssetMaterialization.file(path)\n
\n
\n
\n\n
\n
\nclass dagster.DagsterTypeMaterializer[source]\u00b6
\n

Dagster type materializers are used to materialize outputs of the dagster type they are attached\nto.

\n

The recommended way to define a type loader is with the\n@dagster_type_materializer decorator.

\n
\n\n
\n
\ndagster.usable_as_dagster_type(name=None, description=None, loader=None, materializer=None)[source]\u00b6
\n

Decorate a Python class to make it usable as a Dagster Type.

\n

This is intended to make it straightforward to annotate existing business logic classes to\nmake them dagster types whose typecheck is an isinstance check against that python class.

\n
\n
Parameters
\n
    \n
  • python_type (cls) \u2013 The python type to make usable as python type.

  • \n
  • name (Optional[str]) \u2013 Name of the new Dagster type. If None, the name (__name__) of\nthe python_type will be used.

  • \n
  • description (Optional[str]) \u2013 A user-readable description of the type.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • materializer (Optional[DagsterTypeMaterializer]) \u2013 An instance of a class\nthat inherits from DagsterTypeMaterializer and can persist values of\nthis type. As a rule, you should use the\n@dagster_type_materializer\ndecorator to construct these arguments.

  • \n
\n
\n
\n

Examples:

\n
# dagster_aws.s3.file_manager.S3FileHandle\n@usable_as_dagster_type\nclass S3FileHandle(FileHandle):\n    def __init__(self, s3_bucket, s3_key):\n        self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n        self._s3_key = check.str_param(s3_key, 's3_key')\n\n    @property\n    def s3_bucket(self):\n        return self._s3_bucket\n\n    @property\n    def s3_key(self):\n        return self._s3_key\n\n    @property\n    def path_desc(self):\n        return self.s3_path\n\n    @property\n    def s3_path(self):\n        return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n
\n
\n
\n\n
\n
\ndagster.make_python_type_usable_as_dagster_type(python_type, dagster_type)[source]\u00b6
\n

Take any existing python type and map it to a dagster type (generally created with\nDagsterType) This can only be called once\non a given python type.

\n
\n\n
\n

Testing Types\u00b6

\n
\n
\ndagster.check_dagster_type(dagster_type, value)[source]\u00b6
\n

Test a custom Dagster type.

\n
\n
Parameters
\n
    \n
  • dagster_type (Any) \u2013 The Dagster type to test. Should be one of the\nbuilt-in types, a dagster type explicitly constructed with\nas_dagster_type(), @usable_as_dagster_type, or\nPythonObjectDagsterType(), or a Python type.

  • \n
  • value (Any) \u2013 The runtime value to test.

  • \n
\n
\n
Returns
\n

The result of the type check.

\n
\n
Return type
\n

TypeCheck

\n
\n
\n

Examples

\n
assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n
\n
\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/types", "customsidebar": null, "display_toc": true, "meta": null, "metatags": "\n", "next": {"link": "../utilities/", "title": "Utilities"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dynamic/", "title": "Dynamic Mapping & Collect"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/utilities", "Utilities", "N", "next"], ["sections/api/apidocs/dynamic", "Dynamic Mapping & Collect", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/types.rst.txt", "title": "Types", "toc": "\n"}, "utilities": {"alabaster_version": "0.7.12", "body": "
\n

Utilities\u00b6

\n
\n
\ndagster.file_relative_path(dunderfile, relative_path)[source]\u00b6
\n

Get a path relative to the currently executing Python file.

\n

This function is useful when one needs to load a file that is relative to the position of\nthe current file. (Such as when you encode a configuration file path in source file and want\nin runnable in any current working directory)

\n
\n
Parameters
\n
    \n
  • dunderfile (str) \u2013 Should always be __file__.

  • \n
  • relative_path (str) \u2013 Path to get relative to the currently executing file.

  • \n
\n
\n
\n

Examples:

\n
file_relative_path(__file__, 'path/relative/to/file')\n
\n
\n
\n\n
\n
\ndagster.config_from_files(config_files)[source]\u00b6
\n

Constructs run config from YAML files.

\n
\n
Parameters
\n

config_files (List[str]) \u2013 List of paths or glob patterns for yaml files\nto load and parse as the run config.

\n
\n
Returns
\n

A run config dictionary constructed from provided YAML files.

\n
\n
Return type
\n

Dict[str, Any]

\n
\n
Raises
\n
\n
\n
\n
\n\n
\n
\ndagster.config_from_pkg_resources(pkg_resource_defs)[source]\u00b6
\n

Load a run config from a package resource, using pkg_resources.resource_string().

\n

Example:

\n
config_from_pkg_resources(\n    pkg_resource_defs=[\n        ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n        ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n    ],\n)\n
\n
\n
\n
Parameters
\n

pkg_resource_defs (List[(str, str)]) \u2013 List of pkg_resource modules/files to\nload as the run config.

\n
\n
Returns
\n

A run config dictionary constructed from the provided yaml strings

\n
\n
Return type
\n

Dict[Str, Any]

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\ndagster.config_from_yaml_strings(yaml_strings)[source]\u00b6
\n

Static constructor for run configs from YAML strings.

\n
\n
Parameters
\n

yaml_strings (List[str]) \u2013 List of yaml strings to parse as the run config.

\n
\n
Returns
\n

A run config dictionary constructed from the provided yaml strings

\n
\n
Return type
\n

Dict[Str, Any]

\n
\n
Raises
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\ndagster.get_dagster_logger(name=None)[source]\u00b6
\n

Creates a python logger whose output messages will be captured and converted into Dagster log\nmessages. This means they will have structured information such as the step_key, run_id, etc.\nembedded into them, and will show up in the Dagster event log.

\n

This can be used as a more convenient alternative to context.log in most cases. If log level\nis not set explicitly, defaults to DEBUG.

\n
\n
Parameters
\n

name (Optional[str]) \u2013 If supplied, will create a logger with the name \u201cdagster.builtin.{name}\u201d,\nwith properties inherited from the base Dagster logger. If omitted, the returned logger\nwill be named \u201cdagster.builtin\u201d.

\n
\n
Returns
\n

A logger whose output will be captured by Dagster.

\n
\n
Return type
\n

logging.Logger

\n
\n
\n

Example

\n
from dagster import get_dagster_logger, op\n\n@op\ndef hello_op():\n    log = get_dagster_logger()\n    for i in range(5):\n        # do something\n        log.info(f"Did {i+1} things!")\n
\n
\n
\n\n
\n
\nclass dagster.ExperimentalWarning[source]\u00b6
\n
\n\n
\n
\nclass dagster.utils.forked_pdb.ForkedPdb(completekey='tab', stdin=None, stdout=None, skip=None, nosigint=False, readrc=True)[source]\u00b6
\n

A pdb subclass that may be used from a forked multiprocessing child

\n

Examples:

\n
from dagster.utils.forked_pdb import ForkedPdb\n\n@solid\ndef complex_solid(_):\n    # some complicated stuff\n\n    ForkedPdb().set_trace()\n\n    # some other complicated stuff\n
\n
\n

You can initiate pipeline execution via dagit and use the pdb debugger to examine/step through\nexecution at the breakpoint.

\n
\n\n
\n
\ndagster.utils.make_email_on_run_failure_sensor(*args, **kwargs)[source]\u00b6
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/utilities", "customsidebar": null, "display_toc": false, "meta": null, "metatags": "\n", "next": {"link": "../memoization/", "title": "Versioning and Memoization"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../types/", "title": "Types"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/memoization", "Versioning and Memoization", "N", "next"], ["sections/api/apidocs/types", "Types", "P", "previous"]], "sidebars": ["globaltoc.html", "searchbox.html"], "sourcename": "sections/api/apidocs/utilities.rst.txt", "title": "Utilities", "toc": "\n"}}}} \ No newline at end of file diff --git a/docs/content/concepts/assets/software-defined-assets.mdx b/docs/content/concepts/assets/software-defined-assets.mdx index bae3068a1ff44..9dee46e8ee25b 100644 --- a/docs/content/concepts/assets/software-defined-assets.mdx +++ b/docs/content/concepts/assets/software-defined-assets.mdx @@ -207,7 +207,7 @@ Different assets can have different IO managers: ```python file=/concepts/assets/asset_different_io_managers.py startafter=start_marker endbefore=end_marker from dagster_aws.s3 import s3_pickle_asset_io_manager, s3_resource -from dagster import AssetGroup, asset, fs_asset_io_manager +from dagster import AssetGroup, asset, fs_io_manager @asset(io_manager_key="s3_io_manager") @@ -225,7 +225,7 @@ asset_group = AssetGroup( resource_defs={ "s3_io_manager": s3_pickle_asset_io_manager, "s3": s3_resource, - "fs_io_manager": fs_asset_io_manager, + "fs_io_manager": fs_io_manager, }, ) ``` @@ -237,7 +237,7 @@ The same assets can be bound to different resources and IO managers in different ```python file=/concepts/assets/asset_io_manager_prod_local.py startafter=start_marker endbefore=end_marker from dagster_aws.s3 import s3_pickle_asset_io_manager, s3_resource -from dagster import AssetGroup, asset, fs_asset_io_manager +from dagster import AssetGroup, asset, fs_io_manager @asset @@ -257,7 +257,7 @@ prod_asset_group = AssetGroup( local_asset_group = AssetGroup( [upstream_asset, downstream_asset], - resource_defs={"io_manager": fs_asset_io_manager}, + resource_defs={"io_manager": fs_io_manager}, ) ``` diff --git a/docs/next/public/objects.inv b/docs/next/public/objects.inv index fcead88bae58e..170a92da62226 100644 Binary files a/docs/next/public/objects.inv and b/docs/next/public/objects.inv differ diff --git a/docs/sphinx/sections/api/apidocs/assets.rst b/docs/sphinx/sections/api/apidocs/assets.rst index 184d1e9ee5024..26f23590a8ff8 100644 --- a/docs/sphinx/sections/api/apidocs/assets.rst +++ b/docs/sphinx/sections/api/apidocs/assets.rst @@ -24,6 +24,3 @@ A software-defined asset combines: .. autoclass:: AssetIn .. autoclass:: SourceAsset - -.. autoconfigurable:: fs_asset_io_manager - :annotation: IOManagerDefinition diff --git a/examples/docs_snippets/docs_snippets/concepts/assets/asset_different_io_managers.py b/examples/docs_snippets/docs_snippets/concepts/assets/asset_different_io_managers.py index a12d694c56160..085ac07a98d61 100644 --- a/examples/docs_snippets/docs_snippets/concepts/assets/asset_different_io_managers.py +++ b/examples/docs_snippets/docs_snippets/concepts/assets/asset_different_io_managers.py @@ -2,7 +2,7 @@ # start_marker from dagster_aws.s3 import s3_pickle_asset_io_manager, s3_resource -from dagster import AssetGroup, asset, fs_asset_io_manager +from dagster import AssetGroup, asset, fs_io_manager @asset(io_manager_key="s3_io_manager") @@ -20,7 +20,7 @@ def downstream_asset(upstream_asset): resource_defs={ "s3_io_manager": s3_pickle_asset_io_manager, "s3": s3_resource, - "fs_io_manager": fs_asset_io_manager, + "fs_io_manager": fs_io_manager, }, ) diff --git a/examples/docs_snippets/docs_snippets/concepts/assets/asset_io_manager_prod_local.py b/examples/docs_snippets/docs_snippets/concepts/assets/asset_io_manager_prod_local.py index 7e0d992bb147a..2948acb847801 100644 --- a/examples/docs_snippets/docs_snippets/concepts/assets/asset_io_manager_prod_local.py +++ b/examples/docs_snippets/docs_snippets/concepts/assets/asset_io_manager_prod_local.py @@ -2,7 +2,7 @@ # start_marker from dagster_aws.s3 import s3_pickle_asset_io_manager, s3_resource -from dagster import AssetGroup, asset, fs_asset_io_manager +from dagster import AssetGroup, asset, fs_io_manager @asset @@ -22,7 +22,7 @@ def downstream_asset(upstream_asset): local_asset_group = AssetGroup( [upstream_asset, downstream_asset], - resource_defs={"io_manager": fs_asset_io_manager}, + resource_defs={"io_manager": fs_io_manager}, ) # end_marker diff --git a/python_modules/dagster-graphql/dagster_graphql_tests/graphql/snapshots/snap_test_all_snapshot_ids.py b/python_modules/dagster-graphql/dagster_graphql_tests/graphql/snapshots/snap_test_all_snapshot_ids.py index 1e41165554976..3cfedd11d9a3b 100644 --- a/python_modules/dagster-graphql/dagster_graphql_tests/graphql/snapshots/snap_test_all_snapshot_ids.py +++ b/python_modules/dagster-graphql/dagster_graphql_tests/graphql/snapshots/snap_test_all_snapshot_ids.py @@ -1532,29 +1532,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Selector.2571019f1a5201853d11032145ac3e534067f214": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "env", - "type_key": "String" - } - ], - "given_name": null, - "key": "Selector.2571019f1a5201853d11032145ac3e534067f214", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Selector.a9799b971d12ace70a2d8803c883c863417d0725": { "__class__": "ConfigTypeSnap", "description": null, @@ -1792,23 +1769,23 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851": { + "Shape.0bb49540f1708dcf5378009c9571eba999502e19": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, "fields": [ { "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, + "default_provided": true, + "default_value_as_json_str": "{}", "description": null, "is_required": false, - "name": "base_dir", - "type_key": "StringSourceType" + "name": "io_manager", + "type_key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7" } ], "given_name": null, - "key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851", + "key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -1916,30 +1893,30 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.44f2a71367507edd1b8e64f739222c4312b3691b": { + "Shape.45a8f1f21db73ecbfa5b4e07b9aedc1835cef1ef": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, "fields": [ { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", - "description": null, + "default_provided": false, + "default_value_as_json_str": null, + "description": "Explicit modules to preload in the forkserver.", "is_required": false, - "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" + "name": "preload_modules", + "type_key": "Array.String" } ], "given_name": null, - "key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b", + "key": "Shape.45a8f1f21db73ecbfa5b4e07b9aedc1835cef1ef", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, "scalar_kind": null, "type_param_keys": null }, - "Shape.45a8f1f21db73ecbfa5b4e07b9aedc1835cef1ef": { + "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -1948,21 +1925,21 @@ "__class__": "ConfigFieldSnap", "default_provided": false, "default_value_as_json_str": null, - "description": "Explicit modules to preload in the forkserver.", - "is_required": false, - "name": "preload_modules", - "type_key": "Array.String" + "description": null, + "is_required": true, + "name": "path", + "type_key": "String" } ], "given_name": null, - "key": "Shape.45a8f1f21db73ecbfa5b4e07b9aedc1835cef1ef", + "key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, "scalar_kind": null, "type_param_keys": null }, - "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2": { + "Shape.743e47901855cb245064dd633e217bfcb49a11a7": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -1972,13 +1949,13 @@ "default_provided": false, "default_value_as_json_str": null, "description": null, - "is_required": true, - "name": "path", - "type_key": "String" + "is_required": false, + "name": "config", + "type_key": "Any" } ], "given_name": null, - "key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2", + "key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -2040,29 +2017,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {}}", - "description": null, - "is_required": false, - "name": "io_manager", - "type_key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b" - } - ], - "given_name": null, - "key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Shape.ca9d191bc601d7df07b309fbcc1a5848eafee07a": { "__class__": "ConfigTypeSnap", "description": null, @@ -2130,59 +2084,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.d4a721f9d08bf017662d6c0726bd4773b200c0c4": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "field_aliases": { - "ops": "solids" - }, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {\\"multiprocess\\": {\\"max_concurrent\\": 0, \\"retries\\": {\\"enabled\\": {}}}}}", - "description": null, - "is_required": false, - "name": "execution", - "type_key": "Shape.7abfc8561f904c5622a52b6e52f7d99e3cb10416" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", - "description": null, - "is_required": false, - "name": "loggers", - "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"downstream_static_partitioned_asset\\": {\\"config\\": {\\"assets\\": {}}}, \\"upstream_static_partitioned_asset\\": {\\"config\\": {\\"assets\\": {}}}}", - "description": null, - "is_required": false, - "name": "ops", - "type_key": "Shape.d1d0be17df836fd3955284404a7c6179490dbc5d" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"io_manager\\": {\\"config\\": {}}}", - "description": null, - "is_required": false, - "name": "resources", - "type_key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a" - } - ], - "given_name": null, - "key": "Shape.d4a721f9d08bf017662d6c0726bd4773b200c0c4", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Shape.da39a3ee5e6b4b0d3255bfef95601890afd80709": { "__class__": "ConfigTypeSnap", "description": null, @@ -2272,6 +2173,59 @@ "scalar_kind": null, "type_param_keys": null }, + "Shape.e8d48eaf16c5bef8233f6a7f1da87e14d971ae52": { + "__class__": "ConfigTypeSnap", + "description": null, + "enum_values": null, + "field_aliases": { + "ops": "solids" + }, + "fields": [ + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"config\\": {\\"multiprocess\\": {\\"max_concurrent\\": 0, \\"retries\\": {\\"enabled\\": {}}}}}", + "description": null, + "is_required": false, + "name": "execution", + "type_key": "Shape.7abfc8561f904c5622a52b6e52f7d99e3cb10416" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{}", + "description": null, + "is_required": false, + "name": "loggers", + "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"downstream_static_partitioned_asset\\": {\\"config\\": {\\"assets\\": {}}}, \\"upstream_static_partitioned_asset\\": {\\"config\\": {\\"assets\\": {}}}}", + "description": null, + "is_required": false, + "name": "ops", + "type_key": "Shape.d1d0be17df836fd3955284404a7c6179490dbc5d" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"io_manager\\": {}}", + "description": null, + "is_required": false, + "name": "resources", + "type_key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19" + } + ], + "given_name": null, + "key": "Shape.e8d48eaf16c5bef8233f6a7f1da87e14d971ae52", + "kind": { + "__enum__": "ConfigTypeKind.STRICT_SHAPE" + }, + "scalar_kind": null, + "type_param_keys": null + }, "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c": { "__class__": "ConfigTypeSnap", "description": null, @@ -2309,22 +2263,6 @@ "__enum__": "ConfigScalarKind.STRING" }, "type_param_keys": null - }, - "StringSourceType": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": null, - "given_name": null, - "key": "StringSourceType", - "kind": { - "__enum__": "ConfigTypeKind.SCALAR_UNION" - }, - "scalar_kind": null, - "type_param_keys": [ - "String", - "Selector.2571019f1a5201853d11032145ac3e534067f214" - ] } } }, @@ -2480,18 +2418,18 @@ "__class__": "ResourceDefSnap", "config_field_snap": { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, "is_required": false, "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" + "type_key": "Any" }, - "description": null, + "description": "The default io manager for Jobs. Uses filesystem but switches to in-memory when invoked through execute_in_process.", "name": "io_manager" } ], - "root_config_key": "Shape.d4a721f9d08bf017662d6c0726bd4773b200c0c4" + "root_config_key": "Shape.e8d48eaf16c5bef8233f6a7f1da87e14d971ae52" } ], "name": "static_partitioned_assets_job", @@ -2565,7 +2503,7 @@ "tags": {} }''' -snapshots['test_all_snapshot_ids 102'] = '2444cd6e8ee3a485042dec19acb00bed2035c789' +snapshots['test_all_snapshot_ids 102'] = '21eda44c8d0057a607c3e338a4d6d9288b17122f' snapshots['test_all_snapshot_ids 103'] = '''{ "__class__": "PipelineSnapshot", @@ -4027,7 +3965,7 @@ "scalar_kind": null, "type_param_keys": null }, - "Selector.2571019f1a5201853d11032145ac3e534067f214": { + "Selector.a9799b971d12ace70a2d8803c883c863417d0725": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -4038,19 +3976,37 @@ "default_value_as_json_str": null, "description": null, "is_required": true, - "name": "env", - "type_key": "String" + "name": "json", + "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": false, + "default_value_as_json_str": null, + "description": null, + "is_required": true, + "name": "pickle", + "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": false, + "default_value_as_json_str": null, + "description": null, + "is_required": true, + "name": "value", + "type_key": "Int" } ], "given_name": null, - "key": "Selector.2571019f1a5201853d11032145ac3e534067f214", + "key": "Selector.a9799b971d12ace70a2d8803c883c863417d0725", "kind": { "__enum__": "ConfigTypeKind.SELECTOR" }, "scalar_kind": null, "type_param_keys": null }, - "Selector.a9799b971d12ace70a2d8803c883c863417d0725": { + "Selector.be5d518b39e86a43c5f2eecaf538c1f6c7711b59": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -4080,18 +4036,18 @@ "description": null, "is_required": true, "name": "value", - "type_key": "Int" + "type_key": "Bool" } ], "given_name": null, - "key": "Selector.a9799b971d12ace70a2d8803c883c863417d0725", + "key": "Selector.be5d518b39e86a43c5f2eecaf538c1f6c7711b59", "kind": { "__enum__": "ConfigTypeKind.SELECTOR" }, "scalar_kind": null, "type_param_keys": null }, - "Selector.be5d518b39e86a43c5f2eecaf538c1f6c7711b59": { + "Selector.d00a37e3807d37c9f69cc62997c4a5f4a176e5c3": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -4121,59 +4077,18 @@ "description": null, "is_required": true, "name": "value", - "type_key": "Bool" + "type_key": "Float" } ], "given_name": null, - "key": "Selector.be5d518b39e86a43c5f2eecaf538c1f6c7711b59", + "key": "Selector.d00a37e3807d37c9f69cc62997c4a5f4a176e5c3", "kind": { "__enum__": "ConfigTypeKind.SELECTOR" }, "scalar_kind": null, "type_param_keys": null }, - "Selector.d00a37e3807d37c9f69cc62997c4a5f4a176e5c3": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "json", - "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "pickle", - "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "value", - "type_key": "Float" - } - ], - "given_name": null, - "key": "Selector.d00a37e3807d37c9f69cc62997c4a5f4a176e5c3", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, - "Selector.e04723c9d9937e3ab21206435b22247cfbe58269": { + "Selector.e04723c9d9937e3ab21206435b22247cfbe58269": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -4287,23 +4202,23 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851": { + "Shape.0bb49540f1708dcf5378009c9571eba999502e19": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, "fields": [ { "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, + "default_provided": true, + "default_value_as_json_str": "{}", "description": null, "is_required": false, - "name": "base_dir", - "type_key": "StringSourceType" + "name": "io_manager", + "type_key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7" } ], "given_name": null, - "key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851", + "key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -4411,29 +4326,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.44f2a71367507edd1b8e64f739222c4312b3691b": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", - "description": null, - "is_required": false, - "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" - } - ], - "given_name": null, - "key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Shape.45a8f1f21db73ecbfa5b4e07b9aedc1835cef1ef": { "__class__": "ConfigTypeSnap", "description": null, @@ -4515,6 +4407,29 @@ "scalar_kind": null, "type_param_keys": null }, + "Shape.743e47901855cb245064dd633e217bfcb49a11a7": { + "__class__": "ConfigTypeSnap", + "description": null, + "enum_values": null, + "fields": [ + { + "__class__": "ConfigFieldSnap", + "default_provided": false, + "default_value_as_json_str": null, + "description": null, + "is_required": false, + "name": "config", + "type_key": "Any" + } + ], + "given_name": null, + "key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7", + "kind": { + "__enum__": "ConfigTypeKind.STRICT_SHAPE" + }, + "scalar_kind": null, + "type_param_keys": null + }, "Shape.7abfc8561f904c5622a52b6e52f7d99e3cb10416": { "__class__": "ConfigTypeSnap", "description": null, @@ -4570,7 +4485,7 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.c048443824ba12bebc8209078f139654748959fa": { + "Shape.c32ecdf3803737dc99db4dced6ad9cf858e6eeb8": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -4608,38 +4523,15 @@ { "__class__": "ConfigFieldSnap", "default_provided": true, - "default_value_as_json_str": "{\\"io_manager\\": {\\"config\\": {}}}", + "default_value_as_json_str": "{\\"io_manager\\": {}}", "description": null, "is_required": false, "name": "resources", - "type_key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a" - } - ], - "given_name": null, - "key": "Shape.c048443824ba12bebc8209078f139654748959fa", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, - "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {}}", - "description": null, - "is_required": false, - "name": "io_manager", - "type_key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b" + "type_key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19" } ], "given_name": null, - "key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a", + "key": "Shape.c32ecdf3803737dc99db4dced6ad9cf858e6eeb8", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -4804,22 +4696,6 @@ "__enum__": "ConfigScalarKind.STRING" }, "type_param_keys": null - }, - "StringSourceType": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": null, - "given_name": null, - "key": "StringSourceType", - "kind": { - "__enum__": "ConfigTypeKind.SCALAR_UNION" - }, - "scalar_kind": null, - "type_param_keys": [ - "String", - "Selector.2571019f1a5201853d11032145ac3e534067f214" - ] } } }, @@ -4975,18 +4851,18 @@ "__class__": "ResourceDefSnap", "config_field_snap": { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, "is_required": false, "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" + "type_key": "Any" }, - "description": null, + "description": "The default io manager for Jobs. Uses filesystem but switches to in-memory when invoked through execute_in_process.", "name": "io_manager" } ], - "root_config_key": "Shape.c048443824ba12bebc8209078f139654748959fa" + "root_config_key": "Shape.c32ecdf3803737dc99db4dced6ad9cf858e6eeb8" } ], "name": "time_partitioned_assets_job", @@ -5060,7 +4936,7 @@ "tags": {} }''' -snapshots['test_all_snapshot_ids 106'] = '1f77c9ddd70dc8b2494cc6fdb77f57ad90a83ecf' +snapshots['test_all_snapshot_ids 106'] = '0267d39827d9712d98e95b5c3c6b4e97ea774b48' snapshots['test_all_snapshot_ids 107'] = '''{ "__class__": "PipelineSnapshot", @@ -5328,29 +5204,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Selector.2571019f1a5201853d11032145ac3e534067f214": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "env", - "type_key": "String" - } - ], - "given_name": null, - "key": "Selector.2571019f1a5201853d11032145ac3e534067f214", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Selector.a9799b971d12ace70a2d8803c883c863417d0725": { "__class__": "ConfigTypeSnap", "description": null, @@ -5588,23 +5441,76 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851": { + "Shape.0bb49540f1708dcf5378009c9571eba999502e19": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, "fields": [ { "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, + "default_provided": true, + "default_value_as_json_str": "{}", "description": null, "is_required": false, - "name": "base_dir", - "type_key": "StringSourceType" + "name": "io_manager", + "type_key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7" } ], "given_name": null, - "key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851", + "key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19", + "kind": { + "__enum__": "ConfigTypeKind.STRICT_SHAPE" + }, + "scalar_kind": null, + "type_param_keys": null + }, + "Shape.16e12bae6ff7814aaed1bebfa46941a05d9a5f29": { + "__class__": "ConfigTypeSnap", + "description": null, + "enum_values": null, + "field_aliases": { + "ops": "solids" + }, + "fields": [ + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"config\\": {\\"multiprocess\\": {\\"max_concurrent\\": 0, \\"retries\\": {\\"enabled\\": {}}}}}", + "description": null, + "is_required": false, + "name": "execution", + "type_key": "Shape.7abfc8561f904c5622a52b6e52f7d99e3cb10416" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{}", + "description": null, + "is_required": false, + "name": "loggers", + "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"asset_one\\": {\\"config\\": {\\"assets\\": {}}}, \\"asset_two\\": {\\"config\\": {\\"assets\\": {}}}}", + "description": null, + "is_required": false, + "name": "ops", + "type_key": "Shape.c00b7646bc5339cff1920a8df0cfddf2fcde42e2" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"io_manager\\": {}}", + "description": null, + "is_required": false, + "name": "resources", + "type_key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19" + } + ], + "given_name": null, + "key": "Shape.16e12bae6ff7814aaed1bebfa46941a05d9a5f29", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -5712,29 +5618,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.44f2a71367507edd1b8e64f739222c4312b3691b": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", - "description": null, - "is_required": false, - "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" - } - ], - "given_name": null, - "key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Shape.45a8f1f21db73ecbfa5b4e07b9aedc1835cef1ef": { "__class__": "ConfigTypeSnap", "description": null, @@ -5781,6 +5664,29 @@ "scalar_kind": null, "type_param_keys": null }, + "Shape.743e47901855cb245064dd633e217bfcb49a11a7": { + "__class__": "ConfigTypeSnap", + "description": null, + "enum_values": null, + "fields": [ + { + "__class__": "ConfigFieldSnap", + "default_provided": false, + "default_value_as_json_str": null, + "description": null, + "is_required": false, + "name": "config", + "type_key": "Any" + } + ], + "given_name": null, + "key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7", + "kind": { + "__enum__": "ConfigTypeKind.STRICT_SHAPE" + }, + "scalar_kind": null, + "type_param_keys": null + }, "Shape.7abfc8561f904c5622a52b6e52f7d99e3cb10416": { "__class__": "ConfigTypeSnap", "description": null, @@ -5871,29 +5777,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {}}", - "description": null, - "is_required": false, - "name": "io_manager", - "type_key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b" - } - ], - "given_name": null, - "key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Shape.ca9d191bc601d7df07b309fbcc1a5848eafee07a": { "__class__": "ConfigTypeSnap", "description": null, @@ -5926,59 +5809,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.d05a0b7c58b83745b3811ee98957c2e77edc67d3": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "field_aliases": { - "ops": "solids" - }, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {\\"multiprocess\\": {\\"max_concurrent\\": 0, \\"retries\\": {\\"enabled\\": {}}}}}", - "description": null, - "is_required": false, - "name": "execution", - "type_key": "Shape.7abfc8561f904c5622a52b6e52f7d99e3cb10416" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", - "description": null, - "is_required": false, - "name": "loggers", - "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"asset_one\\": {\\"config\\": {\\"assets\\": {}}}, \\"asset_two\\": {\\"config\\": {\\"assets\\": {}}}}", - "description": null, - "is_required": false, - "name": "ops", - "type_key": "Shape.c00b7646bc5339cff1920a8df0cfddf2fcde42e2" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"io_manager\\": {\\"config\\": {}}}", - "description": null, - "is_required": false, - "name": "resources", - "type_key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a" - } - ], - "given_name": null, - "key": "Shape.d05a0b7c58b83745b3811ee98957c2e77edc67d3", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Shape.da39a3ee5e6b4b0d3255bfef95601890afd80709": { "__class__": "ConfigTypeSnap", "description": null, @@ -6105,22 +5935,6 @@ "__enum__": "ConfigScalarKind.STRING" }, "type_param_keys": null - }, - "StringSourceType": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": null, - "given_name": null, - "key": "StringSourceType", - "kind": { - "__enum__": "ConfigTypeKind.SCALAR_UNION" - }, - "scalar_kind": null, - "type_param_keys": [ - "String", - "Selector.2571019f1a5201853d11032145ac3e534067f214" - ] } } }, @@ -6276,18 +6090,18 @@ "__class__": "ResourceDefSnap", "config_field_snap": { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, "is_required": false, "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" + "type_key": "Any" }, - "description": null, + "description": "The default io manager for Jobs. Uses filesystem but switches to in-memory when invoked through execute_in_process.", "name": "io_manager" } ], - "root_config_key": "Shape.d05a0b7c58b83745b3811ee98957c2e77edc67d3" + "root_config_key": "Shape.16e12bae6ff7814aaed1bebfa46941a05d9a5f29" } ], "name": "two_assets_job", @@ -6361,7 +6175,7 @@ "tags": {} }''' -snapshots['test_all_snapshot_ids 108'] = '220f9b72d9a9eda6978d06e2b0fd4e0377ef00b7' +snapshots['test_all_snapshot_ids 108'] = 'b5348738039801022047d1ccd6056bb70debe6e7' snapshots['test_all_snapshot_ids 109'] = '''{ "__class__": "PipelineSnapshot", @@ -19313,29 +19127,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Selector.2571019f1a5201853d11032145ac3e534067f214": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "env", - "type_key": "String" - } - ], - "given_name": null, - "key": "Selector.2571019f1a5201853d11032145ac3e534067f214", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Selector.a9799b971d12ace70a2d8803c883c863417d0725": { "__class__": "ConfigTypeSnap", "description": null, @@ -19573,76 +19364,23 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.0929a132e623f4a614df1b772407d84539d88f32": { + "Shape.0bb49540f1708dcf5378009c9571eba999502e19": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, - "field_aliases": { - "ops": "solids" - }, "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {\\"retries\\": {\\"enabled\\": {}}}}", - "description": null, - "is_required": false, - "name": "execution", - "type_key": "Shape.ca5906d9a0377218b4ee7d940ad55957afa73d1b" - }, { "__class__": "ConfigFieldSnap", "default_provided": true, "default_value_as_json_str": "{}", "description": null, "is_required": false, - "name": "loggers", - "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"asset_1\\": {\\"config\\": {\\"assets\\": {}}}, \\"asset_2\\": {\\"config\\": {\\"assets\\": {}}}, \\"asset_3\\": {\\"config\\": {\\"assets\\": {}}}}", - "description": null, - "is_required": false, - "name": "ops", - "type_key": "Shape.8a2efcb8020b1315994e06ca4f6f1a9714b5c564" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"io_manager\\": {\\"config\\": {}}}", - "description": null, - "is_required": false, - "name": "resources", - "type_key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a" - } - ], - "given_name": null, - "key": "Shape.0929a132e623f4a614df1b772407d84539d88f32", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, - "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": false, - "name": "base_dir", - "type_key": "StringSourceType" + "name": "io_manager", + "type_key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7" } ], "given_name": null, - "key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851", + "key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -19750,30 +19488,30 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.44f2a71367507edd1b8e64f739222c4312b3691b": { + "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, "fields": [ { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, - "is_required": false, - "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" + "is_required": true, + "name": "path", + "type_key": "String" } ], "given_name": null, - "key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b", + "key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, "scalar_kind": null, "type_param_keys": null }, - "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2": { + "Shape.743e47901855cb245064dd633e217bfcb49a11a7": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -19783,13 +19521,13 @@ "default_provided": false, "default_value_as_json_str": null, "description": null, - "is_required": true, - "name": "path", - "type_key": "String" + "is_required": false, + "name": "config", + "type_key": "Any" } ], "given_name": null, - "key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2", + "key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -19872,23 +19610,53 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a": { + "Shape.be0226c78e2c46d790762d39e34c3a69fcac2ed7": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, + "field_aliases": { + "ops": "solids" + }, "fields": [ { "__class__": "ConfigFieldSnap", "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {}}", + "default_value_as_json_str": "{\\"config\\": {\\"retries\\": {\\"enabled\\": {}}}}", "description": null, "is_required": false, - "name": "io_manager", - "type_key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b" + "name": "execution", + "type_key": "Shape.ca5906d9a0377218b4ee7d940ad55957afa73d1b" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{}", + "description": null, + "is_required": false, + "name": "loggers", + "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"asset_1\\": {\\"config\\": {\\"assets\\": {}}}, \\"asset_2\\": {\\"config\\": {\\"assets\\": {}}}, \\"asset_3\\": {\\"config\\": {\\"assets\\": {}}}}", + "description": null, + "is_required": false, + "name": "ops", + "type_key": "Shape.8a2efcb8020b1315994e06ca4f6f1a9714b5c564" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"io_manager\\": {}}", + "description": null, + "is_required": false, + "name": "resources", + "type_key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19" } ], "given_name": null, - "key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a", + "key": "Shape.be0226c78e2c46d790762d39e34c3a69fcac2ed7", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -20035,22 +19803,6 @@ "__enum__": "ConfigScalarKind.STRING" }, "type_param_keys": null - }, - "StringSourceType": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": null, - "given_name": null, - "key": "StringSourceType", - "kind": { - "__enum__": "ConfigTypeKind.SCALAR_UNION" - }, - "scalar_kind": null, - "type_param_keys": [ - "String", - "Selector.2571019f1a5201853d11032145ac3e534067f214" - ] } } }, @@ -20227,18 +19979,18 @@ "__class__": "ResourceDefSnap", "config_field_snap": { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, "is_required": false, "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" + "type_key": "Any" }, - "description": null, + "description": "The default io manager for Jobs. Uses filesystem but switches to in-memory when invoked through execute_in_process.", "name": "io_manager" } ], - "root_config_key": "Shape.0929a132e623f4a614df1b772407d84539d88f32" + "root_config_key": "Shape.be0226c78e2c46d790762d39e34c3a69fcac2ed7" } ], "name": "failure_assets_job", @@ -20346,7 +20098,7 @@ "tags": {} }''' -snapshots['test_all_snapshot_ids 28'] = '5be5753d8de18f51e5a6fa4f8c0335e382668d09' +snapshots['test_all_snapshot_ids 28'] = '0b8039ca48374969758913e48592da484fdb2066' snapshots['test_all_snapshot_ids 29'] = '''{ "__class__": "PipelineSnapshot", @@ -45699,29 +45451,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Selector.2571019f1a5201853d11032145ac3e534067f214": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "env", - "type_key": "String" - } - ], - "given_name": null, - "key": "Selector.2571019f1a5201853d11032145ac3e534067f214", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Selector.a9799b971d12ace70a2d8803c883c863417d0725": { "__class__": "ConfigTypeSnap", "description": null, @@ -45959,23 +45688,23 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851": { + "Shape.0bb49540f1708dcf5378009c9571eba999502e19": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, "fields": [ { "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, + "default_provided": true, + "default_value_as_json_str": "{}", "description": null, "is_required": false, - "name": "base_dir", - "type_key": "StringSourceType" + "name": "io_manager", + "type_key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7" } ], "given_name": null, - "key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851", + "key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -46083,29 +45812,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.44f2a71367507edd1b8e64f739222c4312b3691b": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", - "description": null, - "is_required": false, - "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" - } - ], - "given_name": null, - "key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2": { "__class__": "ConfigTypeSnap", "description": null, @@ -46155,53 +45861,23 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.80012d09e3ecdb07bcfce83cdcb67abfb679be6a": { + "Shape.743e47901855cb245064dd633e217bfcb49a11a7": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, - "field_aliases": { - "ops": "solids" - }, "fields": [ { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {\\"retries\\": {\\"enabled\\": {}}}}", - "description": null, - "is_required": false, - "name": "execution", - "type_key": "Shape.ca5906d9a0377218b4ee7d940ad55957afa73d1b" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", - "description": null, - "is_required": false, - "name": "loggers", - "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"asset_yields_observation\\": {\\"config\\": {\\"assets\\": {}}}}", - "description": null, - "is_required": false, - "name": "ops", - "type_key": "Shape.4f9ad6532eed9c82c3769f22d887430c0c39efdb" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"io_manager\\": {\\"config\\": {}}}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, "is_required": false, - "name": "resources", - "type_key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a" + "name": "config", + "type_key": "Any" } ], "given_name": null, - "key": "Shape.80012d09e3ecdb07bcfce83cdcb67abfb679be6a", + "key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -46240,23 +45916,53 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a": { + "Shape.b5735ccfd76d0e04168df7bfb62ad9b02dc6a61a": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, + "field_aliases": { + "ops": "solids" + }, "fields": [ { "__class__": "ConfigFieldSnap", "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {}}", + "default_value_as_json_str": "{\\"config\\": {\\"retries\\": {\\"enabled\\": {}}}}", "description": null, "is_required": false, - "name": "io_manager", - "type_key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b" + "name": "execution", + "type_key": "Shape.ca5906d9a0377218b4ee7d940ad55957afa73d1b" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{}", + "description": null, + "is_required": false, + "name": "loggers", + "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"asset_yields_observation\\": {\\"config\\": {\\"assets\\": {}}}}", + "description": null, + "is_required": false, + "name": "ops", + "type_key": "Shape.4f9ad6532eed9c82c3769f22d887430c0c39efdb" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"io_manager\\": {}}", + "description": null, + "is_required": false, + "name": "resources", + "type_key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19" } ], "given_name": null, - "key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a", + "key": "Shape.b5735ccfd76d0e04168df7bfb62ad9b02dc6a61a", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -46403,22 +46109,6 @@ "__enum__": "ConfigScalarKind.STRING" }, "type_param_keys": null - }, - "StringSourceType": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": null, - "given_name": null, - "key": "StringSourceType", - "kind": { - "__enum__": "ConfigTypeKind.SCALAR_UNION" - }, - "scalar_kind": null, - "type_param_keys": [ - "String", - "Selector.2571019f1a5201853d11032145ac3e534067f214" - ] } } }, @@ -46553,18 +46243,18 @@ "__class__": "ResourceDefSnap", "config_field_snap": { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, "is_required": false, "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" + "type_key": "Any" }, - "description": null, + "description": "The default io manager for Jobs. Uses filesystem but switches to in-memory when invoked through execute_in_process.", "name": "io_manager" } ], - "root_config_key": "Shape.80012d09e3ecdb07bcfce83cdcb67abfb679be6a" + "root_config_key": "Shape.b5735ccfd76d0e04168df7bfb62ad9b02dc6a61a" } ], "name": "observation_job", @@ -46604,7 +46294,7 @@ "tags": {} }''' -snapshots['test_all_snapshot_ids 66'] = 'b2364440a2f5697d036689317aec67ded278b3d1' +snapshots['test_all_snapshot_ids 66'] = '39351c77de767109e211227ed4d22de35568433f' snapshots['test_all_snapshot_ids 67'] = '''{ "__class__": "PipelineSnapshot", @@ -46793,29 +46483,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Selector.2571019f1a5201853d11032145ac3e534067f214": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "env", - "type_key": "String" - } - ], - "given_name": null, - "key": "Selector.2571019f1a5201853d11032145ac3e534067f214", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Selector.a9799b971d12ace70a2d8803c883c863417d0725": { "__class__": "ConfigTypeSnap", "description": null, @@ -46928,185 +46595,91 @@ "description": null, "is_required": true, "name": "value", - "type_key": "Float" - } - ], - "given_name": null, - "key": "Selector.d00a37e3807d37c9f69cc62997c4a5f4a176e5c3", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, - "Selector.e04723c9d9937e3ab21206435b22247cfbe58269": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "json", - "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "pickle", - "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "value", - "type_key": "String" - } - ], - "given_name": null, - "key": "Selector.e04723c9d9937e3ab21206435b22247cfbe58269", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, - "Selector.e52fa3afbe531d9522fae1206f3ae9d248775742": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "json", - "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "pickle", - "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" - } - ], - "given_name": null, - "key": "Selector.e52fa3afbe531d9522fae1206f3ae9d248775742", - "kind": { - "__enum__": "ConfigTypeKind.SELECTOR" - }, - "scalar_kind": null, - "type_param_keys": null - }, - "Selector.f2fe6dfdc60a1947a8f8e7cd377a012b47065bc4": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "json", - "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "pickle", - "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": false, - "default_value_as_json_str": null, - "description": null, - "is_required": true, - "name": "value", - "type_key": "Any" + "type_key": "Float" } ], "given_name": null, - "key": "Selector.f2fe6dfdc60a1947a8f8e7cd377a012b47065bc4", + "key": "Selector.d00a37e3807d37c9f69cc62997c4a5f4a176e5c3", "kind": { "__enum__": "ConfigTypeKind.SELECTOR" }, "scalar_kind": null, "type_param_keys": null }, - "Shape.010fcd6801d5ab6c9cd17d68100e8657d84005c6": { + "Selector.e04723c9d9937e3ab21206435b22247cfbe58269": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, - "field_aliases": { - "ops": "solids" - }, "fields": [ { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {\\"retries\\": {\\"enabled\\": {}}}}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, - "is_required": false, - "name": "execution", - "type_key": "Shape.ca5906d9a0377218b4ee7d940ad55957afa73d1b" + "is_required": true, + "name": "json", + "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" }, { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, - "is_required": false, - "name": "loggers", - "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" + "is_required": true, + "name": "pickle", + "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" }, { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"yield_partition_materialization\\": {\\"config\\": {\\"assets\\": {}}}}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, - "is_required": false, - "name": "ops", - "type_key": "Shape.5592faa25ec1a64904a69cac388ede7438da6e21" + "is_required": true, + "name": "value", + "type_key": "String" + } + ], + "given_name": null, + "key": "Selector.e04723c9d9937e3ab21206435b22247cfbe58269", + "kind": { + "__enum__": "ConfigTypeKind.SELECTOR" + }, + "scalar_kind": null, + "type_param_keys": null + }, + "Selector.e52fa3afbe531d9522fae1206f3ae9d248775742": { + "__class__": "ConfigTypeSnap", + "description": null, + "enum_values": null, + "fields": [ + { + "__class__": "ConfigFieldSnap", + "default_provided": false, + "default_value_as_json_str": null, + "description": null, + "is_required": true, + "name": "json", + "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" }, { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"io_manager\\": {\\"config\\": {}}}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, - "is_required": false, - "name": "resources", - "type_key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a" + "is_required": true, + "name": "pickle", + "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" } ], "given_name": null, - "key": "Shape.010fcd6801d5ab6c9cd17d68100e8657d84005c6", + "key": "Selector.e52fa3afbe531d9522fae1206f3ae9d248775742", "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" + "__enum__": "ConfigTypeKind.SELECTOR" }, "scalar_kind": null, "type_param_keys": null }, - "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851": { + "Selector.f2fe6dfdc60a1947a8f8e7cd377a012b47065bc4": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -47116,13 +46689,54 @@ "default_provided": false, "default_value_as_json_str": null, "description": null, + "is_required": true, + "name": "json", + "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": false, + "default_value_as_json_str": null, + "description": null, + "is_required": true, + "name": "pickle", + "type_key": "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": false, + "default_value_as_json_str": null, + "description": null, + "is_required": true, + "name": "value", + "type_key": "Any" + } + ], + "given_name": null, + "key": "Selector.f2fe6dfdc60a1947a8f8e7cd377a012b47065bc4", + "kind": { + "__enum__": "ConfigTypeKind.SELECTOR" + }, + "scalar_kind": null, + "type_param_keys": null + }, + "Shape.0bb49540f1708dcf5378009c9571eba999502e19": { + "__class__": "ConfigTypeSnap", + "description": null, + "enum_values": null, + "fields": [ + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{}", + "description": null, "is_required": false, - "name": "base_dir", - "type_key": "StringSourceType" + "name": "io_manager", + "type_key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7" } ], "given_name": null, - "key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851", + "key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -47230,29 +46844,6 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.44f2a71367507edd1b8e64f739222c4312b3691b": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": [ - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", - "description": null, - "is_required": false, - "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" - } - ], - "given_name": null, - "key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b", - "kind": { - "__enum__": "ConfigTypeKind.STRICT_SHAPE" - }, - "scalar_kind": null, - "type_param_keys": null - }, "Shape.4b53b73df342381d0d05c5f36183dc99cb9676e2": { "__class__": "ConfigTypeSnap", "description": null, @@ -47302,7 +46893,7 @@ "scalar_kind": null, "type_param_keys": null }, - "Shape.979b3d2fece4f3eb92e90f2ec9fb4c85efe9ea5c": { + "Shape.743e47901855cb245064dd633e217bfcb49a11a7": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, @@ -47313,44 +46904,44 @@ "default_value_as_json_str": null, "description": null, "is_required": false, - "name": "marker_to_close", - "type_key": "String" - }, - { - "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{\\"enabled\\": {}}", - "description": null, - "is_required": false, - "name": "retries", - "type_key": "Selector.1bfb167aea90780aa679597800c71bd8c65ed0b2" + "name": "config", + "type_key": "Any" } ], "given_name": null, - "key": "Shape.979b3d2fece4f3eb92e90f2ec9fb4c85efe9ea5c", + "key": "Shape.743e47901855cb245064dd633e217bfcb49a11a7", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, "scalar_kind": null, "type_param_keys": null }, - "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a": { + "Shape.979b3d2fece4f3eb92e90f2ec9fb4c85efe9ea5c": { "__class__": "ConfigTypeSnap", "description": null, "enum_values": null, "fields": [ + { + "__class__": "ConfigFieldSnap", + "default_provided": false, + "default_value_as_json_str": null, + "description": null, + "is_required": false, + "name": "marker_to_close", + "type_key": "String" + }, { "__class__": "ConfigFieldSnap", "default_provided": true, - "default_value_as_json_str": "{\\"config\\": {}}", + "default_value_as_json_str": "{\\"enabled\\": {}}", "description": null, "is_required": false, - "name": "io_manager", - "type_key": "Shape.44f2a71367507edd1b8e64f739222c4312b3691b" + "name": "retries", + "type_key": "Selector.1bfb167aea90780aa679597800c71bd8c65ed0b2" } ], "given_name": null, - "key": "Shape.c2c57770aaa8b396a9e2db0762cc977ca34ead8a", + "key": "Shape.979b3d2fece4f3eb92e90f2ec9fb4c85efe9ea5c", "kind": { "__enum__": "ConfigTypeKind.STRICT_SHAPE" }, @@ -47483,6 +47074,59 @@ "scalar_kind": null, "type_param_keys": null }, + "Shape.f014bade5376c4ab1a52a091b6e67e4d2963d478": { + "__class__": "ConfigTypeSnap", + "description": null, + "enum_values": null, + "field_aliases": { + "ops": "solids" + }, + "fields": [ + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"config\\": {\\"retries\\": {\\"enabled\\": {}}}}", + "description": null, + "is_required": false, + "name": "execution", + "type_key": "Shape.ca5906d9a0377218b4ee7d940ad55957afa73d1b" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{}", + "description": null, + "is_required": false, + "name": "loggers", + "type_key": "Shape.ebeaf4550c200fb540f2e1f3f2110debd8c4157c" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"yield_partition_materialization\\": {\\"config\\": {\\"assets\\": {}}}}", + "description": null, + "is_required": false, + "name": "ops", + "type_key": "Shape.5592faa25ec1a64904a69cac388ede7438da6e21" + }, + { + "__class__": "ConfigFieldSnap", + "default_provided": true, + "default_value_as_json_str": "{\\"io_manager\\": {}}", + "description": null, + "is_required": false, + "name": "resources", + "type_key": "Shape.0bb49540f1708dcf5378009c9571eba999502e19" + } + ], + "given_name": null, + "key": "Shape.f014bade5376c4ab1a52a091b6e67e4d2963d478", + "kind": { + "__enum__": "ConfigTypeKind.STRICT_SHAPE" + }, + "scalar_kind": null, + "type_param_keys": null + }, "String": { "__class__": "ConfigTypeSnap", "description": "", @@ -47497,22 +47141,6 @@ "__enum__": "ConfigScalarKind.STRING" }, "type_param_keys": null - }, - "StringSourceType": { - "__class__": "ConfigTypeSnap", - "description": null, - "enum_values": null, - "fields": null, - "given_name": null, - "key": "StringSourceType", - "kind": { - "__enum__": "ConfigTypeKind.SCALAR_UNION" - }, - "scalar_kind": null, - "type_param_keys": [ - "String", - "Selector.2571019f1a5201853d11032145ac3e534067f214" - ] } } }, @@ -47647,18 +47275,18 @@ "__class__": "ResourceDefSnap", "config_field_snap": { "__class__": "ConfigFieldSnap", - "default_provided": true, - "default_value_as_json_str": "{}", + "default_provided": false, + "default_value_as_json_str": null, "description": null, "is_required": false, "name": "config", - "type_key": "Shape.18b2faaf1efd505374f7f25fcb61ed59bd5be851" + "type_key": "Any" }, - "description": null, + "description": "The default io manager for Jobs. Uses filesystem but switches to in-memory when invoked through execute_in_process.", "name": "io_manager" } ], - "root_config_key": "Shape.010fcd6801d5ab6c9cd17d68100e8657d84005c6" + "root_config_key": "Shape.f014bade5376c4ab1a52a091b6e67e4d2963d478" } ], "name": "partition_materialization_job", @@ -47698,7 +47326,7 @@ "tags": {} }''' -snapshots['test_all_snapshot_ids 68'] = '08370a1d564bbe74f1b8a5ca7420ca3ae849b7cf' +snapshots['test_all_snapshot_ids 68'] = '8ac85f8770650b51171ad614037402a1c9f8708c' snapshots['test_all_snapshot_ids 69'] = '''{ "__class__": "PipelineSnapshot", diff --git a/python_modules/dagster/dagster/__init__.py b/python_modules/dagster/dagster/__init__.py index ad23386414f5d..821064d278c94 100644 --- a/python_modules/dagster/dagster/__init__.py +++ b/python_modules/dagster/dagster/__init__.py @@ -233,7 +233,6 @@ RunShardedEventsCursor, ) from dagster.core.storage.file_manager import FileHandle, LocalFileHandle, local_file_manager -from dagster.core.storage.fs_asset_io_manager import fs_asset_io_manager from dagster.core.storage.fs_io_manager import custom_path_fs_io_manager, fs_io_manager from dagster.core.storage.io_manager import IOManager, IOManagerDefinition, io_manager from dagster.core.storage.mem_io_manager import mem_io_manager @@ -645,7 +644,6 @@ def __dir__() -> typing.List[str]: "RootInputManager", "RootInputManagerDefinition", "root_input_manager", - "fs_asset_io_manager", "fs_io_manager", "mem_io_manager", "custom_path_fs_io_manager", diff --git a/python_modules/dagster/dagster/core/asset_defs/asset_group.py b/python_modules/dagster/dagster/core/asset_defs/asset_group.py index 7a51e6adc8ed3..cd6729cc0d318 100644 --- a/python_modules/dagster/dagster/core/asset_defs/asset_group.py +++ b/python_modules/dagster/dagster/core/asset_defs/asset_group.py @@ -26,7 +26,7 @@ from dagster.core.errors import DagsterUnmetExecutorRequirementsError from dagster.core.execution.execute_in_process_result import ExecuteInProcessResult from dagster.core.selector.subset_selector import AssetSelectionData -from dagster.core.storage.fs_asset_io_manager import fs_asset_io_manager +from dagster.core.storage.fs_io_manager import fs_io_manager from dagster.utils import merge_dicts from dagster.utils.backcompat import ExperimentalWarning @@ -118,7 +118,7 @@ def __init__( # In the case of collisions, merge_dicts takes values from the # dictionary latest in the list, so we place the user provided resource # defs after the defaults. - resource_defs = merge_dicts({"io_manager": fs_asset_io_manager}, resource_defs) + resource_defs = merge_dicts({"io_manager": fs_io_manager}, resource_defs) _validate_resource_reqs_for_asset_group( asset_list=assets, source_assets=source_assets, resource_defs=resource_defs diff --git a/python_modules/dagster/dagster/core/asset_defs/assets_job.py b/python_modules/dagster/dagster/core/asset_defs/assets_job.py index 46fb29794d029..82b73622833a1 100644 --- a/python_modules/dagster/dagster/core/asset_defs/assets_job.py +++ b/python_modules/dagster/dagster/core/asset_defs/assets_job.py @@ -22,9 +22,7 @@ from dagster.core.definitions.resource_definition import ResourceDefinition from dagster.core.errors import DagsterInvalidDefinitionError from dagster.core.selector.subset_selector import AssetSelectionData -from dagster.core.storage.fs_asset_io_manager import fs_asset_io_manager from dagster.utils.backcompat import experimental -from dagster.utils.merger import merge_dicts from .asset_partitions import get_upstream_partitions_for_partition_range from .assets import AssetsDefinition @@ -121,7 +119,7 @@ def asset2(asset1): resolved_source_assets.append(asset) return graph.to_job( - resource_defs=merge_dicts({"io_manager": fs_asset_io_manager}, all_resource_defs), + resource_defs=all_resource_defs, config=config or partitioned_config, tags=tags, executor_def=executor_def, diff --git a/python_modules/dagster/dagster/core/definitions/job_definition.py b/python_modules/dagster/dagster/core/definitions/job_definition.py index 71e29ce945e61..93c2d659e8d48 100644 --- a/python_modules/dagster/dagster/core/definitions/job_definition.py +++ b/python_modules/dagster/dagster/core/definitions/job_definition.py @@ -40,7 +40,6 @@ OpSelectionData, parse_op_selection, ) -from dagster.core.storage.fs_asset_io_manager import fs_asset_io_manager from dagster.core.utils import str_format_set from dagster.utils import merge_dicts @@ -489,7 +488,7 @@ def _swap_default_io_man(resources: Dict[str, ResourceDefinition], job: Pipeline if ( # pylint: disable=comparison-with-callable - resources.get("io_manager") in [default_job_io_manager, fs_asset_io_manager] + resources.get("io_manager") in [default_job_io_manager] and job.version_strategy is None ): updated_resources = dict(resources) diff --git a/python_modules/dagster/dagster/core/storage/fs_asset_io_manager.py b/python_modules/dagster/dagster/core/storage/fs_asset_io_manager.py deleted file mode 100644 index d1f9d8fb292b8..0000000000000 --- a/python_modules/dagster/dagster/core/storage/fs_asset_io_manager.py +++ /dev/null @@ -1,88 +0,0 @@ -import os -from typing import Union - -from dagster.config import Field -from dagster.config.source import StringSource -from dagster.core.execution.context.input import InputContext -from dagster.core.execution.context.output import OutputContext -from dagster.core.storage.io_manager import io_manager - -from .fs_io_manager import PickledObjectFilesystemIOManager - - -@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)}) -def fs_asset_io_manager(init_context): - """IO manager that stores values on the local filesystem, serializing them with pickle. - - Each asset is assigned to a single filesystem path, at "/". If the asset - key has multiple components, the final component is used as the name of the file, and the - preceding components as parent directories under the base_dir. - - Subsequent materializations of an asset will overwrite previous materializations of that asset. - - If not provided via configuration, the base dir is the local_artifact_storage in your - dagster.yaml file. That will be a temporary directory if not explicitly set. - - So, with a base directory of "/my/base/path", an asset with key - `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory - with path "/my/base/path/one/two/". - - Example usage: - - 1. Specify a collection-level IO manager using the reserved resource key ``"io_manager"``, - which will set the given IO manager on all assets in the collection. - - .. code-block:: python - - from dagster import AssetGroup, asset, fs_asset_io_manager - - @asset - def asset1(): - # create df ... - return df - - @asset - def asset2(asset1): - return df[:5] - - asset_group = AssetGroup( - [asset1, asset2], - resource_defs={ - "io_manager": fs_asset_io_manager.configured({"base_path": "/my/base/path"}) - }, - ) - - 2. Specify IO manager on the asset, which allows the user to set different IO managers on - different assets. - - .. code-block:: python - - from dagster import fs_io_manager, job, op, Out - - @asset(io_manager_key="my_io_manager") - def asset1(): - # create df ... - return df - - @asset - def asset2(asset1): - return df[:5] - - asset_group = AssetGroup( - [asset1, asset2], - resource_defs={ - "my_io_manager": fs_asset_io_manager.configured({"base_path": "/my/base/path"}) - }, - ) - """ - base_dir = init_context.resource_config.get( - "base_dir", init_context.instance.storage_directory() - ) - - return AssetPickledObjectFilesystemIOManager(base_dir=base_dir) - - -class AssetPickledObjectFilesystemIOManager(PickledObjectFilesystemIOManager): - def _get_path(self, context: Union[InputContext, OutputContext]) -> str: - identifier = context.get_asset_identifier() - return os.path.join(self.base_dir, *identifier) diff --git a/python_modules/dagster/dagster/core/storage/fs_io_manager.py b/python_modules/dagster/dagster/core/storage/fs_io_manager.py index 22e6b78ca96c9..c5589055d4c63 100644 --- a/python_modules/dagster/dagster/core/storage/fs_io_manager.py +++ b/python_modules/dagster/dagster/core/storage/fs_io_manager.py @@ -25,7 +25,17 @@ def fs_io_manager(init_context): your dagster.yaml file (which will be a temporary directory if not explicitly set). Serializes and deserializes output values using pickling and automatically constructs - the filepaths for the assets. + the filepaths for ops and assets. + + Assigns each op output to a unique filepath containing run ID, step key, and output name. + Assigns each asset to a single filesystem path, at "/". If the asset key + has multiple components, the final component is used as the name of the file, and the preceding + components as parent directories under the base_dir. + + Subsequent materializations of an asset will overwrite previous materializations of that asset. + So, with a base directory of "/my/base/path", an asset with key + `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory + with path "/my/base/path/one/two/". Example usage: @@ -97,8 +107,12 @@ def __init__(self, base_dir=None): def _get_path(self, context: Union[InputContext, OutputContext]) -> str: """Automatically construct filepath.""" - identifier = context.get_identifier() - return os.path.join(self.base_dir, *identifier) + if context.has_asset_key: + path = context.get_asset_identifier() + else: + path = context.get_identifier() + + return os.path.join(self.base_dir, *path) def has_output(self, context): filepath = self._get_path(context) diff --git a/python_modules/dagster/dagster_tests/core_tests/asset_defs_tests/test_asset_group.py b/python_modules/dagster/dagster_tests/core_tests/asset_defs_tests/test_asset_group.py index 567efecb7d5ed..cbf952758dcfb 100644 --- a/python_modules/dagster/dagster_tests/core_tests/asset_defs_tests/test_asset_group.py +++ b/python_modules/dagster/dagster_tests/core_tests/asset_defs_tests/test_asset_group.py @@ -15,7 +15,7 @@ Out, Output, ResourceDefinition, - fs_asset_io_manager, + fs_io_manager, graph, in_process_executor, io_manager, @@ -750,7 +750,7 @@ def asset_foo(): group = AssetGroup(assets=[asset_foo]) assert ( group.resource_defs["io_manager"] # pylint: disable=comparison-with-callable - == fs_asset_io_manager + == fs_io_manager ) diff --git a/python_modules/dagster/dagster_tests/core_tests/asset_defs_tests/test_assets_job.py b/python_modules/dagster/dagster_tests/core_tests/asset_defs_tests/test_assets_job.py index dfbf089f342fd..84f79bae3abe5 100644 --- a/python_modules/dagster/dagster_tests/core_tests/asset_defs_tests/test_assets_job.py +++ b/python_modules/dagster/dagster_tests/core_tests/asset_defs_tests/test_assets_job.py @@ -16,7 +16,9 @@ Output, ResourceDefinition, StaticPartitionsDefinition, + execute_pipeline, graph, + in_process_executor, io_manager, multi_asset, op, @@ -1248,3 +1250,41 @@ def my_derived_asset(my_source_asset): result = source_asset_job.execute_in_process(asset_selection=[AssetKey("my_derived_asset")]) assert result.success + + +def test_op_outputs_with_default_asset_io_mgr(): + @op + def return_stuff(): + return 12 + + @op + def transform(data): + assert data == 12 + return data * 2 + + @op + def one_more_transformation(transformed_data): + assert transformed_data == 24 + return transformed_data + 1 + + @graph( + out={ + "asset_1": GraphOut(), + "asset_2": GraphOut(), + }, + ) + def complicated_graph(): + result = return_stuff() + return one_more_transformation(transform(result)), transform(result) + + @asset + def my_asset(asset_1): + assert asset_1 == 25 + return asset_1 + + my_job = AssetGroup( + [AssetsDefinition.from_graph(complicated_graph), my_asset], + ).build_job("my_job", executor_def=in_process_executor) + + result = execute_pipeline(my_job) + assert result.success diff --git a/python_modules/dagster/dagster_tests/core_tests/storage_tests/test_fs_asset_io_manager.py b/python_modules/dagster/dagster_tests/core_tests/storage_tests/test_fs_asset_io_manager.py deleted file mode 100644 index 5cf4ca2cdef55..0000000000000 --- a/python_modules/dagster/dagster_tests/core_tests/storage_tests/test_fs_asset_io_manager.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import pickle -import tempfile - -from dagster import AssetKey, DailyPartitionsDefinition, Out, Output, StaticPartitionsDefinition -from dagster.core.asset_defs import AssetGroup, AssetIn, asset, build_assets_job, multi_asset -from dagster.core.storage.fs_asset_io_manager import fs_asset_io_manager - - -def get_assets_job(io_manager_def, partitions_def=None): - asset1_namespace = ["one", "two", "three"] - - @asset(namespace=["one", "two", "three"], partitions_def=partitions_def) - def asset1(): - return [1, 2, 3] - - @asset( - namespace=["four", "five"], - ins={"asset1": AssetIn(namespace=asset1_namespace)}, - partitions_def=partitions_def, - ) - def asset2(asset1): - return asset1 + [4] - - return build_assets_job( - name="a", assets=[asset1, asset2], resource_defs={"io_manager": io_manager_def} - ) - - -def test_fs_asset_io_manager(): - with tempfile.TemporaryDirectory() as tmpdir_path: - io_manager_def = fs_asset_io_manager.configured({"base_dir": tmpdir_path}) - job_def = get_assets_job(io_manager_def) - - result = job_def.execute_in_process() - assert result.success - - handled_output_events = list( - filter(lambda evt: evt.is_handled_output, result.all_node_events) - ) - assert len(handled_output_events) == 2 - - filepath_a = os.path.join(tmpdir_path, "one", "two", "three", "asset1") - assert os.path.isfile(filepath_a) - with open(filepath_a, "rb") as read_obj: - assert pickle.load(read_obj) == [1, 2, 3] - - loaded_input_events = list(filter(lambda evt: evt.is_loaded_input, result.all_node_events)) - assert len(loaded_input_events) == 1 - assert loaded_input_events[0].event_specific_data.upstream_step_key.endswith("asset1") - - filepath_b = os.path.join(tmpdir_path, "four", "five", "asset2") - assert os.path.isfile(filepath_b) - with open(filepath_b, "rb") as read_obj: - assert pickle.load(read_obj) == [1, 2, 3, 4] - - -def test_fs_asset_io_manager_partitioned(): - with tempfile.TemporaryDirectory() as tmpdir_path: - io_manager_def = fs_asset_io_manager.configured({"base_dir": tmpdir_path}) - job_def = get_assets_job( - io_manager_def, partitions_def=DailyPartitionsDefinition(start_date="2020-02-01") - ) - - result = job_def.execute_in_process(partition_key="2020-05-03") - assert result.success - - handled_output_events = list( - filter(lambda evt: evt.is_handled_output, result.all_node_events) - ) - assert len(handled_output_events) == 2 - - filepath_a = os.path.join(tmpdir_path, "one", "two", "three", "asset1", "2020-05-03") - assert os.path.isfile(filepath_a) - with open(filepath_a, "rb") as read_obj: - assert pickle.load(read_obj) == [1, 2, 3] - - loaded_input_events = list(filter(lambda evt: evt.is_loaded_input, result.all_node_events)) - assert len(loaded_input_events) == 1 - assert loaded_input_events[0].event_specific_data.upstream_step_key.endswith("asset1") - - filepath_b = os.path.join(tmpdir_path, "four", "five", "asset2", "2020-05-03") - assert os.path.isfile(filepath_b) - with open(filepath_b, "rb") as read_obj: - assert pickle.load(read_obj) == [1, 2, 3, 4] - - -def test_fs_asset_io_manager_partitioned_multi_asset(): - with tempfile.TemporaryDirectory() as tmpdir_path: - io_manager_def = fs_asset_io_manager.configured({"base_dir": tmpdir_path}) - - partitions = StaticPartitionsDefinition(["A"]) - - @multi_asset( - partitions_def=partitions, - outs={ - "out_1": Out(asset_key=AssetKey("upstream_asset_1")), - "out_2": Out(asset_key=AssetKey("upstream_asset_2")), - }, - ) - def upstream_asset(): - return (Output(1, output_name="out_1"), Output(2, output_name="out_2")) - - @asset( - partitions_def=partitions, - ) - def downstream_asset(upstream_asset_1: int) -> int: - del upstream_asset_1 - return 2 - - group = AssetGroup( - [upstream_asset, downstream_asset], resource_defs={"io_manager": io_manager_def} - ) - - job = group.build_job(name="TheJob") - - result = job.execute_in_process(partition_key="A") - assert result.success - - handled_output_events = list( - filter(lambda evt: evt.is_handled_output, result.all_node_events) - ) - assert len(handled_output_events) == 3 diff --git a/python_modules/dagster/dagster_tests/core_tests/storage_tests/test_fs_io_manager.py b/python_modules/dagster/dagster_tests/core_tests/storage_tests/test_fs_io_manager.py index 2cba240d5a6f2..4000c82838ccd 100644 --- a/python_modules/dagster/dagster_tests/core_tests/storage_tests/test_fs_io_manager.py +++ b/python_modules/dagster/dagster_tests/core_tests/storage_tests/test_fs_io_manager.py @@ -4,7 +4,21 @@ import pytest -from dagster import MetadataValue, ModeDefinition, execute_pipeline, graph, op, pipeline, solid +from dagster import ( + AssetKey, + DailyPartitionsDefinition, + MetadataValue, + ModeDefinition, + Out, + Output, + StaticPartitionsDefinition, + execute_pipeline, + graph, + op, + pipeline, + solid, +) +from dagster.core.asset_defs import AssetGroup, AssetIn, asset, build_assets_job, multi_asset from dagster.core.definitions.version_strategy import VersionStrategy from dagster.core.errors import DagsterInvariantViolationError from dagster.core.execution.api import create_execution_plan @@ -178,3 +192,119 @@ def recursion_limit_graph(): match=r"Object .* exceeds recursion limit and is not picklable. .*", ): recursion_job.execute_in_process(instance=instance) + + +def get_assets_job(io_manager_def, partitions_def=None): + asset1_namespace = ["one", "two", "three"] + + @asset(namespace=["one", "two", "three"], partitions_def=partitions_def) + def asset1(): + return [1, 2, 3] + + @asset( + namespace=["four", "five"], + ins={"asset1": AssetIn(namespace=asset1_namespace)}, + partitions_def=partitions_def, + ) + def asset2(asset1): + return asset1 + [4] + + return build_assets_job( + name="a", assets=[asset1, asset2], resource_defs={"io_manager": io_manager_def} + ) + + +def test_fs_io_manager_handles_assets(): + with tempfile.TemporaryDirectory() as tmpdir_path: + io_manager_def = fs_io_manager.configured({"base_dir": tmpdir_path}) + job_def = get_assets_job(io_manager_def) + + result = job_def.execute_in_process() + assert result.success + + handled_output_events = list( + filter(lambda evt: evt.is_handled_output, result.all_node_events) + ) + assert len(handled_output_events) == 2 + + filepath_a = os.path.join(tmpdir_path, "one", "two", "three", "asset1") + assert os.path.isfile(filepath_a) + with open(filepath_a, "rb") as read_obj: + assert pickle.load(read_obj) == [1, 2, 3] + + loaded_input_events = list(filter(lambda evt: evt.is_loaded_input, result.all_node_events)) + assert len(loaded_input_events) == 1 + assert loaded_input_events[0].event_specific_data.upstream_step_key.endswith("asset1") + + filepath_b = os.path.join(tmpdir_path, "four", "five", "asset2") + assert os.path.isfile(filepath_b) + with open(filepath_b, "rb") as read_obj: + assert pickle.load(read_obj) == [1, 2, 3, 4] + + +def test_fs_io_manager_partitioned(): + with tempfile.TemporaryDirectory() as tmpdir_path: + io_manager_def = fs_io_manager.configured({"base_dir": tmpdir_path}) + job_def = get_assets_job( + io_manager_def, partitions_def=DailyPartitionsDefinition(start_date="2020-02-01") + ) + + result = job_def.execute_in_process(partition_key="2020-05-03") + assert result.success + + handled_output_events = list( + filter(lambda evt: evt.is_handled_output, result.all_node_events) + ) + assert len(handled_output_events) == 2 + + filepath_a = os.path.join(tmpdir_path, "one", "two", "three", "asset1", "2020-05-03") + assert os.path.isfile(filepath_a) + with open(filepath_a, "rb") as read_obj: + assert pickle.load(read_obj) == [1, 2, 3] + + loaded_input_events = list(filter(lambda evt: evt.is_loaded_input, result.all_node_events)) + assert len(loaded_input_events) == 1 + assert loaded_input_events[0].event_specific_data.upstream_step_key.endswith("asset1") + + filepath_b = os.path.join(tmpdir_path, "four", "five", "asset2", "2020-05-03") + assert os.path.isfile(filepath_b) + with open(filepath_b, "rb") as read_obj: + assert pickle.load(read_obj) == [1, 2, 3, 4] + + +def test_fs_io_manager_partitioned_multi_asset(): + with tempfile.TemporaryDirectory() as tmpdir_path: + io_manager_def = fs_io_manager.configured({"base_dir": tmpdir_path}) + + partitions = StaticPartitionsDefinition(["A"]) + + @multi_asset( + partitions_def=partitions, + outs={ + "out_1": Out(asset_key=AssetKey("upstream_asset_1")), + "out_2": Out(asset_key=AssetKey("upstream_asset_2")), + }, + ) + def upstream_asset(): + return (Output(1, output_name="out_1"), Output(2, output_name="out_2")) + + @asset( + partitions_def=partitions, + ) + def downstream_asset(upstream_asset_1: int) -> int: + del upstream_asset_1 + return 2 + + group = AssetGroup( + [upstream_asset, downstream_asset], resource_defs={"io_manager": io_manager_def} + ) + + job = group.build_job(name="TheJob") + + result = job.execute_in_process(partition_key="A") + assert result.success + + handled_output_events = list( + filter(lambda evt: evt.is_handled_output, result.all_node_events) + ) + assert len(handled_output_events) == 3