Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Refactored weight compression for further unification. #2181

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 0 additions & 27 deletions nncf/openvino/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
from nncf.openvino.quantization.backend_parameters import BackendParameters
from nncf.openvino.quantization.backend_parameters import is_weight_compression_needed
from nncf.openvino.quantization.quantize_ifmodel import apply_algorithm_if_bodies
from nncf.openvino.quantization.weights_compression import insert_pre_compression_operations
from nncf.parameters import CompressWeightsMode
from nncf.parameters import DropType
from nncf.parameters import ModelType
from nncf.parameters import TargetDevice
Expand Down Expand Up @@ -437,28 +435,3 @@ def quantize_with_accuracy_control_impl(
advanced_quantization_parameters,
advanced_accuracy_restorer_parameters,
)


def compress_weights_impl(
model: ov.Model,
mode: CompressWeightsMode = CompressWeightsMode.INT8,
ratio: Optional[float] = None,
group_size: Optional[int] = None,
) -> ov.Model:
"""
Implementation of the `compress_weights()` method for the OpenVINO backend.

:param model: an OpenVINO model for compression.
:param mode: Defines a mode for weight compression.
INT8 stands for 8-bit integer quantization of all weights.
NF4 stands for a mixed-precision weights quantization to NF4 data type. The first and last layers
are always compressed to a backup precision which is 8-bit integer by default. All others are quantized whether
to NF4 or to a backup precision depending on criteria and the given ratio.
:param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and
the rest to INT8).
:param group_size: number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
The value -1 means no grouping.
:return: The non-trainable model with compressed weights and dequantization operations.
"""
insert_pre_compression_operations(model, mode, ratio, group_size)
return model
131 changes: 131 additions & 0 deletions nncf/quantization/algorithms/weight_compression/algorithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Copyright (c) 2023 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2023 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Optional, TypeVar

from nncf import Dataset
from nncf.common.graph.graph import NNCFGraph
from nncf.common.graph.graph import NNCFNode
from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
from nncf.common.utils.backend import BackendType
from nncf.common.utils.backend import get_backend
from nncf.parameters import CompressWeightsMode
from nncf.quantization.algorithms.algorithm import Algorithm
from nncf.quantization.algorithms.smooth_quant.backend import ALGO_BACKENDS

TModel = TypeVar("TModel")
TTensor = TypeVar("TTensor")


class WeightCompression(Algorithm):
"""
Post-training Weight Compression algorithm implementation.

Compresses weights of Linear and Embedding layers to 8-bit integer or
to nf4 depending on mode, ratio and group size.
"""

def __init__(
self,
mode: CompressWeightsMode,
ratio: float = None,
group_size: int = None,
):
"""
:param mode: Defines a mode for weight compression.
INT8 stands for 8-bit integer quantization of all weights.
NF4 stands for a mixed-precision weights quantization to NF4 data type. The first and last layers
are always compressed to a backup precision which is 8-bit integer by default. All others are quantized
whether to NF4 or to a backup precision depending on criteria and the given ratio.
:param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
and the rest to INT8).
:param group_size: number of weights (e.g. 128) in the channel dimension
that share quantization parameters (scale). The value -1 means no grouping.
"""
super().__init__()
self._mode = mode
self._group_size = group_size
self._ratio = ratio
self._backend_entity = None
self._algorithm_key = f"CW_{hash(self)}"

@property
def available_backends(self) -> Dict[str, BackendType]:
return ALGO_BACKENDS.registry_dict

def _set_backend_entity(self, model: TModel) -> None:
"""
Creates a helper class with a backed-specific logic of the algorithm.

:param model: Backend-specific input model.
"""
model_backend = get_backend(model)
if model_backend == BackendType.OPENVINO:
from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend

self._backend_entity = OVWeightCompressionAlgoBackend()
elif model_backend == BackendType.TORCH:
from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend

self._backend_entity = PTWeightCompressionAlgoBackend()
else:
raise RuntimeError(
"Cannot return backend-specific entity because {} is not supported!".format(model_backend)
)

def apply(
self,
model: TModel,
graph: NNCFGraph,
statistic_points: Optional[StatisticPointsContainer] = None,
dataset: Optional[Dataset] = None,
) -> TModel:
self._set_backend_entity(model)
self._backend_entity.validate_params(self._mode)
nodes_to_compress = self._get_nodes_to_compress(graph)
transformed_model = self._backend_entity.do_compression(
model, nodes_to_compress, self._mode, self._ratio, self._group_size
)
return transformed_model

def _get_nodes_to_compress(self, nncf_graph: NNCFGraph) -> List[NNCFNode]:
"""
Collects nodes in the model's graph corresponding to the layers for weight compression.

:param nncf_graph: NNCFGraph instance.
:return: List with the data for each layer.
"""
weighted_metatypes = self._backend_entity.weighted_metatypes
ordered_nodes_to_compress = []
for node in nncf_graph.topological_sort():
is_node_with_weights = self._backend_entity.is_node_with_weights(node)
if node.metatype in weighted_metatypes and is_node_with_weights:
ordered_nodes_to_compress.append(node)
return ordered_nodes_to_compress

def get_statistic_points(self, model: TModel, graph: NNCFGraph) -> StatisticPointsContainer:
"""
Returns statistic points, for which StatisticsCollector should collect statistics.

:param model: Model for statistics collection.
:param graph: Model graph.
:return: Statistic points, for which StatisticsCollector should collect statistics.
"""
82 changes: 82 additions & 0 deletions nncf/quantization/algorithms/weight_compression/backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright (c) 2023 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC
from abc import abstractmethod
from typing import List, Optional, TypeVar

from nncf.common.graph import NNCFNode
from nncf.common.graph.operator_metatypes import OperatorMetatype
from nncf.common.utils.registry import Registry
from nncf.parameters import CompressWeightsMode

TModel = TypeVar("TModel")
ALGO_BACKENDS = Registry("algo_backends")


class WeightCompressionAlgoBackend(ABC):
@property
@abstractmethod
def weighted_metatypes(self) -> List[OperatorMetatype]:
"""
Property for the backend-specific metatypes.
"""

@staticmethod
@abstractmethod
def is_node_with_weights(node: NNCFNode) -> bool:
"""
Checks whether the node with weights or not.

:param node: NNCFNode to check.
:return: boolean indicating whether the node has weights or not.
"""

@staticmethod
@abstractmethod
def validate_params(mode: CompressWeightsMode) -> None:
"""
Performs validation of the algorithm's parameters and raises an error for unsupported configuration of
parameters. Should be called on early algorithm steps to prevent execution of time-consuming operations.

:param mode: Defines a mode for weight compression.
INT8 stands for 8-bit integer quantization of all weights.
NF4 stands for a mixed-precision weights quantization to NF4 data type. The first and last layers
are always compressed to a backup precision which is 8-bit integer by default. All others are quantized
whether to NF4 or to a backup precision depending on criteria and the given ratio.
"""

@staticmethod
@abstractmethod
def do_compression(
model: TModel,
nodes_to_compress: List[NNCFNode],
mode: CompressWeightsMode,
ratio: float = None,
group_size: int = None,
) -> TModel:
"""
Compress weights of Linear and Embedding layers to 8-bit integer or to nf4
depending on mode, ratio and group size.

:param model: Model for applying weight compression.
:param nodes_to_compress: List of nodes in the model's graph,
corresponding to the layers for weight compression.
:param mode: Defines a mode for weight compression.
INT8 stands for 8-bit integer quantization of all weights.
NF4 stands for a mixed-precision weights quantization to NF4 data type. The first and last layers
are always compressed to a backup precision which is 8-bit integer by default. All others are quantized
whether to NF4 or to a backup precision depending on criteria and the given ratio.
:param ratio: The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
and the rest to INT8).
:param group_size: Number of weights (e.g. 128) in the channel dimension
that share quantization parameters (scale). The value -1 means no grouping.
"""
Loading