diff --git a/python/tvm/ansor/__init__.py b/python/tvm/ansor/__init__.py index edade490018c..9cce63b2840d 100644 --- a/python/tvm/ansor/__init__.py +++ b/python/tvm/ansor/__init__.py @@ -22,24 +22,15 @@ from . import serialization from . import loop_state from . import utils -from . import feature from . import workload_registry -from . import task_scheduler # Shortcut -from .compute_dag import ComputeDAG, LayoutRewriteLevel +from .compute_dag import ComputeDAG from .auto_schedule import SearchTask, SketchSearchPolicy, TuneOption, HardwareParams, \ PreloadMeasuredStates, PreloadCustomSketchRule, auto_schedule from .measure import MeasureInput, LocalBuilder, LocalRunner, RPCRunner, LocalRPCMeasureContext from .cost_model import RandomModel -from .cost_model.xgb_model import XGBModel from .serialization import LogToFile, LogReader, best_measure_pair_in_file, \ load_from_file, write_measure_records_to_file from .workload_registry import register_workload_func, \ workload_key_to_dag, make_workload_key_func -from .task_scheduler import TaskScheduler, SimpleTaskScheduler -from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest as apply_history_best, \ - FallbackContext -from .relay_integration import extract_from_program, extract_from_multiple_program, \ - finish_layout_rewrite, prepare_layout_rewrite, auto_schedule_topi -from .env import GLOBAL_SCOPE diff --git a/python/tvm/ansor/auto_schedule.py b/python/tvm/ansor/auto_schedule.py index 4497bb400703..37e622018658 100644 --- a/python/tvm/ansor/auto_schedule.py +++ b/python/tvm/ansor/auto_schedule.py @@ -82,96 +82,10 @@ def set_verbose(self, verbose): def run_callbacks(self, callbacks): _ffi_api.SearchPolicyRunCallbacks(self, callbacks) - -@tvm._ffi.register_object("ansor.SketchSearchPolicy") -class SketchSearchPolicy(SearchPolicy): - """ The search policy that searches in a hierarchical search space defined by sketches. - The policy randomly samples programs from the space defined by sketches - and use evolutionary search to fine-tune them. - - Parameters - ---------- - program_cost_model: CostModel - Cost model for programs - params: int - Parameters of the search policy. See `src/ansor/search_policy/sketch_search_policy.h` - to find the definitions. See code below to find the default values - seed: int - Random seed - """ - def __init__(self, - program_cost_model, - params=None, - seed=None): - # set default parameters - default_params = { - "eps_greedy": 0.05, - - 'evolutionary_search_population': 2048, - 'evolutionary_search_num_iters': 15, - "evolutionary_search_mutation_prob": 0.85, - "evolutionary_search_use_measured_ratio": 0.2, - - 'cpu_multi_level_tiling_structure': 'SSRSRS', - 'gpu_multi_level_tiling_structure': 'SSSRRSRS', - - 'disable_change_compute_location': 0, - } - - if params is None: - params = default_params - else: - for key, value in default_params.items(): - if key not in params: - params[key] = value - - self.__init_handle_by_constructor__( - _ffi_api.SketchSearchPolicy, program_cost_model, params, - seed or random.randint(1, 1 << 30)) - - @tvm._ffi.register_object("ansor.SearchCallback") class SearchCallback(Object): """Callback function before or after search process""" - -@tvm._ffi.register_object("ansor.PreloadMeasuredStates") -class PreloadMeasuredStates(SearchCallback): - """ A SearchCallback to load measured states from the log file for a search policy. - This can resume the state of the search policy. - - Parameters - ---------- - filename: str - """ - def __init__(self, filename: str): - self.__init_handle_by_constructor__( - _ffi_api.PreloadMeasuredStates, filename) - - -@tvm._ffi.register_object("ansor.PreloadCustomSketchRule") -class PreloadCustomSketchRule(SearchCallback): - """ - A SearchCallback for SketchSearchPolicy that allowing users to add - custom sketch rule. - - Notes - ----- - This is an advanced feature. Make sure you're clear how it - works and this should only be used in SketchSearchPolicy. - - Parameters - ---------- - meet_condition_func: Function - A function with `(policy, state, stage_id) -> int` - apply_func: Function - A function with `(policy, state, stage_id) -> [[State, int], ...]` - """ - def __init__(self, meet_condition_func, apply_func): - self.__init_handle_by_constructor__( - _ffi_api.PreloadCustomSketchRule, meet_condition_func, apply_func) - - @tvm._ffi.register_object("ansor.TuneOption") class TuneOption(Object): """ The options for tuning diff --git a/python/tvm/ansor/compute_dag.py b/python/tvm/ansor/compute_dag.py index 6304c7bb0e0a..994c3ae3ab97 100644 --- a/python/tvm/ansor/compute_dag.py +++ b/python/tvm/ansor/compute_dag.py @@ -23,13 +23,6 @@ from . import _ffi_api -class LayoutRewriteLevel(object): - NO_REWRITE = 0 # No layout rewrite - PLACEHOLDER_REWRITE = 1 # Only rewrite layout of placeholder in the compute dag - COMPUTE_REWRITE = 2 # Only rewrite compute body for new layout in the compute dag - BOTH_REWRITE = 3 # Rewrite both placeholder and compute body in the compute dag - - @tvm._ffi.register_object("ansor.ComputeDAG") class ComputeDAG(Object): """ @@ -97,17 +90,3 @@ def infer_bound_from_state(self, state): """ state_obj = state if isinstance(state, StateObject) else state.state_object return State(_ffi_api.ComputeDAGInferBoundFromState(self, state_obj), self) - - def rewrite_layout_from_state(self, state: State): - """ - Rewrite the layout according to the transform steps in the history of a state - - Parameters - ---------- - state : StateObject - - Returns - ------- - state : StateObject - """ - return _ffi_api.ComputeDAGRewriteLayoutFromState(self, state) diff --git a/python/tvm/ansor/cost_model/__init__.py b/python/tvm/ansor/cost_model/__init__.py index 56e4a5f9128b..1454da451b61 100644 --- a/python/tvm/ansor/cost_model/__init__.py +++ b/python/tvm/ansor/cost_model/__init__.py @@ -17,5 +17,4 @@ # pylint: disable=unused-import, redefined-builtin """ Cost model that estimates the performance of programs """ -from .cost_model import RandomModel -from .xgb_model import XGBModel +from .cost_model import RandomModel \ No newline at end of file diff --git a/python/tvm/ansor/cost_model/cost_model.py b/python/tvm/ansor/cost_model/cost_model.py index fbfc8242488b..605db14c19c3 100644 --- a/python/tvm/ansor/cost_model/cost_model.py +++ b/python/tvm/ansor/cost_model/cost_model.py @@ -44,34 +44,3 @@ def random_number(n, return_ptr): return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float)) array_wrapper = np.ctypeslib.as_array(return_ptr, shape=(n,)) array_wrapper[:] = np.random.uniform(0, 1, (n,)) - - -@tvm._ffi.register_object("ansor.PythonBasedModel") -class PythonBasedModel(CostModel): - """Base class for cost models implemented in python""" - def __init__(self): - def update_func(inputs, results): - self.update(inputs, results) - - def predict_func(task, states, return_ptr): - return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float)) - array_wrapper = np.ctypeslib.as_array(return_ptr, shape=(len(states),)) - array_wrapper[:] = self.predict(task, states) - - def predict_stage_func(task, states, return_ptr): - ret = self.predict_stages(task, states) - return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float)) - array_wrapper = np.ctypeslib.as_array(return_ptr, shape=ret.shape) - array_wrapper[:] = ret - - self.__init_handle_by_constructor__(_ffi_api.PythonBasedModel, update_func, - predict_func, predict_stage_func) - - def update(self, inputs, results): - raise NotImplementedError - - def predict(self, task, states): - raise NotImplementedError - - def predict_stages(self, task, states): - raise NotImplementedError diff --git a/python/tvm/ansor/cost_model/xgb_model.py b/python/tvm/ansor/cost_model/xgb_model.py deleted file mode 100644 index 42af17daae2c..000000000000 --- a/python/tvm/ansor/cost_model/xgb_model.py +++ /dev/null @@ -1,474 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Cost model based on xgboost""" -import multiprocessing -import logging -from collections import defaultdict - -import numpy as np -import xgboost as xgb - -from tvm.autotvm.tuner.xgboost_cost_model import get_rank, recall_curve, max_curve -from .cost_model import PythonBasedModel -from ..feature import get_per_stmt_features_from_measure_pairs, get_per_stmt_features_from_states -from ..serialization import LogReader - -logger = logging.getLogger('ansor') - -class XGBDMatrixContext: - """Context to hold additional attributes of xgb.DMatrix""" - def __init__(self): - self.context_dict = defaultdict(dict) - - def get(self, key, matrix, default=None): - return self.context_dict[key].get(matrix.handle.value, default) - - def put(self, key, matrix, value): - self.context_dict[key][matrix.handle.value] = value - -dmatrix_context = XGBDMatrixContext() - -class XGBModel(PythonBasedModel): - """Train a XGBoost model to predict the runtime cost of a program. - The cost of a program = the sum of the costs of all stages in this program. - i.e. Cost(p) = cost_s0 + cost_s1 + ... + cost_sn, where cost_si is the cost of Stage i - - The xgboost model makes prediction per stage, then we sum them up. - The final predction made by this class is normalized throughtput (from 0 to 1, larger is better) - - To support this stage decomposition, we have to implement a custom loss function for - XGBoost, which is the `pack_sum` in the code below. - """ - def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None): - self.xgb_params = { - 'max_depth': 10, - 'gamma': 0.001, - 'min_child_weight': 0, - 'eta': 0.2, - # todo(lmzheng): automatically decrease learning rate when the loss is too large - - 'n_gpus': 0, - 'nthread': multiprocessing.cpu_count() // 2, - 'verbosity': 0, - 'seed': seed or 43, - 'disable_default_eval_metric': 1 - } - self.bst = None - self.plan_size = 32 - self.num_warmup_sample = num_warmup_sample - self.verbose_eval = verbose_eval - - super().__init__() - - # measurement input/result pairs - self.inputs = [] - self.results = [] - self.inputs_feature_cache = [] - - def update(self, inputs, results): - if len(inputs) <= 0: - return - - self.inputs.extend(inputs) - self.results.extend(results) - - # extract feature - n_cached = len(self.inputs_feature_cache) - features, normalized_throughputs, task_ids = \ - get_per_stmt_features_from_measure_pairs(self.inputs, self.results, - skip_first_n_feature_extraction=n_cached) - if n_cached > 0: - features = list(features) - features[:n_cached] = self.inputs_feature_cache - features = np.array(features) - self.inputs_feature_cache = features - dtrain = pack_sum_xgbmatrix(features, normalized_throughputs, - task_ids, normalized_throughputs) - - # train xgb model - self.bst = xgb.train(self.xgb_params, dtrain, - num_boost_round=10000, - obj=pack_sum_square_error, - callbacks=[custom_callback( - stopping_rounds=50, - metric='tr-p-rmse', - fevals=[ - pack_sum_rmse, pack_sum_average_peak_score(self.plan_size), - ], - evals=[(dtrain, 'tr')], - maximize=False, - verbose_eval=self.verbose_eval)]) - - def predict(self, task, states): - features = get_per_stmt_features_from_states(states, task) - if self.bst is not None and len(self.inputs) > self.num_warmup_sample: - dtest, pack_ids = pack_sum_xgbmatrix_for_prediction(features) - raw_preds = self.bst.predict(dtest) - ret = pack_sum_predict_throughput(raw_preds, pack_ids) - else: - ret = np.random.uniform(0, 1, (len(states),)) - - # Predict 0 for invalid states that failed to be lowered. - for idx, feature in enumerate(features): - if feature.min() == feature.max() == 0: - ret[idx] = float('-inf') - - return ret - - def predict_stages(self, task, states): - # Format: (s0 score, ..., sN score, s0 n_stage, s0 stage 0, ..., s1 n_stage, s1 stage 0,) - features = get_per_stmt_features_from_states(states, task) - if self.bst is not None and len(self.inputs) > self.num_warmup_sample: - dtest, pack_ids = pack_sum_xgbmatrix_for_prediction(features) - raw_preds = self.bst.predict(dtest) - breakdown = pack_sum_predict_throughput(raw_preds, pack_ids) - stage_scores = [[] for _ in range(len(states))] - for pred, pack_id in zip(raw_preds, pack_ids): - stage_scores[pack_id].append(pred) - for idx, stage_score in enumerate(stage_scores): - breakdown = np.append(breakdown, len(stage_score)) - breakdown = np.concatenate((breakdown, -np.array(stage_score))) - else: - breakdown = np.concatenate( - (np.random.uniform(0, 1, (len(states), )), np.zeros(len(states), ))) - - # Predict 0 for invalid states that failed to be lowered. - for idx, feature in enumerate(features): - if feature.min() == feature.max() == 0: - breakdown[idx] = float('-inf') - - return breakdown - - def load_log_file(self, file_name, n_lines=-1): - inputs, results = LogReader(file_name).read_lines(n_lines) - logger.info("XGBModel: Loaded %s lines of history log from %s", len(inputs), file_name) - self.update(inputs, results) - - def save(self, file_name: str): - self.bst.save_model(file_name) - - def load(self, file_name: str): - if self.bst is None: - self.bst = xgb.Booster(self.xgb_params) - self.bst.load_model(file_name) - self.num_warmup_sample = -1 - - -def pack_sum_xgbmatrix_for_prediction(xs): - x_flatten = [] - pack_ids = [] - - for ct, x in enumerate(xs): - for row in x: - x_flatten.append(row) - pack_ids.append(ct) - - return xgb.DMatrix(np.array(x_flatten)), pack_ids - - -def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None): - if gids is not None: - # sort by group - indices = gids.argsort() - xs, ys = xs[indices], ys[indices] - group_sizes = np.bincount(gids) - if weights is not None: - weights = weights[indices] - else: - # assume it has only one group - group_sizes = [len(xs)] - - x_flatten = [] - y_flatten = [] - weights_flatten = [] - pack_ids = [] - - if weights is not None: - for ct, (x, y, w) in enumerate(zip(xs, ys, weights)): - for row in x: - x_flatten.append(row) - y_flatten.append(y) - weights_flatten.append(w) - pack_ids.append(ct) - else: - for ct, (x, y) in enumerate(zip(xs, ys)): - for row in x: - x_flatten.append(row) - y_flatten.append(y) - pack_ids.append(ct) - - ret = xgb.DMatrix(np.array(x_flatten), y_flatten) - if weights is not None: - ret.set_weight(weights_flatten) - dmatrix_context.put('pack_ids', ret, np.array(pack_ids)) - dmatrix_context.put('group_sizes', ret, group_sizes) - return ret - -LOSS_TYPE = 3 - -# Type 0 -# The model predicts cost. Use square error of throughput as loss -# loss = 1/2 * (1 / sum(x_i) - y) ^ 2 -# -# Type 1 -# The model predicts cost. Use square error of cost as loss -# loss = 1/2 * (sum(x_i) - 1 / y) ^ 2 -# -# Type 2 -# The model predicts throughput. Use square error of throughput as loss. -# loss = 1/2 * (1 / sum(1 / x_i) - y) ^ 2 -# -# Type 3 -# The model predicts throughput. Use square error of throughput as loss. -# But approximate 1 / (1 / a_1 + 1 / a_2 + ... + 1 / a_n) with -(b_1 + b_2 + b_3) -# loss = 1/2 * (-sum(x_i) - y) ^ 2 -# -# Type 4 -# The model predicts throughput. Use square error of throughput as loss. -# But approximate 1 / (1 / a_1 + 1 / a_2 + ... + 1 / a_n) with -(b_1 + b_2 + b_3) -# Also add a sigmoid to force the prediction to be within the range of (0, 1) -# loss = 1/2 * (sigmoid(-sum(x_i)) - y) ^ 2 -# - -def pack_sum_predict_throughput(raw_preds, pack_ids): - if LOSS_TYPE == 0: - sum_pred = np.bincount(pack_ids, weights=raw_preds) - return 1 / sum_pred - elif LOSS_TYPE == 1: - sum_pred = np.bincount(pack_ids, weights=raw_preds) - return 1 / sum_pred - elif LOSS_TYPE == 2: - sum_inverse_preds = np.bincount(pack_ids, weights=1 / raw_preds) - return 1 / sum_inverse_preds - elif LOSS_TYPE == 3: - sum_pred = np.bincount(pack_ids, weights=raw_preds) - return - sum_pred # pylint: disable=invalid-unary-operand-type - elif LOSS_TYPE == 4: - sum_pred = np.bincount(pack_ids, weights=raw_preds) - return 1 / (1 + np.exp(sum_pred)) - else: - raise ValueError("Invalid loss type: " + LOSS_TYPE) - -def pack_sum_square_error(preds, dtrain): - pack_ids = dmatrix_context.get("pack_ids", dtrain) - weight = dtrain.get_weight() - - if LOSS_TYPE == 0: - sum_pred = np.bincount(pack_ids, weights=preds) - x = sum_pred[pack_ids] - y = dtrain.get_label() - gradient = (x * y - 1) / np.power(x, 3) - hessian = (3 - 2 * x * y) / np.power(x, 4) - elif LOSS_TYPE == 1: - sum_pred = np.bincount(pack_ids, weights=preds) - x = sum_pred[pack_ids] - y = dtrain.get_label() - gradient = x - 1 / np.minimum(y, 1e6) - hessian = np.ones_like(gradient) - elif LOSS_TYPE == 2: - sum_inverse_preds = np.bincount(pack_ids, weights=1 / preds)[pack_ids] - y = dtrain.get_label() - gradient = (1 / sum_inverse_preds - y) / (np.power(preds * sum_inverse_preds, 2)) - hessian = (2 * preds * y * np.power(sum_inverse_preds, 2) - 2 * y * sum_inverse_preds - 2 * preds * sum_inverse_preds + 3) / (np.power(preds * sum_inverse_preds, 4)) - elif LOSS_TYPE == 3: - sum_pred = np.bincount(pack_ids, weights=preds) - x = sum_pred[pack_ids] - y = dtrain.get_label() - gradient = x + y - hessian = np.ones_like(gradient) - elif LOSS_TYPE == 4: - sum_pred = np.bincount(pack_ids, weights=preds) - exp_x = np.exp(sum_pred[pack_ids]) - exp_2x = np.power(exp_x, 2) - y = dtrain.get_label() - gradient = exp_x * (exp_x * y + y - 1) / np.power(exp_x + 1, 3) - hessian = exp_x * (-exp_2x * y + 2 * exp_x + y - 1) / np.power(exp_x + 1, 4) - else: - raise ValueError("Invalid loss type: " + LOSS_TYPE) - - if len(weight) == 0: - return gradient, hessian - else: - return gradient * weight, hessian * weight - -def pack_sum_rmse(raw_preds, dtrain): - pack_ids = dmatrix_context.get("pack_ids", dtrain) - preds = pack_sum_predict_throughput(raw_preds, pack_ids)[pack_ids] - return 'p-rmse', np.sqrt(np.mean(np.square((preds - dtrain.get_label())))) - -def pack_sum_average_peak_score(N): - """Evaluate pack sum average peak score for xgb""" - - def feval(preds, labels): - group_sizes = dmatrix_context.get('group_sizes', labels, [len(preds)]) - pack_ids = dmatrix_context.get("pack_ids", labels) - - preds = pack_sum_predict_throughput(preds, pack_ids) - labels = (np.bincount(pack_ids, weights=labels.get_label()) - / np.unique(pack_ids, return_counts=True)[1]) - - scores = [] - offset = 0 - for size in group_sizes: - preds_group = preds[offset:offset + size] - labels_group = labels[offset:offset + size] - offset += size - - trials = np.argsort(preds_group)[::-1][:N] - trial_scores = labels_group[trials] - curve = max_curve(trial_scores) / np.max(labels_group) - scores.append(np.mean(curve)) - return "a-peak@%d" % N, np.mean(scores) - return feval - -def pack_sum_average_recall_score(N): - """Evaluate average recall score for xgb""" - - def feval(preds, labels): - group_sizes = dmatrix_context.get('group_sizes', labels, [len(preds)]) - pack_ids = dmatrix_context.get("pack_ids", labels) - - preds = pack_sum_predict_throughput(preds, pack_ids) - labels = (np.bincount(pack_ids, weights=labels.get_label()) - / np.unique(pack_ids, return_counts=True)[1]) - - scores = [] - offset = 0 - for size in group_sizes: - preds_group = preds[offset:offset + size] - labels_group = labels[offset:offset + size] - offset += size - - trials = np.argsort(preds_group)[::-1] - ranks = get_rank(labels_group[trials])[:N] - curve = recall_curve(ranks) - scores.append(np.mean(curve)) - return "a-recall@%d" % N, np.mean(scores) - return feval - - -def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None, - maximize=False, verbose_eval=True, skip_every=2): - """Callback function for xgboost to support multiple custom evaluation functions""" - from xgboost.core import EarlyStopException - from xgboost.callback import _fmt_metric - from xgboost.training import aggcv - - state = {} - metric_shortname = metric.split("-")[1] - - def init(env): - """internal function""" - bst = env.model - - state['maximize_score'] = maximize - state['best_iteration'] = 0 - if maximize: - state['best_score'] = float('-inf') - else: - state['best_score'] = float('inf') - - if bst is not None: - if bst.attr('best_score') is not None: - state['best_score'] = float(bst.attr('best_score')) - state['best_iteration'] = int(bst.attr('best_iteration')) - state['best_msg'] = bst.attr('best_msg') - else: - bst.set_attr(best_iteration=str(state['best_iteration'])) - bst.set_attr(best_score=str(state['best_score'])) - else: - assert env.cvfolds is not None - - def callback(env): - """internal function""" - if not state: - init(env) - - bst = env.model - i = env.iteration - cvfolds = env.cvfolds - - res_dict = {} - - if i % skip_every == 1: - return - - ##### evaluation ##### - if cvfolds is not None: - for feval in fevals: - tmp = aggcv([f.eval(i, feval) for f in cvfolds]) - for k, mean, std in tmp: - res_dict[k] = [mean, std] - else: - for feval in fevals: - bst_eval = bst.eval_set(evals, i, feval) - res = [x.split(':') for x in bst_eval.split()] - for kv in res[1:]: - res_dict[kv[0]] = [float(kv[1])] - - eval_res = [] - keys = list(res_dict.keys()) - keys.sort(key=lambda x: x if metric_shortname not in x else "a" + x) - for key in keys: - v = res_dict[key] - eval_res.append([key] + v) - - ##### print eval result ##### - if not isinstance(verbose_eval, bool) and verbose_eval and i % verbose_eval == 0: - infos = ["XGB iter: %3d" % i] - for item in eval_res: - if 'null' in item[0]: - continue - infos.append("%s: %.6f" % (item[0], item[1])) - - logger.debug("\t".join(infos)) - if log_file: - with open(log_file, "a") as fout: - fout.write("\t".join(infos) + '\n') - - ##### choose score and do early stopping ##### - score = None - for item in eval_res: - if item[0] == metric: - score = item[1] - break - assert score is not None - - best_score = state['best_score'] - best_iteration = state['best_iteration'] - maximize_score = state['maximize_score'] - if (maximize_score and score > best_score) or \ - (not maximize_score and score < best_score): - msg = '[%d] %s' % ( - env.iteration, - '\t'.join([_fmt_metric(x) for x in eval_res])) - state['best_msg'] = msg - state['best_score'] = score - state['best_iteration'] = env.iteration - # save the property to attributes, so they will occur in checkpoint. - if env.model is not None: - env.model.set_attr(best_score=str(state['best_score']), - best_iteration=str(state['best_iteration']), - best_msg=state['best_msg']) - elif env.iteration - best_iteration >= stopping_rounds: - best_msg = state['best_msg'] - if verbose_eval and env.rank == 0: - logger.debug("XGB stopped. Best iteration: %s ", best_msg) - raise EarlyStopException(best_iteration) - - return callback diff --git a/python/tvm/ansor/dispatcher.py b/python/tvm/ansor/dispatcher.py deleted file mode 100644 index 3a5dc4e9e206..000000000000 --- a/python/tvm/ansor/dispatcher.py +++ /dev/null @@ -1,299 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -The global context that dispatches best configurations to workloads -""" -# pylint: disable=invalid-name - -from __future__ import absolute_import as _abs - -import logging - -import numpy as np - -from tvm.tir.expr import FloatImm - -logger = logging.getLogger('auto_scheduler') - - -class DispatchContext(object): - """ - Base class of dispatch context. - """ - current = None - - def __init__(self): - self._old_ctx = DispatchContext.current - - def query(self, target, workload): - """ - Query the context to get the specific config for a workload. - If cannot find the result inside this context, this function will query it - from the upper contexts. - - Parameters - ---------- - target: Target - The current target - workload : str - The current workload - - Returns - ------- - cfg : State - The schedule configuration for the workload - """ - ret = self._query_inside(target, workload) - return ret - - def update(self, target, workload, cfg): - """ - Update the config for a workload - - Parameters - ---------- - target: Target - The current target - workload : Workload - The current workload. - cfg : State - The schedule configuration for the workload - """ - raise NotImplementedError() - - def _query_inside(self, target, workload): - """ - Query the context to get the specific config for a workload. - This function only query config inside this context. - - Parameters - ---------- - target: Target - The current target - workload : Workload - The current workload. - - Returns - ------- - cfg : State or str - The schedule configuration for the workload - """ - raise NotImplementedError() - - def __enter__(self): - self._old_ctx = DispatchContext.current - DispatchContext.current = self - return self - - def __exit__(self, ptype, value, trace): - DispatchContext.current = self._old_ctx - - -class ApplyConfig(DispatchContext): - """Apply a deterministic config for all queries. - - Parameters - ---------- - config : State - The schedule configuration - """ - def __init__(self, config): - super(ApplyConfig, self).__init__() - self._config = config - self.workload = None - - def _query_inside(self, target, workload): - """Override query""" - self.workload = workload - return self._config - - def update(self, target, workload, cfg): - """Override update""" - self.workload = workload - self._config = cfg - - -class ApplyHistoryBest(DispatchContext): - """ - Apply the history best config - - Parameters - ---------- - records : str or iterator of (MeasureInput, MeasureResult) - Collection of tuning records. - If is str, then it should be the filename of a records log file. - Each row of this file is an encoded record pair. - Otherwise, it is an iterator. - n_lines: int (optional) - if it is not None, only load the first `n_lines` lines of log - """ - def __init__(self, records, n_lines=None): - super(ApplyHistoryBest, self).__init__() - - self.best_by_targetkey = {} - self.best_by_model = {} - self._best_user_defined = {} - - if records: - self.load(records, n_lines) - - def load(self, records, n_lines=None): - """Load records to this dispatch context - - Parameters - ---------- - records : str or iterator of (MeasureInput, MeasureResult) - Collection of tuning records. - If is str, then it should be the filename of a records log file. - Each row of this file is an encoded record pair. - Otherwise, it is an iterator. - n_lines: int (optional) - if it is not None, only load the first `n_lines` lines of log - """ - from pathlib import Path - from . import load_from_file - - if isinstance(records, Path): - records = str(records) - - if isinstance(records, str): - records = load_from_file(records) - if not records: - return - - best_by_targetkey = self.best_by_targetkey - best_by_model = self.best_by_model - - counter = 0 - for inp, res in records: - if n_lines is not None and counter >= n_lines: - break - counter += 1 - if res.error_no != 0: - continue - - # use target keys in tvm target system as key to build best map - for k in inp.task.target.keys: - key = (k, inp.task.workload_key) - if key not in best_by_targetkey: - best_by_targetkey[key] = (inp, res) - else: - _, other_res = best_by_targetkey[key] - other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)] - costs = [x.value for x in res.costs if isinstance(x, FloatImm)] - if np.mean(other_costs) > np.mean(costs): - best_by_targetkey[key] = (inp, res) - - # use model as key to build best map - key = (inp.task.target.model, inp.task.workload_key) - if key not in best_by_model: - if inp.task.target.model != 'unknown': - best_by_model[key] = (inp, res) - else: - _, other_res = best_by_model[key] - other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)] - costs = [x.value for x in res.costs if isinstance(x, FloatImm)] - if np.mean(other_costs) > np.mean(costs): - best_by_model[key] = (inp, res) - - logger.debug("Finish loading %d records", counter) - - def _query_inside(self, target, workload): - if target is None: - raise RuntimeError("Need a target context to find the history best. " - "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`" - " above the dispatcher call. So does other target. ") - - # first try matching by model - key = (target.model, workload) - if key in self._best_user_defined: - return self._best_user_defined[key] - if key in self.best_by_model: - return self.best_by_model[key][0].state - - # then try matching by target key - for k in target.keys: - key = (k, workload) - if key in self._best_user_defined: - return self._best_user_defined[key] - if key in self.best_by_targetkey: - return self.best_by_targetkey[key][0].state - - return None - - def update(self, target, workload, state): - model = target.model - key = (model, workload) - self._best_user_defined[key] = state - - for k in target.keys: - key = (k, workload) - self._best_user_defined[key] = state - - -class FallbackContext(DispatchContext): - """ - A fallback dispatch context. - This is used as the root context. - """ - - def __init__(self): - super(FallbackContext, self).__init__() - self.memory = {} - self.silent = False - - # a set to prevent print duplicated message - self.messages = set() - - def _query_inside(self, target, workload): - key = (str(target), workload) - if key in self.memory: - return self.memory[key] - - if not self.silent: - msg = "Cannot find config for target=%s, workload=%s. A fallback configuration "\ - "is used, which may bring great performance regression." % (target, workload) - if msg not in self.messages: - self.messages.add(msg) - logger.warning(msg) - cfg = None - - # cache this config to avoid duplicated warning message - self.memory[key] = cfg - return cfg - - def clear_cache(self, target, workload): - """Clear fallback cache. Pass the same argument as _query_inside to this function - to clean the cache. - - Parameters - ---------- - target: Target - The current target - workload : Workload - The current workload. - """ - key = (str(target), workload) - if key in self.memory: - del self.memory[key] - - def update(self, target, workload, cfg): - key = (str(target), workload) - self.memory[key] = cfg - - -DispatchContext.current = FallbackContext() diff --git a/python/tvm/ansor/env.py b/python/tvm/ansor/env.py deleted file mode 100644 index 56e76e26ee4f..000000000000 --- a/python/tvm/ansor/env.py +++ /dev/null @@ -1,25 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" The scope to store global variables in ansor """ - - -class AutoschedulerGlobalScope(object): - def __init__(self): - self.topi_in_compute_rewrite_mode = False - -GLOBAL_SCOPE = AutoschedulerGlobalScope() diff --git a/python/tvm/ansor/feature.py b/python/tvm/ansor/feature.py deleted file mode 100644 index fa1b2cb07dcc..000000000000 --- a/python/tvm/ansor/feature.py +++ /dev/null @@ -1,150 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""" -Python API for Feature extraction. -""" - -from typing import List, Tuple -import struct -import numpy as np - -from .loop_state import State, StateObject -from .measure import MeasureInput, MeasureResult -from . import _ffi_api - - -# Maximum number of buffers for one statement to extract feature for -DEFAULT_MAX_N_BUFS = 5 - -# The length of the feature vector -DEFAULT_FEATURE_VEC_LEN = 164 - - -def unpack_feature(byte_arr: bytearray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Unpack the encoded feature (in byte array format) of from c++""" - size_of_int = 4 - size_of_float = 4 - - # The format for n records is: - # { - # int n; - # int[n+2] sizes - - # float[sizes[0]] feature for record 1 - # float[sizes[1]] feature for record 2 - # ... feature for record i... - # float[sizes[n-1]] feature for record n - - # float[sizes[n]] normalized throughput for n records - # int[sizes[n+1]] task id for n records - # } - - vec_len = DEFAULT_FEATURE_VEC_LEN - - # unpack sizes - offset = 0 - n = struct.unpack_from("1i", byte_arr, offset=offset)[0] - offset += size_of_int - - sizes = struct.unpack_from("%di" % (n+2), byte_arr, offset=offset) - offset += size_of_int * (n+2) - - # unpack features - features = [] - for size in sizes[:-2]: - row = [] - - # Now we need to unpack the feature for multiple statements. - # The format is: - # { - # int n_stmts - # float[n_stmt][vec_len] feature_vecs - # } - # where vec_len can be calculated by `(size - 1) / n_stmts` - - if size == 0: - # failed during lowering - features.append(np.zeros((1, vec_len))) - else: - n_stmts = struct.unpack_from("f", byte_arr, offset=offset) - offset += size_of_float - - n_stmts = int(n_stmts[0] + 0.5) - tmp_vec_len = (size - 1) // n_stmts - assert tmp_vec_len == vec_len, "The lenght of feature vector is wrong. " \ - "Expected %d but got %d." % (vec_len, tmp_vec_len) - assert (size - 1) % n_stmts == 0 - for _ in range(n_stmts): - x = struct.unpack_from("%df" % vec_len, byte_arr, offset=offset) - offset += vec_len * size_of_float - row.append(x) - - features.append(np.array(row)) - - # unpack normalized_throughputs - m = sizes[-2] - normalized_throughputs = struct.unpack_from("%df" % m, byte_arr, offset=offset) - offset += m * size_of_int - - # unpack task_ids - m = sizes[-1] - task_ids = struct.unpack_from("%di" % m, byte_arr, offset=offset) - offset += m * size_of_int - - assert offset == len(byte_arr), "%d vs %d" % (offset, len(byte_arr)) - return np.array(features), np.array(normalized_throughputs), np.array(task_ids) - - -def get_per_stmt_features_from_file(filename: str, - n_lines: int, - max_n_bufs: int = None) \ - -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Get per_stmt features from a log file""" - byte_arr = _ffi_api.GetPerStmtFeaturesFromFile( - filename, n_lines, max_n_bufs or DEFAULT_MAX_N_BUFS) - return unpack_feature(byte_arr) - - -def get_per_stmt_features_from_measure_pairs(inputs: List[MeasureInput], - results: List[MeasureResult], - skip_first_n_feature_extraction: int = 0, - max_n_bufs: int = None) \ - -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Get per_stmt features from measurement pairs""" - byte_arr = _ffi_api.GetPerStmtFeaturesFromMeasurePairs( - inputs, results, skip_first_n_feature_extraction, max_n_bufs or DEFAULT_MAX_N_BUFS) - return unpack_feature(byte_arr) - - -def get_per_stmt_features_from_states(states, - task: "SearchTask", - max_n_bufs: int = None) -> List[np.ndarray]: - """Get per_stmt features from states""" - if isinstance(states[0], State): - state_objects = [s.state_object for s in states] - elif isinstance(states[0], StateObject): - state_objects = states - byte_arr = _ffi_api.GetPerStmtFeaturesFromStates( - state_objects, task, max_n_bufs or DEFAULT_MAX_N_BUFS) - return unpack_feature(byte_arr)[0] - - -def get_per_stmt_feature_names(max_n_bufs: int = None) -> List[str]: - """Get names for the elements in the flatten feature vector""" - return [x for x in - _ffi_api.GetPerStmtFeatureNames(max_n_bufs or DEFAULT_MAX_N_BUFS)] diff --git a/python/tvm/ansor/loop_state.py b/python/tvm/ansor/loop_state.py index 7aa5de0e9c1d..470ae40f5278 100644 --- a/python/tvm/ansor/loop_state.py +++ b/python/tvm/ansor/loop_state.py @@ -152,345 +152,6 @@ def split(self, stage_id, iterator, lengths, inner_to_outer=True): self._clear_cache() return res - def follow_split(self, stage_id, iterator, src_step_id, n_split): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to split - iterator : Iterator - The iterator to split - src_step_id : int - The index of the split step to follow in the history - n_split : int - The number of split level - - Returns - ------- - res_its : List[Iterator] - The splitted new Iterators - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, res = _ffi_api.StateFollowSplit(self.state_object, stage_id, iterator, - src_step_id, n_split) - self._clear_cache() - return res - - def follow_fused_split(self, stage_id, iterator, src_step_ids, level, - factor_or_nparts): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to split - iterator : Iterator - The iterator to split - src_step_ids : List[int] - The indices of the split steps to follow in the history - level : int - Use the length in this split level - factor_or_nparts : bool - True to use `factor` for split from inner to outer, - False to use `nparts` for split from outer to inner - - Returns - ------- - res_its : List[Iterator] - The splitted new Iterators - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, res = _ffi_api.StateFollowFusedSplit(self.state_object, stage_id, - iterator, src_step_ids, level, - factor_or_nparts) - self._clear_cache() - return res - - def fuse(self, stage_id, iters): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to fuse - iters : List[Iterator] - The iterators to be fused - - Returns - ------- - res_it : Iterator - The fused Iterator - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, res = _ffi_api.StateFuse(self.state_object, stage_id, iters) - self._clear_cache() - return res - - def vectorize(self, stage_id, iterator): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to vectorize - iterator : Iterator - The iterator to be vectorized - - Returns - ------- - res_it : Iterator - The vectorized Iterator - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, res = _ffi_api.StateVectorize(self.state_object, stage_id, iterator) - self._clear_cache() - return res - - def parallel(self, stage_id, iterator): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to parallel - iterator : Iterator - The iterator to be parallelized - - Returns - ------- - res_it : Iterator - The parallelized Iterator - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, res = _ffi_api.StateParallel(self.state_object, stage_id, iterator) - self._clear_cache() - return res - - def unroll(self, stage_id, iterator, max_unroll=-1): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to unroll - iterator : Iterator - The iterator to be unrolled - max_unroll: int - The maximum length of the iterator that can be unrolled - - Returns - ------- - res_it : Iterator - The unrolled Iterator - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, res = _ffi_api.StateUnroll(self.state_object, stage_id, iterator, - max_unroll) - self._clear_cache() - return res - - def bind_thread(self, stage_id, iterator, thread_name): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to bind - iterator : Iterator - The iterator to be bound - thread_name : str - The name of the thread (e.g. "blockIdx.x", "threadIdx.y", "vthread") - - Returns - ------- - res_it : Iterator - The bound Iterator - """ - trans_table = { - "vthread": 4, - "blockIdx.x": 5, - "threadIdx.x": 6, - "blockIdx.y": 7, - "threadIdx.y": 8, - } - thread_id = trans_table[thread_name] - - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, res = _ffi_api.StateBindThread(self.state_object, stage_id, iterator, - thread_id) - self._clear_cache() - return res - - def compute_at(self, stage_id, target_stage_id, target_iter): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of source stage - target_stage_id : Union[int, Operation, Tensor] - The index of the target stage of compute_at - target_iter : Iterator - The target Iterator of compute_at - """ - stage_id = self._resolve_stage_id(stage_id) - target_stage_id = self._resolve_stage_id(target_stage_id) - - self.state_object = _ffi_api.StateComputeAt(self.state_object, stage_id, - target_stage_id, target_iter) - self._clear_cache() - - def compute_root(self, stage_id): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to compute root - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object = _ffi_api.StateComputeRoot(self.state_object, stage_id) - self._clear_cache() - - def compute_inline(self, stage_id): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to compute inline - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object = _ffi_api.StateComputeInline(self.state_object, stage_id) - self._clear_cache() - - def cache_read(self, stage_id, scope_name, reader_stage_ids): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to do cache_read - scope_name : str - reader_stage_ids : List[int] - - Returns - ------- - new_stage_id : int - The added staged id - """ - stage_id = self._resolve_stage_id(stage_id) - - if isinstance(reader_stage_ids, list): - tmp_list = [] - for reader_stage_id in reader_stage_ids: - tmp_list.append(self._resolve_stage_id(reader_stage_id)) - reader_stage_ids = tmp_list - else: - raise ValueError("reader_stage_ids must be list of Tensor or int") - - self.state_object, new_stage_id = _ffi_api.StateCacheRead(self.state_object, stage_id, - scope_name, reader_stage_ids, - self.compute_dag) - return self._insert_new_stage(new_stage_id) - - def cache_write(self, stage_id, scope_name): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to do cache read - scope_name : str - - Returns - ------- - new_stage_id : int - The added staged id - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, new_stage_id = _ffi_api.StateCacheWrite(self.state_object, stage_id, - scope_name, self.compute_dag) - return self._insert_new_stage(new_stage_id) - - def pragma(self, stage_id, iterator, pragma_type): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to add pragma - iterator : Iterator - The iterator to add pragma - pragma_type : str - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object = _ffi_api.StatePragma(self.state_object, stage_id, iterator, - pragma_type) - self._clear_cache() - - def rfactor(self, stage_id, iterator, factor_iter_id): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to do reduction factor - iterator : Iterator - factor_iter_id : int - - Returns - ------- - new_stage_id : int - The added staged id - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, new_stage_id = _ffi_api.StateRfactor(self.state_object, stage_id, - iterator, factor_iter_id, - self.compute_dag) - return self._insert_new_stage(new_stage_id) - - def storage_align(self, stage_id, iterator, factor, offset): - """ - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to do storage align - iterator : Iterator - factor : int - offset : int - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object = _ffi_api.StateStorageAlign(self.state_object, stage_id, iterator, - factor, offset) - self._clear_cache() - - def tensorize(self, stage_id, iterator, ti_func_name): - """ The `ti_func_name` corresponds to a global registered funcion - that returns a Tensorintrin - - Parameters - ---------- - stage_id : Union[int, Operation, Tensor] - The index of the stage to do storage align - iterator : Iterator - The iterator to be tensorized - ti_func_name : str - Tensorize intrinsic function name - - Returns - ------- - res_it : Iterator - The tensorized Iterator - """ - stage_id = self._resolve_stage_id(stage_id) - - self.state_object, res = _ffi_api.StateTensorize(self.state_object, - stage_id, iterator, - ti_func_name) - self._clear_cache() - return res - def _resolve_stage_id(self, stage_id): if isinstance(stage_id, Operation): return self.stage_id_map[stage_id] diff --git a/python/tvm/ansor/measure.py b/python/tvm/ansor/measure.py index be7d69e5ed3a..46c3e3aabd5d 100644 --- a/python/tvm/ansor/measure.py +++ b/python/tvm/ansor/measure.py @@ -42,7 +42,6 @@ from . import _ffi_api from .utils import get_const_tuple, NoDaemonPool, call_func_with_timeout, request_remote, \ check_remote -from .compute_dag import LayoutRewriteLevel LOGGER = logging.getLogger('ansor') @@ -331,7 +330,7 @@ def timed_func(): try: sch, args = task.compute_dag.apply_steps_from_state( - inp.state, LayoutRewriteLevel.BOTH_REWRITE) + inp.state) except Exception: error_no = MeasureErrorNo.INSTANTIATION_ERROR error_msg = make_error_msg() diff --git a/python/tvm/ansor/relay_integration.py b/python/tvm/ansor/relay_integration.py deleted file mode 100644 index f2873f8c72fd..000000000000 --- a/python/tvm/ansor/relay_integration.py +++ /dev/null @@ -1,241 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=unused-variable,invalid-name - -""" -Integrate ansor into relay. It implements the following items: -1. Extract search tasks from a relay program -2. Provide auto-scheduling for all TOPI compute functions -""" -import os -import json -import threading - -import tvm -from tvm import te, transform -from tvm.te.tensor import PlaceholderOp, ComputeOp -from .dispatcher import DispatchContext -from .workload_registry import register_workload_bufs, compute_dag_hash -from .compute_dag import ComputeDAG, LayoutRewriteLevel -from .env import GLOBAL_SCOPE - -def call_all_topi_funcs(mod, target, params, target_host=None): - """Call all TOPI compute + schedule to extract tasks in a relay program""" - # pylint: disable=import-outside-toplevel - from tvm import relay - - with transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - bld_mod = relay.build_module.BuildModule() - bld_mod.call_all_topi_funcs(mod, target=target, params=params, target_host=target_host) - -def extract_from_program(mod, params, target, target_host=None): - """ Extract tuning tasks from a relay program. - - This function is the single program version of extract_from_multiple_program. - - Parameters - ---------- - mod : relay.Module - The module to extract. - params: dict of str to numpy array - The associated parameters of the program - ops: List of relay op - List of relay ops to be tuned - target: tvm.target.Target - The compilation target - target_host: tvm.target.Target - The host compilation target - - Returns - ------- - workloads: Array of Tuple(wkl_key, target) - """ - return extract_from_multiple_program([mod], [params], target, target_host) - -def extract_from_multiple_program(mods, params, target, target_host=None): - """ Extract tuning tasks from multiple relay programs. - - Parameters - ---------- - mods : List of relay.Module - The modules to extract. - params: List of dict of str to numpy array - The associated parameters of the programs - ops: List of relay op - List of relay ops to be tuned - target: tvm.target.Target - The compilation target - target_host: tvm.target.Target - The host compilation target - - Returns - ------- - workloads: Array of Tuple(wkl_key, target) - """ - # pylint: disable=import-outside-toplevel - from tvm import relay - - env = TracingEnvironment(TracingMode.EXTRACT_TASK) - with env: - # run compiler to collect all TOPI calls during compilation - for mod, param in zip(mods, params): - # wrap build call in a new thread to avoid the conflict - # between python's multiprocessing and tvm's thread pool - build_thread = threading.Thread(target=call_all_topi_funcs, - args=(mod, target, param, target_host)) - build_thread.start() - build_thread.join() - relay.backend.compile_engine.get().clear() - - # create tasks for target - wkl_keys = [] - wkl_weights = [] - for wkl_key, wkl_weight in env.wkl_key_collection.items(): - wkl_keys.append(wkl_key) - wkl_weights.append(wkl_weight) - - return wkl_keys, wkl_weights - - -def prepare_layout_rewrite(mod, params, target): - """ - Prepare for kernel layout rewrite. This function will write layout infos to a global static - variable. - Then these layout info will be used by a relay pass `kernel_layout_transform`. - """ - # pylint: disable=import-outside-toplevel - from tvm import relay - - env = TracingEnvironment(TracingMode.PREPARE_LAYOUT_REWRITE) - with env: - # wrap build call in a new thread to avoid the conflict - # between python's multiprocessing and tvm's thread pool - build_thread = threading.Thread(target=call_all_topi_funcs, - args=(mod, target, params)) - build_thread.start() - build_thread.join() - relay.backend.compile_engine.get().clear() - - if env.layout_rewrite_success_ct > 0: - GLOBAL_SCOPE.topi_in_compute_rewrite_mode = True - -def finish_layout_rewrite(): - """Clear the global flag for layout rewrite""" - GLOBAL_SCOPE.topi_in_compute_rewrite_mode = False - - -class TracingMode: - """Two modes for tracing""" - EXTRACT_TASK = 0 # trace all topi calls to extract tasks - PREPARE_LAYOUT_REWRITE = 1 # trace all topi calls to prepare layout rewrite - -class TracingEnvironment: - """Global environment for tracing all topi function calls""" - current = None - - def __init__(self, tracing_mode): - self.tracing_mode = tracing_mode - self.relay_disable_build_cache = "false" - self.layout_rewrite_success_ct = 0 - self.wkl_key_collection = {} - - def __enter__(self): - self.relay_disable_build_cache = os.environ.get("TVM_RELAY_DISABLE_BUILD_CACHE", "false") - os.environ["TVM_RELAY_DISABLE_BUILD_CACHE"] = "true" - TracingEnvironment.current = self - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - os.environ["TVM_RELAY_DISABLE_BUILD_CACHE"] = self.relay_disable_build_cache - TracingEnvironment.current = None - - def add_workload_key(self, key): - """Add the workload key of an Ansor search task - - Parameters - ---------- - key: str - """ - if key in self.wkl_key_collection: - self.wkl_key_collection[key] += 1 - else: - self.wkl_key_collection[key] = 1 - - -def traverse_to_get_io_tensors(outs): - """Traverse from a list of output tensors to get a whole computational DAG""" - layout_free_ops = [] - inputs = [] - - visited = set() - - def traverse(t): - if t in visited: - return - if isinstance(t.op, PlaceholderOp): - inputs.append(t) - elif isinstance(t.op, ComputeOp): - if "layout_free_placeholders" in t.op.attrs: - layout_free_ops.append(t.op) - for x in t.op.input_tensors: - traverse(x) - visited.add(t) - - for t in outs: - traverse(t) - - has_layout_free = (len(layout_free_ops) > 0) - return inputs + [t for t in outs], has_layout_free - - -def auto_schedule_topi(outs): - """ Use ansor to auto-schedule a topi compute declaration """ - io_tensors, has_layout_free = traverse_to_get_io_tensors(outs) - key = register_workload_bufs(io_tensors) - - env = TracingEnvironment.current - if env is None: # in the final build mode - state = DispatchContext.current.query(tvm.target.Target.current(), key) - if state is None: - return te.create_schedule([x.op for x in outs]) - - dag = ComputeDAG(io_tensors) - # Only update compute body, layout_rewrite_level = LayoutRewriteLevel.COMPUTE_REWRITE, - # Since kernel layout has already been rewritten in relay pass - schedule, _ = dag.apply_steps_from_state( - state, layout_rewrite_level=LayoutRewriteLevel.COMPUTE_REWRITE) - return schedule - if env.tracing_mode == TracingMode.EXTRACT_TASK: # in the task extraction mode - env.add_workload_key(key) - return te.create_schedule([x.op for x in outs]) - if env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE: - # in prepare_layout_rewrite mode - if has_layout_free: - # Rewrite the DAG and update the transform history for - # the new dag in DispatchContext - dispatch_ctx = DispatchContext.current - tgt = tvm.target.Target.current() - state = dispatch_ctx.query(tgt, key) - assert state is not None - dag = ComputeDAG(outs) - new_dag = dag.rewrite_layout_from_state(state) - new_key = json.dumps((compute_dag_hash(new_dag),)) - dispatch_ctx.update(tgt, new_key, state) - if new_key != key: - env.layout_rewrite_success_ct += 1 - return te.create_schedule([x.op for x in outs]) - raise ValueError("Invalid tracing mode: " + env.tracing_mode) diff --git a/python/tvm/ansor/task_scheduler.py b/python/tvm/ansor/task_scheduler.py deleted file mode 100644 index 5b916ed39769..000000000000 --- a/python/tvm/ansor/task_scheduler.py +++ /dev/null @@ -1,299 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""TaskScheduler that allocates the time resources when tuning multiple tasks together""" -from typing import List, Union, Callable -import time - -import numpy as np - -from .auto_schedule import SearchTask, SearchPolicy, SketchSearchPolicy, TuneOption -from .cost_model import RandomModel, XGBModel -from .measure import ProgramMeasurer -from .utils import array_mean, to_str_round - - -class TaskScheduler: - """Allocate the time resources when tuning multiple tasks together""" - def __init__(self, - tasks: List[SearchTask], - objective_func: Callable = None): - self.tasks = tasks - self.objective_func = objective_func or sum - - def compute_score(self, costs: List[float]) -> float: - return self.objective_func(costs) - - -def get_search_policies(search_policy: Union[str, List[SearchPolicy]], tasks: List[SearchTask], - num_measure_per_iter, load_model_file=None, load_log_file=None): - """ ... - """ - if search_policy == 'default': - search_policy = 'sketch.xgb' - - if isinstance(search_policy, str): - policy_type, model_type = search_policy.split('.') - if model_type == 'xgb': - cost_model = XGBModel(num_warmup_sample=len(tasks) * num_measure_per_iter) - if load_model_file: - print("Load pretrained model...") - cost_model.load(load_model_file) - elif load_log_file: - cost_model.load_log_file(load_log_file) - elif model_type == 'random': - cost_model = RandomModel() - else: - raise ValueError("Invalid search policy: " + search_policy) - - if policy_type == 'sketch': - search_policies = [SketchSearchPolicy(cost_model) for _ in range(len(tasks))] - elif policy_type == 'limit-space': - search_policies = [SketchSearchPolicy(cost_model, - params={'cpu_multi_level_tiling_structure': 'SRS', - 'disable_change_compute_location': 1}) - for _ in range(len(tasks))] - elif policy_type == 'beam-search': - search_policies = [SketchSearchPolicy(cost_model, - params={'use_beam_search': 1}) - for _ in range(len(tasks))] - else: - raise ValueError("Invalid search policy: " + search_policy) - else: - # check type - assert isinstance(search_policy, (tuple, list)) - for item in search_policy: - assert isinstance(item, SearchPolicy) - search_policies = search_policy - - return search_policies - - -class SimpleTaskScheduler(TaskScheduler): - """The default task scheduler with several strategies - - Parameters - ---------- - tasks: List[SearchTask] - All workloads to tune - weights: List[float] - Weights of tasks (i.e. the number of occurrence of a task in the whole network) - strategy: str - The joint tuning strategy. - "sequential" : Tune tasks sequentially. Divide n_trials equally to every task. - "round-robin": Tune tasks in round robin order. - "gradient" : Tune tasks with gradient descent. - load_log_file: str - Load history log file to pre-train cost model - eps-random: float - Always allocate this percent of n_trials to select tasks randomly. - This is for encouraging exploration. - verbose: int - The level of verbosity. 0 means silent. - alpha: float - The parameter used for 'gradient' strategy - beta: float - The parameter used for 'gradient' strategy - backward_window_size: int - The parameter used for 'gradient' strategy - """ - def __init__(self, - tasks: List[SearchTask], - objective_func: Callable = None, - strategy: str = 'gradient', - load_log_file: str = None, - load_model_file: str = None, - eps_random: float = 0.05, - verbose: int = 1, - alpha: float = 0.2, - beta: float = 2, - gamma: float = 0.5, - backward_window_size: int = 3, - use_debug_measurement_simulator=None): - super().__init__(tasks, objective_func) - self.strategy = strategy - self.eps_random = eps_random - self.verbose = verbose - self.load_log_file = load_log_file - self.load_model_file = load_model_file - self.alpha = alpha - self.beta = beta - self.gamma = gamma - self.backward_window_size = backward_window_size - self.use_debug_measurement_simulator = use_debug_measurement_simulator - - assert self.strategy in ['round-robin', 'gradient'] - - self.task_cts = [] - self.task_costs_history = [] - self.best_costs = self.cur_score = None - self.tune_option = self.measurer = self.search_policies = self.ct = self.tic = None - self.num_measure_per_iter = None - self.dead_tasks = set() - self.sequential_now_task_idx = 0 - self.sequential_now_task_begin_ct = 0 - - def tune(self, tune_option: TuneOption, - search_policy: Union[str, List[SearchPolicy]] = 'default'): - """ Tune tasks. - - Notice: This method does not have return value, make sure to set `LogToFile` - measure callback in `tune_option`. - - Parameters - ---------- - tune_option: TuneOption - search_policy: Str or List[SearchPolicy] - """ - # init members - self.task_cts = [0 for _ in range(len(self.tasks))] - self.task_costs_history = [[] for _ in range(len(self.tasks))] - self.best_costs = 1e10 * np.ones(len(self.tasks)) - self.cur_score = self.compute_score(self.best_costs) - self.tune_option = tune_option - if self.use_debug_measurement_simulator is None: - self.measurer = ProgramMeasurer(tune_option.builder, tune_option.runner, - tune_option.measure_callbacks, tune_option.verbose) - self.ct = 0 - self.tic = time.time() - # reset num_measure_per_iter to make sure every task is tuned at least once - self.num_measure_per_iter = min(tune_option.num_measure_per_iter, - tune_option.n_trials // len(self.tasks)) - self.search_policies = get_search_policies(search_policy, self.tasks, - self.num_measure_per_iter, - self.load_model_file, - self.load_log_file) - self.dead_tasks = set() - self.sequential_now_task_idx = 0 - self.sequential_now_task_begin_ct = 0 - - for i in range(len(self.tasks)): - search_policy = self.search_policies[i] - task = self.tasks[i] - search_policy.set_task(task) - search_policy.set_verbose(tune_option.verbose) - search_policy.run_callbacks(tune_option.pre_search_callbacks) - - # do a round robin first - if self.strategy != 'sequential': - for i in range(len(self.tasks)): - self.tune_task(i) - - # use the specific strategy to choose workload to tune - task_idx = -1 - while self.ct < tune_option.n_trials and len(self.dead_tasks) < len(self.tasks): - if self.strategy == 'sequential': - allocated_total_ct = ((tune_option.n_trials - self.sequential_now_task_begin_ct) - / (len(self.tasks) - self.sequential_now_task_idx)) - used_ct = self.ct - self.sequential_now_task_begin_ct - - if self.sequential_now_task_idx in self.dead_tasks or used_ct >= allocated_total_ct: - self.sequential_now_task_idx += 1 - self.sequential_now_task_begin_ct = self.ct - task_idx = self.sequential_now_task_idx - if task_idx >= len(self.tasks): - break - elif self.strategy == 'round-robin': - task_idx = (task_idx + 1) % len(self.tasks) - while task_idx in self.dead_tasks: - task_idx = (task_idx + 1) % len(self.tasks) - elif self.strategy == 'gradient': - gradients = [] - for i in range(len(self.tasks)): - if i in self.dead_tasks: - gradients.append(0) - continue - - # compute gradient from chain rule : (delta f / delta g_i) - delta = 1e-7 - new_costs = list(self.best_costs) - new_costs[i] -= delta - chain_grad = (self.compute_score(self.best_costs) - self.compute_score(new_costs)) / delta - - # compute (g_i(t_i) - g(t_i - \Delta t)) / (\Delta t) - if self.task_cts[i] - 1 - self.backward_window_size >= 0: - backward_grad = (self.task_costs_history[i][self.task_cts[i] - 1] - - self.task_costs_history[i][self.task_cts[i] - 1 - self.backward_window_size]) \ - / self.backward_window_size - else: - backward_grad = 0 - - # compute (g_i(t_i + \Delta t) - g(t_i)) / (\Delta t) - g_next_1 = self.best_costs[i] - (self.best_costs[i] / self.task_cts[i]) - # todo(lmzheng): this needs adding attribute to topi.compute for similarity check - g_next_2 = self.beta * 1e20 - g_next = min(g_next_1, g_next_2) - forward_grad = g_next - self.best_costs[i] - - # combine all grads - grad = chain_grad * (self.alpha * backward_grad + (1 - self.alpha) * forward_grad) - assert grad <= 0 - gradients.append(grad) - - if max(gradients) == min(gradients): - task_idx = np.random.choice(len(gradients)) - else: - task_idx = np.argmin(gradients) - else: - raise ValueError("Invalid strategy: " + self.strategy) - - if self.verbose >= 1: - print("Next tuning task: %d" % task_idx) - self.tune_task(task_idx) - - def tune_task(self, task_idx): - """ ... - """ - if self.use_debug_measurement_simulator is not None: - measure_inputs, measure_results = \ - self.use_debug_measurement_simulator.get_next_batch( - self.tasks[task_idx], - self.num_measure_per_iter, - ) - else: - measure_inputs, measure_results = \ - self.search_policies[task_idx].continue_search( - self.tasks[task_idx], - self.num_measure_per_iter, - self.tune_option.verbose, - self.measurer) - - for inp, res in zip(measure_inputs, measure_results): - cost = array_mean(res.costs) - if cost < self.best_costs[task_idx]: - self.best_costs[task_idx] = cost - - if len(measure_inputs) == 0: - self.dead_tasks.add(task_idx) - - self.task_cts[task_idx] += 1 - self.task_costs_history[task_idx].append(self.best_costs[task_idx]) - - self.ct += len(measure_inputs) - self.cur_score = self.compute_score(self.best_costs) - - if self.verbose >= 1: - print(("TaskScheduler\tct: %d\testimated cost (ms): %.3f\ttime elapsed: %.2f\t" + - "best_costs (ms): %s\ttask_ct: %s") % - (self.ct, self.cur_score * 1e3, time.time() - self.tic, - to_str_round(self.best_costs * 1e3, decimal=3), - self.task_cts)) - - def remove_dead_task(self, prob): - for idx in self.dead_tasks: - prob[idx] = 0 - return prob / prob.sum() diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index b6bedb411540..8e6698e4a164 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -18,7 +18,6 @@ """Backend code generation engine.""" from __future__ import absolute_import -import os import logging import numpy as np import tvm @@ -142,6 +141,7 @@ def get_valid_implementations(op, attrs, inputs, out_type, target): ret.append(impl) return ret + def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True): """Select the best implementation from the op strategy. @@ -179,9 +179,6 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True) ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor]) The best op implementation and the corresponding output tensors. """ - if os.environ.get('TVM_USE_AUTOTVM', 'false') == 'false': - use_autotvm = False - all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) best_plevel_impl = None diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index d1a39ceb630e..30c5971e32b9 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -72,7 +72,6 @@ def __init__(self): self._get_module = self.mod["get_module"] self._build = self.mod["build"] self._optimize = self.mod["optimize"] - self._call_all_topi_funcs = self.mod["call_all_topi_funcs"] self._set_params_func = self.mod["set_params"] self._get_params_func = self.mod["get_params"] @@ -161,12 +160,6 @@ def optimize(self, mod, target=None, params=None): return mod, params - def call_all_topi_funcs(self, mod, target=None, target_host=None, params=None): - """Call all topi compute and schedule used in a relay function""" - target = _update_target(target) - if params: - self._set_params(params) - self._call_all_topi_funcs(mod, target, target_host) def _set_params(self, params): self._set_params_func(_convert_param_map(params)) diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py index 41bd10cabe3e..d104c1b1c2f8 100644 --- a/python/tvm/relay/op/_transform.py +++ b/python/tvm/relay/op/_transform.py @@ -74,8 +74,6 @@ def compute_strided_set(attrs, inputs, output_type): # layout_transform _reg.register_injective_schedule("layout_transform") _reg.register_pattern("layout_transform", OpPattern.INJECTIVE) -_reg.register_injective_schedule("kernel_layout_transform") -_reg.register_pattern("kernel_layout_transform", OpPattern.INJECTIVE) # argwhere @_reg.register_compute("argwhere") diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py index 58b9269a4c48..486d63c36ff0 100644 --- a/python/tvm/relay/op/op_attrs.py +++ b/python/tvm/relay/op/op_attrs.py @@ -261,9 +261,6 @@ class ClipAttrs(Attrs): class LayoutTransformAttrs(Attrs): """Attributes for transform.layout_transform""" -@tvm._ffi.register_object("relay.attrs.KernelLayoutTransformAttrs") -class KernelLayoutTransformAttrs(Attrs): - """Attributes for transform.kernel_layout_transform""" @tvm._ffi.register_object("relay.attrs.ShapeOfAttrs") class ShapeOfAttrs(Attrs): diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 3453b089f373..b02db416bdc8 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -16,15 +16,14 @@ # under the License. """Definition of x86 operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +import logging +import re +import topi from tvm.te import SpecializedCondition -from tvm import ansor from .generic import * from .. import op as _op -# Set the priority level to use the Ansor auto-scheduler -ansor_plevel = 11 - logger = logging.getLogger('strategy') _NCHWc_matcher = re.compile("^NCHW[0-9]+c$") @@ -40,7 +39,7 @@ def schedule_injective_cpu(attrs, outs, target): def schedule_reduce_cpu(attrs, outs, target): """schedule reduction ops for x86""" with target: - return ansor.auto_schedule_topi(outs) + return topi.x86.schedule_reduce(outs) @schedule_concatenate.register("cpu") def schedule_concatenate_cpu(attrs, outs, target): @@ -52,13 +51,13 @@ def schedule_concatenate_cpu(attrs, outs, target): def schedule_pool_cpu(attrs, outs, target): """schedule pooling ops for x86""" with target: - return ansor.auto_schedule_topi(outs) + return topi.x86.schedule_pool(outs, attrs.layout) @schedule_adaptive_pool.register("cpu") def schedule_adaptive_pool_cpu(attrs, outs, target): """schedule adaptive pooling ops for x86""" with target: - return ansor.auto_schedule_topi(outs) + return topi.x86.schedule_adaptive_pool(outs) @softmax_strategy.register("cpu") def softmax_strategy_cpu(attrs, inputs, out_type, target): @@ -66,15 +65,15 @@ def softmax_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_softmax(topi.nn.softmax), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.x86.schedule_softmax), + name="softmax.x86") return strategy @schedule_log_softmax.register("cpu") def schedule_log_softmax_cpu(attrs, outs, target): """schedule log_softmax op for x86""" with target: - return ansor.auto_schedule_topi(outs) + return topi.x86.schedule_softmax(outs) @conv2d_strategy.register("cpu") def conv2d_strategy_cpu(attrs, inputs, out_type, target): @@ -106,18 +105,18 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target) elif layout == "NHWC": assert kernel_layout == "HWIO" - #logger.warning("For x86 target, NCHW layout is recommended for conv2d.") + logger.warning("For x86 target, NCHW layout is recommended for conv2d.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nhwc), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), + name="conv2d_nhwc.x86") elif layout == "HWCN": assert kernel_layout == "HWIO" - #logger.warning("conv2d HWCN layout is not optimized for x86.") + logger.warning("conv2d HWCN layout is not optimized for x86.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_hwcn), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), + name="conv2d_hwcn.generic") else: raise RuntimeError("Unsupported conv2d layout {} for x86".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -144,8 +143,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): logger.warning("depthwise_conv2d NHWC layout is not optimized for x86.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.generic") else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d @@ -154,8 +153,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): logger.warning("group_conv2d is not optimized for x86.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.generic") else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy @@ -232,8 +231,8 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target): name="conv3d_ncdhw.x86") elif layout == "NDHWC": strategy.add_implementation(wrap_compute_conv3d(topi.x86.conv3d_ndhwc), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc), + name="conv3d_ndhwc.x86") else: raise ValueError("Not support this layout {} yet".format(layout)) return strategy @@ -252,8 +251,8 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): name="conv1d_ncw.x86") elif layout == "NWC": strategy.add_implementation(wrap_compute_conv1d(topi.nn.conv1d_nwc), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.x86.schedule_conv1d_nwc), + name="conv1d_nwc.x86") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy @@ -262,23 +261,16 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() - - strategy.add_implementation(wrap_compute_dense(topi.nn.dense), - wrap_topi_schedule(ansor.auto_schedule_topi), - name='ansor', - plevel=ansor_plevel) - + m, _ = inputs[0].shape strategy.add_implementation(wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", plevel=10) - if "cblas" in target.libs: strategy.add_implementation(wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=15) - m, _ = inputs[0].shape with SpecializedCondition(m >= 16): # this implementation may not be well-optimized, so use plevel=8 for now. strategy.add_implementation(wrap_compute_dense(topi.x86.dense_pack), @@ -291,12 +283,6 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): def batch_matmul_strategy_cpu(attrs, inputs, out_type, target): """batch_matmul x86 strategy""" strategy = _op.OpStrategy() - - strategy.add_implementation(wrap_compute_dense(topi.nn.batch_matmul), - wrap_topi_schedule(ansor.auto_schedule_topi), - name='ansor', - plevel=ansor_plevel) - strategy.add_implementation(wrap_compute_batch_matmul(topi.x86.batch_matmul), wrap_topi_schedule(topi.x86.schedule_batch_matmul), name="batch_matmul.x86", diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py index f2fa2b5f5b90..a37226ea4f58 100644 --- a/python/tvm/relay/op/transform.py +++ b/python/tvm/relay/op/transform.py @@ -815,27 +815,6 @@ def layout_transform(data, src_layout, dst_layout): """ return _make.layout_transform(data, src_layout, dst_layout) -def kernel_layout_transform(data, src_layout, dst_layout): - """Transform the layout of a kernel - - Parameters - ---------- - data : relay.Expr - The source tensor to be transformed - - src_layout: str - The source layout. (e.g 1N32C112H112W) - - dst_layout: str - The destination layout. (e.g. 1N2C112H112W16c) - - Returns - ------- - ret : relay.Expr - The transformed tensor. - """ - return _make.kernel_layout_transform(data, src_layout, dst_layout) - def reverse_reshape(data, newshape): """Reshapes the input array where the special values are inferred from diff --git a/python/tvm/relay/testing/dqn.py b/python/tvm/relay/testing/dqn.py index 3d6883362c9b..10da37001f12 100644 --- a/python/tvm/relay/testing/dqn.py +++ b/python/tvm/relay/testing/dqn.py @@ -26,32 +26,27 @@ from . import layers from .init import create_workload -def get_net(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32", layout="NCHW"): +def get_net(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"): """get symbol of nature dqn""" data_shape = (batch_size,) + image_shape data = relay.var("data", shape=data_shape, dtype=dtype) - bias_axis = layout.index('C') - conv1_bias = relay.var("conv1_bias") conv1 = layers.conv2d(data, kernel_size=(8, 8), strides=(4, 4), padding=(0, 0), - channels=32, name="conv1", data_layout=layout, - kernel_layout=layers.conv_kernel_layout(layout)) - conv1 = relay.nn.bias_add(conv1, conv1_bias, bias_axis) + channels=32, name="conv1") + conv1 = relay.nn.bias_add(conv1, conv1_bias) relu1 = relay.nn.relu(conv1) conv2_bias = relay.var("conv2_bias") conv2 = layers.conv2d(relu1, kernel_size=(4, 4), strides=(2, 2), padding=(0, 0), - channels=64, name="conv2", data_layout=layout, - kernel_layout=layers.conv_kernel_layout(layout)) - conv2 = relay.nn.bias_add(conv2, conv2_bias, bias_axis) + channels=64, name="conv2") + conv2 = relay.nn.bias_add(conv2, conv2_bias) relu2 = relay.nn.relu(conv2) conv3_bias = relay.var("conv3_bias") conv3 = layers.conv2d(relu2, kernel_size=(3, 3), strides=(1, 1), padding=(0, 0), - channels=64, name="conv3", data_layout=layout, - kernel_layout=layers.conv_kernel_layout(layout)) - conv3 = relay.nn.bias_add(conv3, conv3_bias, bias_axis) + channels=64, name="conv3") + conv3 = relay.nn.bias_add(conv3, conv3_bias) relu3 = relay.nn.relu(conv3) bf1 = relay.nn.batch_flatten(relu3) @@ -63,8 +58,7 @@ def get_net(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32" return relay.Function(args, dense2) -def get_workload(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32", - layout="NCHW"): +def get_workload(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"): """Get benchmark workload for a Deep Q Network Parameters ---------- @@ -78,11 +72,10 @@ def get_workload(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="flo The data type Returns ------- - mod : tvm.relay.Module + mod : tvm.IRModule The relay module that contains a DQN network. params : dict of str to NDArray The parameters. """ - net = get_net(batch_size, num_actions=num_actions, image_shape=image_shape, dtype=dtype, - layout=layout) + net = get_net(batch_size, num_actions=num_actions, image_shape=image_shape, dtype=dtype) return create_workload(net) diff --git a/python/tvm/relay/testing/resnet.py b/python/tvm/relay/testing/resnet.py index ac63afde4cba..b431dd096f9d 100644 --- a/python/tvm/relay/testing/resnet.py +++ b/python/tvm/relay/testing/resnet.py @@ -59,11 +59,9 @@ def residual_unit(data, name : str Base name of the operators """ - bn_axis = data_layout.index('C') if bottle_neck: bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, - axis=bn_axis, name=name + '_bn1') act1 = relay.nn.relu(data=bn1) conv1 = layers.conv2d( @@ -75,13 +73,13 @@ def residual_unit(data, name=name + '_conv1', data_layout=data_layout, kernel_layout=kernel_layout) - bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, axis=bn_axis, name=name + '_bn2') + bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + '_bn2') act2 = relay.nn.relu(data=bn2) conv2 = layers.conv2d( data=act2, channels=int(num_filter*0.25), kernel_size=(3, 3), strides=(1, 1), padding=(1, 1), name=name + '_conv2', data_layout=data_layout, kernel_layout=kernel_layout) - bn3 = layers.batch_norm_infer(data=conv2, epsilon=2e-5, axis=bn_axis, name=name + '_bn3') + bn3 = layers.batch_norm_infer(data=conv2, epsilon=2e-5, name=name + '_bn3') act3 = relay.nn.relu(data=bn3) conv3 = layers.conv2d( data=act3, channels=num_filter, kernel_size=(1, 1), @@ -96,13 +94,13 @@ def residual_unit(data, data_layout=data_layout, kernel_layout=kernel_layout) return relay.add(conv3, shortcut) - bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, axis=bn_axis, name=name + '_bn1') + bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, name=name + '_bn1') act1 = relay.nn.relu(data=bn1) conv1 = layers.conv2d( data=act1, channels=num_filter, kernel_size=(3, 3), strides=stride, padding=(1, 1), name=name + '_conv1', data_layout=data_layout, kernel_layout=kernel_layout) - bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, axis=bn_axis, name=name + '_bn2') + bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + '_bn2') act2 = relay.nn.relu(data=bn2) conv2 = layers.conv2d( data=act2, channels=num_filter, kernel_size=(3, 3), @@ -158,16 +156,12 @@ def resnet(units, data_layout = layout kernel_layout = "OIHW" if layout == "NCHW" else "HWIO" - bn_axis = data_layout.index('C') num_unit = len(units) assert num_unit == num_stages data = relay.var("data", shape=data_shape, dtype=dtype) - data = layers.batch_norm_infer(data=data, epsilon=2e-5, axis=bn_axis, scale=False, - name='bn_data') + data = layers.batch_norm_infer(data=data, epsilon=2e-5, scale=False, name='bn_data') (_, _, height, _) = data_shape - if layout == "NHWC": - (_, height, _, _) = data_shape if height <= 32: # such as cifar10 body = layers.conv2d( data=data, channels=filter_list[0], kernel_size=(3, 3), @@ -178,7 +172,7 @@ def resnet(units, data=data, channels=filter_list[0], kernel_size=(7, 7), strides=(2, 2), padding=(3, 3), name="conv0", data_layout=data_layout, kernel_layout=kernel_layout) - body = layers.batch_norm_infer(data=body, epsilon=2e-5, axis=bn_axis, name='bn0') + body = layers.batch_norm_infer(data=body, epsilon=2e-5, name='bn0') body = relay.nn.relu(data=body) body = relay.nn.max_pool2d(data=body, pool_size=(3, 3), strides=(2, 2), padding=(1, 1), layout=data_layout) @@ -193,7 +187,7 @@ def resnet(units, body, filter_list[i+1], (1, 1), True, name='stage%d_unit%d' % (i + 1, j + 2), bottle_neck=bottle_neck, data_layout=data_layout, kernel_layout=kernel_layout) - bn1 = layers.batch_norm_infer(data=body, epsilon=2e-5, axis=bn_axis, name='bn1') + bn1 = layers.batch_norm_infer(data=body, epsilon=2e-5, name='bn1') relu1 = relay.nn.relu(data=bn1) # Although kernel is not used here when global_pool=True, we should put one pool1 = relay.nn.global_avg_pool2d(data=relu1, layout=data_layout) @@ -215,8 +209,6 @@ def get_net(batch_size, Original author Wei Wu """ (_, height, _) = image_shape - if layout == "NHWC": - (height, _, _) = image_shape data_shape = (batch_size,) + image_shape if height <= 28: num_stages = 3 diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py index 967bfcdd3cde..060673dc19c6 100644 --- a/python/tvm/runtime/ndarray.py +++ b/python/tvm/runtime/ndarray.py @@ -279,39 +279,6 @@ def empty(shape, dtype="float32", ctx=context(1, 0)): return _make_array(handle, False, False) -def non_empty(shape, dtype="float32", ctx=context(1, 0)): - """Create an non-empty array given shape and device - - Parameters - ---------- - shape : tuple of int - The shape of the array - - dtype : type or str - The data type of the array. - - ctx : TVMContext - The context of the array - - Returns - ------- - arr : tvm.nd.NDArray - The array tvm supported. - """ - shape = c_array(tvm_shape_index_t, shape) - ndim = ctypes.c_int(len(shape)) - handle = TVMArrayHandle() - dtype = DataType(dtype) - check_call(_LIB.TVMArrayAllocNonEmpty( - shape, ndim, - ctypes.c_int(dtype.type_code), - ctypes.c_int(dtype.bits), - ctypes.c_int(dtype.lanes), - ctx.device_type, - ctx.device_id, - ctypes.byref(handle))) - return _make_array(handle, False, False) - def from_dlpack(dltensor): """Produce an array from a DLPack tensor without memory copy. Retreives the underlying DLPack tensor's pointer to create an array from the diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py index 6a2120817eb1..7d73bf42ab7d 100644 --- a/python/tvm/te/tensor.py +++ b/python/tvm/te/tensor.py @@ -56,11 +56,9 @@ class Tensor(DataProducer, _expr.ExprOp): """Tensor object, to construct, see function.Tensor""" def __call__(self, *indices): - # ndim = self.ndim - # After ansor kernel layout rewrite, len(indices) <= ndim, - # and the indices will get modified by Ansor during schedule generation. - # if len(indices) != ndim: - # raise ValueError("Need to provide %d index in tensor slice" % ndim) + ndim = self.ndim + if len(indices) != ndim: + raise ValueError("Need to provide %d index in tensor slice" % ndim) indices = convert_to_object(indices) args = [] for x in indices: diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index a8cd1d3c2462..34c3487e3ef2 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -153,11 +153,6 @@ class RelayBuildModule : public runtime::ModuleNode { CHECK_EQ(args.num_args, 2); *rv = this->Optimize(args[0], args[1], this->params_); }); - } else if (name == "call_all_topi_funcs") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue *rv) { - CHECK_EQ(args.num_args, 3); - this->CallAllTopiFuncs(args[0], args[1], args[2]); - }); } else { LOG(FATAL) << "Unknown packed function: " << name; return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {}); @@ -232,21 +227,6 @@ class RelayBuildModule : public runtime::ModuleNode { BuildRelay(mod, params_); } - /*! \brief Call all used TOPI compute and schedule in a relay function */ - void CallAllTopiFuncs(IRModule mod, - const TargetsMap& targets, - const tvm::Target& target_host) { - targets_ = targets; - target_host_ = target_host; - - IRModule relay_module = Optimize(mod, targets_, params_); - auto func = Downcast(relay_module->Lookup("main")); - - graph_codegen_ = std::unique_ptr(new GraphCodegen()); - graph_codegen_->Init(nullptr, targets_); - graph_codegen_->Codegen(func); - } - protected: /*! * \brief Optimize a Relay IRModule. @@ -335,18 +315,6 @@ class RelayBuildModule : public runtime::ModuleNode { // Fuse the operations if it is needed. relay_module = transform::FuseOps()(relay_module); - - if (targets.size() == 1) { - pass_seqs.push_back(transform::KernelLayoutTransform()); - pass_seqs.push_back(transform::DeFuseOps()); - pass_seqs.push_back(transform::FoldConstant()); - transform::Pass seq = transform::Sequential(pass_seqs); - const auto& it = targets.begin(); - With tctx((*it).second); - relay_module = seq(relay_module); - relay_module = transform::FuseOps()(relay_module); - } - relay_module = transform::InferType()(relay_module); // Inline the functions that have been lifted by the module scope. // diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index fde880b10f1d..2aae8546248f 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -68,11 +68,6 @@ CCacheKey::CCacheKey(Function source_func, Target target) { auto n = make_object(); n->source_func = std::move(source_func); n->target = std::move(target); - n->disabled = false; - char* envar = getenv("TVM_RELAY_DISABLE_BUILD_CACHE"); - if (envar != nullptr && strcmp(envar, "true") == 0) { - n->disabled = true; - } data_ = std::move(n); } diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h index b290462a4b22..a5f3f6359f89 100644 --- a/src/relay/backend/compile_engine.h +++ b/src/relay/backend/compile_engine.h @@ -115,8 +115,6 @@ class CCacheKeyNode : public Object { /*! \brief The hardware target.*/ Target target; - bool disabled; - void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("source_func", &source_func); v->Visit("target", &target); @@ -261,7 +259,6 @@ inline size_t CCacheKeyNode::Hash() const { } inline bool CCacheKeyNode::Equal(const CCacheKeyNode* other) const { - if (disabled) return false; if (Hash() != other->Hash()) return false; return this->target->str() == other->target->str() && tvm::StructuralEqual()(this->source_func, other->source_func);