Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ci] [python-package] enable ruff-format on tests and examples #6317

Merged
merged 8 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,21 @@ exclude: |
)$

repos:
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Proposing moving isort up above ruff because they were fighting each other. I think it makes sense to have isort go first since it might do things like remove duplicate imports or re-group them, and then ruff after which will simply reformat them.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

relevant conversation: astral-sh/ruff#8926 (comment)

Copy link
Collaborator Author

@jameslamb jameslamb Feb 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah nevermind this one, looks like this was a result of me adding profile = "black" to the isort config while testing. Reverted that in 007ed34 and python-package/ changes dropped out of the diff.

Nope, guess we DO need to include python-package/, and a few other changes to the configuration for isort to make it and ruff stop fighting each other. Sorry, was running this on the wrong branch before 😅

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.2.1
hooks:
# Run the linter.
- id: ruff
args: ["--config", "python-package/pyproject.toml"]
types_or: [python, jupyter]
# Run the formatter.
- id: ruff-format
args: ["--config", "python-package/pyproject.toml"]
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
types_or: [python, jupyter]
180 changes: 88 additions & 92 deletions examples/python-guide/advanced_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@

import lightgbm as lgb

print('Loading data...')
print("Loading data...")
# load or create your dataset
binary_example_dir = Path(__file__).absolute().parents[1] / 'binary_classification'
df_train = pd.read_csv(str(binary_example_dir / 'binary.train'), header=None, sep='\t')
df_test = pd.read_csv(str(binary_example_dir / 'binary.test'), header=None, sep='\t')
W_train = pd.read_csv(str(binary_example_dir / 'binary.train.weight'), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / 'binary.test.weight'), header=None)[0]
binary_example_dir = Path(__file__).absolute().parents[1] / "binary_classification"
df_train = pd.read_csv(str(binary_example_dir / "binary.train"), header=None, sep="\t")
df_test = pd.read_csv(str(binary_example_dir / "binary.test"), header=None, sep="\t")
W_train = pd.read_csv(str(binary_example_dir / "binary.train.weight"), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / "binary.test.weight"), header=None)[0]

y_train = df_train[0]
y_test = df_test[0]
Expand All @@ -27,72 +27,72 @@

# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)

# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
"boosting_type": "gbdt",
"objective": "binary",
"metric": "binary_logloss",
"num_leaves": 31,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"verbose": 0,
}

# generate feature names
feature_name = [f'feature_{col}' for col in range(num_feature)]
feature_name = [f"feature_{col}" for col in range(num_feature)]

print('Starting training...')
print("Starting training...")
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])

print('Finished first 10 rounds...')
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21],
)

print("Finished first 10 rounds...")
# check feature name
print(f'7th feature name is: {lgb_train.feature_name[6]}')
print(f"7th feature name is: {lgb_train.feature_name[6]}")

print('Saving model...')
print("Saving model...")
# save model to file
gbm.save_model('model.txt')
gbm.save_model("model.txt")

print('Dumping model to JSON...')
print("Dumping model to JSON...")
# dump model to JSON (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
with open("model.json", "w+") as f:
json.dump(model_json, f, indent=4)

# feature names
print(f'Feature names: {gbm.feature_name()}')
print(f"Feature names: {gbm.feature_name()}")

# feature importances
print(f'Feature importances: {list(gbm.feature_importance())}')
print(f"Feature importances: {list(gbm.feature_importance())}")

print('Loading model to predict...')
print("Loading model to predict...")
# load model to predict
bst = lgb.Booster(model_file='model.txt')
bst = lgb.Booster(model_file="model.txt")
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
auc_loaded_model = roc_auc_score(y_test, y_pred)
print(f"The ROC AUC of loaded model's prediction is: {auc_loaded_model}")

print('Dumping and loading model with pickle...')
print("Dumping and loading model with pickle...")
# dump model with pickle
with open('model.pkl', 'wb') as fout:
with open("model.pkl", "wb") as fout:
pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
with open("model.pkl", "rb") as fin:
pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
Expand All @@ -104,46 +104,46 @@
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model="model.txt", valid_sets=lgb_eval)

print('Finished 10 - 20 rounds with model file...')
print("Finished 10 - 20 rounds with model file...")

# decay learning rates
# reset_parameter callback accepts:
# 1. list with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99 ** iter))])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99**iter))],
)

print('Finished 20 - 30 rounds with decay learning rates...')
print("Finished 20 - 30 rounds with decay learning rates...")

# change other parameters during training
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)],
)

print('Finished 30 - 40 rounds with changing bagging_fraction...')
print("Finished 30 - 40 rounds with changing bagging_fraction...")


# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
hess = preds * (1.0 - preds)
return grad, hess


Expand All @@ -156,22 +156,19 @@ def loglikelihood(preds, train_data):
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'error', np.mean(labels != (preds > 0.5)), False
preds = 1.0 / (1.0 + np.exp(-preds))
return "error", np.mean(labels != (preds > 0.5)), False


# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood
params_custom_obj["objective"] = loglikelihood

gbm = lgb.train(params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=binary_error,
valid_sets=lgb_eval)
gbm = lgb.train(
params_custom_obj, lgb_train, num_boost_round=10, init_model=gbm, feval=binary_error, valid_sets=lgb_eval
)

print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
print("Finished 40 - 50 rounds with self-defined objective function and eval metric...")


# another self-defined eval metric
Expand All @@ -183,42 +180,41 @@ def binary_error(preds, train_data):
# Keep this in mind when you use the customization
def accuracy(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'accuracy', np.mean(labels == (preds > 0.5)), True
preds = 1.0 / (1.0 + np.exp(-preds))
return "accuracy", np.mean(labels == (preds > 0.5)), True


# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood
params_custom_obj["objective"] = loglikelihood

gbm = lgb.train(params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=[binary_error, accuracy],
valid_sets=lgb_eval)
gbm = lgb.train(
params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=[binary_error, accuracy],
valid_sets=lgb_eval,
)

print('Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...')
print("Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...")

print('Starting a new training job...')
print("Starting a new training job...")


# callback
def reset_metrics():
def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new_valid')
print("Add a new valid dataset at iteration 5...")
env.model.add_valid(lgb_eval_new, "new_valid")

callback.before_iteration = True
callback.order = 0
return callback


gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, callbacks=[reset_metrics()])

print('Finished first 10 rounds with callback function...')
print("Finished first 10 rounds with callback function...")
23 changes: 6 additions & 17 deletions examples/python-guide/dask/ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
if __name__ == "__main__":
print("loading data")

rank_example_dir = Path(__file__).absolute().parents[2] / 'lambdarank'
X, y = load_svmlight_file(str(rank_example_dir / 'rank.train'))
group = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
rank_example_dir = Path(__file__).absolute().parents[2] / "lambdarank"
X, y = load_svmlight_file(str(rank_example_dir / "rank.train"))
group = np.loadtxt(str(rank_example_dir / "rank.train.query"))

print("initializing a Dask cluster")

Expand All @@ -32,25 +32,14 @@
# a sparse boundary to partition the data
X = X.toarray()

dX = da.from_array(
x=X,
chunks=[
(rows_in_part1, rows_in_part2),
(num_features,)
]
)
dX = da.from_array(x=X, chunks=[(rows_in_part1, rows_in_part2), (num_features,)])
dy = da.from_array(
x=y,
chunks=[
(rows_in_part1, rows_in_part2),
]
)
dg = da.from_array(
x=group,
chunks=[
(100, group.size - 100)
]
],
)
dg = da.from_array(x=group, chunks=[(100, group.size - 100)])

print("beginning training")

Expand Down
Loading
Loading