huggingface · sgugger · Jun 17, 2022 · Sep 24, 2020 · Sep 24, 2020 · Sep 25, 2020
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
@@ -787,7 +787,7 @@ def setup(self, args, state, model):
         Environment:
             HF_MLFLOW_LOG_ARTIFACTS (`str`, *optional*):
                 Whether to use MLflow .log_artifact() facility to log artifacts. This only makes sense if logging to a
-                remote server, e.g. s3 or GCS. If set to `True` or *1*, will copy whatever is in
+                remote server, e.g. s3 or GCS. If set to `True` or *1*, will copy each saved checkpoint on each save in
                 [`TrainingArguments`]'s `output_dir` to the local or remote artifact storage. Using it without a remote
                 storage will just copy the files to your artifact location.
             MLFLOW_EXPERIMENT_NAME (`str`, *optional*):
@@ -872,12 +872,20 @@ def on_log(self, args, state, control, logs, model=None, **kwargs):
 
     def on_train_end(self, args, state, control, **kwargs):
         if self._initialized and state.is_world_process_zero:
-            if self._log_artifacts:
-                logger.info("Logging artifacts. This may take time.")
-                self._ml_flow.log_artifacts(args.output_dir)
             if self._auto_end_run and self._ml_flow.active_run():
                 self._ml_flow.end_run()
 
+    def on_save(self, args, state, control, **kwargs):
+        if self._initialized and state.is_world_process_zero and self._log_artifacts:
+            ckpt_dir = f"checkpoint-{state.global_step}"
+            artifact_path = os.path.join(args.output_dir, ckpt_dir)
+            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. This may take time.")
+            self._ml_flow.pyfunc.log_model(
+                ckpt_dir,
+                artifacts={"model_path": artifact_path},
+                python_model=self._ml_flow.pyfunc.PythonModel(),
+            )
+
     def __del__(self):
         # if the previous run is not terminated correctly, the fluent API will
         # not let you start a new run before the previous one is killed