diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 513e720fd099b..6b1fb36010999 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -94,7 +94,7 @@ static void CheckTensorNANOrInf(const std::string& name,
 }
 
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
-                               int block_id) {
+                               int block_id) const {
   auto& global_block = pdesc.Block(block_id);
 
   const Scope* ancestor_scope = scope;
@@ -131,7 +131,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
 }
 
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope, bool create_vars) {
+                   bool create_local_scope, bool create_vars) const {
   platform::RecordBlock b(block_id);
   auto ctx = Prepare(pdesc, block_id);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
@@ -226,10 +226,10 @@ static bool has_fetch_operators(
 }
 
 void Executor::Run(const ProgramDesc& program, Scope* scope,
-                   std::map<std::string, const LoDTensor*>& feed_targets,
-                   std::map<std::string, LoDTensor*>& fetch_targets,
+                   const std::map<std::string, const LoDTensor*>& feed_targets,
+                   const std::map<std::string, LoDTensor*>& fetch_targets,
                    bool create_vars, const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name) {
+                   const std::string& fetch_holder_name) const {
   platform::RecordBlock b(kProgramId);
   bool has_feed_ops =
       has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
@@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 }
 
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                                  bool create_local_scope, bool create_vars) {
+                                  bool create_local_scope,
+                                  bool create_vars) const {
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
@@ -361,9 +362,10 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
 void Executor::RunPreparedContext(
     ExecutorPrepareContext* ctx, Scope* scope,
-    std::map<std::string, const LoDTensor*>& feed_targets,
-    std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
-    const std::string& feed_holder_name, const std::string& fetch_holder_name) {
+    const std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
+    const std::string& feed_holder_name,
+    const std::string& fetch_holder_name) const {
   auto& global_block = ctx->prog_.Block(ctx->block_id_);
 
   PADDLE_ENFORCE(
@@ -378,8 +380,8 @@ void Executor::RunPreparedContext(
     if (op->Type() == kFeedOpType) {
       std::string feed_target_name = op->Output("Out")[0];
       int idx = boost::get<int>(op->GetAttr("col"));
-      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
-                      idx);
+      SetFeedVariable(scope, *feed_targets.at(feed_target_name),
+                      feed_holder_name, idx);
     }
   }
 
@@ -390,7 +392,7 @@ void Executor::RunPreparedContext(
     if (op->Type() == kFetchOpType) {
       std::string fetch_target_name = op->Input("X")[0];
       int idx = boost::get<int>(op->GetAttr("col"));
-      *fetch_targets[fetch_target_name] =
+      *fetch_targets.at(fetch_target_name) =
           GetFetchVariable(*scope, fetch_holder_name, idx);
     }
   }
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 43defdacf2a1c..d70306b0cc0b1 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -52,14 +52,14 @@ class Executor {
    *  Scope
    */
   void Run(const ProgramDesc& prog, Scope* scope, int block_id,
-           bool create_local_scope = true, bool create_vars = true);
+           bool create_local_scope = true, bool create_vars = true) const;
 
   void Run(const ProgramDesc& program, Scope* scope,
-           std::map<std::string, const LoDTensor*>& feed_targets,
-           std::map<std::string, LoDTensor*>& fetch_targets,
+           const std::map<std::string, const LoDTensor*>& feed_targets,
+           const std::map<std::string, LoDTensor*>& fetch_targets,
            bool create_vars = true,
            const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch");
+           const std::string& fetch_holder_name = "fetch") const;
 
   static std::unique_ptr<ExecutorPrepareContext> Prepare(
       const ProgramDesc& program, int block_id);
@@ -67,18 +67,19 @@ class Executor {
   static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
       const ProgramDesc& program, const std::vector<int>& block_ids);
 
-  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
+  void CreateVariables(const ProgramDesc& pdesc, Scope* scope,
+                       int block_id) const;
 
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           bool create_local_scope = true,
-                          bool create_vars = true);
-
-  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                          std::map<std::string, const LoDTensor*>& feed_targets,
-                          std::map<std::string, LoDTensor*>& fetch_targets,
-                          bool create_vars = true,
-                          const std::string& feed_holder_name = "feed",
-                          const std::string& fetch_holder_name = "fetch");
+                          bool create_vars = true) const;
+
+  void RunPreparedContext(
+      ExecutorPrepareContext* ctx, Scope* scope,
+      const std::map<std::string, const LoDTensor*>& feed_targets,
+      const std::map<std::string, LoDTensor*>& fetch_targets,
+      bool create_vars = true, const std::string& feed_holder_name = "feed",
+      const std::string& fetch_holder_name = "fetch") const;
 
  private:
   const platform::Place place_;
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 3b58019db6e55..f598e99a713fb 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/io.h"
 
+#include <algorithm>
 #include <fstream>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -27,14 +29,14 @@ namespace inference {
 // linking the inference shared library.
 void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
 
-void ReadBinaryFile(const std::string& filename, std::string& contents) {
+void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
   PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
   fin.seekg(0, std::ios::end);
-  contents.clear();
-  contents.resize(fin.tellg());
+  contents->clear();
+  contents->resize(fin.tellg());
   fin.seekg(0, std::ios::beg);
-  fin.read(&contents[0], contents.size());
+  fin.read(&(contents->at(0)), contents->size());
   fin.close();
 }
 
@@ -47,7 +49,8 @@ bool IsPersistable(const framework::VarDesc* var) {
   return false;
 }
 
-void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
+void LoadPersistables(const framework::Executor& executor,
+                      framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename) {
@@ -92,18 +95,18 @@ void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
     op->CheckAttrs();
   }
 
-  executor.Run(*load_program, &scope, 0, true, true);
+  executor.Run(*load_program, scope, 0, true, true);
 
   delete load_program;
 }
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
-                                             const std::string& dirname) {
+std::unique_ptr<framework::ProgramDesc> Load(
+    const framework::Executor& executor, framework::Scope* scope,
+    const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
   std::string program_desc_str;
   VLOG(3) << "loading model from " << model_filename;
-  ReadBinaryFile(model_filename, program_desc_str);
+  ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
@@ -113,11 +116,11 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
 }
 
 std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor, framework::Scope& scope,
+    const framework::Executor& executor, framework::Scope* scope,
     const std::string& prog_filename, const std::string& param_filename) {
   std::string model_filename = prog_filename;
   std::string program_desc_str;
-  ReadBinaryFile(model_filename, program_desc_str);
+  ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 756c936b33ad5..d94df8daa37ee 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -27,19 +27,19 @@ namespace inference {
 
 void Init(bool init_p2p);
 
-void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
+void LoadPersistables(const framework::Executor& executor,
+                      framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename);
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
-                                             const std::string& dirname);
+std::unique_ptr<framework::ProgramDesc> Load(
+    const framework::Executor& executor, framework::Scope* scope,
+    const std::string& dirname);
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
-                                             const std::string& prog_filename,
-                                             const std::string& param_filename);
+std::unique_ptr<framework::ProgramDesc> Load(
+    const framework::Executor& executor, framework::Scope* scope,
+    const std::string& prog_filename, const std::string& param_filename);
 
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 97d9f03f88ad3..4edcdab99a85a 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -13,12 +13,25 @@ function(inference_test TARGET_NAME)
   else()
     list(APPEND arg_list "_")
   endif()
+
+  set(use_float16 "")
+  if(${TARGET_NAME} MATCHES "^float16")
+    if(${TARGET_NAME} MATCHES "image_classification")
+      set(use_float16 "--use_float16=true")
+    endif()
+    set(book_dir "book_float16")    
+  else()
+    set(book_dir "book")    
+  endif()
+  set(SOURCE_NAME "")
+  string(REGEX REPLACE "^float16_" "" SOURCE_NAME "${TARGET_NAME}")
+
   foreach(arg ${arg_list})
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
-        SRCS test_inference_${TARGET_NAME}.cc
+        SRCS test_inference_${SOURCE_NAME}.cc
         DEPS paddle_fluid
-        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
+        ARGS --dirname=${PYTHON_TESTS_DIR}/${book_dir}/${TARGET_NAME}${arg}.inference.model ${use_float16})
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
   endforeach()
@@ -27,6 +40,7 @@ endfunction(inference_test)
 # This unittest is buggy!
 #inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)
+inference_test(float16_image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index 1e6555bb02033..4c5d03a28e1ef 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -15,12 +15,16 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/platform/float16.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
+DEFINE_bool(use_float16, false, "Running inference in float16 mode or not");
 
 TEST(inference, image_classification) {
+  using float16 = paddle::platform::float16;
+
   if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
                   "--batch_size=1 --repeat=1";
@@ -35,20 +39,28 @@ TEST(inference, image_classification) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
-                     static_cast<float>(0), static_cast<float>(1));
+  if (!FLAGS_use_float16) {
+    SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
+                       static_cast<float>(0), static_cast<float>(1));
+  } else {
+    SetupTensor<float16>(&input, {FLAGS_batch_size, 3, 32, 32},
+                         static_cast<float16>(0), static_cast<float16>(1));
+  }
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
+  // float16 inference is currently not supported on CPU
   paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
+  if (!FLAGS_use_float16) {
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    cpu_fetchs1.push_back(&output1);
 
-  // Run inference on CPU
-  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace, false, true>(
-      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
-  LOG(INFO) << output1.dims();
+    // Run inference on CPU
+    LOG(INFO) << "--- CPU Runs: ---";
+    TestInference<paddle::platform::CPUPlace, false, true>(
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
+    LOG(INFO) << output1.dims();
+  }
 
 #ifdef PADDLE_WITH_CUDA
   paddle::framework::LoDTensor output2;
@@ -61,6 +73,8 @@ TEST(inference, image_classification) {
       dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
   LOG(INFO) << output2.dims();
 
-  CheckError<float>(output1, output2);
+  if (!FLAGS_use_float16) {
+    CheckError<float>(output1, output2);
+  }
 #endif
 }
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index c3a8d0889c6a6..ae4be004f6c93 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -31,7 +31,9 @@ void SetupTensor(paddle::framework::LoDTensor* input,
 
   T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
   for (int i = 0; i < input->numel(); ++i) {
-    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    input_ptr[i] =
+        static_cast<T>(uniform_dist(rng) * static_cast<double>(upper - lower) +
+                       static_cast<double>(lower));
   }
 }
 
@@ -133,12 +135,12 @@ void TestInference(const std::string& dirname,
       std::string prog_filename = "__model_combined__";
       std::string param_filename = "__params_combined__";
       inference_program = paddle::inference::Load(
-          executor, *scope, dirname + "/" + prog_filename,
+          executor, scope, dirname + "/" + prog_filename,
           dirname + "/" + param_filename);
     } else {
       // Parameters are saved in separate files sited in the specified
       // `dirname`.
-      inference_program = paddle::inference::Load(executor, *scope, dirname);
+      inference_program = paddle::inference::Load(executor, scope, dirname);
     }
   }
   // Disable the profiler and print the timing information
diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
index 56745f115db23..033afc7a1248a 100644
--- a/paddle/fluid/inference/tests/test_multi_thread_helper.h
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
@@ -73,7 +73,7 @@ void TestMultiThreadInference(
 
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program =
-      paddle::inference::Load(executor, *scope, dirname);
+      paddle::inference::Load(executor, scope, dirname);
 
   std::vector<std::thread*> threads;
   for (int i = 0; i < num_threads; ++i) {
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
index a7ba1e0ae1d22..0cfb7fb730587 100644
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -61,3 +61,35 @@ TEST(SaveLoadOp, CPU) {
     }
   }
 }
+
+TEST(SaveLoadFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+
+  float* expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(i);
+  }
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+  attrs.insert({"save_as_fp16", true});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  paddle::platform::float16* actual = target->data<paddle::platform::float16>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
+  }
+}
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 4a715c4baab2d..f45d07ed90d52 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <numeric>
 
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
+    auto save_as_fp16 = Attr<bool>("save_as_fp16");
 
     if (FileExists(filename) && !overwrite) {
       PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
@@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
-    framework::SerializeToStream(fout, tensor, dev_ctx);
+    auto in_dtype = framework::ToDataType(tensor.type());
+    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor out;
+      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+      framework::SerializeToStream(fout, out, dev_ctx);
+    } else {
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
   }
 };
 
@@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk.
                   "(boolean, default true)"
                   "Overwrite the output file if exist")
         .SetDefault(true);
+    AddAttr<bool>("save_as_fp16",
+                  "(boolean, default false)"
+                  "If true, the tensor will be converted to float16 data "
+                  "type and then saved. Otherwise, the tensor will be "
+                  "directly saved without data type conversion.")
+        .SetDefault(false);
     AddAttr<std::string>("file_path",
                          "(string)"
                          "The \"file_path\" where the variable will be saved.")
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a1e8ff6399f08..869b577bfde9b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -409,9 +409,9 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
-      .def("run",
-           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
-               Executor::Run);
+      .def("run", (void (Executor::*)(const ProgramDesc &, Scope *, int, bool,
+                                      bool) const) &
+                      Executor::Run);
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 4b841ef31dcb6..57f0724e701dd 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -23,16 +23,9 @@
 import unique_name
 
 __all__ = [
-    'Block',
-    'Variable',
-    'Program',
-    'Operator',
-    'default_startup_program',
-    'default_main_program',
-    'program_guard',
-    'switch_startup_program',
-    'switch_main_program',
-    'get_var',
+    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
+    'default_main_program', 'program_guard', 'switch_startup_program',
+    'switch_main_program', 'get_var', 'np_dtype_to_fluid_dtype'
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -41,6 +34,28 @@
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 
 
+def np_dtype_to_fluid_dtype(input):
+    """Change the dtype of float16 numpy array
+
+    numpy float16 is binded to paddle::platform::float16 
+    in tensor_py.h via the help of uint16 data type since
+    the internal memory representation of float16 is 
+    uint16_t in paddle and np.uint16 in numpy, which are
+    themselves binded together by pybind.
+
+    Args:
+        input: input numpy array
+
+    Returns:
+        input: The dtype of input will be changed to np.uint16 if 
+            it is originally np.float16, such that the internal memory
+            of input will be reinterpreted as of dtype np.uint16. 
+    """
+    if input.dtype == np.float16:
+        input.dtype = np.uint16
+    return input
+
+
 def grad_var_name(var_name):
     """
     return gradient name for a certain var name
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 1c0f1f6eb415b..87898aa190c5b 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -63,12 +63,40 @@ def _clone_var_in_block_(block, var):
         persistable=True)
 
 
+def _get_no_fp16_coversion_var_names_(program):
+    """
+    Get the set of input variable names that shouldn't be converted to float16.
+
+    When we want to save the trained parameters for float16 inference, most 
+    parameters need to be firstly converted to float16 and then saved by the 
+    save op. However, there are some parameters that shouldn't be converted to 
+    float16 because the corresponding operator requires float32 parameters even
+    in float16 mode (when the input data is of float16 data type). Currently,
+    the only operator that has this exclusion is the batch norm op.
+
+    :param program: program to get the variable names
+    :type program: Program
+    :return: set of input variable names 
+    :type var_names: set
+    """
+    op_names = {'batch_norm'}
+    var_names = set()
+    for block in program.blocks:
+        for op in block.ops:
+            if op.type in op_names:
+                input_names = op.input_arg_names
+                for in_name in input_names:
+                    var_names.add(in_name)
+    return var_names
+
+
 def save_vars(executor,
               dirname,
               main_program=None,
               vars=None,
               predicate=None,
-              filename=None):
+              filename=None,
+              use_float16=False):
     """
     Save variables to directory by executor.
 
@@ -85,33 +113,46 @@ def save_vars(executor,
 
     :return: None
     """
-    if vars is None:
-        if main_program is None:
-            main_program = default_main_program()
-        if not isinstance(main_program, Program):
-            raise TypeError("program should be as Program type or None")
+    if main_program is None:
+        main_program = default_main_program()
+    if not isinstance(main_program, Program):
+        raise TypeError("program should be as Program type or None")
 
+    if vars is None:
         save_vars(
             executor,
             dirname=dirname,
             vars=filter(predicate, main_program.list_vars()),
-            filename=filename)
+            filename=filename,
+            use_float16=use_float16)
     else:
         save_program = Program()
         save_block = save_program.global_block()
-
         save_var_map = {}
+
+        # Get the names of variables that shouldn't be converted to float16 in 
+        # float16 saving mode, right now it is limited to batch norm input weights.
+        no_conversion_var_names = _get_no_fp16_coversion_var_names_(
+            main_program)
+
         for each_var in vars:
             # NOTE: don't save the variable which type is RAW
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
+
             new_var = _clone_var_in_block_(save_block, each_var)
+            # Determine if a variable needed to be converted to float16 before saving    
+            save_as_fp16 = use_float16 and new_var.name not in no_conversion_var_names
+
             if filename is None:
                 save_block.append_op(
                     type='save',
                     inputs={'X': [new_var]},
                     outputs={},
-                    attrs={'file_path': os.path.join(dirname, new_var.name)})
+                    attrs={
+                        'file_path': os.path.join(dirname, new_var.name),
+                        'save_as_fp16': save_as_fp16
+                    })
             else:
                 save_var_map[new_var.name] = new_var
 
@@ -129,7 +170,11 @@ def save_vars(executor,
         executor.run(save_program)
 
 
-def save_params(executor, dirname, main_program=None, filename=None):
+def save_params(executor,
+                dirname,
+                main_program=None,
+                filename=None,
+                use_float16=False):
     """
     Save all parameters to directory with executor.
     """
@@ -139,10 +184,15 @@ def save_params(executor, dirname, main_program=None, filename=None):
         main_program=main_program,
         vars=None,
         predicate=is_parameter,
-        filename=filename)
+        filename=filename,
+        use_float16=use_float16)
 
 
-def save_persistables(executor, dirname, main_program=None, filename=None):
+def save_persistables(executor,
+                      dirname,
+                      main_program=None,
+                      filename=None,
+                      use_float16=False):
     """
     Save all persistables to directory with executor.
     """
@@ -152,7 +202,8 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
         main_program=main_program,
         vars=None,
         predicate=is_persistable,
-        filename=filename)
+        filename=filename,
+        use_float16=use_float16)
 
 
 def load_vars(executor,
@@ -301,7 +352,8 @@ def save_inference_model(dirname,
                          executor,
                          main_program=None,
                          model_filename=None,
-                         params_filename=None):
+                         params_filename=None,
+                         use_float16=False):
     """
     Build a model especially for inference,
     and save it to directory by the executor.
@@ -359,7 +411,12 @@ def save_inference_model(dirname,
     with open(model_filename, "wb") as f:
         f.write(inference_program.desc.serialize_to_string())
 
-    save_persistables(executor, dirname, inference_program, params_filename)
+    save_persistables(
+        executor,
+        dirname,
+        inference_program,
+        params_filename,
+        use_float16=use_float16)
 
 
 def get_feed_targets_names(program):
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index d24417bbacb50..6a80da41b24ff 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -8,3 +8,4 @@ endforeach()
 add_subdirectory(unittests)
 add_subdirectory(book)
 add_subdirectory(book_memory_optimization)
+add_subdirectory(book_float16)
diff --git a/python/paddle/fluid/tests/book_float16/.gitignore b/python/paddle/fluid/tests/book_float16/.gitignore
new file mode 100644
index 0000000000000..dd28d354f4160
--- /dev/null
+++ b/python/paddle/fluid/tests/book_float16/.gitignore
@@ -0,0 +1 @@
+*.inference.model
diff --git a/python/paddle/fluid/tests/book_float16/CMakeLists.txt b/python/paddle/fluid/tests/book_float16/CMakeLists.txt
new file mode 100644
index 0000000000000..673c965b662a0
--- /dev/null
+++ b/python/paddle/fluid/tests/book_float16/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book_float16/test_float16_image_classification.py b/python/paddle/fluid/tests/book_float16/test_float16_image_classification.py
new file mode 100644
index 0000000000000..f41a5a281ea2c
--- /dev/null
+++ b/python/paddle/fluid/tests/book_float16/test_float16_image_classification.py
@@ -0,0 +1,240 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import math
+import sys
+import numpy as np
+import unittest
+import os
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
+    return fc2
+
+
+def train(net_type, save_dirname):
+    classdim = 10
+    data_shape = [3, 32, 32]
+
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if net_type == "vgg":
+        print("train vgg net")
+        net = vgg16_bn_drop(images)
+    elif net_type == "resnet":
+        print("train resnet")
+        net = resnet_cifar10(images, 32)
+    else:
+        raise ValueError("%s network is not supported" % net_type)
+
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+
+    # Test program 
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+    BATCH_SIZE = 128
+    PASS_NUM = 1
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+
+    place = fluid.CPUPlace()
+    if fluid.core.is_compiled_with_cuda():
+        place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+
+    main_program = fluid.default_main_program()
+    exe.run(fluid.default_startup_program())
+    loss = 0.0
+    for pass_id in range(PASS_NUM):
+        for batch_id, data in enumerate(train_reader()):
+            exe.run(main_program, feed=feeder.feed(data))
+
+            if (batch_id % 10) == 0:
+                acc_list = []
+                avg_loss_list = []
+                for tid, test_data in enumerate(test_reader()):
+                    loss_t, acc_t = exe.run(program=test_program,
+                                            feed=feeder.feed(test_data),
+                                            fetch_list=[avg_cost, acc])
+                    if math.isnan(float(loss_t)):
+                        sys.exit("got NaN loss, training failed.")
+                    acc_list.append(float(acc_t))
+                    avg_loss_list.append(float(loss_t))
+                    break  # Use 1 segment for speeding up CI
+
+                acc_value = np.array(acc_list).mean()
+                avg_loss_value = np.array(avg_loss_list).mean()
+
+                print(
+                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                    format(pass_id, batch_id + 1,
+                           float(avg_loss_value), float(acc_value)))
+
+                if acc_value > 0.01:  # Low threshold for speeding up CI
+                    fluid.io.save_inference_model(
+                        save_dirname, ["pixel"], [predict],
+                        exe,
+                        use_float16=True)
+                    return
+
+
+def infer(save_dirname):
+    # float16 inference is currently only supported on CUDA GPU
+    if not fluid.core.is_compiled_with_cuda():
+        return
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension of conv should be 4-D or 5-D.
+        # Use normilized image pixels as input data, which should be in the range [0, 1.0].
+        batch_size = 1
+        # The input is of numpy float16 data type
+        tensor_img = np.random.rand(batch_size, 3, 32, 32).astype(np.float16)
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        # Use np_dtype_to_fluid_dtype to bind tensor_img of numpy float16 data type 
+        # with fluid float16 data type so that it will invoke the inference engine
+        # to run in float16 mode.
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]:
+                              fluid.np_dtype_to_fluid_dtype(tensor_img)
+                          },
+                          fetch_list=fetch_targets)
+        print("infer results: ", results[0])
+
+
+def main(net_type):
+    # Directory for saving the trained model
+    save_dirname = "float16_image_classification_" + net_type + ".inference.model"
+
+    train(net_type, save_dirname)
+    infer(save_dirname)
+
+
+class TestFP16ImageClassification(unittest.TestCase):
+    def test_vgg(self):
+        with self.scope_prog_guard():
+            main('vgg')
+
+    def test_resnet(self):
+        with self.scope_prog_guard():
+            main('resnet')
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()