From 81d54e60809700e0d2a6abadb0e8807f77505ea0 Mon Sep 17 00:00:00 2001
From: Nicky <nicky@baidu.com>
Date: Wed, 28 Mar 2018 12:08:44 -0700
Subject: [PATCH 1/6] Create Audio Feature in SDK * Add apis to record audio in
 SDK * Add corresponding apis in pybind, storage.py, sdk.h * Implement
 reservoir sampling when collecting audio samples

---
 visualdl/logic/pybind.cc       |  77 ++++++++++++++++++++--
 visualdl/logic/sdk.cc          | 111 +++++++++++++++++++++++++++++--
 visualdl/logic/sdk.h           | 117 +++++++++++++++++++++++++++++++++
 visualdl/python/storage.py     |  26 ++++++++
 visualdl/storage/storage.proto |   1 +
 visualdl/storage/tablet.h      |   4 ++
 6 files changed, 325 insertions(+), 11 deletions(-)

diff --git a/visualdl/logic/pybind.cc b/visualdl/logic/pybind.cc
index f5e292f57..1ac8bd2d2 100644
--- a/visualdl/logic/pybind.cc
+++ b/visualdl/logic/pybind.cc
@@ -74,14 +74,17 @@ PYBIND11_MODULE(core, m) {
       #undef READER_ADD_HISTOGRAM
 
       // clang-format on
-      .def("get_image",
-           [](vs::LogReader& self, const std::string& tag) {
-             auto tablet = self.tablet(tag);
-             return vs::components::ImageReader(self.mode(), tablet);
-           })
+      .def("get_image", [](vs::LogReader& self, const std::string& tag) {
+         auto tablet = self.tablet(tag);
+         return vs::components::ImageReader(self.mode(), tablet);
+      })
       .def("get_text", [](vs::LogReader& self, const std::string& tag) {
         auto tablet = self.tablet(tag);
         return vs::components::TextReader(tablet);
+      })
+      .def("get_audio", [](vs::LogReader& self, const std::string& tag) {
+        auto tablet = self.tablet(tag);
+        return vs::components::AudioReader(self.mode(), tablet);
       });
 
   // clang-format on
@@ -122,7 +125,15 @@ PYBIND11_MODULE(core, m) {
       .def("new_text", [](vs::LogWriter& self, const std::string& tag) {
         auto tablet = self.AddTablet(tag);
         return vs::components::Text(tablet);
-      });
+       })
+      .def("new_audio",
+            [](vs::LogWriter& self,
+               const std::string& tag,
+               int num_samples,
+               int step_cycle) {
+               auto tablet = self.AddTablet(tag);
+               return vs::components::Audio(tablet, num_samples, step_cycle);
+            });
 
 //------------------- components --------------------
 #define ADD_SCALAR_READER(T)                               \
@@ -219,6 +230,60 @@ PYBIND11_MODULE(core, m) {
       .def("total_records", &cp::TextReader::total_records)
       .def("size", &cp::TextReader::size);
 
+
+    py::class_<cp::Audio>(m, "AudioWriter", R"pbdoc(
+            PyBind class. Must instantiate through the LogWriter.
+          )pbdoc")
+    .def("set_caption", &cp::Audio::SetCaption, R"pbdoc(
+            PyBind class. Must instantiate through the LogWriter.
+          )pbdoc")
+    .def("start_sampling", &cp::Audio::StartSampling, R"pbdoc(
+            Start a sampling period, this interface will start a new reservoir sampling phase.
+          )pbdoc")
+    .def("is_sample_taken", &cp::Audio::IsSampleTaken, R"pbdoc(
+            Will this sample be taken, this interface is introduced to reduce the cost
+            of copy audio data, by testing whether this audio will be sampled, and only
+            copy data when it should be sampled. In that way, most of un-sampled audio
+            data need not be copied or processed at all.
+
+            :return: Index
+            :rtype: integer
+                  )pbdoc")
+    .def("finish_sampling", &cp::Audio::FinishSampling, R"pbdoc(
+            End a sampling period, it will clear all states for reservoir sampling.
+          )pbdoc")
+    .def("set_sample", &cp::Audio::SetSample, R"pbdoc(
+            Store the flatten audio data with sample rate specified.
+
+            :param index:
+            :type index: integer
+            :param sample_rate: Sample rate of audio
+            :type sample_rate: integer
+            :param audio_data: Flatten audio data
+            :type audio_data: list
+                  )pbdoc")
+    .def("add_sample", &cp::Audio::AddSample, R"pbdoc(
+            A combined interface for is_sample_taken and set_sample, simpler but is less efficient.
+
+            :param sample_rate: Sample rate of audio
+            :type sample_rate: integer
+            :param audio_data: Flatten audio data
+            :type audio_data: list
+                  )pbdoc");
+
+    py::class_<cp::AudioReader::AudioRecord>(m, "AudioRecord")
+    // TODO(ChunweiYan) make these copyless.
+    .def("data",        [](cp::AudioReader::AudioRecord& self) { return self.data; })
+    .def("sample_rate", [](cp::AudioReader::AudioRecord& self) { return self.sample_rate; })
+    .def("step_id",     [](cp::AudioReader::AudioRecord& self) { return self.step_id; });
+
+    py::class_<cp::AudioReader>(m, "AudioReader")
+    .def("caption", &cp::AudioReader::caption)
+    .def("num_records", &cp::AudioReader::num_records)
+    .def("num_samples", &cp::AudioReader::num_samples)
+    .def("record", &cp::AudioReader::record)
+    .def("timestamp", &cp::AudioReader::timestamp);
+
 #define ADD_HISTOGRAM_WRITER(T)                                          \
   py::class_<cp::Histogram<T>>(m, "HistogramWriter__" #T, \ 
    R"pbdoc(PyBind class. Must instantiate through the LogWriter.)pbdoc") \
diff --git a/visualdl/logic/sdk.cc b/visualdl/logic/sdk.cc
index db58e73ea..a61f9f235 100644
--- a/visualdl/logic/sdk.cc
+++ b/visualdl/logic/sdk.cc
@@ -222,11 +222,6 @@ void Image::SetSample(int index,
   CHECK_LT(index, num_samples_);
   CHECK_LE(index, num_records_);
 
-  // trick to store int8 to protobuf
-  std::vector<byte_t> data_str(data.size());
-  for (int i = 0; i < data.size(); i++) {
-    data_str[i] = data[i];
-  }
   Uint8Image image(new_shape[2], new_shape[0] * new_shape[1]);
   NormalizeImage(&image, &data[0], new_shape[0] * new_shape[1], new_shape[2]);
 
@@ -352,6 +347,112 @@ std::string TextReader::caption() const {
 
 size_t TextReader::size() const { return reader_.total_records(); }
 
+
+void Audio::StartSampling() {
+  if (!ToSampleThisStep()) return;
+
+  step_ = writer_.AddRecord();
+  step_.SetId(step_id_);
+
+  time_t time = std::time(nullptr);
+  step_.SetTimeStamp(time);
+
+  // resize record
+  for (int i = 0; i < num_samples_; i++) {
+    step_.AddData();
+  }
+  num_records_ = 0;
+}
+
+int Audio::IsSampleTaken() {
+  if (!ToSampleThisStep()) return -1;
+  num_records_++;
+  if (num_records_ <= num_samples_) {
+    return num_records_ - 1;
+  }
+  float prob = float(num_samples_) / num_records_;
+  float randv = (float)rand() / RAND_MAX;
+  if (randv < prob) {
+    // take this sample
+    int index = rand() % num_samples_;
+    return index;
+  }
+  return -1;
+}
+
+void Audio::FinishSampling() {
+  step_id_++;
+  if (ToSampleThisStep()) {
+    writer_.parent()->PersistToDisk();
+  }
+}
+
+void Audio::AddSample(int sample_rate,
+                      const std::vector<value_t>& data) {
+  auto idx = IsSampleTaken();
+  if (idx >= 0) {
+    SetSample(idx, sample_rate, data);
+  }
+}
+
+void Audio::SetSample(int index,
+                      int sample_rate,
+                      const std::vector<value_t>& data) {
+  CHECK_GT(sample_rate, 0) << "sample rate should be something like 6000, 8000 or 44100";
+  CHECK_LT(index, num_samples_);
+  CHECK_LE(index, num_records_);
+
+    //convert float vector to char vector
+  std::vector<char> data_str(data.size());
+  for (int i = 0; i < data.size(); i++) {
+    data_str[i] = data[i];
+  }
+
+  BinaryRecord brcd(
+          GenBinaryRecordDir(step_.parent()->dir()), std::string(data_str.data()));
+  brcd.tofile();
+
+  auto entry = step_.MutableData<std::vector<byte_t>>(index);
+  // update record
+  auto old_hash = entry.reader().GetRaw();
+  if (!old_hash.empty()) {
+    std::string old_path =
+            GenBinaryRecordDir(step_.parent()->dir()) + "/" + old_hash;
+    CHECK_EQ(std::remove(old_path.c_str()), 0) << "delete old binary record "
+                                               << old_path << " failed";
+  }
+  entry.SetRaw(brcd.filename());
+}
+
+std::string AudioReader::caption() {
+  CHECK_EQ(reader_.captions().size(), 1);
+  auto caption = reader_.captions().front();
+  if (LogReader::TagMatchMode(caption, mode_)) {
+    return LogReader::GenReadableTag(mode_, caption);
+  }
+  string::TagDecode(caption);
+  return caption;
+}
+
+AudioReader::AudioRecord AudioReader::record(int offset, int index) {
+  AudioRecord res;
+  auto record = reader_.record(offset);
+  auto entry = record.data(index);
+  auto filename = entry.GetRaw();
+  CHECK(!g_log_dir.empty())
+          << "g_log_dir should be set in LogReader construction";
+  BinaryRecordReader brcd(GenBinaryRecordDir(g_log_dir), filename);
+
+  std::transform(brcd.data.begin(),
+                 brcd.data.end(),
+                 std::back_inserter(res.data),
+                 [](byte_t i) { return (int)(i); });
+  res.step_id = record.id();
+  return res;
+}
+
+
+
 }  // namespace components
 
 }  // namespace visualdl
diff --git a/visualdl/logic/sdk.h b/visualdl/logic/sdk.h
index d1e08d0c9..04ca7b023 100644
--- a/visualdl/logic/sdk.h
+++ b/visualdl/logic/sdk.h
@@ -326,6 +326,123 @@ struct TextReader {
   TabletReader reader_;
 };
 
+
+
+
+/*
+ * Image component writer.
+ */
+struct Audio {
+    using value_t = float;
+
+    /*
+     * step_cycle: store every `step_cycle` as a record.
+     * num_samples: how many samples to take in a step.
+     */
+    Audio(Tablet tablet, int num_samples, int step_cycle)
+            : writer_(tablet), num_samples_(num_samples), step_cycle_(step_cycle) {
+      CHECK_GT(step_cycle, 0);
+      CHECK_GT(num_samples, 0);
+
+      writer_.SetType(Tablet::Type::kAudio);
+      // make audio's tag as the default caption.
+      writer_.SetNumSamples(num_samples);
+      SetCaption(tablet.reader().tag());
+    }
+
+    void SetCaption(const std::string& c) {
+      writer_.SetCaptions(std::vector<std::string>({c}));
+    }
+
+    /*
+     * Start a sampling period, this interface will start a new reservior sampling
+     * phase.
+     */
+    void StartSampling();
+    /*
+     * End a sampling period, it will clear all states for reservior sampling.
+     */
+    void FinishSampling();
+
+    /*
+     * A combined interface for IsSampleTaken and SetSample, simpler but might be
+     * low effience.
+     */
+    void AddSample(int sample_rate,
+                   const std::vector<value_t>& data);
+
+    /*
+     * Will this sample be taken, this interface is introduced to reduce the cost
+     * of copy audio data, by testing whether this audio will be sampled, and only
+     * copy data when it should be sampled. In that way, most of unsampled audio
+     * data need not be copied or processed at all.
+     */
+    int IsSampleTaken();
+    /*
+     * Store audio data with sample rate
+     */
+    void SetSample(int index,
+                   int sample_rate,
+                   const std::vector<value_t>& data);
+
+protected:
+    bool ToSampleThisStep() { return step_id_ % step_cycle_ == 0; }
+
+private:
+    Tablet writer_;
+    Record step_;
+    int num_records_{0};
+    int num_samples_{0};
+    int step_id_{0};
+    int step_cycle_;
+};
+
+/*
+* Audio reader.
+*/
+struct AudioReader {
+    using value_t = typename Audio::value_t;
+
+    struct AudioRecord {
+        int step_id;
+        int sample_rate;
+        std::vector<int> data;
+    };
+
+    AudioReader(const std::string& mode, TabletReader tablet)
+            : reader_(tablet), mode_{mode} {}
+
+    std::string caption();
+
+    // number of steps.
+    int num_records() { return reader_.total_records(); }
+
+    int num_samples() { return reader_.num_samples(); }
+
+    int64_t timestamp(int step) { return reader_.record(step).timestamp(); }
+
+    /*
+     * offset: offset of a step.
+     * index: index of a sample.
+     */
+    AudioRecord record(int offset, int index);
+
+    /*
+     * offset: offset of a step.
+     * index: index of a sample.
+     */
+    std::vector<value_t> data(int offset, int index);
+
+    int stepid(int offset, int index);
+
+private:
+    TabletReader reader_;
+    std::string mode_;
+};
+
+
+
+
 }  // namespace components
 }  // namespace visualdl
 
diff --git a/visualdl/python/storage.py b/visualdl/python/storage.py
index 4b519dd66..e537bd6d2 100644
--- a/visualdl/python/storage.py
+++ b/visualdl/python/storage.py
@@ -119,6 +119,16 @@ def text(self, tag):
         check_tag_name_valid(tag)
         return self.reader.get_text(tag)
 
+    def audio(self, tag):
+        """
+        Get a audio reader with tag
+
+        :param tag:  The reader will read the audio data marked with tag
+        :type tag: basestring
+        """
+        check_tag_name_valid(tag)
+        return self.reader.get_audio(tag)
+
     def __enter__(self):
         return self
 
@@ -226,6 +236,22 @@ def histogram(self, tag, num_buckets, type='float'):
         }
         return types[type](tag, num_buckets)
 
+    def audio(self, tag, num_samples, step_cycle=1):
+        """
+        Create an audio writer that used to write audio data.
+
+        :param tag: The audio writer will label the audio with tag
+        :type tag: basestring
+        :param num_samples: how many samples to take in a step.
+        :type num_samples: integer
+        :param step_cycle: store every `step_cycle` as a record.
+        :type step_cycle: integer
+        :return: A audio writer to sample audio
+        :rtype: AudioWriter
+        """
+        check_tag_name_valid(tag)
+        return self.writer.new_audio(tag, num_samples, step_cycle)
+
     def text(self, tag):
         check_tag_name_valid(tag)
         return self.writer.new_text(tag)
diff --git a/visualdl/storage/storage.proto b/visualdl/storage/storage.proto
index d6e827217..f822de014 100644
--- a/visualdl/storage/storage.proto
+++ b/visualdl/storage/storage.proto
@@ -108,6 +108,7 @@ message Tablet {
     kHistogram = 1;
     kImage = 2;
     kText = 3;
+    kAudio = 4;
   }
   // The unique identification for this `Tablet`. VisualDL will have no the
   // concept of FileWriter like TB. It will store all the tablets in a single
diff --git a/visualdl/storage/tablet.h b/visualdl/storage/tablet.h
index d314966d8..882659317 100644
--- a/visualdl/storage/tablet.h
+++ b/visualdl/storage/tablet.h
@@ -34,6 +34,7 @@ struct Tablet {
     kHistogram = 1,
     kImage = 2,
     kText = 3,
+    kAudio = 4,
     kUnknown = -1
   };
 
@@ -55,6 +56,9 @@ struct Tablet {
     if (name == "text") {
       return kText;
     }
+    if (name == "audio") {
+      return kAudio;
+    }
     LOG(ERROR) << "unknown component: " << name;
     return kUnknown;
   }

From 0be2512cd89f66acf2216bcdfd946a8cfbc83b69 Mon Sep 17 00:00:00 2001
From: Nicky <nicky@baidu.com>
Date: Wed, 28 Mar 2018 15:42:13 -0700
Subject: [PATCH 2/6] fix clang format and update based on comment

---
 visualdl/logic/pybind.cc   |  82 +++++++++---------
 visualdl/logic/sdk.cc      |  27 +++---
 visualdl/logic/sdk.h       | 173 ++++++++++++++++++-------------------
 visualdl/python/storage.py |   2 +-
 4 files changed, 139 insertions(+), 145 deletions(-)

diff --git a/visualdl/logic/pybind.cc b/visualdl/logic/pybind.cc
index 1ac8bd2d2..a3ebc48ef 100644
--- a/visualdl/logic/pybind.cc
+++ b/visualdl/logic/pybind.cc
@@ -74,14 +74,16 @@ PYBIND11_MODULE(core, m) {
       #undef READER_ADD_HISTOGRAM
 
       // clang-format on
-      .def("get_image", [](vs::LogReader& self, const std::string& tag) {
-         auto tablet = self.tablet(tag);
-         return vs::components::ImageReader(self.mode(), tablet);
-      })
-      .def("get_text", [](vs::LogReader& self, const std::string& tag) {
-        auto tablet = self.tablet(tag);
-        return vs::components::TextReader(tablet);
-      })
+      .def("get_image",
+           [](vs::LogReader& self, const std::string& tag) {
+             auto tablet = self.tablet(tag);
+             return vs::components::ImageReader(self.mode(), tablet);
+           })
+      .def("get_text",
+           [](vs::LogReader& self, const std::string& tag) {
+             auto tablet = self.tablet(tag);
+             return vs::components::TextReader(tablet);
+           })
       .def("get_audio", [](vs::LogReader& self, const std::string& tag) {
         auto tablet = self.tablet(tag);
         return vs::components::AudioReader(self.mode(), tablet);
@@ -122,18 +124,19 @@ PYBIND11_MODULE(core, m) {
              auto tablet = self.AddTablet(tag);
              return vs::components::Image(tablet, num_samples, step_cycle);
            })
-      .def("new_text", [](vs::LogWriter& self, const std::string& tag) {
-        auto tablet = self.AddTablet(tag);
-        return vs::components::Text(tablet);
-       })
+      .def("new_text",
+           [](vs::LogWriter& self, const std::string& tag) {
+             auto tablet = self.AddTablet(tag);
+             return vs::components::Text(tablet);
+           })
       .def("new_audio",
-            [](vs::LogWriter& self,
-               const std::string& tag,
-               int num_samples,
-               int step_cycle) {
-               auto tablet = self.AddTablet(tag);
-               return vs::components::Audio(tablet, num_samples, step_cycle);
-            });
+           [](vs::LogWriter& self,
+              const std::string& tag,
+              int num_samples,
+              int step_cycle) {
+             auto tablet = self.AddTablet(tag);
+             return vs::components::Audio(tablet, num_samples, step_cycle);
+           });
 
 //------------------- components --------------------
 #define ADD_SCALAR_READER(T)                               \
@@ -230,17 +233,16 @@ PYBIND11_MODULE(core, m) {
       .def("total_records", &cp::TextReader::total_records)
       .def("size", &cp::TextReader::size);
 
-
-    py::class_<cp::Audio>(m, "AudioWriter", R"pbdoc(
+  py::class_<cp::Audio>(m, "AudioWriter", R"pbdoc(
             PyBind class. Must instantiate through the LogWriter.
           )pbdoc")
-    .def("set_caption", &cp::Audio::SetCaption, R"pbdoc(
+      .def("set_caption", &cp::Audio::SetCaption, R"pbdoc(
             PyBind class. Must instantiate through the LogWriter.
           )pbdoc")
-    .def("start_sampling", &cp::Audio::StartSampling, R"pbdoc(
+      .def("start_sampling", &cp::Audio::StartSampling, R"pbdoc(
             Start a sampling period, this interface will start a new reservoir sampling phase.
           )pbdoc")
-    .def("is_sample_taken", &cp::Audio::IsSampleTaken, R"pbdoc(
+      .def("is_sample_taken", &cp::Audio::IsSampleTaken, R"pbdoc(
             Will this sample be taken, this interface is introduced to reduce the cost
             of copy audio data, by testing whether this audio will be sampled, and only
             copy data when it should be sampled. In that way, most of un-sampled audio
@@ -249,10 +251,10 @@ PYBIND11_MODULE(core, m) {
             :return: Index
             :rtype: integer
                   )pbdoc")
-    .def("finish_sampling", &cp::Audio::FinishSampling, R"pbdoc(
+      .def("finish_sampling", &cp::Audio::FinishSampling, R"pbdoc(
             End a sampling period, it will clear all states for reservoir sampling.
           )pbdoc")
-    .def("set_sample", &cp::Audio::SetSample, R"pbdoc(
+      .def("set_sample", &cp::Audio::SetSample, R"pbdoc(
             Store the flatten audio data with sample rate specified.
 
             :param index:
@@ -262,7 +264,7 @@ PYBIND11_MODULE(core, m) {
             :param audio_data: Flatten audio data
             :type audio_data: list
                   )pbdoc")
-    .def("add_sample", &cp::Audio::AddSample, R"pbdoc(
+      .def("add_sample", &cp::Audio::AddSample, R"pbdoc(
             A combined interface for is_sample_taken and set_sample, simpler but is less efficient.
 
             :param sample_rate: Sample rate of audio
@@ -271,18 +273,20 @@ PYBIND11_MODULE(core, m) {
             :type audio_data: list
                   )pbdoc");
 
-    py::class_<cp::AudioReader::AudioRecord>(m, "AudioRecord")
-    // TODO(ChunweiYan) make these copyless.
-    .def("data",        [](cp::AudioReader::AudioRecord& self) { return self.data; })
-    .def("sample_rate", [](cp::AudioReader::AudioRecord& self) { return self.sample_rate; })
-    .def("step_id",     [](cp::AudioReader::AudioRecord& self) { return self.step_id; });
-
-    py::class_<cp::AudioReader>(m, "AudioReader")
-    .def("caption", &cp::AudioReader::caption)
-    .def("num_records", &cp::AudioReader::num_records)
-    .def("num_samples", &cp::AudioReader::num_samples)
-    .def("record", &cp::AudioReader::record)
-    .def("timestamp", &cp::AudioReader::timestamp);
+  py::class_<cp::AudioReader::AudioRecord>(m, "AudioRecord")
+      // TODO(ChunweiYan) make these copyless.
+      .def("data", [](cp::AudioReader::AudioRecord& self) { return self.data; })
+      .def("sample_rate",
+           [](cp::AudioReader::AudioRecord& self) { return self.sample_rate; })
+      .def("step_id",
+           [](cp::AudioReader::AudioRecord& self) { return self.step_id; });
+
+  py::class_<cp::AudioReader>(m, "AudioReader")
+      .def("caption", &cp::AudioReader::caption)
+      .def("num_records", &cp::AudioReader::num_records)
+      .def("num_samples", &cp::AudioReader::num_samples)
+      .def("record", &cp::AudioReader::record)
+      .def("timestamp", &cp::AudioReader::timestamp);
 
 #define ADD_HISTOGRAM_WRITER(T)                                          \
   py::class_<cp::Histogram<T>>(m, "HistogramWriter__" #T, \ 
diff --git a/visualdl/logic/sdk.cc b/visualdl/logic/sdk.cc
index a61f9f235..24dba325c 100644
--- a/visualdl/logic/sdk.cc
+++ b/visualdl/logic/sdk.cc
@@ -347,7 +347,6 @@ std::string TextReader::caption() const {
 
 size_t TextReader::size() const { return reader_.total_records(); }
 
-
 void Audio::StartSampling() {
   if (!ToSampleThisStep()) return;
 
@@ -387,8 +386,7 @@ void Audio::FinishSampling() {
   }
 }
 
-void Audio::AddSample(int sample_rate,
-                      const std::vector<value_t>& data) {
+void Audio::AddSample(int sample_rate, const std::vector<value_t>& data) {
   auto idx = IsSampleTaken();
   if (idx >= 0) {
     SetSample(idx, sample_rate, data);
@@ -398,18 +396,21 @@ void Audio::AddSample(int sample_rate,
 void Audio::SetSample(int index,
                       int sample_rate,
                       const std::vector<value_t>& data) {
-  CHECK_GT(sample_rate, 0) << "sample rate should be something like 6000, 8000 or 44100";
-  CHECK_LT(index, num_samples_);
-  CHECK_LE(index, num_records_);
-
-    //convert float vector to char vector
+  CHECK_GT(sample_rate, 0)
+      << "sample rate should be something like 6000, 8000 or 44100";
+  CHECK_LT(index, num_samples_)
+      << "index should be less than number of samples";
+  CHECK_LE(index, num_records_)
+      << "index should be less than or equal to number of records";
+
+  // convert float vector to char vector
   std::vector<char> data_str(data.size());
   for (int i = 0; i < data.size(); i++) {
     data_str[i] = data[i];
   }
 
-  BinaryRecord brcd(
-          GenBinaryRecordDir(step_.parent()->dir()), std::string(data_str.data()));
+  BinaryRecord brcd(GenBinaryRecordDir(step_.parent()->dir()),
+                    std::string(data_str.data()));
   brcd.tofile();
 
   auto entry = step_.MutableData<std::vector<byte_t>>(index);
@@ -417,7 +418,7 @@ void Audio::SetSample(int index,
   auto old_hash = entry.reader().GetRaw();
   if (!old_hash.empty()) {
     std::string old_path =
-            GenBinaryRecordDir(step_.parent()->dir()) + "/" + old_hash;
+        GenBinaryRecordDir(step_.parent()->dir()) + "/" + old_hash;
     CHECK_EQ(std::remove(old_path.c_str()), 0) << "delete old binary record "
                                                << old_path << " failed";
   }
@@ -440,7 +441,7 @@ AudioReader::AudioRecord AudioReader::record(int offset, int index) {
   auto entry = record.data(index);
   auto filename = entry.GetRaw();
   CHECK(!g_log_dir.empty())
-          << "g_log_dir should be set in LogReader construction";
+      << "g_log_dir should be set in LogReader construction";
   BinaryRecordReader brcd(GenBinaryRecordDir(g_log_dir), filename);
 
   std::transform(brcd.data.begin(),
@@ -451,8 +452,6 @@ AudioReader::AudioRecord AudioReader::record(int offset, int index) {
   return res;
 }
 
-
-
 }  // namespace components
 
 }  // namespace visualdl
diff --git a/visualdl/logic/sdk.h b/visualdl/logic/sdk.h
index 04ca7b023..44a5d68a3 100644
--- a/visualdl/logic/sdk.h
+++ b/visualdl/logic/sdk.h
@@ -171,7 +171,7 @@ struct Image {
 
   /*
    * A combined interface for IsSampleTaken and SetSample, simpler but might be
-   * low effience.
+   * low efficiency.
    */
   void AddSample(const std::vector<shape_t>& shape,
                  const std::vector<value_t>& data);
@@ -326,123 +326,114 @@ struct TextReader {
   TabletReader reader_;
 };
 
-
-
-
 /*
  * Image component writer.
  */
 struct Audio {
-    using value_t = float;
-
-    /*
-     * step_cycle: store every `step_cycle` as a record.
-     * num_samples: how many samples to take in a step.
-     */
-    Audio(Tablet tablet, int num_samples, int step_cycle)
-            : writer_(tablet), num_samples_(num_samples), step_cycle_(step_cycle) {
-      CHECK_GT(step_cycle, 0);
-      CHECK_GT(num_samples, 0);
-
-      writer_.SetType(Tablet::Type::kAudio);
-      // make audio's tag as the default caption.
-      writer_.SetNumSamples(num_samples);
-      SetCaption(tablet.reader().tag());
-    }
-
-    void SetCaption(const std::string& c) {
-      writer_.SetCaptions(std::vector<std::string>({c}));
-    }
-
-    /*
-     * Start a sampling period, this interface will start a new reservior sampling
-     * phase.
-     */
-    void StartSampling();
-    /*
-     * End a sampling period, it will clear all states for reservior sampling.
-     */
-    void FinishSampling();
-
-    /*
-     * A combined interface for IsSampleTaken and SetSample, simpler but might be
-     * low effience.
-     */
-    void AddSample(int sample_rate,
-                   const std::vector<value_t>& data);
-
-    /*
-     * Will this sample be taken, this interface is introduced to reduce the cost
-     * of copy audio data, by testing whether this audio will be sampled, and only
-     * copy data when it should be sampled. In that way, most of unsampled audio
-     * data need not be copied or processed at all.
-     */
-    int IsSampleTaken();
-    /*
-     * Store audio data with sample rate
-     */
-    void SetSample(int index,
-                   int sample_rate,
-                   const std::vector<value_t>& data);
+  using value_t = float;
+
+  /*
+   * step_cycle: store every `step_cycle` as a record.
+   * num_samples: how many samples to take in a step.
+   */
+  Audio(Tablet tablet, int num_samples, int step_cycle)
+      : writer_(tablet), num_samples_(num_samples), step_cycle_(step_cycle) {
+    CHECK_GT(step_cycle, 0);
+    CHECK_GT(num_samples, 0);
+
+    writer_.SetType(Tablet::Type::kAudio);
+    // make audio's tag as the default caption.
+    writer_.SetNumSamples(num_samples);
+    SetCaption(tablet.reader().tag());
+  }
+
+  void SetCaption(const std::string& c) {
+    writer_.SetCaptions(std::vector<std::string>({c}));
+  }
+
+  /*
+   * Start a sampling period, this interface will start a new reservior sampling
+   * phase.
+   */
+  void StartSampling();
+  /*
+   * End a sampling period, it will clear all states for reservior sampling.
+   */
+  void FinishSampling();
+
+  /*
+   * A combined interface for IsSampleTaken and SetSample, simpler but might be
+   * low efficiency.
+   */
+  void AddSample(int sample_rate, const std::vector<value_t>& data);
+
+  /*
+   * Will this sample be taken, this interface is introduced to reduce the cost
+   * of copy audio data, by testing whether this audio will be sampled, and only
+   * copy data when it should be sampled. In that way, most of unsampled audio
+   * data need not be copied or processed at all.
+   */
+  int IsSampleTaken();
+  /*
+   * Store audio data with sample rate
+   */
+  void SetSample(int index, int sample_rate, const std::vector<value_t>& data);
 
 protected:
-    bool ToSampleThisStep() { return step_id_ % step_cycle_ == 0; }
+  bool ToSampleThisStep() { return step_id_ % step_cycle_ == 0; }
 
 private:
-    Tablet writer_;
-    Record step_;
-    int num_records_{0};
-    int num_samples_{0};
-    int step_id_{0};
-    int step_cycle_;
+  Tablet writer_;
+  Record step_;
+  int num_records_{0};
+  int num_samples_{0};
+  int step_id_{0};
+  int step_cycle_;
 };
 
 /*
 * Audio reader.
 */
 struct AudioReader {
-    using value_t = typename Audio::value_t;
+  using value_t = typename Audio::value_t;
 
-    struct AudioRecord {
-        int step_id;
-        int sample_rate;
-        std::vector<int> data;
-    };
+  struct AudioRecord {
+    int step_id;
+    int sample_rate;
+    std::vector<int> data;
+  };
 
-    AudioReader(const std::string& mode, TabletReader tablet)
-            : reader_(tablet), mode_{mode} {}
+  AudioReader(const std::string& mode, TabletReader tablet)
+      : reader_(tablet), mode_{mode} {}
 
-    std::string caption();
+  std::string caption();
 
-    // number of steps.
-    int num_records() { return reader_.total_records(); }
+  // number of steps.
+  int num_records() { return reader_.total_records(); }
 
-    int num_samples() { return reader_.num_samples(); }
+  int num_samples() { return reader_.num_samples(); }
 
-    int64_t timestamp(int step) { return reader_.record(step).timestamp(); }
+  int64_t timestamp(int step) { return reader_.record(step).timestamp(); }
 
-    /*
-     * offset: offset of a step.
-     * index: index of a sample.
-     */
-    AudioRecord record(int offset, int index);
+  /*
+   * offset: offset of a step.
+   * index: index of a sample.
+   */
+  AudioRecord record(int offset, int index);
 
-    /*
-     * offset: offset of a step.
-     * index: index of a sample.
-     */
-    std::vector<value_t> data(int offset, int index);
+  /*
+   * offset: offset of a step.
+   * index: index of a sample.
+   */
+  std::vector<value_t> data(int offset, int index);
 
-    int stepid(int offset, int index);
+  int stepid(int offset, int index);
 
 private:
-    TabletReader reader_;
-    std::string mode_;
+  TabletReader reader_;
+  std::string mode_;
 };
 
-
-
-
 }  // namespace components
 }  // namespace visualdl
 
diff --git a/visualdl/python/storage.py b/visualdl/python/storage.py
index e537bd6d2..912e8c7ea 100644
--- a/visualdl/python/storage.py
+++ b/visualdl/python/storage.py
@@ -121,7 +121,7 @@ def text(self, tag):
 
     def audio(self, tag):
         """
-        Get a audio reader with tag
+        Get an audio reader with tag
 
         :param tag:  The reader will read the audio data marked with tag
         :type tag: basestring

From d843de7cae994061d936b9ee78179d900300323e Mon Sep 17 00:00:00 2001
From: Nicky <nicky@baidu.com>
Date: Wed, 28 Mar 2018 16:28:47 -0700
Subject: [PATCH 3/6] use int_8 for reading records and convert string directly
 from vector

---
 visualdl/logic/sdk.cc | 11 ++---------
 visualdl/logic/sdk.h  |  2 +-
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/visualdl/logic/sdk.cc b/visualdl/logic/sdk.cc
index 24dba325c..0adc3d185 100644
--- a/visualdl/logic/sdk.cc
+++ b/visualdl/logic/sdk.cc
@@ -403,14 +403,7 @@ void Audio::SetSample(int index,
   CHECK_LE(index, num_records_)
       << "index should be less than or equal to number of records";
 
-  // convert float vector to char vector
-  std::vector<char> data_str(data.size());
-  for (int i = 0; i < data.size(); i++) {
-    data_str[i] = data[i];
-  }
-
-  BinaryRecord brcd(GenBinaryRecordDir(step_.parent()->dir()),
-                    std::string(data_str.data()));
+  BinaryRecord brcd(GenBinaryRecordDir(step_.parent()->dir()), std::string(data.begin(),data.end()));
   brcd.tofile();
 
   auto entry = step_.MutableData<std::vector<byte_t>>(index);
@@ -447,7 +440,7 @@ AudioReader::AudioRecord AudioReader::record(int offset, int index) {
   std::transform(brcd.data.begin(),
                  brcd.data.end(),
                  std::back_inserter(res.data),
-                 [](byte_t i) { return (int)(i); });
+                 [](byte_t i) { return (int8_t)(i); });
   res.step_id = record.id();
   return res;
 }
diff --git a/visualdl/logic/sdk.h b/visualdl/logic/sdk.h
index 44a5d68a3..37d2b308e 100644
--- a/visualdl/logic/sdk.h
+++ b/visualdl/logic/sdk.h
@@ -400,7 +400,7 @@ struct AudioReader {
   struct AudioRecord {
     int step_id;
     int sample_rate;
-    std::vector<int> data;
+    std::vector<int8_t> data;
   };
 
   AudioReader(const std::string& mode, TabletReader tablet)

From 5d1b040eebee72e3066af3548e7b984ac89ec22d Mon Sep 17 00:00:00 2001
From: Nicky <nicky@baidu.com>
Date: Wed, 28 Mar 2018 17:01:45 -0700
Subject: [PATCH 4/6] refract isSampleTaken to IndexOfSampleTaken

---
 visualdl/logic/pybind.cc | 6 +++---
 visualdl/logic/sdk.cc    | 8 ++++----
 visualdl/logic/sdk.h     | 8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/visualdl/logic/pybind.cc b/visualdl/logic/pybind.cc
index a3ebc48ef..f9582b267 100644
--- a/visualdl/logic/pybind.cc
+++ b/visualdl/logic/pybind.cc
@@ -175,7 +175,7 @@ PYBIND11_MODULE(core, m) {
       .def("start_sampling", &cp::Image::StartSampling, R"pbdoc(
         Start a sampling period, this interface will start a new reservoir sampling phase.
       )pbdoc")
-      .def("is_sample_taken", &cp::Image::IsSampleTaken, R"pbdoc(
+      .def("is_sample_taken", &cp::Image::IndexOfSampleTaken, R"pbdoc(
         Will this sample be taken, this interface is introduced to reduce the cost
         of copy image data, by testing whether this image will be sampled, and only
         copy data when it should be sampled. In that way, most of un-sampled image
@@ -242,7 +242,7 @@ PYBIND11_MODULE(core, m) {
       .def("start_sampling", &cp::Audio::StartSampling, R"pbdoc(
             Start a sampling period, this interface will start a new reservoir sampling phase.
           )pbdoc")
-      .def("is_sample_taken", &cp::Audio::IsSampleTaken, R"pbdoc(
+      .def("is_sample_taken", &cp::Audio::IndexOfSampleTaken, R"pbdoc(
             Will this sample be taken, this interface is introduced to reduce the cost
             of copy audio data, by testing whether this audio will be sampled, and only
             copy data when it should be sampled. In that way, most of un-sampled audio
@@ -274,7 +274,7 @@ PYBIND11_MODULE(core, m) {
                   )pbdoc");
 
   py::class_<cp::AudioReader::AudioRecord>(m, "AudioRecord")
-      // TODO(ChunweiYan) make these copyless.
+      // TODO(Nicky) make these copyless.
       .def("data", [](cp::AudioReader::AudioRecord& self) { return self.data; })
       .def("sample_rate",
            [](cp::AudioReader::AudioRecord& self) { return self.sample_rate; })
diff --git a/visualdl/logic/sdk.cc b/visualdl/logic/sdk.cc
index 0adc3d185..8a193cc6b 100644
--- a/visualdl/logic/sdk.cc
+++ b/visualdl/logic/sdk.cc
@@ -160,7 +160,7 @@ void Image::StartSampling() {
   num_records_ = 0;
 }
 
-int Image::IsSampleTaken() {
+int Image::IndexOfSampleTaken() {
   if (!ToSampleThisStep()) return -1;
   num_records_++;
   if (num_records_ <= num_samples_) {
@@ -195,7 +195,7 @@ struct is_same_type<T, T> {
 
 void Image::AddSample(const std::vector<shape_t>& shape,
                       const std::vector<value_t>& data) {
-  auto idx = IsSampleTaken();
+  auto idx = IndexOfSampleTaken();
   if (idx >= 0) {
     SetSample(idx, shape, data);
   }
@@ -363,7 +363,7 @@ void Audio::StartSampling() {
   num_records_ = 0;
 }
 
-int Audio::IsSampleTaken() {
+int Audio::IndexOfSampleTaken() {
   if (!ToSampleThisStep()) return -1;
   num_records_++;
   if (num_records_ <= num_samples_) {
@@ -387,7 +387,7 @@ void Audio::FinishSampling() {
 }
 
 void Audio::AddSample(int sample_rate, const std::vector<value_t>& data) {
-  auto idx = IsSampleTaken();
+  auto idx = IndexOfSampleTaken();
   if (idx >= 0) {
     SetSample(idx, sample_rate, data);
   }
diff --git a/visualdl/logic/sdk.h b/visualdl/logic/sdk.h
index 37d2b308e..e721242af 100644
--- a/visualdl/logic/sdk.h
+++ b/visualdl/logic/sdk.h
@@ -170,7 +170,7 @@ struct Image {
   void FinishSampling();
 
   /*
-   * A combined interface for IsSampleTaken and SetSample, simpler but might be
+   * A combined interface for IndexOfSampleTaken and SetSample, simpler but might be
    * low efficiency.
    */
   void AddSample(const std::vector<shape_t>& shape,
@@ -182,7 +182,7 @@ struct Image {
    * copy data when it should be sampled. In that way, most of unsampled image
    * data need not be copied or processed at all.
    */
-  int IsSampleTaken();
+  int IndexOfSampleTaken();
   /*
    * Just store a tensor with nothing to do with image format.
    */
@@ -362,7 +362,7 @@ struct Audio {
   void FinishSampling();
 
   /*
-   * A combined interface for IsSampleTaken and SetSample, simpler but might be
+   * A combined interface for IndexOfSampleTaken and SetSample, simpler but might be
    * low efficiency.
    */
   void AddSample(int sample_rate, const std::vector<value_t>& data);
@@ -373,7 +373,7 @@ struct Audio {
    * copy data when it should be sampled. In that way, most of unsampled audio
    * data need not be copied or processed at all.
    */
-  int IsSampleTaken();
+  int IndexOfSampleTaken();
   /*
    * Store audio data with sample rate
    */

From 3de9514ba208213f330c977bb299feb14c861138 Mon Sep 17 00:00:00 2001
From: Nicky <nicky@baidu.com>
Date: Wed, 28 Mar 2018 17:25:48 -0700
Subject: [PATCH 5/6] fix clang format again

---
 visualdl/logic/sdk.cc      |  3 ++-
 visualdl/logic/sdk.h       |  6 ++++--
 visualdl/logic/sdk_test.cc | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/visualdl/logic/sdk.cc b/visualdl/logic/sdk.cc
index 8a193cc6b..04944556d 100644
--- a/visualdl/logic/sdk.cc
+++ b/visualdl/logic/sdk.cc
@@ -403,7 +403,8 @@ void Audio::SetSample(int index,
   CHECK_LE(index, num_records_)
       << "index should be less than or equal to number of records";
 
-  BinaryRecord brcd(GenBinaryRecordDir(step_.parent()->dir()), std::string(data.begin(),data.end()));
+  BinaryRecord brcd(GenBinaryRecordDir(step_.parent()->dir()),
+                    std::string(data.begin(), data.end()));
   brcd.tofile();
 
   auto entry = step_.MutableData<std::vector<byte_t>>(index);
diff --git a/visualdl/logic/sdk.h b/visualdl/logic/sdk.h
index e721242af..6f3fece24 100644
--- a/visualdl/logic/sdk.h
+++ b/visualdl/logic/sdk.h
@@ -170,7 +170,8 @@ struct Image {
   void FinishSampling();
 
   /*
-   * A combined interface for IndexOfSampleTaken and SetSample, simpler but might be
+   * A combined interface for IndexOfSampleTaken and SetSample, simpler but
+   * might be
    * low efficiency.
    */
   void AddSample(const std::vector<shape_t>& shape,
@@ -362,7 +363,8 @@ struct Audio {
   void FinishSampling();
 
   /*
-   * A combined interface for IndexOfSampleTaken and SetSample, simpler but might be
+   * A combined interface for IndexOfSampleTaken and SetSample, simpler but
+   * might be
    * low efficiency.
    */
   void AddSample(int sample_rate, const std::vector<value_t>& data);
diff --git a/visualdl/logic/sdk_test.cc b/visualdl/logic/sdk_test.cc
index 0a42ff23c..eabc9b354 100644
--- a/visualdl/logic/sdk_test.cc
+++ b/visualdl/logic/sdk_test.cc
@@ -132,6 +132,40 @@ TEST(Image, add_sample_test) {
   CHECK_EQ(image2read.num_records(), num_steps);
 }
 
+TEST(Image, add_sample_test) {
+  const auto dir = "./tmp/sdk_test.image";
+  LogWriter writer__(dir, 4);
+  auto writer = writer__.AsMode("train");
+
+  auto tablet = writer.AddTablet("image0");
+  components::Image image(tablet, 3, 1);
+  const int num_steps = 10;
+
+  LOG(INFO) << "write images";
+  image.SetCaption("this is an image");
+  for (int step = 0; step < num_steps; step++) {
+    image.StartSampling();
+    for (int i = 0; i < 7; i++) {
+      vector<int64_t> shape({5, 5, 3});
+      vector<float> data;
+      for (int j = 0; j < 3 * 5 * 5; j++) {
+        data.push_back(float(rand()) / RAND_MAX);
+      }
+      image.AddSample(shape, data);
+    }
+    image.FinishSampling();
+  }
+
+  LOG(INFO) << "read images";
+  // read it
+  LogReader reader__(dir);
+  auto reader = reader__.AsMode("train");
+  auto tablet2read = reader.tablet("image0");
+  components::ImageReader image2read("train", tablet2read);
+  CHECK_EQ(image2read.caption(), "this is an image");
+  CHECK_EQ(image2read.num_records(), num_steps);
+}
+
 TEST(Histogram, AddRecord) {
   const auto dir = "./tmp/sdk_test.histogram";
   LogWriter writer__(dir, 1);

From c8b29e38aa0aaee26debf6b686c00789dacbe64a Mon Sep 17 00:00:00 2001
From: Nicky <nicky@baidu.com>
Date: Wed, 28 Mar 2018 20:14:52 -0700
Subject: [PATCH 6/6] add sdk unit test for audio api

---
 visualdl/logic/sdk_test.cc | 71 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/visualdl/logic/sdk_test.cc b/visualdl/logic/sdk_test.cc
index eabc9b354..818a3a92e 100644
--- a/visualdl/logic/sdk_test.cc
+++ b/visualdl/logic/sdk_test.cc
@@ -80,7 +80,7 @@ TEST(Image, test) {
       for (int j = 0; j < 3 * 5 * 5; j++) {
         data.push_back(float(rand()) / RAND_MAX);
       }
-      int index = image.IsSampleTaken();
+      int index = image.IndexOfSampleTaken();
       if (index != -1) {
         image.SetSample(index, shape, data);
       }
@@ -166,6 +166,75 @@ TEST(Image, add_sample_test) {
   CHECK_EQ(image2read.num_records(), num_steps);
 }
 
+TEST(Audio, test) {
+  const auto dir = "./tmp/sdk_test.audio";
+  LogWriter writer__(dir, 4);
+  auto writer = writer__.AsMode("train");
+
+  auto tablet = writer.AddTablet("audio0");
+  components::Audio audio(tablet, 3, 1);
+  const int num_steps = 10;
+
+  LOG(INFO) << "write audio";
+  audio.SetCaption("this is an audio");
+  for (int step = 0; step < num_steps; step++) {
+    audio.StartSampling();
+    for (int i = 0; i < 7; i++) {
+      vector<int64_t> shape({5, 5, 3});
+      vector<float> data;
+      for (int j = 0; j < 3 * 5 * 5; j++) {
+        data.push_back(float(rand()) / RAND_MAX);
+      }
+      int index = audio.IndexOfSampleTaken();
+      if (index != -1) {
+        audio.SetSample(index, 16000, data);
+      }
+    }
+    audio.FinishSampling();
+  }
+
+  LOG(INFO) << "read audio";
+  // read it
+  LogReader reader__(dir);
+  auto reader = reader__.AsMode("train");
+  auto tablet2read = reader.tablet("audio0");
+  components::AudioReader audio2read("train", tablet2read);
+  CHECK_EQ(audio2read.caption(), "this is an audio");
+  CHECK_EQ(audio2read.num_records(), num_steps);
+}
+
+TEST(Audio, add_sample_test) {
+  const auto dir = "./tmp/sdk_test.audio";
+  LogWriter writer__(dir, 4);
+  auto writer = writer__.AsMode("train");
+
+  auto tablet = writer.AddTablet("audio0");
+  components::Audio audio(tablet, 3, 1);
+  const int num_steps = 10;
+
+  LOG(INFO) << "write audio";
+  audio.SetCaption("this is an audio");
+  for (int step = 0; step < num_steps; step++) {
+    audio.StartSampling();
+    for (int i = 0; i < 7; i++) {
+      vector<float> data;
+      for (int j = 0; j < 3 * 5 * 5; j++) {
+        data.push_back(float(rand()) / RAND_MAX);
+      }
+    }
+    audio.FinishSampling();
+  }
+
+  LOG(INFO) << "read audio";
+  // read it
+  LogReader reader__(dir);
+  auto reader = reader__.AsMode("train");
+  auto tablet2read = reader.tablet("audio0");
+  components::AudioReader audio2read("train", tablet2read);
+  CHECK_EQ(audio2read.caption(), "this is an audio");
+  CHECK_EQ(audio2read.num_records(), num_steps);
+}
+
 TEST(Histogram, AddRecord) {
   const auto dir = "./tmp/sdk_test.histogram";
   LogWriter writer__(dir, 1);